From 4dfe0cb5a0f69fea5ae6f33a77a145cabc6a94f8 Mon Sep 17 00:00:00 2001 From: Digimer Date: Thu, 24 Sep 2020 02:09:18 -0400 Subject: [PATCH] * Created Cluster->boot_server, ->shutdown_server and ->migrate_server methods that handle booting, migrating and shutting down servers. Also created the private method ->_set_server_constraint which is used by migrate and boot to set resource constraints to control where a server boots or migrates to. * Did more work on parsing server data out of the CIB. There is still an issue with determining which node currently hosts a resource, however. * Renamed Server->boot to ->boot_virsh, ->shutdown to ->shutdown_virsh and ->migrate to ->migrate_virsh to clarify that these methods work on the raw virsh calls, outside of pacemaker (indeed, they are what the pacemaker RA uses to do what pacemaker asks). * Got more work done on the scan-cluster SA. * Created the empty files for the pending scan-server SA. Signed-off-by: Digimer --- Anvil/Tools/Cluster.pm | 757 +++++++++++++++++- Anvil/Tools/Server.pm | 40 +- ocf/alteeve/server | 12 +- scancore-agents/scan-cluster/scan-cluster | 29 +- scancore-agents/scan-cluster/scan-cluster.sql | 467 ++++++++++- scancore-agents/scan-cluster/scan-cluster.xml | 2 +- scancore-agents/scan-server/scan-server | 0 scancore-agents/scan-server/scan-server.sql | 0 scancore-agents/scan-server/scan-server.xml | 0 share/words.xml | 26 + tools/test.pl | 282 ++++++- 11 files changed, 1536 insertions(+), 79 deletions(-) create mode 100755 scancore-agents/scan-server/scan-server create mode 100644 scancore-agents/scan-server/scan-server.sql create mode 100644 scancore-agents/scan-server/scan-server.xml diff --git a/Anvil/Tools/Cluster.pm b/Anvil/Tools/Cluster.pm index afd48b5c..8d67420d 100644 --- a/Anvil/Tools/Cluster.pm +++ b/Anvil/Tools/Cluster.pm @@ -14,11 +14,15 @@ our $VERSION = "3.0.0"; my $THIS_FILE = "Cluster.pm"; ### Methods; +# boot_server # check_node_status # get_peers +# migrate_server # parse_cib +# shutdown_server # start_cluster # which_node +# _set_server_constraint =pod @@ -78,6 +82,162 @@ sub parent # Public methods # ############################################################################################################# +=head2 boot_server + +This uses pacemaker to boot a server. + +If there is a problem, C<< !!error!! >> is returned. + +Parameters; + +=head3 server (required) + +This is the name of the server to boot. + +=head3 node (optional) + +If set, a resource constraint is placed so that the server prefers one node over the other before it boots. + +B<< Note >>; The method relies on pacemaker to boot the node. As such, if for some reason it decides the server can not be booted on the prefered node, it may boot on the other node. As such, this parameter does not guarantee that the server will be booted on the target node! + +=head3 wait (optional, default '1') + +This controls whether the method waits for the server to shut down before returning. By default, it will go into a loop and check every 2 seconds to see if the server is still running. Once it's found to be off, the method returns. If this is set to C<< 0 >>, the method will return as soon as the request to shut down the server is issued. + +=cut +sub boot_server +{ + my $self = shift; + my $parameter = shift; + my $anvil = $self->parent; + my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "Cluster->boot_server()" }}); + + my $node = defined $parameter->{node} ? $parameter->{node} : ""; + my $server = defined $parameter->{server} ? $parameter->{server} : ""; + my $wait = defined $parameter->{'wait'} ? $parameter->{'wait'} : 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + node => $node, + server => $server, + 'wait' => $wait, + }}); + + if (not $server) + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0020", variables => { method => "Cluster->boot_server()", parameter => "server" }}); + return("!!error!!"); + } + + my $host_type = $anvil->Get->host_type({debug => $debug}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { host_type => $host_type }}); + if ($host_type ne "node") + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0146", variables => { server => $server }}); + return("!!error!!"); + } + + my $problem = $anvil->Cluster->parse_cib({debug => $debug}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { problem => $problem }}); + if ($problem) + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0145", variables => { server => $server }}); + return('!!error!!'); + } + + # Is this node fully in the cluster? + if (not $anvil->data->{cib}{parsed}{'local'}{ready}) + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0147", variables => { server => $server }}); + return('!!error!!'); + } + + # Is the server one we know of? + if (not exists $anvil->data->{cib}{parsed}{data}{server}{$server}) + { + # The server isn't in the pacemaker config. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0149", variables => { server => $server }}); + return('!!error!!'); + } + + # Is the server already running? If so, do nothing. + my $status = $anvil->data->{cib}{parsed}{data}{server}{$server}{status}; + my $host = $anvil->data->{cib}{parsed}{data}{server}{$server}{host}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + status => $status, + host => $host, + }}); + + if ($status eq "running") + { + # Nothing to do. + if ((not $node) or ($host eq $node)) + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0548", variables => { server => $server }}); + return(0); + } + else + { + # It's running, but on the other node. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "warning_0059", variables => { + server => $server, + requested_node => $node, + current_host => $host, + }}); + return(0); + } + } + + if ($node) + { + $anvil->Cluster->_set_server_constraint({ + server => $server, + preferred_node => $node, + }); + } + + # Now boot the server. + my ($output, $return_code) = $anvil->System->call({debug => 3, shell_call => $anvil->data->{path}{exe}{pcs}." resource enable ".$server}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + output => $output, + return_code => $return_code, + }}); + + if (not $wait) + { + # We're done. + return(0); + } + + # Wait now for the server to start. + my $waiting = 1; + while($waiting) + { + $anvil->Cluster->parse_cib({debug => $debug}); + my $status = $anvil->data->{cib}{parsed}{data}{server}{$server}{status}; + my $host = $anvil->data->{cib}{parsed}{data}{server}{$server}{host}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + status => $status, + host => $host, + }}); + + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0552", variables => { server => $server }}); + if ($host eq "running") + { + # It's up. + $waiting = 0; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0553", variables => { server => $server }}); + } + else + { + # Wait a bit and check again. + sleep 2; + } + } + + return(0); +} + + =head2 check_node_status This takes a node name (generally the short host name) and, using a C<< parse_cib >> call data (made before calling this method), the node's ready state will be checked. If the node is ready, C<< 1 >> is returned. If not, C<< 0 >> is returned. If there is a problem, C<< !!error!! >> is returned. @@ -104,7 +264,7 @@ sub check_node_status if (not $node_name) { - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0020", variables => { method => "Database->get_host_from_uuid()", parameter => "host_uuid" }}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0020", variables => { method => "Cluster->get_host_from_uuid()", parameter => "host_uuid" }}); return("!!error!!"); } @@ -202,7 +362,7 @@ sub get_peers elsif ($host_uuid eq $anvil_node2_host_uuid) { # Found our Anvil!, and we're node 1. - $found = 1; + $found = 1; $anvil->data->{sys}{anvil}{i_am} = "node2"; $anvil->data->{sys}{anvil}{peer_is} = "node1"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { @@ -248,6 +408,181 @@ sub get_peers return($peer); } + +=head2 migrate_server + +This manipulates pacemaker's location constraints to trigger a pacemaker-controlled migration of one or more servers. + +This method works by confirming that the server is running and it not on the target C<< node >>. If the server is server indeed needs to be migrated, a location constraint is set to give preference to the target node. Optionally, this method can wait until the migration is complete. + +B<< Note >>: This method does not make the actual C<< virsh >> call! To perform a migration B<< OUTSIDE >> pacemaker, use C<< Server->migrate_virsh() >>. + +Parameters; + +=head3 server (required) + +This is the server to migrate. + +=head3 node (required) + +This is the name of the node to move the server to. + +=head3 wait (optional, default '1') + +This controls whether the method waits for the server to shut down before returning. By default, it will go into a loop and check every 2 seconds to see if the server is still running. Once it's found to be off, the method returns. If this is set to C<< 0 >>, the method will return as soon as the request to shut down the server is issued. + +=cut +sub migrate_server +{ + my $self = shift; + my $parameter = shift; + my $anvil = $self->parent; + my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "Cluster->migrate_server()" }}); + + my $server = defined $parameter->{server} ? $parameter->{server} : ""; + my $node = defined $parameter->{node} ? $parameter->{node} : ""; + my $wait = defined $parameter->{'wait'} ? $parameter->{'wait'} : 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + server => $server, + node => $node, + 'wait' => $wait, + }}); + + if (not $server) + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0020", variables => { method => "Cluster->migrate_server()", parameter => "server" }}); + return("!!error!!"); + } + + my $host_type = $anvil->Get->host_type({debug => $debug}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { host_type => $host_type }}); + if ($host_type ne "node") + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0154", variables => { server => $server }}); + return("!!error!!"); + } + + my $problem = $anvil->Cluster->parse_cib({debug => $debug}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { problem => $problem }}); + if ($problem) + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0155", variables => { server => $server }}); + return('!!error!!'); + } + + # Are both nodes fully in the cluster? + if (not $anvil->data->{cib}{parsed}{'local'}{ready}) + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0156", variables => { server => $server }}); + return('!!error!!'); + } + if (not $anvil->data->{cib}{parsed}{peer}{ready}) + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0157", variables => { server => $server }}); + return('!!error!!'); + } + + # Is the server one we know of? + if (not exists $anvil->data->{cib}{parsed}{data}{server}{$server}) + { + # The server isn't in the pacemaker config. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0158", variables => { server => $server }}); + return('!!error!!'); + } + + # Is the server already running? If so, where? + my $status = $anvil->data->{cib}{parsed}{data}{server}{$server}{status}; + my $host = $anvil->data->{cib}{parsed}{data}{server}{$server}{host}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + status => $status, + host => $host, + }}); + + if ($status eq "off") + { + # It's not running on either node, nothing to do. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "warning_0061", variables => { + server => $server, + requested_node => $node, + }}); + return(0); + } + elsif (($status eq "running") && ($host eq $node)) + { + # Already running on the target. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0549", variables => { + server => $server, + requested_node => $node, + }}); + return(0); + } + elsif ($status ne "running") + { + # The server is in an unknown state. + # It's in an unknown state, abort. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "warning_0060", variables => { + server => $server, + current_host => $host, + current_state => $status, + }}); + return('!!error!!'); + } + + # TODO: Record that the server is migrating + + # change the constraint to trigger the move. + if ($node) + { + $anvil->Cluster->_set_server_constraint({ + server => $server, + preferred_node => $node, + }); + } + + if (not $wait) + { + # We'll leave it to the scan-server scan agent to clear the migration flag from the database. + return(0); + } + + # Wait now for the server to start. + my $waiting = 1; + while($waiting) + { + $anvil->Cluster->parse_cib({debug => $debug}); + my $status = $anvil->data->{cib}{parsed}{data}{server}{$server}{status}; + my $host = $anvil->data->{cib}{parsed}{data}{server}{$server}{host}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + status => $status, + host => $host, + }}); + + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0550", variables => { + server => $server, + requested_node => $node, + }}); + if (($host eq "running") && ($host eq $node)) + { + # It's done. + $waiting = 0; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0551", variables => { + server => $server, + requested_node => $node, + }}); + } + else + { + # Wait a bit and check again. + sleep 2; + } + } + + + return(0); +} + + =head2 parse_cib This reads in the CIB XML and parses it. On success, it returns C<< 0 >>. On failure (ie: pcsd isn't running), returns C<< 1 >>. @@ -565,7 +900,17 @@ sub parse_cib }}); # Is this me or the peer? - if (($node_name ne $anvil->Get->host_name) && ($node_name ne $anvil->Get->short_host_name)) + if (($node_name eq $anvil->Get->host_name) or ($node_name eq $anvil->Get->short_host_name)) + { + # Me. + $anvil->data->{cib}{parsed}{'local'}{ready} = $node_name; + $anvil->data->{cib}{parsed}{'local'}{name} = $node_name; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + "cib::parsed::local::ready" => $anvil->data->{cib}{parsed}{'local'}{ready}, + "cib::parsed::local::name" => $anvil->data->{cib}{parsed}{'local'}{name}, + }}); + } + else { # It's our peer. $anvil->data->{cib}{parsed}{peer}{ready} = $ready; @@ -701,10 +1046,300 @@ sub parse_cib }}); } + # Hosted server information + foreach my $id (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{cib}{status}{node_state}}) + { + my $node_name = $anvil->data->{cib}{parsed}{configuration}{nodes}{$id}{uname}; + foreach my $lrm_id (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{cib}{status}{node_state}{$id}{lrm_id}}) + { + foreach my $lrm_resource_id (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{cib}{status}{node_state}{$id}{lrm_id}{$lrm_id}{lrm_resource}}) + { + my $lrm_resource_operations_count = keys %{$anvil->data->{cib}{parsed}{cib}{status}{node_state}{$id}{lrm_id}{$lrm_id}{lrm_resource}{$lrm_resource_id}{lrm_rsc_op_id}}; + foreach my $lrm_rsc_op_id (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{cib}{status}{node_state}{$id}{lrm_id}{$lrm_id}{lrm_resource}{$lrm_resource_id}{lrm_rsc_op_id}}) + { + my $type = $anvil->data->{cib}{parsed}{cib}{status}{node_state}{$id}{lrm_id}{$lrm_id}{lrm_resource}{$lrm_resource_id}{type}; + my $class = $anvil->data->{cib}{parsed}{cib}{status}{node_state}{$id}{lrm_id}{$lrm_id}{lrm_resource}{$lrm_resource_id}{class}; + my $operation = $anvil->data->{cib}{parsed}{cib}{status}{node_state}{$id}{lrm_id}{$lrm_id}{lrm_resource}{$lrm_resource_id}{lrm_rsc_op_id}{$lrm_rsc_op_id}{operation}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + lrm_resource_operations_count => $lrm_resource_operations_count, + type => $type, + class => $class, + operation => $operation, + lrm_rsc_op_id => $lrm_rsc_op_id, + }}); + + # Skip unless it's a server. + next if $type ne "server"; + + # This will be updated below if the server is running. + if (not exists $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}) + { + $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{status} = "off"; + $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{last_operation} = "unknown"; + $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{last_operation_rc_code} = "-1"; + $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{host} = ""; + $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{last_monitor_rc_code} = "-1"; + $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{last_failure_operation} = ""; + $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{last_failure_return_code} = "-1"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + "cib::parsed::data::server::${lrm_resource_id}::status" => $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{status}, + "cib::parsed::data::server::${lrm_resource_id}::host" => $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{host}, + "cib::parsed::data::server::${lrm_resource_id}::last_monitor_rc_code" => $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{last_monitor_rc_code}, + "cib::parsed::data::server::${lrm_resource_id}::last_operation" => $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{last_operation}, + "cib::parsed::data::server::${lrm_resource_id}::last_operation_rc_code" => $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{last_operation_rc_code}, + "cib::parsed::data::server::${lrm_resource_id}::last_failure_operation" => $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{last_operation}, + "cib::parsed::data::server::${lrm_resource_id}::last_failure_return_code" => $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{last_failure_return_code}, + }}); + } + + # If there are two LRM resource operation IDs, then the server is + # running on this node. Generally (always?) there will be a + # '$lrm_rsc_op_id' called '_last_0'. If there is a second one + # with '_monitor' in it, the server is running locally (we always have + # a monitor operation defined). + if (($lrm_resource_operations_count > 1) && ($lrm_rsc_op_id !~ /_last_/)) + { + # The server is (should be) running. + # - return code is from the RA's last status check. + # - exit-reason is the STDERR of the RA + # - + my $last_return_code = $anvil->data->{cib}{parsed}{cib}{status}{node_state}{$id}{lrm_id}{$lrm_id}{lrm_resource}{$lrm_resource_id}{lrm_rsc_op_id}{$lrm_rsc_op_id}{'rc-code'}; + my $status = "unknown"; + if ($last_return_code eq "0") + { + $status = "running"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { status => $status }}); + } + elsif ($last_return_code eq "7") + { + $status = "stopped"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { status => $status }}); + } + else + { + $status = "error_condition - rc: ".$last_return_code; + + # Log all variables in case there is anything useful. + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => { status => $status }}); + foreach my $variable (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{cib}{status}{node_state}{$id}{lrm_id}{$lrm_id}{lrm_resource}{$lrm_resource_id}{lrm_rsc_op_id}{$lrm_rsc_op_id}}) + { + my $value = $anvil->data->{cib}{parsed}{cib}{status}{node_state}{$id}{lrm_id}{$lrm_id}{lrm_resource}{$lrm_resource_id}{lrm_rsc_op_id}{$lrm_rsc_op_id}{$variable}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => { + "cib::parsed::cib::status::node_state::${id}::lrm_id::${lrm_id}::lrm_resource::${lrm_resource_id}::lrm_rsc_op_id::${lrm_rsc_op_id}::${variable}" => $anvil->data->{cib}{parsed}{cib}{status}{node_state}{$id}{lrm_id}{$lrm_id}{lrm_resource}{$lrm_resource_id}{lrm_rsc_op_id}{$lrm_rsc_op_id}{$variable}, + }}); + } + } + + $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{status} = $status; + $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{host} = $anvil->data->{cib}{parsed}{cib}{status}{node_state}{$id}{lrm_id}{$lrm_id}{lrm_resource}{$lrm_resource_id}{lrm_rsc_op_id}{$lrm_rsc_op_id}{'on_node'}; + $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{last_monitor_rc_code} = $anvil->data->{cib}{parsed}{cib}{status}{node_state}{$id}{lrm_id}{$lrm_id}{lrm_resource}{$lrm_resource_id}{lrm_rsc_op_id}{$lrm_rsc_op_id}{'rc-code'}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + "cib::parsed::data::server::${lrm_resource_id}::status" => $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{status}, + "cib::parsed::data::server::${lrm_resource_id}::host" => $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{host}, + "cib::parsed::data::server::${lrm_resource_id}::last_monitor_rc_code" => $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{last_monitor_rc_code}, + }}); + } + elsif ($lrm_rsc_op_id =~ /_last_failure_/) + { + $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{last_failure_operation} = $operation; + $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{last_failure_return_code} = $anvil->data->{cib}{parsed}{cib}{status}{node_state}{$id}{lrm_id}{$lrm_id}{lrm_resource}{$lrm_resource_id}{lrm_rsc_op_id}{$lrm_rsc_op_id}{'rc-code'}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + "cib::parsed::data::server::${lrm_resource_id}::last_failure_operation" => $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{last_failure_operation}, + "cib::parsed::data::server::${lrm_resource_id}::last_failure_return_code" => $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{last_failure_return_code}, + }}); + } + else + { + # This isn't a monirot operation, so it will contain the most + # recent data on the server. + if ($anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{last_operation} eq "unknown") + { + $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{last_operation} = $operation; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + "cib::parsed::data::server::${lrm_resource_id}::last_operation" => $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{last_operation}, + }}); + } + if ($anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{last_operation_rc_code} eq "-1") + { + $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{last_operation_rc_code} = $anvil->data->{cib}{parsed}{cib}{status}{node_state}{$id}{lrm_id}{$lrm_id}{lrm_resource}{$lrm_resource_id}{lrm_rsc_op_id}{$lrm_rsc_op_id}{'rc-code'}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + "cib::parsed::data::server::${lrm_resource_id}::last_operation_rc_code" => $anvil->data->{cib}{parsed}{data}{server}{$lrm_resource_id}{last_operation_rc_code}, + }}); + } + } + + print "Node: [".$node_name."] (".$id."), lrm_id: [".$lrm_id."], lrm_resource_id: [".$lrm_resource_id."] (type: [".$type."], class: [".$class."]), lrm_rsc_op_id: [".$lrm_rsc_op_id."] (".$lrm_resource_operations_count.")\n"; + foreach my $variable (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{cib}{status}{node_state}{$id}{lrm_id}{$lrm_id}{lrm_resource}{$lrm_resource_id}{lrm_rsc_op_id}{$lrm_rsc_op_id}}) + { + my $value = $anvil->data->{cib}{parsed}{cib}{status}{node_state}{$id}{lrm_id}{$lrm_id}{lrm_resource}{$lrm_resource_id}{lrm_rsc_op_id}{$lrm_rsc_op_id}{$variable}; + print "- Variable: [".$variable."], value: [".$value."]\n"; + } + } + } + } + } + + # Debug code. + foreach my $server (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{data}{server}}) + { + my $last_operation = $anvil->data->{cib}{parsed}{data}{server}{$server}{last_operation}; + my $last_operation_rc_code = $anvil->data->{cib}{parsed}{data}{server}{$server}{last_operation_rc_code}; + my $status = $anvil->data->{cib}{parsed}{data}{server}{$server}{status}; + my $host = $anvil->data->{cib}{parsed}{data}{server}{$server}{host}; + my $last_monitor_rc_code = $anvil->data->{cib}{parsed}{data}{server}{$server}{last_monitor_rc_code}; + my $last_failure_operation = $anvil->data->{cib}{parsed}{data}{server}{$server}{last_failure_operation}; + my $last_failure_return_code = $anvil->data->{cib}{parsed}{data}{server}{$server}{last_failure_return_code}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + 's1:server' => $server, + 's2:host' => $host, + 's3:status' => $status, + 's4:last_monitor_rc_code' => $last_monitor_rc_code, + 's5:last_operation' => $last_operation, + 's6:last_operation_rc_code' => $last_operation_rc_code, + 's7:last_failure_operation' => $last_failure_operation, + 's8:last_failure_return_code' => $last_failure_return_code, + }}); + } + return($problem); } +=head2 shutdown_server + +This shuts down a server that is running on the Anvil! system. + +Parameters; + +=head3 server (required) + +This is the name of the server to shut down. + +=head3 wait (optional, default '1') + +This controls whether the method waits for the server to shut down before returning. By default, it will go into a loop and check every 2 seconds to see if the server is still running. Once it's found to be off, the method returns. If this is set to C<< 0 >>, the method will return as soon as the request to shut down the server is issued. + +=cut +sub shutdown_server +{ + my $self = shift; + my $parameter = shift; + my $anvil = $self->parent; + my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "Cluster->shutdown_server()" }}); + + my $server = defined $parameter->{server} ? $parameter->{server} : ""; + my $wait = defined $parameter->{'wait'} ? $parameter->{'wait'} : 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + server => $server, + 'wait' => $wait, + }}); + + if (not $server) + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0020", variables => { method => "Cluster->shutdown_server()", parameter => "server" }}); + return("!!error!!"); + } + + my $host_type = $anvil->Get->host_type({debug => $debug}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { host_type => $host_type }}); + if ($host_type ne "node") + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0150", variables => { server => $server }}); + return("!!error!!"); + } + + my $problem = $anvil->Cluster->parse_cib({debug => $debug}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { problem => $problem }}); + if ($problem) + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0151", variables => { server => $server }}); + return('!!error!!'); + } + + # Is this node fully in the cluster? + if (not $anvil->data->{cib}{parsed}{'local'}{ready}) + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0152", variables => { server => $server }}); + return('!!error!!'); + } + + # Is the server one we know of? + if (not exists $anvil->data->{cib}{parsed}{data}{server}{$server}) + { + # The server isn't in the pacemaker config. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0153", variables => { server => $server }}); + return('!!error!!'); + } + + # Is the server already running? If so, do nothing. + my $status = $anvil->data->{cib}{parsed}{data}{server}{$server}{status}; + my $host = $anvil->data->{cib}{parsed}{data}{server}{$server}{host}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + status => $status, + host => $host, + }}); + + if ($status eq "off") + { + # Already off. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0548", variables => { server => $server }}); + return(0); + } + elsif ($status ne "running") + { + # It's in an unknown state, abort. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "warning_0060", variables => { + server => $server, + current_host => $host, + current_state => $status, + }}); + return('!!error!!'); + } + + # Now shut down the server. + my ($output, $return_code) = $anvil->System->call({debug => 3, shell_call => $anvil->data->{path}{exe}{pcs}." resource disable ".$server}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + output => $output, + return_code => $return_code, + }}); + + if (not $wait) + { + # We're done. + return(0); + } + + # Wait now for the server to start. + my $waiting = 1; + while($waiting) + { + $anvil->Cluster->parse_cib({debug => $debug}); + my $status = $anvil->data->{cib}{parsed}{data}{server}{$server}{status}; + my $host = $anvil->data->{cib}{parsed}{data}{server}{$server}{host}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + status => $status, + host => $host, + }}); + + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0554", variables => { server => $server }}); + if ($host eq "running") + { + # Wait a bit and check again. + sleep 2; + } + else + { + # It's down. + $waiting = 0; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0555", variables => { server => $server }}); + } + } + + return(0); +} + + =head2 start_cluster This will join the local node to the pacemaker cluster. Optionally, it can try to start the cluster on both nodes if C<< all >> is set. @@ -818,25 +1453,24 @@ sub which_node $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { anvil_name => $anvil_name, - node1_host_uuid => $node1_host_uuid - node2_host_uuid => $node2_host_uuid + node1_host_uuid => $node1_host_uuid, + node2_host_uuid => $node2_host_uuid, }}); if ($node_uuid eq $node1_host_uuid) { - $node_id = "node1"; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { node_id => $node_id }}); + $node_is = "node1"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { node_is => $node_is }}); last; } elsif ($node_uuid eq $node2_host_uuid) { - $node_id = "node2"; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { node_id => $node_id }}); + $node_is = "node2"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { node_is => $node_is }}); last; } } - return($node_is); } @@ -850,3 +1484,106 @@ sub which_node ############################################################################################################# # Private functions # ############################################################################################################# + +=head2 _set_server_constraint + +This is a private method used to set a preferencial location constraint for a server. It takes a server name and a preferred host node. It checks to see if a location constraint exists and, if so, which node is preferred. If it is not the requested node, the constraint is updated. If no constraint exists, it is created. + +Returns C<< !!error!! >> if there is a problem, C<< 0 >> otherwise + +Parameters; + +=head3 server (required) + +This is the name of the server whose preferred host node priproty is being set. + +=head3 preferred_node (required) + +This is the name the node that a server will prefer to run on. + +=cut +sub _set_server_constraint +{ + my $self = shift; + my $parameter = shift; + my $anvil = $self->parent; + my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "Cluster->_set_server_constraint()" }}); + + my $preferred_node = defined $parameter->{preferred_node} ? $parameter->{preferred_node} : ""; + my $server = defined $parameter->{server} ? $parameter->{server} : ""; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + server => $server, + preferred_node => $preferred_node, + }}); + + if (not $server) + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0020", variables => { method => "Cluster->_set_server_constraint()", parameter => "server" }}); + return("!!error!!"); + } + + if (not $preferred_node) + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0020", variables => { method => "Cluster->_set_server_constraint()", parameter => "preferred_node" }}); + return("!!error!!"); + } + + if (not exists $anvil->data->{cib}{parsed}{data}{cluster}{name}) + { + my $problem = $anvil->Cluster->parse_cib({debug => $debug}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { problem => $problem }}); + if ($problem) + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0145", variables => { server => $server }}); + + } + } + + # Is this node fully in the cluster? + if (not $anvil->data->{cib}{parsed}{'local'}{ready}) + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0148", variables => { + server => $server, + node => $preferred_node, + }}); + return('!!error!!'); + } + + my $peer_name = $anvil->data->{cib}{parsed}{peer}{name}; + my $local_name = $anvil->data->{cib}{parsed}{'local'}{name}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + peer_name => $peer_name, + local_name => $local_name, + }}); + + my $shell_call = ""; + if ($preferred_node eq $peer_name) + { + $shell_call = $anvil->data->{path}{exe}{pcs}." constraint location ".$server." prefers ".$peer_name."=200 ".$local_name."=100"; + } + elsif ($preferred_node eq $local_name) + { + $shell_call = $anvil->data->{path}{exe}{pcs}." constraint location ".$server." prefers ".$peer_name."=100 ".$local_name."=200"; + } + else + { + # Invalid + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0144", variables => { + server => $server, + node => $preferred_node, + node1 => $local_name, + node2 => $peer_name, + }}); + return("!!error!!"); + } + + # Change the location constraint + my ($output, $return_code) = $anvil->System->call({debug => 3, shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + output => $output, + return_code => $return_code, + }}); + + return(0); +} diff --git a/Anvil/Tools/Server.pm b/Anvil/Tools/Server.pm index 6e789d2d..2657eebd 100755 --- a/Anvil/Tools/Server.pm +++ b/Anvil/Tools/Server.pm @@ -12,12 +12,12 @@ our $VERSION = "3.0.0"; my $THIS_FILE = "Server.pm"; ### Methods; -# boot +# boot_virsh # find # get_status # map_network -# migrate -# shutdown +# migrate_virsh +# shutdown_virsh =pod @@ -78,13 +78,13 @@ sub parent # Public methods # ############################################################################################################# -=head2 boot +=head2 boot_virsh This takes a server name and tries to boot it (using C<< virsh create /mnt/shared/definition/.xml >>. It requires that any supporting systems already be started (ie: DRBD resource is up). If booted, C<< 1 >> is returned. Otherwise, C<< 0 >> is returned. - my ($booted) = $anvil->Server->boot({server => "test_server"}); +# my ($booted) = $anvil->Server->boot_virsh({server => "test_server"}); Parameters; @@ -99,13 +99,13 @@ By default, the definition file used will be named C<< .xml >> in the C< This is the name of the server, as it appears in C<< virsh >>. =cut -sub boot +sub boot_virsh { my $self = shift; my $parameter = shift; my $anvil = $self->parent; my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3; - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "Server->boot()" }}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "Server->boot_virsh()" }}); my $server = defined $parameter->{server} ? $parameter->{server} : ""; my $definition = defined $parameter->{definition} ? $parameter->{definition} : ""; @@ -117,7 +117,7 @@ sub boot if (not $server) { - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0020", variables => { method => "Server->boot()", parameter => "server" }}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0020", variables => { method => "Server->boot_virsh()", parameter => "server" }}); return(1); } if (not $definition) @@ -595,11 +595,15 @@ Provision order: return(0); } -=head2 migrate +=head2 migrate_virsh This will migrate (push or pull) a server from one node to another. If the migration was successful, C<< 1 >> is returned. Otherwise, C<< 0 >> is returned with a (hopefully) useful error being logged. -NOTE: It is assumed that sanity checks are completed before this method is called. +Generally speaking, this is B<< NOT >> the method you want to call. + +B<< Warning >>: This method is meant to do the raw C<< virsh >> call, it is NOT designed to be called by pacemaker. To migrate via pacemaker, use C<< Cluster->migrate >>. + +B<< Note >>: It is assumed that sanity checks are completed before this method is called. Parameters; @@ -618,13 +622,13 @@ If set, the server will be pulled. This is the host name (or IP) Of the host that the server will be pushed to, if C<< source >> is not set. When this is not passed, the local full host name is used as default. =cut -sub migrate +sub migrate_virsh { my $self = shift; my $parameter = shift; my $anvil = $self->parent; my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3; - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "Server->migrate()" }}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "Server->migrate_virsh()" }}); my $server = defined $parameter->{server} ? $parameter->{server} : ""; my $source = defined $parameter->{source} ? $parameter->{source} : ""; @@ -638,7 +642,7 @@ sub migrate if (not $server) { - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0020", variables => { method => "Server->migrate()", parameter => "server" }}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0020", variables => { method => "Server->migrate_virsh()", parameter => "server" }}); return($success); } @@ -745,13 +749,13 @@ sub migrate return($success); } -=head2 shutdown +=head2 shutdown_virsh This takes a server name and tries to shut it down. If the server was found locally, the shut down is requested and this method will wait for the server to actually shut down before returning. If shut down, C<< 1 >> is returned. If the server wasn't found or another problem occurs, C<< 0 >> is returned. - my ($shutdown) = $anvil->Server->shutdown({server => "test_server"}); + my ($shutdown) = $anvil->Server->shutdown_virsh({server => "test_server"}); Parameters; @@ -770,13 +774,13 @@ This is the name of the server (as it appears in C<< virsh >>) to shut down. By default, this method will wait indefinetly for the server to shut down before returning. If this is set to a non-zero number, the method will wait that number of seconds for the server to shut dwwn. If the server is still not off by then, C<< 0 >> is returned. =cut -sub shutdown +sub shutdown_virsh { my $self = shift; my $parameter = shift; my $anvil = $self->parent; my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3; - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "Server->shutdown()" }}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "Server->shutdown_virsh()" }}); my $server = defined $parameter->{server} ? $parameter->{server} : ""; my $force = defined $parameter->{force} ? $parameter->{force} : 0; @@ -789,7 +793,7 @@ sub shutdown if (not $server) { - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0020", variables => { method => "Server->shutdown()", parameter => "server" }}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0020", variables => { method => "Server->shutdown_virsh()", parameter => "server" }}); return($success); } if (($wait) && ($wait =~ /\D/)) diff --git a/ocf/alteeve/server b/ocf/alteeve/server index be1606e2..b7ace50f 100755 --- a/ocf/alteeve/server +++ b/ocf/alteeve/server @@ -3,7 +3,7 @@ # This is the resource agent used to manage servers on the Anvil! Intelligent Availability platform. # # License: GNU General Public License (GPL) v2+ -# (c) 1997-2019 - Alteeve's Niche! Inc. +# (c) 1997-2020 - Alteeve's Niche! Inc. # # WARNING: This is a pretty purpose-specific resource agent. No effort was made to test this on an rgmanager # cluster or on any configuration outside how the Anvil! m3 uses it. If you plan to adapt it to @@ -665,7 +665,7 @@ sub start_server start_drbd_resource($anvil); # Still alive? Boot! - my ($success) = $anvil->Server->boot({debug => 3, server => $server}); + my ($success) = $anvil->Server->boot_virsh({debug => 3, server => $server}); if ($success) { # Success! @@ -955,7 +955,7 @@ sub stop_server $anvil->Server->get_status({debug => 3, server => $server}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0313", variables => { server => $server }}); - my $success = $anvil->Server->shutdown({debug => 3, server => $server}); + my $success = $anvil->Server->shutdown_virsh({debug => 3, server => $server}); if (not $success) { # Something went wrong. Details should be in the logs. @@ -997,7 +997,7 @@ sub server_status $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "log_0331", variables => { timeout => $anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout} }}); } - # Is 'libvirtd' running? We'll wait up to halfthe timeout for it to start (in case it _just_ started) + # Is 'libvirtd' running? We'll wait up to half the timeout for it to start (in case it _just_ started) # before timing out. my $wait_until = time + ($anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout} / 2000); # Devide by 2000 to convert to seconds and total second halved. my $look_for_pid = 0; @@ -1358,7 +1358,7 @@ sub migrate_server } # If we're still alive, we're ready to migrate. - ($migrated) = $anvil->Server->migrate({ + ($migrated) = $anvil->Server->migrate_virsh({ debug => 3, server => $server, source => $source, @@ -1413,7 +1413,7 @@ sub migrate_server validate_all($anvil); # Call the pull migation. - ($migrated) = $anvil->Server->migrate({ + ($migrated) = $anvil->Server->migrate_virsh({ debug => 3, server => $server, source => $source, diff --git a/scancore-agents/scan-cluster/scan-cluster b/scancore-agents/scan-cluster/scan-cluster index 8e457e3f..eb1cd862 100755 --- a/scancore-agents/scan-cluster/scan-cluster +++ b/scancore-agents/scan-cluster/scan-cluster @@ -54,7 +54,7 @@ if ($problem) $anvil->nice_exit({exit_code => 1}); } -$anvil->Log->entry({test => 1, source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_cluster_log_0001", variables => { program => $THIS_FILE }}); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_cluster_log_0001", variables => { program => $THIS_FILE }}); if ($anvil->data->{switches}{purge}) { # This can be called when doing bulk-database purges. @@ -77,6 +77,7 @@ if ($host_type ne "node") # Read the data. collect_data($anvil); +# Read last scan $anvil->nice_exit({exit_code => 0}); @@ -108,17 +109,35 @@ sub collect_data ### TODO: If we're node 2, or not in the cluster, only update our information in the ### 'scan_cluster_nodes' table. Node 1 will update everything else if it's 'ready' (else node 2 - ### will, if it's ready) + ### will, if it's ready). + my $i_am = $anvil->Cluster->which_node({debug => 1}); + my $my_node_name = $anvil->data->{cib}{parsed}{'local'}{name}; + my $peer_node_name = $anvil->data->{cib}{parsed}{peer}{name}; + my $peer_ready = $anvil->data->{cib}{parsed}{peer}{ready}; + my $local_ready = $anvil->data->{cib}{parsed}{data}{node}{$my_node_name}{node_state}{ready}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + i_am => $i_am, + my_node_name => $my_node_name, + peer_node_name => $peer_node_name, + peer_ready => $peer_ready, + local_ready => $local_ready, + }}); - my $ready = $anvil->data->{cib}{parsed}{data}{node}{$node_name}{node_state}{ready}; + ### TODO: Change the logic so that when both nodes are in the cluster, the node with the lowest + ### load does the scan (typically the node without VMs). + if (($i_am eq "node2") && ($peer_ready)) + { + # We're not going to run. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "scan_cluster_message_0001"}); + $anvil->nice_exit({exit_code => 0}); + } + # If we're still alive, we're either node 1, or we're node 2 and node 1 is not ready. If we're not ready, if ($stonith_max_attempts ne "INFINITY") { ### TODO: Call pcs to update } - # Pick up node data - return(0); diff --git a/scancore-agents/scan-cluster/scan-cluster.sql b/scancore-agents/scan-cluster/scan-cluster.sql index b1edf66f..9e550a40 100644 --- a/scancore-agents/scan-cluster/scan-cluster.sql +++ b/scancore-agents/scan-cluster/scan-cluster.sql @@ -55,59 +55,450 @@ CREATE TRIGGER trigger_scan_cluster CREATE TABLE scan_cluster_nodes ( - scan_cluster_node_uuid uuid primary key, - scan_cluster_node_host_uuid uuid not null, -- This is the host UUID of the node. - scan_cluster_node_name text not null, -- This is the host name as reported by pacemaker. It _should_ match up to a host name in 'hosts'. - scan_cluster_node_pacemaker_id numeric not null, -- This is the internal pacemaker ID number of this node. - my $node_id = $anvil->data->{cib}{parsed}{data}{node}{$node_name}{id}; - my $in_ccm = $anvil->data->{cib}{parsed}{cib}{node_state}{$node_id}{in_ccm} eq "true" ? 1 : 0; # 'true' or 'false' - Corosync member - my $crmd = $anvil->data->{cib}{parsed}{cib}{node_state}{$node_id}{crmd} eq "online" ? 1 : 0; # 'online' or 'offline' - In corosync process group - my $join = $anvil->data->{cib}{parsed}{cib}{node_state}{$node_id}{'join'} eq "member" ? 1 : 0; # 'member' or 'down' - Completed controller join process + scan_cluster_node_uuid uuid primary key, + scan_cluster_node_scan_cluster_uuid uuid not null, -- The parent scan_cluster_uuid. + scan_cluster_node_host_uuid uuid not null, -- This is the host UUID of the node. + scan_cluster_node_name text not null, -- This is the host name as reported by pacemaker. It _should_ match up to a host name in 'hosts'. + scan_cluster_node_pacemaker_id numeric not null, -- This is the internal pacemaker ID number of this node. + modified_date timestamp with time zone not null, - modified_date timestamp with time zone not null, + FOREIGN KEY(scan_cluster_node_scan_cluster_uuid) REFERENCES scan_cluster(scan_cluster_uuid), + FOREIGN KEY(scan_cluster_host_uuid) REFERENCES hosts(host_uuid) +); +ALTER TABLE scan_cluster_nodes OWNER TO admin; + +CREATE TABLE history.scan_cluster_nodes ( + history_id bigserial, + scan_cluster_node_uuid uuid, + scan_cluster_node_scan_cluster_uuid uuid, + scan_cluster_node_host_uuid uuid, + scan_cluster_node_name text, + scan_cluster_node_pacemaker_id numeric, + modified_date timestamp with time zone not null +); +ALTER TABLE history.scan_cluster_nodes OWNER TO admin; + +CREATE FUNCTION history_scan_cluster_nodes() RETURNS trigger +AS $$ +DECLARE + history_scan_cluster_nodes RECORD; +BEGIN + SELECT INTO history_scan_cluster_nodes * FROM scan_cluster_nodes WHERE scan_cluster_node_uuid=new.scan_cluster_node_uuid; + INSERT INTO history.scan_cluster_nodes + (scan_cluster_node_uuid, + scan_cluster_node_scan_cluster_uuid, + scan_cluster_node_host_uuid, + scan_cluster_node_name, + scan_cluster_node_pacemaker_id, + modified_date) + VALUES + (history_scan_cluster_nodes.scan_cluster_node_uuid, + history_scan_cluster_nodes.scan_cluster_node_scan_cluster_uuid, + history_scan_cluster_nodes.scan_cluster_node_host_uuid, + history_scan_cluster_nodes.scan_cluster_node_name, + history_scan_cluster_nodes.scan_cluster_node_pacemaker_id, + history_scan_cluster_nodes.modified_date); + RETURN NULL; +END; +$$ +LANGUAGE plpgsql; +ALTER FUNCTION history_scan_cluster_nodes() OWNER TO admin; + +CREATE TRIGGER trigger_scan_cluster_nodes + AFTER INSERT OR UPDATE ON scan_cluster_nodes + FOR EACH ROW EXECUTE PROCEDURE history_scan_cluster_nodes(); + + +CREATE TABLE scan_cluster_stoniths ( + scan_cluster_stonith_uuid uuid primary key, + scan_cluster_stonith_scan_cluster_uuid uuid not null, -- The parent scan_cluster_uuid. + scan_cluster_stonith_host_uuid uuid not null, -- This is the host UUID of the node. + scan_cluster_stonith_name text not null, -- This is the 'stonith id' + scan_cluster_stonith_arguments text not null, -- This is the fence agent + collection of primitive variable=value pairs (the nvpairs) + scan_cluster_stonith_operations text not null, -- This is the collection of operation variable=value pairs (the nvpairs) + modified_date timestamp with time zone not null, + FOREIGN KEY(scan_cluster_stonith_scan_cluster_uuid) REFERENCES scan_cluster(scan_cluster_uuid), FOREIGN KEY(scan_cluster_host_uuid) REFERENCES hosts(host_uuid) ); -ALTER TABLE scan_cluster OWNER TO admin; +ALTER TABLE scan_cluster_stoniths OWNER TO admin; -CREATE TABLE history.scan_cluster ( - history_id bigserial, - scan_cluster_uuid uuid, - scan_cluster_host_uuid uuid, - scan_cluster_node_name text, - scan_cluster_stonith_enabled boolean, - scan_cluster_maintenance_mode boolean, - modified_date timestamp with time zone not null +CREATE TABLE history.scan_cluster_stoniths ( + history_id bigserial, + scan_cluster_stonith_uuid uuid, + scan_cluster_stonith_scan_cluster_uuid uuid, + scan_cluster_stonith_host_uuid uuid, + scan_cluster_stonith_name text, + scan_cluster_stonith_arguments text, + scan_cluster_stonith_operations text, + modified_date timestamp with time zone not null ); -ALTER TABLE history.scan_cluster OWNER TO admin; +ALTER TABLE history.scan_cluster_stoniths OWNER TO admin; -CREATE FUNCTION history_scan_cluster() RETURNS trigger +CREATE FUNCTION history_scan_cluster_stoniths() RETURNS trigger AS $$ DECLARE - history_scan_cluster RECORD; + history_scan_cluster_stoniths RECORD; BEGIN - SELECT INTO history_scan_cluster * FROM scan_cluster WHERE scan_cluster_uuid=new.scan_cluster_uuid; - INSERT INTO history.scan_cluster - (scan_cluster_uuid, - scan_cluster_host_uuid, - scan_cluster_name, - scan_cluster_stonith_enabled, - scan_cluster_maintenance_mode, + SELECT INTO history_scan_cluster_stoniths * FROM scan_cluster_stoniths WHERE scan_cluster_stonith_uuid=new.scan_cluster_stonith_uuid; + INSERT INTO history.scan_cluster_stoniths + (scan_cluster_stonith_uuid, + scan_cluster_stonith_scan_cluster_uuid, + scan_cluster_stonith_host_uuid, + scan_cluster_stonith_name, + scan_cluster_stonith_arguments, + scan_cluster_stonith_operations, modified_date) VALUES - (history_scan_cluster.scan_cluster_uuid, - history_scan_cluster.scan_cluster_host_uuid, - history_scan_cluster.scan_cluster_name, - history_scan_cluster.scan_cluster_stonith_enabled, - history_scan_cluster.scan_cluster_maintenance_mode, - history_scan_cluster.modified_date); + (history_scan_cluster_stoniths.scan_cluster_stonith_uuid, + history_scan_cluster_stoniths.scan_cluster_stonith_scan_cluster_uuid, + history_scan_cluster_stoniths.scan_cluster_stonith_host_uuid, + history_scan_cluster_stoniths.scan_cluster_stonith_name, + history_scan_cluster_stoniths.scan_cluster_stonith_arguments, + history_scan_cluster_stoniths.scan_cluster_stonith_operations, + history_scan_cluster_stoniths.modified_date); RETURN NULL; END; $$ LANGUAGE plpgsql; -ALTER FUNCTION history_scan_cluster() OWNER TO admin; +ALTER FUNCTION history_scan_cluster_stoniths() OWNER TO admin; + +CREATE TRIGGER trigger_scan_cluster_stoniths + AFTER INSERT OR UPDATE ON scan_cluster_stoniths + FOR EACH ROW EXECUTE PROCEDURE history_scan_cluster_stoniths(); + + +CREATE TABLE scan_cluster_servers ( + scan_cluster_server_uuid uuid primary key, + scan_cluster_server_scan_cluster_uuid uuid not null, -- The parent scan_cluster_uuid. + scan_cluster_server_name text not null, -- This is the name of the server (ocf primitive id) + scan_cluster_server_state text not null, -- This is the 'running' or why it's off (off by user, etc) + scan_cluster_server_host_name uuid not null, -- This is the (cluster) name of the node hosting the server. Blank if the server is off. + scan_cluster_server_arguments text not null, -- This is the collection of primitive variable=value pairs (the nvpairs) + scan_cluster_server_operations text not null, -- This is the collection of operation variable=value pairs (the nvpairs) + scan_cluster_server_meta text not null, -- This is the collection of meta attribute variable=value pairs (the nvpairs) + modified_date timestamp with time zone not null, + + FOREIGN KEY(scan_cluster_server_scan_cluster_uuid) REFERENCES scan_cluster(scan_cluster_uuid), + FOREIGN KEY(scan_cluster_host_uuid) REFERENCES hosts(host_uuid) +); +ALTER TABLE scan_cluster_servers OWNER TO admin; + +CREATE TABLE history.scan_cluster_servers ( + history_id bigserial, + scan_cluster_server_uuid uuid, + scan_cluster_server_scan_cluster_uuid uuid, + scan_cluster_server_name text, + scan_cluster_server_arguments text, + scan_cluster_server_operations text, + scan_cluster_server_meta text, + modified_date timestamp with time zone not null +); +ALTER TABLE history.scan_cluster_servers OWNER TO admin; + +CREATE FUNCTION history_scan_cluster_servers() RETURNS trigger +AS $$ +DECLARE + history_scan_cluster_servers RECORD; +BEGIN + SELECT INTO history_scan_cluster_servers * FROM scan_cluster_servers WHERE scan_cluster_server_uuid=new.scan_cluster_server_uuid; + INSERT INTO history.scan_cluster_servers + (scan_cluster_server_uuid, + scan_cluster_server_scan_cluster_uuid, + scan_cluster_server_name, + scan_cluster_server_arguments, + scan_cluster_server_operations, + scan_cluster_server_meta, + modified_date) + VALUES + (history_scan_cluster_servers.scan_cluster_server_uuid, + history_scan_cluster_servers.scan_cluster_server_scan_cluster_uuid, + history_scan_cluster_servers.scan_cluster_server_host_uuid, + history_scan_cluster_servers.scan_cluster_server_name, + history_scan_cluster_servers.scan_cluster_server_arguments, + history_scan_cluster_servers.scan_cluster_server_operations, + history_scan_cluster_servers.modified_date); + RETURN NULL; +END; +$$ +LANGUAGE plpgsql; +ALTER FUNCTION history_scan_cluster_servers() OWNER TO admin; + +CREATE TRIGGER trigger_scan_cluster_servers + AFTER INSERT OR UPDATE ON scan_cluster_servers + FOR EACH ROW EXECUTE PROCEDURE history_scan_cluster_servers(); -CREATE TRIGGER trigger_scan_cluster - AFTER INSERT OR UPDATE ON scan_cluster - FOR EACH ROW EXECUTE PROCEDURE history_scan_cluster(); +-- Example CIB +# pcs resource + * srv07-el6 (ocf::alteeve:server): Stopped (disabled) + * srv01-sql (ocf::alteeve:server): Started mk-a02n01 + * srv02-lab1 (ocf::alteeve:server): Started mk-a02n01 + * srv08-m2-psql (ocf::alteeve:server): Stopped (disabled) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +-- \ No newline at end of file diff --git a/scancore-agents/scan-cluster/scan-cluster.xml b/scancore-agents/scan-cluster/scan-cluster.xml index 487af25a..49b5eeeb 100644 --- a/scancore-agents/scan-cluster/scan-cluster.xml +++ b/scancore-agents/scan-cluster/scan-cluster.xml @@ -21,7 +21,7 @@ NOTE: All string keys MUST be prefixed with the agent name! ie: 'scan_cluster_lo This host is a: [#!variable!host_type!#], this agent is only useful on nodes. Exiting. - + We're node 2, and node 1 is running as well. Exiting as only one node needs to run this agent. diff --git a/scancore-agents/scan-server/scan-server b/scancore-agents/scan-server/scan-server new file mode 100755 index 00000000..e69de29b diff --git a/scancore-agents/scan-server/scan-server.sql b/scancore-agents/scan-server/scan-server.sql new file mode 100644 index 00000000..e69de29b diff --git a/scancore-agents/scan-server/scan-server.xml b/scancore-agents/scan-server/scan-server.xml new file mode 100644 index 00000000..e69de29b diff --git a/share/words.xml b/share/words.xml index 45b0ba52..fa74228c 100644 --- a/share/words.xml +++ b/share/words.xml @@ -216,6 +216,21 @@ The error was: There appears to be no mail server in the database with the UUID: [#!variable!uuid!#]. There alert level: [#!variable!alert_level!#] is invalid. Valid values are '1' / 'critical', '2' / 'warning, '3' / 'notice', and '4' / 'info'. Failed to write the email alert file: [#!variable!file!#]! Unable to process the alert. Check the logs above for possible reasons for the error. + I was asked to change the preferred host node of the server: [#!variable!server!#] to: [#!variable!node!#], but that doesn't match the name of either node in the cluster. The node names are: [#!variable!node1!#] and [#!variable!node2!#]. + Unable to boot the server: [#!variable!server!#] as the cluster isn't running or there was a problem parsing the cluster CIB. + Unable to boot the server: [#!variable!server!#] as this host is not a node. + Unable to boot the server: [#!variable!server!#] as this node is not (yet) a full member of the cluster. + Unable to set the preferred host of the server: [#!variable!server!#] to: [#!variable!node!#] as this node is not (yet) a full member of the cluster. + Unable to boot the server: [#!variable!server!#] as this server was not found in the cluster information base (CIB). + Unable to shut down the server: [#!variable!server!#] as this host is not a node. + Unable to shut down the server: [#!variable!server!#] as the cluster isn't running or there was a problem parsing the cluster CIB. + Unable to shut down the server: [#!variable!server!#] as this node is not (yet) a full member of the cluster. + Unable to shut down the server: [#!variable!server!#] as this server was not found in the cluster information base (CIB). + Unable to migrate the server: [#!variable!server!#] as this host is not a node. + Unable to migrate the server: [#!variable!server!#] as the cluster isn't running or there was a problem parsing the cluster CIB. + Unable to migrate the server: [#!variable!server!#] as this node is not (yet) a full member of the cluster. + Unable to migrate the server: [#!variable!server!#] as the peer node is not (yet) a full member of the cluster. + Unable to migrate the server: [#!variable!server!#] as this server was not found in the cluster information base (CIB). Current Network Interfaces and States @@ -1032,6 +1047,14 @@ The file: [#!variable!file!#] needs to be updated. The difference is: The table: [#!variable!table!#] does NOT exists in the database on the host: [#!variable!host!#]. Will load the schema file: [#!variable!file!#] now. The passed in 'temperature_state' value: [#!variable!temperature_state!#] is invalid. The value must be 'ok', 'warning' or 'critical'. The passed in 'temperature_is' value: [#!variable!temperature_is!#] is invalid. The value must be 'nominal', 'warning' or 'critical'. + The server: [#!variable!server!#] is already running, no need to boot it. + The server: [#!variable!server!#] is already running on the target node: [#!variable!requested_node!#], migration not needed. + Waiting for the server: [#!variable!server!#] to finish migrating to the node: [#!variable!requested_node!#]... + The migration of the server: [#!variable!server!#] to the node: [#!variable!requested_node!#] is complete! + Waiting for the server: [#!variable!server!#] to boot... + The server: [#!variable!server!#] has booted! + Waiting for the server: [#!variable!server!#] to shut down... + The server: [#!variable!server!#] is now off. The host name: [#!variable!target!#] does not resolve to an IP address. @@ -1814,6 +1837,9 @@ The error was: [ Warning ] - The DR Host is set to the same machine as Node 2. [ Warning ] - The 'libvirtd' daemon is not running. Checking to see if the server is running by looking for its PID (server state won't be available). Please start 'libvirtd'! [ Warning ] - The server: [#!variable!server!#] is in a crashed state! + [ Warning ] - The server: [#!variable!server!#] was asked to be booted on: [#!variable!requested_node!#], but it is is already running on: [#!variable!current_host!#]. + [ Warning ] - The server: [#!variable!server!#] was asked to be shutdown, but it's in an unexpected state: [#!variable!state!#] on the host: [#!variable!current_host!#]. Aborting. + [ Warning ] - The server: [#!variable!server!#] was asked to be migrated to: [#!variable!requested_node!#], but the server is off. Aborting. diff --git a/tools/test.pl b/tools/test.pl index 33c497ed..931f3ca5 100755 --- a/tools/test.pl +++ b/tools/test.pl @@ -29,7 +29,287 @@ print "Connecting to the database(s);\n"; $anvil->Database->connect({debug => 3}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, secure => 0, key => "log_0132"}); -my $not_in_cluster = $anvil->Cluster->parse_cib({debug => 2}); +$anvil->Cluster->shutdown_server({ + debug => 2, + server => "srv07-el6", +}); +$anvil->Cluster->shutdown_server({ + debug => 2, + server => "srv01-sql", +}); +exit; + +my $cib = ' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +'; +my $not_in_cluster = $anvil->Cluster->parse_cib({debug => 2, cib => $cib}); if ($not_in_cluster) { print "This node isn't in the cluster.\n";