From bc3d04ad2e296b77b7456d49a0ce04a073151948 Mon Sep 17 00:00:00 2001 From: digimer Date: Tue, 6 Jun 2023 14:34:02 -0400 Subject: [PATCH] * Updated Cluster->add_server() to wait up to 15 seconds for a server to appear to ensure that the pcs call to add the server with the right requested running state. * Updated Cluster->recover_server() to set the desired recovery state before calling the crm_resource refresh. Signed-off-by: digimer --- Anvil/Tools/Cluster.pm | 83 ++++++++++++++++++----- Anvil/Tools/Server.pm | 1 + ocf/alteeve/server | 3 +- scancore-agents/scan-cluster/scan-cluster | 5 +- tools/anvil-provision-server | 2 + 5 files changed, 72 insertions(+), 22 deletions(-) diff --git a/Anvil/Tools/Cluster.pm b/Anvil/Tools/Cluster.pm index 5fac82fe..3f737456 100644 --- a/Anvil/Tools/Cluster.pm +++ b/Anvil/Tools/Cluster.pm @@ -185,23 +185,47 @@ sub add_server password => $anvil->Log->is_secure($password), }}); - # Verify that the server is here or on the peer. We need to add the command to t - $anvil->Server->find({ - debug => $debug, - server => $server_name, - }); - $anvil->Server->find({ - debug => $debug, - refresh => 0, - password => $password, - target => $peer_target_ip, - server => $server_name, - }); + # Verify that the server is here or on the peer. Given they could be called at the same time that the + # server is being provisioned, we'll wait up to 15 seconds for it to appear. + my $waiting = 1; + my $wait_until = time + 15; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { wait_until => $wait_until }}); + while ($waiting) + { + $anvil->Server->find({ + debug => $debug, + server => $server_name, + }); + $anvil->Server->find({ + debug => $debug, + refresh => 0, + password => $password, + target => $peer_target_ip, + server => $server_name, + }); + + if (exists $anvil->data->{server}{location}{$server_name}{status}) + { + $waiting = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + waiting => $waiting, + "server::location::${server_name}::status" => $anvil->data->{server}{location}{$server_name}{status}, + "server::location::${server_name}::host_name" => $anvil->data->{server}{location}{$server_name}{host_name}, + }}); + } + + if (($waiting) && (time > $wait_until)) + { + # Stop waiting. + $waiting = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { waiting => $waiting }}); + } + } # The host here is the full host name. my $host_name = $anvil->Get->host_name(); - my $server_state = $anvil->data->{server}{location}{$server_name}{status}; - my $server_host = $anvil->data->{server}{location}{$server_name}{host_name}; + my $server_state = defined $anvil->data->{server}{location}{$server_name}{status} ? $anvil->data->{server}{location}{$server_name}{status} : ""; + my $server_host = defined $anvil->data->{server}{location}{$server_name}{host_name} ? $anvil->data->{server}{location}{$server_name}{host_name} : ""; my $target_role = $server_state eq "running" ? "started" : "stopped"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { host_name => $host_name, @@ -4278,10 +4302,14 @@ This tries to recover a C<< FAILED >> resource (server). Parameters; -=head3 server_ (required) +=head3 server (required) This is the server (resource) name to try to recover. +=head3 running (required) + +This indicates if the server should be recovered into the running state when set to C<< 1 >>, or stopped state when set to C<< 0 >>. + =cut sub recover_server { @@ -4291,9 +4319,11 @@ sub recover_server my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "Cluster->recover_server()" }}); - my $server = defined $parameter->{server} ? $parameter->{server} : ""; + my $running = defined $parameter->{running} ? $parameter->{running} : ""; + my $server = defined $parameter->{server} ? $parameter->{server} : ""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { - server => $server, + running => $running, + server => $server, }}); if (not $server) @@ -4301,8 +4331,15 @@ sub recover_server $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0020", variables => { method => "Cluster->recover_server()", parameter => "server" }}); return("!!error!!"); } + if ($running eq "") + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0020", variables => { method => "Cluster->recover_server()", parameter => "running" }}); + return("!!error!!"); + } - my $shell_call = $anvil->data->{path}{exe}{crm_resource}." --resource ".$server." --refresh"; + # Set the desired state post recovery. + my $wanted_state = $running ? "enable" : "disable"; + my $shell_call = $anvil->data->{path}{exe}{pcs}." resource ".$wanted_state." ".$server; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { shell_call => $shell_call }}); my ($output, $return_code) = $anvil->System->call({debug => $debug, shell_call => $shell_call}); @@ -4311,6 +4348,16 @@ sub recover_server return_code => $return_code, }}); + # Now tell it to refresh + $shell_call = $anvil->data->{path}{exe}{crm_resource}." --resource ".$server." --refresh"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { shell_call => $shell_call }}); + + ($output, $return_code) = $anvil->System->call({debug => $debug, shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + output => $output, + return_code => $return_code, + }}); + return(0); } diff --git a/Anvil/Tools/Server.pm b/Anvil/Tools/Server.pm index eadce06c..18997fd8 100644 --- a/Anvil/Tools/Server.pm +++ b/Anvil/Tools/Server.pm @@ -416,6 +416,7 @@ sub find # Clear any old data if ((exists $anvil->data->{server}{location}) && ($refresh)) { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0700", variables => { hash => "server::location" }}); delete $anvil->data->{server}{location}; } diff --git a/ocf/alteeve/server b/ocf/alteeve/server index 8e5d8b13..2c8cf9e0 100755 --- a/ocf/alteeve/server +++ b/ocf/alteeve/server @@ -100,8 +100,7 @@ $| = 1; # NOTE: Setting 'log_level' and 'log_secure' here will get overridden in the main lopp. Use the Log methods # in the loop as well to override defaults in code. my $anvil = Anvil::Tools->new(); - -$anvil->Log->level({set => 2}); +#$anvil->Log->level({set => 2}); ### Read or Set the environment variables # This is the name of the server we're managing. # Example values: diff --git a/scancore-agents/scan-cluster/scan-cluster b/scancore-agents/scan-cluster/scan-cluster index 724918be..ccef1f10 100755 --- a/scancore-agents/scan-cluster/scan-cluster +++ b/scancore-agents/scan-cluster/scan-cluster @@ -199,8 +199,9 @@ sub check_resources { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "scan_cluster_log_0010"}); $anvil->Cluster->recover_server({ - debug => 2, - server => $server, + debug => 2, + server => $server, + running => $server_found, }); # It'll leave 'failed state' for a bit, so we need to wait. diff --git a/tools/anvil-provision-server b/tools/anvil-provision-server index 7510a30f..f1ad17c0 100755 --- a/tools/anvil-provision-server +++ b/tools/anvil-provision-server @@ -3463,6 +3463,8 @@ server_uuid=".$anvil->data->{new_server}{uuid}; $target_host_uuid = $node1_host_uuid; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { target_host_uuid => $target_host_uuid }}); } + my $host_name = $anvil->Get->host_name_from_uuid({host_uuid => $target_host_uuid}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_name => $host_name }}); my ($job_uuid) = $anvil->Database->insert_or_update_jobs({ debug => 2, job_command => $anvil->data->{path}{exe}{'anvil-provision-server'}.$anvil->Log->switches,