From e90dae96f7290006eb2764671d28862f518735be Mon Sep 17 00:00:00 2001 From: Digimer Date: Wed, 31 Aug 2022 18:12:07 -0400 Subject: [PATCH] * In Server->shutdown_virsh(), disabled trying to resume a paused VM. Also updated the logging around not waiting for a VM to stop. * Updated anvil-safe-stop to check for VMs running, even if the cluster is stopped, when --stop-servers is used. Signed-off-by: Digimer --- Anvil/Tools/Server.pm | 34 +++++++-------- share/words.xml | 3 ++ tools/anvil-safe-stop | 96 +++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 114 insertions(+), 19 deletions(-) diff --git a/Anvil/Tools/Server.pm b/Anvil/Tools/Server.pm index 41f65efc..fffcf80f 100644 --- a/Anvil/Tools/Server.pm +++ b/Anvil/Tools/Server.pm @@ -1959,21 +1959,22 @@ sub shutdown_virsh } elsif ($status eq "paused") { + ### TODO: No, don't do this! The server might be migrating # The server is paused. Resume it, wait a few, then proceed with the shutdown. - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0314", variables => { server => $server }}); - my ($output, $return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{virsh}." resume $server"}); - if ($return_code) - { - # Looks like virsh isn't running. - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "log_0315", variables => { - server => $server, - return_code => $return_code, - output => $output, - }}); - $anvil->nice_exit({exit_code => 1}); - } - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0316"}); - sleep 3; +# $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0314", variables => { server => $server }}); +# my ($output, $return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{virsh}." resume $server"}); +# if ($return_code) +# { +# # Looks like virsh isn't running. +# $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "log_0315", variables => { +# server => $server, +# return_code => $return_code, +# output => $output, +# }}); +# $anvil->nice_exit({exit_code => 1}); +# } +# $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0316"}); +# sleep 3; } elsif ($status eq "pmsuspended") { @@ -2147,9 +2148,10 @@ WHERE { # Give up waiting. $waiting = 0; - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0426", variables => { waiting => $waiting }}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { waiting => $waiting }}); - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0427", variables => { + my $key = $wait_time == 1 ? "log_0727" : "log_0427"; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => $key, variables => { server => $server, 'wait' => $wait_time, }}); diff --git a/share/words.xml b/share/words.xml index 2d29f214..ac1751d8 100644 --- a/share/words.xml +++ b/share/words.xml @@ -1372,6 +1372,8 @@ Note: This is a permanent action! If you protect this server again later, a full Done! The server: [#!variable!server!#] is no longer being protected on DR! The resource config file: [#!variable!config_file!#] doesn't exist locally, pulling a copy over from: [#!variable!source!#]. Re-parsing the replicated storage configuration. + The server: [#!variable!server!#] was found to be running outside the cluster. Asking it to shut down now. + The server: [#!variable!server!#] is still running two minutes after asking it to stop. It might have woken up on the first press and ignored the shutdown request (Hi Windows). Pressing the poewr button again. Starting: [#!variable!program!#]. @@ -2200,6 +2202,7 @@ The file: [#!variable!file!#] needs to be updated. The difference is: The file: [#!variable!file!#] needs to be added to the database, but since the last scan it's size grew from: [#!variable!old_size_bytes!# (#!variable!old_size_hr!#)] to: [#!variable!new_size_bytes!# (#!variable!new_size_hr!#)]. A difference of: [#!variable!difference_bytes!# (#!variable!difference_hr!#)]. It might still be being uploaded, so we'll keep checking periodocally until the size stops changing. Found the missing file: [#!variable!file!#] in the directory: [#!variable!directory!#]. Updating the database now. Deleting the hash key: [#!variable!hash_key!#]. + [ Note ] - The server: [#!variable!server!#] is not yet off, but we've been told not to wait for it to stop. The host name: [#!variable!target!#] does not resolve to an IP address. diff --git a/tools/anvil-safe-stop b/tools/anvil-safe-stop index 3bf32ed2..53c280c0 100755 --- a/tools/anvil-safe-stop +++ b/tools/anvil-safe-stop @@ -277,7 +277,20 @@ sub process_servers $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0321"}); $anvil->Job->update_progress({progress => 10, message => "job_0321"}); } - my $waiting = 1; + + # Use virsh to check for servers, in case pacemaker lies to us. + $anvil->Server->find(); + my $progress = 10; + my $waiting = 1; + my $first_try = 0; + my $second_try = 0; + my $try_again = 0; + my $server_count = keys %{$anvil->data->{server}{location}}; + my $progress_steps = $server_count ? int(35 / $server_count) : 70; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:server_count' => $server_count, + 's2:progress_steps' => $progress_steps, + }}); while ($waiting) { # Is the cluster up? @@ -286,7 +299,84 @@ sub process_servers $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); if ($problem) { - # Nope. + # Nope. Are we stopping servers? + if ($anvil->data->{switches}{'stop-servers'}) + { + # Yes, are any servers running (check virsh) + foreach my $server_name (sort {$a cmp $b} keys %{$anvil->data->{server}{location}}) + { + my $status = $anvil->data->{server}{location}{$server_name}{status}; + my $host_name = $anvil->data->{server}{location}{$server_name}{host_name}; + $progress += $progress_steps; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:server_name' => $server_name, + 's2:status' => $status, + 's3:host_name' => $host_name, + 's4:progress' => $progress, + }}); + + if ($host_name eq $anvil->Get->host_name) + { + # Server is still running. + if (($status eq "running") && (not $first_try)) + { + # It's running despite the cluster being own, stop it. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "job_0419", variables => { server => $server_name }}); + $anvil->Job->update_progress({progress => $progress, message => "job_0419,!!server!".$server_name."!!"}); + $anvil->Server->shutdown_virsh({ + debug => 2, + server => $server_name, + wait_time => 1, + }); + + $waiting = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); + } + elsif (($status eq "in shutdown") && ($try_again)) + { + # Hit the power button again. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "job_0420", variables => { server => $server_name }}); + $anvil->Job->update_progress({progress => $progress, message => "job_0420,!!server!".$server_name."!!"}); + $anvil->Server->shutdown_virsh({ + debug => 2, + server => $server_name, + wait_time => 1, + }); + + $waiting = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); + } + } + } + + if ($waiting) + { + if (not $first_try) + { + $first_try = time; + $second_try = $first_try + 120; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + first_try => $first_try, + second_try => $second_try, + }}); + } + elsif ($try_again) + { + $try_again = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { try_again => $try_again }}); + } + elsif (($second_try) && (time > $second_try)) + { + $try_again = 1; + $second_try = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + second_try => $second_try, + try_again => $try_again, + }}); + } + } + } + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0313"}); $anvil->Job->update_progress({progress => 80, message => "job_0313"}); } @@ -304,7 +394,7 @@ sub process_servers $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:server' => $server, 's2:status' => $status, - 's2:host_name' => $host_name, + 's3:host_name' => $host_name, 's4:role' => $role, 's5:active' => $active, }});