From 751687129add9334cd52defef7f295a30b71591e Mon Sep 17 00:00:00 2001 From: digimer Date: Sat, 15 Jul 2023 16:19:21 -0400 Subject: [PATCH] * Updated anvil-daemon to not exit on RAM use if anvil-update-system is running. * Fixed a bug in anvil-safe-stop where it wouldn't trigger a migration when the peer is online. * Updated anvil-update-system to set job_data to 'failed' and exit with rc 4 if the os update failed. * Got striker-update-cluster to error out and exit if a called 'anvil-update-system' job failed. Signed-off-by: digimer --- share/words.xml | 4 ++++ tools/anvil-daemon | 12 +++++++++--- tools/anvil-safe-stop | 10 ++++++---- tools/anvil-update-system | 26 ++++++++++++++++++++++++++ tools/striker-update-cluster | 17 +++++++++++++++++ 5 files changed, 62 insertions(+), 7 deletions(-) diff --git a/share/words.xml b/share/words.xml index 57d838df..fae981e7 100644 --- a/share/words.xml +++ b/share/words.xml @@ -603,6 +603,10 @@ The error was: There was a problem with finding a common storage network between: [#!variable!node1_name!#] and: [#!variable!node2_name!#]. Found node 1 to have the IP: [#!variable!node1_ip!#] and node 2: [#!variable!node2_ip!#]. Is there a problem with '/etc/hosts'? Failed to find a network to use for storage replication. Is there a problem with '/etc/hosts'? '.]]> + Failed to withdraw the subnode from the node's cluster. Expected the 'anvil-safe-stop' call to return '0', but got: [#!variable!return_code!#]. The output, if anything, was: +======== +#!variable!output!# +======== diff --git a/tools/anvil-daemon b/tools/anvil-daemon index 4643c72c..441fa2c7 100755 --- a/tools/anvil-daemon +++ b/tools/anvil-daemon @@ -252,8 +252,8 @@ sub check_ram }}); if ($problem) { - # See if an 'anvil-sync-shared' job is running and, if so, don't exit. The file copy is - # counted and not an actual problem. + # See if an 'anvil-sync-shared', or an 'anvil-update-system' job is running and, if so, + # don't exit. The file copy or OS update is counted and not an actual problem. $anvil->Database->get_jobs({debug => 2}); foreach my $job_uuid (keys %{$anvil->data->{jobs}{running}}) { @@ -264,7 +264,13 @@ sub check_ram job_progress => $job_progress, }}); - if (($job_progress != 100) && ($job_command =~ /anvil-sync-shared/)) + if ( + ($job_progress != 100) && + ( + ($job_command =~ /anvil-update-system/) or + ($job_command =~ /anvil-sync-shared/) + ) + ) { # Don't abort. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "warning_0139", variables => { diff --git a/tools/anvil-safe-stop b/tools/anvil-safe-stop index 6c6b99cf..0ea2962e 100755 --- a/tools/anvil-safe-stop +++ b/tools/anvil-safe-stop @@ -274,9 +274,9 @@ sub process_servers 's2:progress_steps' => $progress_steps, }}); - # If we have one or more local servers, we need to know if both of us are in the cluster. If we're - # not, or the peer isn't, we can't migrate. - my $can_migrate = 0; + # If we have one or more local servers, we need to know if both subnodes are in the node's cluster. + # If we're not, or the peer isn't, we can't migrate. + my $can_migrate = 1; if ($server_count) { my $problem = $anvil->Cluster->parse_cib({debug => 2}); @@ -287,18 +287,20 @@ sub process_servers }}); if ($problem) { + # We're not in the node's cluster, we can't migrate. $can_migrate = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { can_migrate => $can_migrate }}); } elsif ((not $anvil->data->{cib}{parsed}{'local'}{ready}) or (not $anvil->data->{cib}{parsed}{peer}{ready})) { + # One of the subnodes is not in the cluster, so we can't migrate. $can_migrate = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { can_migrate => $can_migrate }}); } if ((not $anvil->data->{switches}{'stop-servers'}) && (not $can_migrate)) { - # Abort. + # We would have to stop the servers, and the user didn't tell us to do that, abort. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0372"}); $anvil->Job->update_progress({progress => 100, message => "error_0372"}); $anvil->nice_exit({exit_code => 1}); diff --git a/tools/anvil-update-system b/tools/anvil-update-system index 78b0497a..71898a6a 100755 --- a/tools/anvil-update-system +++ b/tools/anvil-update-system @@ -15,6 +15,7 @@ # 1 = No database connections available. # 2 = The job UUID was passed, but it wasn't valid. # 3 = It looks like the update failed, reset progress to '0'. +# 4 = Failed to withdraw the node from the cluster. # # TODO: # - Rebuild this to be 'striker-update-system' and have it update local strikers and all nodes. @@ -201,6 +202,31 @@ sub run_os_update output => $output, return_code => $return_code, }}); + + if ($return_code) + { + # Something went wrong, abort. + update_progress($anvil, 100, "error_0420,!!return_code!".$return_code."!!,!!output!".$output."!!"); + + # Set the job_data to 'failed' so that striker-update-cluster' knows to abort. + if ($anvil->data->{switches}{'job-uuid'}) + { + my $query = " +UPDATE + jobs +SET + job_data = 'failed', + modified_date = ".$anvil->Database->quote($anvil->Database->refresh_timestamp)." +WHERE + job_uuid = ".$anvil->Database->quote($anvil->data->{switches}{'job-uuid'})." +;"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }}); + $anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__}); + } + + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, secure => 0, key => "error_0035", variables => { output => $output } }); + $anvil->nice_exit({exit_code => 4}); + } } } diff --git a/tools/striker-update-cluster b/tools/striker-update-cluster index 5e9eabd5..2017e5ec 100755 --- a/tools/striker-update-cluster +++ b/tools/striker-update-cluster @@ -416,6 +416,14 @@ sub update_nodes $rebooted = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { rebooted => $rebooted }}); } + + # Did it fail? + if ($anvil->data->{jobs}{job_data} eq "failed") + { + # Abort! + print "[ Error ] - There was a problem updating the subnode! Anvil! cluster update aborted.\n"; + $anvil->nice_exit({exit_code => 1}); + } } else { @@ -477,6 +485,7 @@ sub update_nodes }}); my ($output, $error, $return_code) = $anvil->Remote->call({ + debug => 2, shell_call => $shell_call, target => $anvil->data->{peer}{$short_host_name}{access}{ip}, }); @@ -881,6 +890,14 @@ sub update_strikers_and_dr $rebooted = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { rebooted => $rebooted }}); } + + # Did it fail? + if ($anvil->data->{jobs}{job_data} eq "failed") + { + # Abort! + print "[ Error ] - There was a problem updating the system! Anvil! cluster update aborted.\n"; + $anvil->nice_exit({exit_code => 1}); + } } else {