diff --git a/share/words.xml b/share/words.xml
index 57d838df..fae981e7 100644
--- a/share/words.xml
+++ b/share/words.xml
@@ -603,6 +603,10 @@ The error was:
There was a problem with finding a common storage network between: [#!variable!node1_name!#] and: [#!variable!node2_name!#]. Found node 1 to have the IP: [#!variable!node1_ip!#] and node 2: [#!variable!node2_ip!#]. Is there a problem with '/etc/hosts'?
Failed to find a network to use for storage replication. Is there a problem with '/etc/hosts'?
'.]]>
+ Failed to withdraw the subnode from the node's cluster. Expected the 'anvil-safe-stop' call to return '0', but got: [#!variable!return_code!#]. The output, if anything, was:
+========
+#!variable!output!#
+========
diff --git a/tools/anvil-daemon b/tools/anvil-daemon
index 4643c72c..441fa2c7 100755
--- a/tools/anvil-daemon
+++ b/tools/anvil-daemon
@@ -252,8 +252,8 @@ sub check_ram
}});
if ($problem)
{
- # See if an 'anvil-sync-shared' job is running and, if so, don't exit. The file copy is
- # counted and not an actual problem.
+ # See if an 'anvil-sync-shared', or an 'anvil-update-system' job is running and, if so,
+ # don't exit. The file copy or OS update is counted and not an actual problem.
$anvil->Database->get_jobs({debug => 2});
foreach my $job_uuid (keys %{$anvil->data->{jobs}{running}})
{
@@ -264,7 +264,13 @@ sub check_ram
job_progress => $job_progress,
}});
- if (($job_progress != 100) && ($job_command =~ /anvil-sync-shared/))
+ if (
+ ($job_progress != 100) &&
+ (
+ ($job_command =~ /anvil-update-system/) or
+ ($job_command =~ /anvil-sync-shared/)
+ )
+ )
{
# Don't abort.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "warning_0139", variables => {
diff --git a/tools/anvil-safe-stop b/tools/anvil-safe-stop
index 6c6b99cf..0ea2962e 100755
--- a/tools/anvil-safe-stop
+++ b/tools/anvil-safe-stop
@@ -274,9 +274,9 @@ sub process_servers
's2:progress_steps' => $progress_steps,
}});
- # If we have one or more local servers, we need to know if both of us are in the cluster. If we're
- # not, or the peer isn't, we can't migrate.
- my $can_migrate = 0;
+ # If we have one or more local servers, we need to know if both subnodes are in the node's cluster.
+ # If we're not, or the peer isn't, we can't migrate.
+ my $can_migrate = 1;
if ($server_count)
{
my $problem = $anvil->Cluster->parse_cib({debug => 2});
@@ -287,18 +287,20 @@ sub process_servers
}});
if ($problem)
{
+ # We're not in the node's cluster, we can't migrate.
$can_migrate = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { can_migrate => $can_migrate }});
}
elsif ((not $anvil->data->{cib}{parsed}{'local'}{ready}) or (not $anvil->data->{cib}{parsed}{peer}{ready}))
{
+ # One of the subnodes is not in the cluster, so we can't migrate.
$can_migrate = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { can_migrate => $can_migrate }});
}
if ((not $anvil->data->{switches}{'stop-servers'}) && (not $can_migrate))
{
- # Abort.
+ # We would have to stop the servers, and the user didn't tell us to do that, abort.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0372"});
$anvil->Job->update_progress({progress => 100, message => "error_0372"});
$anvil->nice_exit({exit_code => 1});
diff --git a/tools/anvil-update-system b/tools/anvil-update-system
index 78b0497a..71898a6a 100755
--- a/tools/anvil-update-system
+++ b/tools/anvil-update-system
@@ -15,6 +15,7 @@
# 1 = No database connections available.
# 2 = The job UUID was passed, but it wasn't valid.
# 3 = It looks like the update failed, reset progress to '0'.
+# 4 = Failed to withdraw the node from the cluster.
#
# TODO:
# - Rebuild this to be 'striker-update-system' and have it update local strikers and all nodes.
@@ -201,6 +202,31 @@ sub run_os_update
output => $output,
return_code => $return_code,
}});
+
+ if ($return_code)
+ {
+ # Something went wrong, abort.
+ update_progress($anvil, 100, "error_0420,!!return_code!".$return_code."!!,!!output!".$output."!!");
+
+ # Set the job_data to 'failed' so that striker-update-cluster' knows to abort.
+ if ($anvil->data->{switches}{'job-uuid'})
+ {
+ my $query = "
+UPDATE
+ jobs
+SET
+ job_data = 'failed',
+ modified_date = ".$anvil->Database->quote($anvil->Database->refresh_timestamp)."
+WHERE
+ job_uuid = ".$anvil->Database->quote($anvil->data->{switches}{'job-uuid'})."
+;";
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }});
+ $anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__});
+ }
+
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, secure => 0, key => "error_0035", variables => { output => $output } });
+ $anvil->nice_exit({exit_code => 4});
+ }
}
}
diff --git a/tools/striker-update-cluster b/tools/striker-update-cluster
index 5e9eabd5..2017e5ec 100755
--- a/tools/striker-update-cluster
+++ b/tools/striker-update-cluster
@@ -416,6 +416,14 @@ sub update_nodes
$rebooted = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { rebooted => $rebooted }});
}
+
+ # Did it fail?
+ if ($anvil->data->{jobs}{job_data} eq "failed")
+ {
+ # Abort!
+ print "[ Error ] - There was a problem updating the subnode! Anvil! cluster update aborted.\n";
+ $anvil->nice_exit({exit_code => 1});
+ }
}
else
{
@@ -477,6 +485,7 @@ sub update_nodes
}});
my ($output, $error, $return_code) = $anvil->Remote->call({
+ debug => 2,
shell_call => $shell_call,
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
});
@@ -881,6 +890,14 @@ sub update_strikers_and_dr
$rebooted = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { rebooted => $rebooted }});
}
+
+ # Did it fail?
+ if ($anvil->data->{jobs}{job_data} eq "failed")
+ {
+ # Abort!
+ print "[ Error ] - There was a problem updating the system! Anvil! cluster update aborted.\n";
+ $anvil->nice_exit({exit_code => 1});
+ }
}
else
{