* Updated anvil-daemon to not exit on RAM use if anvil-update-system is running.

* Fixed a bug in anvil-safe-stop where it wouldn't trigger a migration when the peer is online.
* Updated anvil-update-system to set job_data to 'failed' and exit with rc 4 if the os update failed.
* Got striker-update-cluster to error out and exit if a called 'anvil-update-system' job failed.

Signed-off-by: digimer <mkelly@alteeve.ca>
main
digimer 1 year ago
parent c1e4380a64
commit 751687129a
  1. 4
      share/words.xml
  2. 12
      tools/anvil-daemon
  3. 10
      tools/anvil-safe-stop
  4. 26
      tools/anvil-update-system
  5. 17
      tools/striker-update-cluster

@ -603,6 +603,10 @@ The error was:
<key name="error_0417">There was a problem with finding a common storage network between: [#!variable!node1_name!#] and: [#!variable!node2_name!#]. Found node 1 to have the IP: [#!variable!node1_ip!#] and node 2: [#!variable!node2_ip!#]. Is there a problem with '/etc/hosts'?</key>
<key name="error_0418">Failed to find a network to use for storage replication. Is there a problem with '/etc/hosts'?</key>
<key name="error_0419"><![CDATA[[ Error ] - The resource to refresh must be provide with '--resource <res>'.]]></key>
<key name="error_0420">Failed to withdraw the subnode from the node's cluster. Expected the 'anvil-safe-stop' call to return '0', but got: [#!variable!return_code!#]. The output, if anything, was:
========
#!variable!output!#
========</key>
<!-- Files templates -->
<!-- NOTE: Translating these files requires an understanding of which lines are translatable -->

@ -252,8 +252,8 @@ sub check_ram
}});
if ($problem)
{
# See if an 'anvil-sync-shared' job is running and, if so, don't exit. The file copy is
# counted and not an actual problem.
# See if an 'anvil-sync-shared', or an 'anvil-update-system' job is running and, if so,
# don't exit. The file copy or OS update is counted and not an actual problem.
$anvil->Database->get_jobs({debug => 2});
foreach my $job_uuid (keys %{$anvil->data->{jobs}{running}})
{
@ -264,7 +264,13 @@ sub check_ram
job_progress => $job_progress,
}});
if (($job_progress != 100) && ($job_command =~ /anvil-sync-shared/))
if (
($job_progress != 100) &&
(
($job_command =~ /anvil-update-system/) or
($job_command =~ /anvil-sync-shared/)
)
)
{
# Don't abort.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "warning_0139", variables => {

@ -274,9 +274,9 @@ sub process_servers
's2:progress_steps' => $progress_steps,
}});
# If we have one or more local servers, we need to know if both of us are in the cluster. If we're
# not, or the peer isn't, we can't migrate.
my $can_migrate = 0;
# If we have one or more local servers, we need to know if both subnodes are in the node's cluster.
# If we're not, or the peer isn't, we can't migrate.
my $can_migrate = 1;
if ($server_count)
{
my $problem = $anvil->Cluster->parse_cib({debug => 2});
@ -287,18 +287,20 @@ sub process_servers
}});
if ($problem)
{
# We're not in the node's cluster, we can't migrate.
$can_migrate = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { can_migrate => $can_migrate }});
}
elsif ((not $anvil->data->{cib}{parsed}{'local'}{ready}) or (not $anvil->data->{cib}{parsed}{peer}{ready}))
{
# One of the subnodes is not in the cluster, so we can't migrate.
$can_migrate = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { can_migrate => $can_migrate }});
}
if ((not $anvil->data->{switches}{'stop-servers'}) && (not $can_migrate))
{
# Abort.
# We would have to stop the servers, and the user didn't tell us to do that, abort.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0372"});
$anvil->Job->update_progress({progress => 100, message => "error_0372"});
$anvil->nice_exit({exit_code => 1});

@ -15,6 +15,7 @@
# 1 = No database connections available.
# 2 = The job UUID was passed, but it wasn't valid.
# 3 = It looks like the update failed, reset progress to '0'.
# 4 = Failed to withdraw the node from the cluster.
#
# TODO:
# - Rebuild this to be 'striker-update-system' and have it update local strikers and all nodes.
@ -201,6 +202,31 @@ sub run_os_update
output => $output,
return_code => $return_code,
}});
if ($return_code)
{
# Something went wrong, abort.
update_progress($anvil, 100, "error_0420,!!return_code!".$return_code."!!,!!output!".$output."!!");
# Set the job_data to 'failed' so that striker-update-cluster' knows to abort.
if ($anvil->data->{switches}{'job-uuid'})
{
my $query = "
UPDATE
jobs
SET
job_data = 'failed',
modified_date = ".$anvil->Database->quote($anvil->Database->refresh_timestamp)."
WHERE
job_uuid = ".$anvil->Database->quote($anvil->data->{switches}{'job-uuid'})."
;";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }});
$anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__});
}
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, secure => 0, key => "error_0035", variables => { output => $output } });
$anvil->nice_exit({exit_code => 4});
}
}
}

@ -416,6 +416,14 @@ sub update_nodes
$rebooted = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { rebooted => $rebooted }});
}
# Did it fail?
if ($anvil->data->{jobs}{job_data} eq "failed")
{
# Abort!
print "[ Error ] - There was a problem updating the subnode! Anvil! cluster update aborted.\n";
$anvil->nice_exit({exit_code => 1});
}
}
else
{
@ -477,6 +485,7 @@ sub update_nodes
}});
my ($output, $error, $return_code) = $anvil->Remote->call({
debug => 2,
shell_call => $shell_call,
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
});
@ -881,6 +890,14 @@ sub update_strikers_and_dr
$rebooted = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { rebooted => $rebooted }});
}
# Did it fail?
if ($anvil->data->{jobs}{job_data} eq "failed")
{
# Abort!
print "[ Error ] - There was a problem updating the system! Anvil! cluster update aborted.\n";
$anvil->nice_exit({exit_code => 1});
}
}
else
{

Loading…
Cancel
Save