* Added checks to anvil-provision-server to fail out if either of the SN IPs are not found when generating a DRBD resource config.

* Added logging to anvil-provision-server and anvil-daemon to try to find the cause of jobs being re-run after completing. May have fixed with a fix to job_progress updates going to 100 too early in some cases.

Signed-off-by: digimer <mkelly@alteeve.ca>
This commit is contained in:
digimer 2023-06-19 21:44:45 -04:00
parent 58371d22b6
commit c9e11fbbfc
5 changed files with 45 additions and 7 deletions

View File

@ -600,6 +600,8 @@ The error was:
#!variable!error!#
========
</key>
<key name="error_0417">There was a problem with finding a common storage network between: [#!variable!node1_name!#] and: [#!variable!node2_name!#] using the common interface: [#!variable!interface!#]. Found node 1 to have the IP: [#!variable!node1_ip!#] and node 2: [#!variable!node2_ip!#]. Is there a problem with '/etc/hosts'?</key>
<key name="error_0418">Failed to find a network to use for storage replication. Is there a problem with '/etc/hosts'?</key>
<!-- Files templates -->
<!-- NOTE: Translating these files requires an understanding of which lines are translatable -->

View File

@ -1414,14 +1414,20 @@ sub run_jobs
my $ended_within = $startup ? 1 : 300;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { ended_within => $ended_within }});
$anvil->Database->get_jobs({ended_within => $ended_within});
$anvil->Database->get_jobs({
debug => 2,
ended_within => $ended_within,
});
foreach my $modified_date (sort {$a cmp $b} keys %{$anvil->data->{jobs}{modified_date}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { modified_date => $modified_date }});
foreach my $job_uuid (sort {$a cmp $b} keys %{$anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}})
{
# Reload the jobs so we get an updated view of them.
$anvil->Database->get_jobs({ended_within => $ended_within});
$anvil->Database->get_jobs({
debug => 2,
ended_within => $ended_within,
});
# Collect the data.
my $job_command = $anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_command};

View File

@ -195,8 +195,8 @@ sub migrate_server
if (not exists $anvil->data->{cib}{parsed}{data}{server}{$server})
{
# Nope.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "log_0548", variables => { server => $server }});
$anvil->Job->update_progress({progress => 100, message => "log_0548,!!server!".$server."!!"});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0158", variables => { server => $server }});
$anvil->Job->update_progress({progress => 100, message => "error_0158,!!server!".$server."!!"});
$anvil->nice_exit({exit_code => 1});
}

View File

@ -71,7 +71,7 @@ if ($anvil->data->{switches}{'job-uuid'})
{
# Load the job data.
$anvil->Job->clear();
$anvil->Job->get_job_details();
$anvil->Job->get_job_details({debug => 2});
$anvil->Job->update_progress({
progress => 1,
job_picked_up_by => $$,
@ -346,6 +346,7 @@ sub run_jobs
# Done!
$anvil->Job->update_progress({
debug => 2,
progress => 100,
message => "job_0202",
});
@ -473,7 +474,7 @@ sub write_definition
# The peer is done, it'll pick up the XML definition when ScanCore runs
$anvil->Job->update_progress({
progress => 100,
progress => 85,
message => "job_0204",
});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "job_0204"});
@ -1216,6 +1217,9 @@ sub get_sn_details
line => __LINE__,
});
$anvil->data->{job}{sn_network} = "";
$anvil->data->{job}{node1_sn_ip} = "";
$anvil->data->{job}{node2_sn_ip} = "";
my $node1_short_host_name = $anvil->data->{job}{node1_short_host_name};
my $node2_short_host_name = $anvil->data->{job}{node2_short_host_name};
my $matched_ips = keys %{$match};
@ -1226,6 +1230,20 @@ sub get_sn_details
{
### TODO: This always chooses SN1 at this time, we need to support (later) VM
### build-time SN selection when 2+ SNs exist.
# Valid data?
if ((not $match->{$node1_short_host_name}{$interface}{ip}) or (not $match->{$node2_short_host_name}{$interface}{ip}))
{
# Probably a bug, maybe a broken /etc/hosts file?
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => 'err', key => "error_0417", variables => {
node1_name => $node1_short_host_name,
node2_name => $node2_short_host_name,
interface => $interface,
node1_ip => defined $match->{$node1_short_host_name}{$interface}{ip} ? $match->{$node1_short_host_name}{$interface}{ip} : "",
node2_ip => defined $match->{$node2_short_host_name}{$interface}{ip} ? $match->{$node2_short_host_name}{$interface}{ip} : "",
}});
next;
}
# Found an SN.
$anvil->data->{job}{sn_network} = uc(($interface =~ /^(sn\d+)_/)[0]);
$anvil->data->{job}{node1_sn_ip} = $match->{$node1_short_host_name}{$interface}{ip};
@ -1238,6 +1256,18 @@ sub get_sn_details
}
}
if ((not $anvil->data->{job}{node1_sn_ip}) or (not $anvil->data->{job}{node2_sn_ip}))
{
# Fail out.
$anvil->Job->update_progress({
progress => 100,
message => "error_0418",
job_status => "failed",
});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => 'err', key => "error_0418"});
$anvil->nice_exit({exit_code => 1});
}
return(0);
}

View File

@ -17,7 +17,7 @@
# 3 = It looks like the update failed, reset progress to '0'.
#
# TODO:
# - Use this to update local repos for when we get to the Install Manifest stage.
# - Rebuild this to be 'striker-update-system' and have it update local strikers and all nodes.
#
use strict;