* Added checks to anvil-provision-server to fail out if either of the SN IPs are not found when generating a DRBD resource config.
* Added logging to anvil-provision-server and anvil-daemon to try to find the cause of jobs being re-run after completing. May have fixed with a fix to job_progress updates going to 100 too early in some cases. Signed-off-by: digimer <mkelly@alteeve.ca>
This commit is contained in:
parent
58371d22b6
commit
c9e11fbbfc
@ -600,6 +600,8 @@ The error was:
|
||||
#!variable!error!#
|
||||
========
|
||||
</key>
|
||||
<key name="error_0417">There was a problem with finding a common storage network between: [#!variable!node1_name!#] and: [#!variable!node2_name!#] using the common interface: [#!variable!interface!#]. Found node 1 to have the IP: [#!variable!node1_ip!#] and node 2: [#!variable!node2_ip!#]. Is there a problem with '/etc/hosts'?</key>
|
||||
<key name="error_0418">Failed to find a network to use for storage replication. Is there a problem with '/etc/hosts'?</key>
|
||||
|
||||
<!-- Files templates -->
|
||||
<!-- NOTE: Translating these files requires an understanding of which lines are translatable -->
|
||||
|
@ -1414,14 +1414,20 @@ sub run_jobs
|
||||
my $ended_within = $startup ? 1 : 300;
|
||||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { ended_within => $ended_within }});
|
||||
|
||||
$anvil->Database->get_jobs({ended_within => $ended_within});
|
||||
$anvil->Database->get_jobs({
|
||||
debug => 2,
|
||||
ended_within => $ended_within,
|
||||
});
|
||||
foreach my $modified_date (sort {$a cmp $b} keys %{$anvil->data->{jobs}{modified_date}})
|
||||
{
|
||||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { modified_date => $modified_date }});
|
||||
foreach my $job_uuid (sort {$a cmp $b} keys %{$anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}})
|
||||
{
|
||||
# Reload the jobs so we get an updated view of them.
|
||||
$anvil->Database->get_jobs({ended_within => $ended_within});
|
||||
$anvil->Database->get_jobs({
|
||||
debug => 2,
|
||||
ended_within => $ended_within,
|
||||
});
|
||||
|
||||
# Collect the data.
|
||||
my $job_command = $anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_command};
|
||||
|
@ -195,8 +195,8 @@ sub migrate_server
|
||||
if (not exists $anvil->data->{cib}{parsed}{data}{server}{$server})
|
||||
{
|
||||
# Nope.
|
||||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "log_0548", variables => { server => $server }});
|
||||
$anvil->Job->update_progress({progress => 100, message => "log_0548,!!server!".$server."!!"});
|
||||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0158", variables => { server => $server }});
|
||||
$anvil->Job->update_progress({progress => 100, message => "error_0158,!!server!".$server."!!"});
|
||||
$anvil->nice_exit({exit_code => 1});
|
||||
}
|
||||
|
||||
|
@ -71,7 +71,7 @@ if ($anvil->data->{switches}{'job-uuid'})
|
||||
{
|
||||
# Load the job data.
|
||||
$anvil->Job->clear();
|
||||
$anvil->Job->get_job_details();
|
||||
$anvil->Job->get_job_details({debug => 2});
|
||||
$anvil->Job->update_progress({
|
||||
progress => 1,
|
||||
job_picked_up_by => $$,
|
||||
@ -346,6 +346,7 @@ sub run_jobs
|
||||
|
||||
# Done!
|
||||
$anvil->Job->update_progress({
|
||||
debug => 2,
|
||||
progress => 100,
|
||||
message => "job_0202",
|
||||
});
|
||||
@ -473,7 +474,7 @@ sub write_definition
|
||||
|
||||
# The peer is done, it'll pick up the XML definition when ScanCore runs
|
||||
$anvil->Job->update_progress({
|
||||
progress => 100,
|
||||
progress => 85,
|
||||
message => "job_0204",
|
||||
});
|
||||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "job_0204"});
|
||||
@ -1216,6 +1217,9 @@ sub get_sn_details
|
||||
line => __LINE__,
|
||||
});
|
||||
|
||||
$anvil->data->{job}{sn_network} = "";
|
||||
$anvil->data->{job}{node1_sn_ip} = "";
|
||||
$anvil->data->{job}{node2_sn_ip} = "";
|
||||
my $node1_short_host_name = $anvil->data->{job}{node1_short_host_name};
|
||||
my $node2_short_host_name = $anvil->data->{job}{node2_short_host_name};
|
||||
my $matched_ips = keys %{$match};
|
||||
@ -1226,6 +1230,20 @@ sub get_sn_details
|
||||
{
|
||||
### TODO: This always chooses SN1 at this time, we need to support (later) VM
|
||||
### build-time SN selection when 2+ SNs exist.
|
||||
# Valid data?
|
||||
if ((not $match->{$node1_short_host_name}{$interface}{ip}) or (not $match->{$node2_short_host_name}{$interface}{ip}))
|
||||
{
|
||||
# Probably a bug, maybe a broken /etc/hosts file?
|
||||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => 'err', key => "error_0417", variables => {
|
||||
node1_name => $node1_short_host_name,
|
||||
node2_name => $node2_short_host_name,
|
||||
interface => $interface,
|
||||
node1_ip => defined $match->{$node1_short_host_name}{$interface}{ip} ? $match->{$node1_short_host_name}{$interface}{ip} : "",
|
||||
node2_ip => defined $match->{$node2_short_host_name}{$interface}{ip} ? $match->{$node2_short_host_name}{$interface}{ip} : "",
|
||||
}});
|
||||
next;
|
||||
}
|
||||
|
||||
# Found an SN.
|
||||
$anvil->data->{job}{sn_network} = uc(($interface =~ /^(sn\d+)_/)[0]);
|
||||
$anvil->data->{job}{node1_sn_ip} = $match->{$node1_short_host_name}{$interface}{ip};
|
||||
@ -1238,6 +1256,18 @@ sub get_sn_details
|
||||
}
|
||||
}
|
||||
|
||||
if ((not $anvil->data->{job}{node1_sn_ip}) or (not $anvil->data->{job}{node2_sn_ip}))
|
||||
{
|
||||
# Fail out.
|
||||
$anvil->Job->update_progress({
|
||||
progress => 100,
|
||||
message => "error_0418",
|
||||
job_status => "failed",
|
||||
});
|
||||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => 'err', key => "error_0418"});
|
||||
$anvil->nice_exit({exit_code => 1});
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
@ -17,7 +17,7 @@
|
||||
# 3 = It looks like the update failed, reset progress to '0'.
|
||||
#
|
||||
# TODO:
|
||||
# - Use this to update local repos for when we get to the Install Manifest stage.
|
||||
# - Rebuild this to be 'striker-update-system' and have it update local strikers and all nodes.
|
||||
#
|
||||
|
||||
use strict;
|
||||
|
Loading…
Reference in New Issue
Block a user