From c9e11fbbfc756dc9aa962081d7a16bea254a1cad Mon Sep 17 00:00:00 2001 From: digimer Date: Mon, 19 Jun 2023 21:44:45 -0400 Subject: [PATCH] * Added checks to anvil-provision-server to fail out if either of the SN IPs are not found when generating a DRBD resource config. * Added logging to anvil-provision-server and anvil-daemon to try to find the cause of jobs being re-run after completing. May have fixed with a fix to job_progress updates going to 100 too early in some cases. Signed-off-by: digimer --- share/words.xml | 2 ++ tools/anvil-daemon | 10 ++++++++-- tools/anvil-migrate-server | 4 ++-- tools/anvil-provision-server | 34 ++++++++++++++++++++++++++++++++-- tools/anvil-update-system | 2 +- 5 files changed, 45 insertions(+), 7 deletions(-) diff --git a/share/words.xml b/share/words.xml index f61139c6..3a4baadf 100644 --- a/share/words.xml +++ b/share/words.xml @@ -600,6 +600,8 @@ The error was: #!variable!error!# ======== + There was a problem with finding a common storage network between: [#!variable!node1_name!#] and: [#!variable!node2_name!#] using the common interface: [#!variable!interface!#]. Found node 1 to have the IP: [#!variable!node1_ip!#] and node 2: [#!variable!node2_ip!#]. Is there a problem with '/etc/hosts'? + Failed to find a network to use for storage replication. Is there a problem with '/etc/hosts'? diff --git a/tools/anvil-daemon b/tools/anvil-daemon index 12f167f8..2116e9a7 100755 --- a/tools/anvil-daemon +++ b/tools/anvil-daemon @@ -1414,14 +1414,20 @@ sub run_jobs my $ended_within = $startup ? 1 : 300; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { ended_within => $ended_within }}); - $anvil->Database->get_jobs({ended_within => $ended_within}); + $anvil->Database->get_jobs({ + debug => 2, + ended_within => $ended_within, + }); foreach my $modified_date (sort {$a cmp $b} keys %{$anvil->data->{jobs}{modified_date}}) { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { modified_date => $modified_date }}); foreach my $job_uuid (sort {$a cmp $b} keys %{$anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}}) { # Reload the jobs so we get an updated view of them. - $anvil->Database->get_jobs({ended_within => $ended_within}); + $anvil->Database->get_jobs({ + debug => 2, + ended_within => $ended_within, + }); # Collect the data. my $job_command = $anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_command}; diff --git a/tools/anvil-migrate-server b/tools/anvil-migrate-server index fd6b75a3..826bd3ee 100755 --- a/tools/anvil-migrate-server +++ b/tools/anvil-migrate-server @@ -195,8 +195,8 @@ sub migrate_server if (not exists $anvil->data->{cib}{parsed}{data}{server}{$server}) { # Nope. - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "log_0548", variables => { server => $server }}); - $anvil->Job->update_progress({progress => 100, message => "log_0548,!!server!".$server."!!"}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0158", variables => { server => $server }}); + $anvil->Job->update_progress({progress => 100, message => "error_0158,!!server!".$server."!!"}); $anvil->nice_exit({exit_code => 1}); } diff --git a/tools/anvil-provision-server b/tools/anvil-provision-server index 452e1577..e18d1a80 100755 --- a/tools/anvil-provision-server +++ b/tools/anvil-provision-server @@ -71,7 +71,7 @@ if ($anvil->data->{switches}{'job-uuid'}) { # Load the job data. $anvil->Job->clear(); - $anvil->Job->get_job_details(); + $anvil->Job->get_job_details({debug => 2}); $anvil->Job->update_progress({ progress => 1, job_picked_up_by => $$, @@ -346,6 +346,7 @@ sub run_jobs # Done! $anvil->Job->update_progress({ + debug => 2, progress => 100, message => "job_0202", }); @@ -473,7 +474,7 @@ sub write_definition # The peer is done, it'll pick up the XML definition when ScanCore runs $anvil->Job->update_progress({ - progress => 100, + progress => 85, message => "job_0204", }); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "job_0204"}); @@ -1216,6 +1217,9 @@ sub get_sn_details line => __LINE__, }); + $anvil->data->{job}{sn_network} = ""; + $anvil->data->{job}{node1_sn_ip} = ""; + $anvil->data->{job}{node2_sn_ip} = ""; my $node1_short_host_name = $anvil->data->{job}{node1_short_host_name}; my $node2_short_host_name = $anvil->data->{job}{node2_short_host_name}; my $matched_ips = keys %{$match}; @@ -1226,6 +1230,20 @@ sub get_sn_details { ### TODO: This always chooses SN1 at this time, we need to support (later) VM ### build-time SN selection when 2+ SNs exist. + # Valid data? + if ((not $match->{$node1_short_host_name}{$interface}{ip}) or (not $match->{$node2_short_host_name}{$interface}{ip})) + { + # Probably a bug, maybe a broken /etc/hosts file? + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => 'err', key => "error_0417", variables => { + node1_name => $node1_short_host_name, + node2_name => $node2_short_host_name, + interface => $interface, + node1_ip => defined $match->{$node1_short_host_name}{$interface}{ip} ? $match->{$node1_short_host_name}{$interface}{ip} : "", + node2_ip => defined $match->{$node2_short_host_name}{$interface}{ip} ? $match->{$node2_short_host_name}{$interface}{ip} : "", + }}); + next; + } + # Found an SN. $anvil->data->{job}{sn_network} = uc(($interface =~ /^(sn\d+)_/)[0]); $anvil->data->{job}{node1_sn_ip} = $match->{$node1_short_host_name}{$interface}{ip}; @@ -1238,6 +1256,18 @@ sub get_sn_details } } + if ((not $anvil->data->{job}{node1_sn_ip}) or (not $anvil->data->{job}{node2_sn_ip})) + { + # Fail out. + $anvil->Job->update_progress({ + progress => 100, + message => "error_0418", + job_status => "failed", + }); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => 'err', key => "error_0418"}); + $anvil->nice_exit({exit_code => 1}); + } + return(0); } diff --git a/tools/anvil-update-system b/tools/anvil-update-system index 78874e2f..f73c0d44 100755 --- a/tools/anvil-update-system +++ b/tools/anvil-update-system @@ -17,7 +17,7 @@ # 3 = It looks like the update failed, reset progress to '0'. # # TODO: -# - Use this to update local repos for when we get to the Install Manifest stage. +# - Rebuild this to be 'striker-update-system' and have it update local strikers and all nodes. # use strict;