From 156a0ca201ce227864ea38e249d3e5094c70bb3e Mon Sep 17 00:00:00 2001 From: digimer Date: Fri, 16 Jun 2023 11:43:49 -0400 Subject: [PATCH] Updated anvil-daemon's new job launching logic to allow the restart of a running job that failed out early. Signed-off-by: digimer --- Anvil/Tools.pm | 1 + share/words.xml | 1 + tools/anvil-daemon | 17 ++++++++++++++++- tools/anvil-watch-power | 2 ++ 4 files changed, 20 insertions(+), 1 deletion(-) diff --git a/Anvil/Tools.pm b/Anvil/Tools.pm index fcd00b3a..4cd64e43 100644 --- a/Anvil/Tools.pm +++ b/Anvil/Tools.pm @@ -1095,6 +1095,7 @@ sub _set_paths bonds => "/proc/net/bonding", 'cgi-bin' => "/var/www/cgi-bin", drbd_resources => "/etc/drbd.d/", + drbd_kernel_proc => "/sys/kernel/debug/drbd/resources", fence_agents => "/usr/sbin", firewalld_services => "/usr/lib/firewalld/services", firewalld_zones_etc => "/etc/firewalld/zones", # Changes when firewall-cmd ... --permanent is used. diff --git a/share/words.xml b/share/words.xml index 6aa50e92..f61139c6 100644 --- a/share/words.xml +++ b/share/words.xml @@ -2393,6 +2393,7 @@ The file: [#!variable!file!#] needs to be updated. The difference is: The server: [#!variable!server!#] needs it's pacemaker configuration updated. Running: [#!variable!command!#]. Running the scan-agent: [#!variable!agent!#] now to ensure that the database has an updated view of resources. I was about to start: [#!variable!command!#] with the job UUID: [#!variable!this_job_uuid!#]. However, another job using the same command with the job UUID: [#!variable!other_job_uuid!#]. To avoid race conditions, only one process with a given command is run at the same time. + The job with the command: [#!variable!command!#] and job UUID: [#!variable!job_uuid!#] is restarting. The host name: [#!variable!target!#] does not resolve to an IP address. diff --git a/tools/anvil-daemon b/tools/anvil-daemon index 3c1be196..12f167f8 100755 --- a/tools/anvil-daemon +++ b/tools/anvil-daemon @@ -1637,7 +1637,22 @@ sub run_jobs if (exists $anvil->data->{jobs}{running}{$started_job_uuid}) { - if ($anvil->data->{jobs}{running}{$started_job_uuid}{job_progress} != 100) + # If the previously running job and this job have the same + # UUID, it failed and needs to restart. + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + job_uuid => $job_uuid, + started_job_uuid => $started_job_uuid, + "jobs::running::${started_job_uuid}::job_progress" => $anvil->data->{jobs}{running}{$started_job_uuid}{job_progress}, + }}); + if ($started_job_uuid eq $job_uuid) + { + # We're restarting. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0741", variables => { + command => $job_command, + job_uuid => $job_uuid, + }}); + } + elsif ($anvil->data->{jobs}{running}{$started_job_uuid}{job_progress} != 100) { # Don't start it in this pass. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0741", variables => { diff --git a/tools/anvil-watch-power b/tools/anvil-watch-power index 197f74ca..20dc7af5 100755 --- a/tools/anvil-watch-power +++ b/tools/anvil-watch-power @@ -52,6 +52,8 @@ while(1) sleep 2; } +$anvil->nice_exit({exit_code => 0}); + sub show_power_data {