From 6f0bc0d86fa2b7590c747498258586ac1c160e8f Mon Sep 17 00:00:00 2001 From: Digimer Date: Sat, 6 Oct 2018 03:16:08 -0400 Subject: [PATCH] * Fixed a major bug where anvil-daemon would reset the job_progress to 0 when clearing the 'reboot_needed' flag, causing anvil-daemon to pick the job up and again reboot, repeatedly. * Updated Jobs->update_progress to take 'picked_up_by' as an optional parameter, defaulting to '$$' (the caller's PID). * Created System->get_uptime() to return the current uptime in seconds. * Added a delay to anvil-manage-power to not proceed with a reboot if the uptime is less than 600 seconds. This way, if any future bug causes an infinite reboot, there will be more time to determine what's wrong and debug the system between reboots. Signed-off-by: Digimer --- Anvil/Tools/Job.pm | 26 +++++++++++++------ Anvil/Tools/System.pm | 30 +++++++++++++++++++++ share/words.xml | 4 +++ tools/anvil-daemon | 56 ++++++++++++++++++---------------------- tools/anvil-manage-power | 38 +++++++++++++++++++++++---- 5 files changed, 110 insertions(+), 44 deletions(-) diff --git a/Anvil/Tools/Job.pm b/Anvil/Tools/Job.pm index f53129be..eedda4e3 100755 --- a/Anvil/Tools/Job.pm +++ b/Anvil/Tools/Job.pm @@ -210,6 +210,10 @@ This is the UUID of the job to update. If it isn't set, but C<< jobs::job_uuid > If set, this message will be appended to C<< job_status >>. If set to 'C<< clear >>', previous records will be removed. +=head3 picked_up_by (optional, default '$$' (caller's PID)) + +If set, this is used for the C<< job_picked_up_by >> column. If it isn't set, the process ID of the caller is used. + =head3 progress (required) This is a number to set the current progress to. @@ -222,15 +226,22 @@ sub update_progress my $anvil = $self->parent; my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3; - my $job_uuid = defined $parameter->{job_uuid} ? $parameter->{job_uuid} : ""; - my $message = defined $parameter->{message} ? $parameter->{message} : ""; - my $progress = defined $parameter->{progress} ? $parameter->{progress} : ""; + my $job_uuid = defined $parameter->{job_uuid} ? $parameter->{job_uuid} : ""; + my $message = defined $parameter->{message} ? $parameter->{message} : ""; + my $picked_up_by = defined $parameter->{picked_up_by} ? $parameter->{picked_up_by} : ""; + my $progress = defined $parameter->{progress} ? $parameter->{progress} : ""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + picked_up_by => $picked_up_by, progress => $progress, message => $message, job_uuid => $job_uuid, "jobs::job_uuid" => $anvil->data->{jobs}{job_uuid}, }}); + if ($picked_up_by eq "") + { + $picked_up_by = $$; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { picked_up_by => $picked_up_by }}); + } $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { job_uuid => $job_uuid }}); if ((not $job_uuid) && ($anvil->data->{jobs}{job_uuid})) @@ -269,14 +280,13 @@ sub update_progress } # Get the current job_status and append this new one. - my $job_picked_up_by = $$; my $job_picked_up_at = 0; my $job_status = ""; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { message => $message, job_picked_up_by => $job_picked_up_by }}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { message => $message, picked_up_by => $picked_up_by }}); if ($message eq "clear") { - $job_picked_up_by = 0; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { job_picked_up_by => $job_picked_up_by }}); + $picked_up_by = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { picked_up_by => $picked_up_by }}); } else { @@ -363,7 +373,7 @@ WHERE debug => $debug, update_progress_only => 1, job_uuid => $job_uuid, - job_picked_up_by => $job_picked_up_by, + job_picked_up_by => $picked_up_by, job_picked_up_at => $job_picked_up_at, job_progress => $progress, job_status => $job_status, diff --git a/Anvil/Tools/System.pm b/Anvil/Tools/System.pm index 4cc9dd56..e1470e1f 100755 --- a/Anvil/Tools/System.pm +++ b/Anvil/Tools/System.pm @@ -882,6 +882,36 @@ sub get_ips return(0); } +=head2 get_uptime + +This returns, in seconds, how long the host has been up and running for. + +This method takes no parameters. + +=cut +sub get_uptime +{ + my $self = shift; + my $parameter = shift; + my $anvil = $self->parent; + my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "System->_is_local()" }}); + + my $uptime = $anvil->Storage->read_file({ + force_read => 1, + cache => 0, + file => $anvil->data->{path}{proc}{uptime}, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { uptime => $uptime }}); + + # Clean it up. We'll have gotten two numbers, the uptime in seconds (to two decimal places) and the + # total idle time. We only care about the int number. + $uptime =~ s/^(\d+)\..*$/$1/; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { uptime => $uptime }}); + + return($uptime); +} + =head2 hostname Get or set the local hostname. The current (or new) "static" (traditional) host name and the "pretty" (descriptive) host names are returned. diff --git a/share/words.xml b/share/words.xml index c131d398..25f4ec43 100644 --- a/share/words.xml +++ b/share/words.xml @@ -372,6 +372,10 @@ The database connection error was: Writing: [#!variable!to_write!#] record(s) to resync the database on: [#!variable!host_name!#]. The connection to the database on: [#!variable!host!#] isn't established, trying again... The connection to the database on: [#!variable!host!#] has been successfully established. + The system has only been running for: [#!variable!uptime!#] seconds. To minimize the impact of a bug causing a rapid reboot cycle, the request to: [#!variable!task!#] will be paused until the system has been running for at least ten minutes. We will proceed in: [#!variable!difference!#] seconds (at #!variable!say_time!#). + power off + reboot + Delay complete, proceeding with the #!variable!task!# operation now. Test diff --git a/tools/anvil-daemon b/tools/anvil-daemon index 66d33723..79e54b6e 100755 --- a/tools/anvil-daemon +++ b/tools/anvil-daemon @@ -207,19 +207,10 @@ sub boot_time_tasks my ($anvil) = @_; # If the uptime is less than ten minutes, clear the reboot flag. - my $uptime = $anvil->Storage->read_file({ - force_read => 1, - cache => 0, - file => $anvil->data->{path}{proc}{uptime}, - }); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { uptime => $uptime }}); - - # Clean it up. We'll have gotten two numbers, the uptime in seconds (to two decimal places) and the - # total idle time. We only care about the int number. - $uptime =~ s/^(\d+)\..*$/$1/; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { uptime => $uptime }}); + my $uptime = $anvil->System->get_uptime; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { uptime => $uptime }}); - # Now find out if a reboot is needed and when it was last changed. + # Now find out if a reboot is listed as needed and when it was last changed. my $reboot_needed = 0; my $changed_seconds_ago = 0; my $query = " @@ -235,9 +226,10 @@ AND AND variable_name = 'reboot::needed' ;"; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0124", variables => { query => $query }}); my $results = $anvil->Database->query({query => $query, source => $THIS_FILE, line => __LINE__}); my $count = @{$results}; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { results => $results, count => $count, }}); @@ -246,7 +238,7 @@ AND $reboot_needed = $results->[0]->[0]; $changed_seconds_ago = $results->[0]->[1]; $changed_seconds_ago =~ s/^(\d+)\..*$/$1/; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { reboot_needed => $reboot_needed, changed_seconds_ago => $changed_seconds_ago, }}); @@ -255,34 +247,36 @@ AND # If a reboot is needed, see if the uptime is less than the time since the reboot needed flag was # set. If the uptime is less, then the system rebooted since it was requested so clear it. h/t to # Lisa Seelye (@thedoh) for this idea! - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { - reboot_needed => $reboot_needed, - changed_seconds_ago => $changed_seconds_ago, - uptime => $uptime, + my $difference = ($changed_seconds_ago - $uptime); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "s1:reboot_needed" => $reboot_needed, + "s2:changed_seconds_ago" => $changed_seconds_ago, + "s3:uptime" => $uptime, + "s4:difference" => $difference, }}); if (($reboot_needed) && ($uptime < $changed_seconds_ago)) { # Clear the reboot request. - $reboot_needed = $anvil->System->reboot_needed({set => 0}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { reboot_needed => $reboot_needed }}); + $reboot_needed = $anvil->System->reboot_needed({debug => 2, set => 0}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { reboot_needed => $reboot_needed }}); # Check to see if there was a reboot job in progress. If so, finish it off. - my $job_uuid = $anvil->Job->get_job_uuid({program => "anvil-manage-power"}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { job_uuid => $job_uuid }}); + my $job_uuid = $anvil->Job->get_job_uuid({debug => 2, program => "anvil-manage-power"}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }}); if ($job_uuid) { # Update the percentage to '100' and then clear the old PID. my $date_time = $anvil->Get->date_and_time(); - if ($anvil->data->{jobs}{job_uuid}) - { - $anvil->Job->update_progress({ - progress => 100, - message => "message_0064,!!date_and_time!".$date_time."!!", - job_uuid => $job_uuid - }); - } - $anvil->Job->clear({job_uuid => $job_uuid}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { date_time => $date_time }}); + + $anvil->Job->update_progress({ + debug => 2, + progress => 100, + message => "message_0064,!!date_and_time!".$date_time."!!", + job_uuid => $job_uuid, + picked_up_by => 0, + }); } } diff --git a/tools/anvil-manage-power b/tools/anvil-manage-power index eb27cf6c..08f5413b 100755 --- a/tools/anvil-manage-power +++ b/tools/anvil-manage-power @@ -34,7 +34,7 @@ if (($running_directory =~ /^\./) && ($ENV{PWD})) my $anvil = Anvil::Tools->new({log_level => 2, log_secure => 1}); -$anvil->Storage->read_config({file => "/etc/anvil/anvil.conf"}); +$anvil->Storage->read_config({file => $anvil->data->{path}{configs}{'anvil.conf'}}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, secure => 0, key => "log_0115", variables => { program => $THIS_FILE }}); # Read switches @@ -45,6 +45,7 @@ $anvil->data->{switches}{'y'} = ""; $anvil->data->{switches}{'yes'} = ""; $anvil->data->{switches}{'reboot-needed'} = ""; $anvil->data->{switches}{'job-uuid'} = ""; +$anvil->data->{switches}{'no-delay'} = ""; $anvil->Get->switches; if ($anvil->data->{switches}{'power-off'}) @@ -58,7 +59,7 @@ if ($anvil->data->{switches}{'yes'}) # Connect to DBs. $anvil->Database->connect; -$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"}); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => "log_0132"}); if (not $anvil->data->{sys}{database}{connections}) { # No databases, exit. @@ -128,7 +129,7 @@ if ($anvil->data->{switches}{'reboot-needed'} eq "1") # Enable if (not $reboot_needed) { - $reboot_needed = $anvil->System->reboot_needed({set => 1}); + $reboot_needed = $anvil->System->reboot_needed({debug => 2, set => 1}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { reboot_needed => $reboot_needed }}); print $anvil->Words->string({key => "message_0048"})."\n"; } @@ -143,7 +144,7 @@ elsif ($anvil->data->{switches}{'reboot-needed'} eq "0") # Disabled if ($reboot_needed) { - $reboot_needed = $anvil->System->reboot_needed({set => 0}); + $reboot_needed = $anvil->System->reboot_needed({debug => 2, set => 0}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { reboot_needed => $reboot_needed }}); print $anvil->Words->string({key => "message_0050"})."\n"; } @@ -162,7 +163,7 @@ elsif ($anvil->data->{switches}{'reboot-needed'}) # Get the current state if ($reboot_needed) { - # Report that we're in mainteance mode + # Report that we need to reboot print $anvil->Words->string({key => "message_0053"})."\n"; } else @@ -185,9 +186,36 @@ sub do_poweroff my ($anvil, $task) = @_; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { task => $task }}); + # We'll wait until the system has at least 10 minutes of uptime, unless '--no-wait' was given. + my $uptime = $anvil->data->{switches}{'no-wait'} ? 0 : $anvil->System->get_uptime; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "switches::no-wait" => $anvil->data->{switches}{'no-delay'}, + uptime => $uptime, + }}); + my $say_task = $task eq "poweroff" ? "message_0062" : "message_0063"; my $percent = $task eq "poweroff" ? 100 : 50; print $anvil->Words->string({key => $say_task})."\n"; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => $say_task}); + + # To minimize the trouble of a problem where the reboot needed flag isn't cleared, and so the system + # wants to repeatedly reboot, we need to add a delay to not let anvil-daemon ask us to + # reboot/power-off until the system uptime is more than ten minutes. + if (($uptime) && ($uptime < 600)) + { + # We'll wait until the system has been running for ten minutes. + my $difference = 600 - $uptime; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, secure => 0, key => "log_0224", variables => { + task => $task eq "poweroff" ? "#!string!log_0225!#" : "#!string!log_0226!#", + difference => $difference, + uptime => $uptime, + say_time => $anvil->Get->date_and_time({offset => $difference, time_only => 1}), + }}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => $say_task}); + + sleep $difference; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, secure => 0, key => "log_0227", variables => { task => $task eq "poweroff" ? "#!string!log_0225!#" : "#!string!log_0226!#" }}); + } # If I don't have a job_uuid, try to find one. my $job_uuid = "";