* Fixed a major bug where anvil-daemon would reset the job_progress to 0 when clearing the 'reboot_needed' flag, causing anvil-daemon to pick the job up and again reboot, repeatedly.

* Updated Jobs->update_progress to take 'picked_up_by' as an optional parameter, defaulting to '$$' (the caller's PID).
* Created System->get_uptime() to return the current uptime in seconds.
* Added a delay to anvil-manage-power to not proceed with a reboot if the uptime is less than 600 seconds. This way, if any future bug causes an infinite reboot, there will be more time to determine what's wrong and debug the system between reboots.

Signed-off-by: Digimer <digimer@alteeve.ca>
main
Digimer 6 years ago
parent 46916b658b
commit 6f0bc0d86f
  1. 20
      Anvil/Tools/Job.pm
  2. 30
      Anvil/Tools/System.pm
  3. 4
      share/words.xml
  4. 48
      tools/anvil-daemon
  5. 38
      tools/anvil-manage-power

@ -210,6 +210,10 @@ This is the UUID of the job to update. If it isn't set, but C<< jobs::job_uuid >
If set, this message will be appended to C<< job_status >>. If set to 'C<< clear >>', previous records will be removed.
=head3 picked_up_by (optional, default '$$' (caller's PID))
If set, this is used for the C<< job_picked_up_by >> column. If it isn't set, the process ID of the caller is used.
=head3 progress (required)
This is a number to set the current progress to.
@ -224,13 +228,20 @@ sub update_progress
my $job_uuid = defined $parameter->{job_uuid} ? $parameter->{job_uuid} : "";
my $message = defined $parameter->{message} ? $parameter->{message} : "";
my $picked_up_by = defined $parameter->{picked_up_by} ? $parameter->{picked_up_by} : "";
my $progress = defined $parameter->{progress} ? $parameter->{progress} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
picked_up_by => $picked_up_by,
progress => $progress,
message => $message,
job_uuid => $job_uuid,
"jobs::job_uuid" => $anvil->data->{jobs}{job_uuid},
}});
if ($picked_up_by eq "")
{
$picked_up_by = $$;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { picked_up_by => $picked_up_by }});
}
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { job_uuid => $job_uuid }});
if ((not $job_uuid) && ($anvil->data->{jobs}{job_uuid}))
@ -269,14 +280,13 @@ sub update_progress
}
# Get the current job_status and append this new one.
my $job_picked_up_by = $$;
my $job_picked_up_at = 0;
my $job_status = "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { message => $message, job_picked_up_by => $job_picked_up_by }});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { message => $message, picked_up_by => $picked_up_by }});
if ($message eq "clear")
{
$job_picked_up_by = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { job_picked_up_by => $job_picked_up_by }});
$picked_up_by = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { picked_up_by => $picked_up_by }});
}
else
{
@ -363,7 +373,7 @@ WHERE
debug => $debug,
update_progress_only => 1,
job_uuid => $job_uuid,
job_picked_up_by => $job_picked_up_by,
job_picked_up_by => $picked_up_by,
job_picked_up_at => $job_picked_up_at,
job_progress => $progress,
job_status => $job_status,

@ -882,6 +882,36 @@ sub get_ips
return(0);
}
=head2 get_uptime
This returns, in seconds, how long the host has been up and running for.
This method takes no parameters.
=cut
sub get_uptime
{
my $self = shift;
my $parameter = shift;
my $anvil = $self->parent;
my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "System->_is_local()" }});
my $uptime = $anvil->Storage->read_file({
force_read => 1,
cache => 0,
file => $anvil->data->{path}{proc}{uptime},
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { uptime => $uptime }});
# Clean it up. We'll have gotten two numbers, the uptime in seconds (to two decimal places) and the
# total idle time. We only care about the int number.
$uptime =~ s/^(\d+)\..*$/$1/;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { uptime => $uptime }});
return($uptime);
}
=head2 hostname
Get or set the local hostname. The current (or new) "static" (traditional) host name and the "pretty" (descriptive) host names are returned.

@ -372,6 +372,10 @@ The database connection error was:
<key name="log_0221">Writing: [#!variable!to_write!#] record(s) to resync the database on: [#!variable!host_name!#].</key>
<key name="log_0222">The connection to the database on: [#!variable!host!#] isn't established, trying again...</key>
<key name="log_0223">The connection to the database on: [#!variable!host!#] has been successfully established.</key>
<key name="log_0224">The system has only been running for: [#!variable!uptime!#] seconds. To minimize the impact of a bug causing a rapid reboot cycle, the request to: [#!variable!task!#] will be paused until the system has been running for at least ten minutes. We will proceed in: [#!variable!difference!#] seconds (at #!variable!say_time!#).</key>
<key name="log_0225">power off</key>
<key name="log_0226">reboot</key>
<key name="log_0227">Delay complete, proceeding with the #!variable!task!# operation now.</key>
<!-- Test words. Do NOT change unless you update 't/Words.t' or tests will needlessly fail. -->
<key name="t_0000">Test</key>

@ -207,19 +207,10 @@ sub boot_time_tasks
my ($anvil) = @_;
# If the uptime is less than ten minutes, clear the reboot flag.
my $uptime = $anvil->Storage->read_file({
force_read => 1,
cache => 0,
file => $anvil->data->{path}{proc}{uptime},
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { uptime => $uptime }});
# Clean it up. We'll have gotten two numbers, the uptime in seconds (to two decimal places) and the
# total idle time. We only care about the int number.
$uptime =~ s/^(\d+)\..*$/$1/;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { uptime => $uptime }});
my $uptime = $anvil->System->get_uptime;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { uptime => $uptime }});
# Now find out if a reboot is needed and when it was last changed.
# Now find out if a reboot is listed as needed and when it was last changed.
my $reboot_needed = 0;
my $changed_seconds_ago = 0;
my $query = "
@ -235,9 +226,10 @@ AND
AND
variable_name = 'reboot::needed'
;";
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0124", variables => { query => $query }});
my $results = $anvil->Database->query({query => $query, source => $THIS_FILE, line => __LINE__});
my $count = @{$results};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
results => $results,
count => $count,
}});
@ -246,7 +238,7 @@ AND
$reboot_needed = $results->[0]->[0];
$changed_seconds_ago = $results->[0]->[1];
$changed_seconds_ago =~ s/^(\d+)\..*$/$1/;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
reboot_needed => $reboot_needed,
changed_seconds_ago => $changed_seconds_ago,
}});
@ -255,35 +247,37 @@ AND
# If a reboot is needed, see if the uptime is less than the time since the reboot needed flag was
# set. If the uptime is less, then the system rebooted since it was requested so clear it. h/t to
# Lisa Seelye (@thedoh) for this idea!
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
reboot_needed => $reboot_needed,
changed_seconds_ago => $changed_seconds_ago,
uptime => $uptime,
my $difference = ($changed_seconds_ago - $uptime);
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"s1:reboot_needed" => $reboot_needed,
"s2:changed_seconds_ago" => $changed_seconds_ago,
"s3:uptime" => $uptime,
"s4:difference" => $difference,
}});
if (($reboot_needed) && ($uptime < $changed_seconds_ago))
{
# Clear the reboot request.
$reboot_needed = $anvil->System->reboot_needed({set => 0});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { reboot_needed => $reboot_needed }});
$reboot_needed = $anvil->System->reboot_needed({debug => 2, set => 0});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { reboot_needed => $reboot_needed }});
# Check to see if there was a reboot job in progress. If so, finish it off.
my $job_uuid = $anvil->Job->get_job_uuid({program => "anvil-manage-power"});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { job_uuid => $job_uuid }});
my $job_uuid = $anvil->Job->get_job_uuid({debug => 2, program => "anvil-manage-power"});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }});
if ($job_uuid)
{
# Update the percentage to '100' and then clear the old PID.
my $date_time = $anvil->Get->date_and_time();
if ($anvil->data->{jobs}{job_uuid})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { date_time => $date_time }});
$anvil->Job->update_progress({
debug => 2,
progress => 100,
message => "message_0064,!!date_and_time!".$date_time."!!",
job_uuid => $job_uuid
job_uuid => $job_uuid,
picked_up_by => 0,
});
}
$anvil->Job->clear({job_uuid => $job_uuid});
}
}
return(0);

@ -34,7 +34,7 @@ if (($running_directory =~ /^\./) && ($ENV{PWD}))
my $anvil = Anvil::Tools->new({log_level => 2, log_secure => 1});
$anvil->Storage->read_config({file => "/etc/anvil/anvil.conf"});
$anvil->Storage->read_config({file => $anvil->data->{path}{configs}{'anvil.conf'}});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, secure => 0, key => "log_0115", variables => { program => $THIS_FILE }});
# Read switches
@ -45,6 +45,7 @@ $anvil->data->{switches}{'y'} = "";
$anvil->data->{switches}{'yes'} = "";
$anvil->data->{switches}{'reboot-needed'} = "";
$anvil->data->{switches}{'job-uuid'} = "";
$anvil->data->{switches}{'no-delay'} = "";
$anvil->Get->switches;
if ($anvil->data->{switches}{'power-off'})
@ -58,7 +59,7 @@ if ($anvil->data->{switches}{'yes'})
# Connect to DBs.
$anvil->Database->connect;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => "log_0132"});
if (not $anvil->data->{sys}{database}{connections})
{
# No databases, exit.
@ -128,7 +129,7 @@ if ($anvil->data->{switches}{'reboot-needed'} eq "1")
# Enable
if (not $reboot_needed)
{
$reboot_needed = $anvil->System->reboot_needed({set => 1});
$reboot_needed = $anvil->System->reboot_needed({debug => 2, set => 1});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { reboot_needed => $reboot_needed }});
print $anvil->Words->string({key => "message_0048"})."\n";
}
@ -143,7 +144,7 @@ elsif ($anvil->data->{switches}{'reboot-needed'} eq "0")
# Disabled
if ($reboot_needed)
{
$reboot_needed = $anvil->System->reboot_needed({set => 0});
$reboot_needed = $anvil->System->reboot_needed({debug => 2, set => 0});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { reboot_needed => $reboot_needed }});
print $anvil->Words->string({key => "message_0050"})."\n";
}
@ -162,7 +163,7 @@ elsif ($anvil->data->{switches}{'reboot-needed'})
# Get the current state
if ($reboot_needed)
{
# Report that we're in mainteance mode
# Report that we need to reboot
print $anvil->Words->string({key => "message_0053"})."\n";
}
else
@ -185,9 +186,36 @@ sub do_poweroff
my ($anvil, $task) = @_;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { task => $task }});
# We'll wait until the system has at least 10 minutes of uptime, unless '--no-wait' was given.
my $uptime = $anvil->data->{switches}{'no-wait'} ? 0 : $anvil->System->get_uptime;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"switches::no-wait" => $anvil->data->{switches}{'no-delay'},
uptime => $uptime,
}});
my $say_task = $task eq "poweroff" ? "message_0062" : "message_0063";
my $percent = $task eq "poweroff" ? 100 : 50;
print $anvil->Words->string({key => $say_task})."\n";
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => $say_task});
# To minimize the trouble of a problem where the reboot needed flag isn't cleared, and so the system
# wants to repeatedly reboot, we need to add a delay to not let anvil-daemon ask us to
# reboot/power-off until the system uptime is more than ten minutes.
if (($uptime) && ($uptime < 600))
{
# We'll wait until the system has been running for ten minutes.
my $difference = 600 - $uptime;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, secure => 0, key => "log_0224", variables => {
task => $task eq "poweroff" ? "#!string!log_0225!#" : "#!string!log_0226!#",
difference => $difference,
uptime => $uptime,
say_time => $anvil->Get->date_and_time({offset => $difference, time_only => 1}),
}});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => $say_task});
sleep $difference;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, secure => 0, key => "log_0227", variables => { task => $task eq "poweroff" ? "#!string!log_0225!#" : "#!string!log_0226!#" }});
}
# If I don't have a job_uuid, try to find one.
my $job_uuid = "";

Loading…
Cancel
Save