anvil/tools/anvil-daemon

324 lines
12 KiB
Plaintext
Raw Normal View History

#!/usr/bin/perl
#
# This is the master daemon that manages all periodically run processes on Striker dashboards and Anvil!
# nodes.
#
# Exit codes;
# 0 = Normal exit
# 1 = md5sum of this program changed. Exited to reload.
#
# TODO:
#
use strict;
use warnings;
use Anvil::Tools;
use Proc::Simple;
my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0];
my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0];
if (($running_directory =~ /^\./) && ($ENV{PWD}))
{
$running_directory =~ s/^\./$ENV{PWD}/;
}
# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete.
$| = 1;
my $anvil = Anvil::Tools->new({log_level => 2, log_secure => 1});
# Connect to the database(s). If we have no connections, we'll proceed anyway as one of the 'run_once' tasks
# is to setup the database server.
$anvil->Database->connect();
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => "log_0132"});
# There are some things we only want to run on (re)start and don't need to always run.
run_once($anvil);
# Calculate my sum so that we can exit if it changes later.
$anvil->Storage->record_md5sums;
# Disconnect. We'll reconnect inside the loop
$anvil->Database->disconnect;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => "log_0132"});
# These are the things we always want running.
while(1)
{
# Loop and sleep for 2s.
keep_running($anvil);
# Exit if called with '--run-once'
if ($anvil->data->{switches}{'run-once'})
{
$anvil->nice_exit({code => 0});
}
# Has the file on disk changed?
if ($anvil->Storage->check_md5sums)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "warn", key => "message_0014"});
$anvil->nice_exit({code => 1});
}
# Sleep now.
sleep 2;
}
$anvil->nice_exit({code => 0});
#############################################################################################################
# Functions #
#############################################################################################################
# These are tools that don't need to constantly run. They'll typically run when the server starts up or the
# daemon is restarted or reloaded.
sub run_once
{
my ($anvil) = @_;
# Check that the database is ready.
my $shell_call = $anvil->data->{path}{exe}{'anvil-prep-database'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { shell_call => $shell_call }});
my $database_output = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__});
if ($database_output)
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { database_output => $database_output }});
}
# If the uptime is less than ten minutes, clear the reboot flag.
my $uptime = $anvil->Storage->read_file({
debug => 2,
force_read => 1,
cache => 0,
file => $anvil->data->{path}{proc}{uptime},
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { uptime => $uptime }});
# Clean it up. We'll have gotten two numbers, the uptime in seconds (to two decimal places) and the
# total idle time. We only care about the int number.
$uptime =~ s/^(\d+)\..*$/$1/;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { uptime => $uptime }});
if ($uptime < 600)
{
# Clear the reboot request.
my $output = $anvil->System->call({
debug => 2,
shell_call => $anvil->data->{path}{exe}{'anvil-clear-reboot'},
source => $THIS_FILE,
line => __LINE__,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output }});
}
return(0);
}
# These are tools that need to keep running.
sub keep_running
{
my ($anvil) = @_;
# Check for jobs that were running and now exited.
if (exists $anvil->data->{processes})
{
foreach my $job_uuid (%{$anvil->data->{jobs}{handles}})
{
# If it's not a handle, delete it.
my $running = $anvil->data->{jobs}{handles}{$job_uuid}->poll();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => {
"jobs::handles::${job_uuid}" => $anvil->data->{jobs}{handles}{$job_uuid},
running => $running,
}});
# If it's not running, update the table to clear the 'job_picked_up_by' column.
if (not $running)
{
my $exit_status = $anvil->data->{jobs}{handles}{$job_uuid}->exit_status();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => { exit_status => $exit_status }});
# Free up memory
$anvil->data->{jobs}{handles}{$job_uuid}->cleanup();
clear_job($anvil, $job_uuid);
}
}
}
# Update hardware state files.
update_state_file($anvil);
# Run any pending jobs by calling 'anvil-jobs' with the 'job_uuid' as a background process.
run_jobs($anvil);
return(0);
}
# This clears the 'job_picked_up_by'.
sub clear_job
{
my ($anvil, $job_uuid) = @_;
my $query = "
UPDATE
jobs
SET
job_picked_up_by = '0',
modified_date = ".$anvil->data->{sys}{database}{use_handle}->quote($anvil->data->{sys}{database}{timestamp})."
WHERE
job_uuid = ".$anvil->data->{sys}{database}{use_handle}->quote($job_uuid)."
";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }});
$anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__});
return(0);
}
# This will check for any jobs that aren't at 100%. For each found, if 'picked_up_by' is set, a check is made
# to see if the PID is still alive. If it isn't, or if 'picked_up_by' is not set, the appropriate tool is
# invoked to handle it.
sub run_jobs
{
my ($anvil) = @_;
# We'll also update the jobs.json file.
my $jobs_file = "{\"jobs\":[\n";
# Get a list of pending or incomplete jobs.
my $query = "
SELECT
job_uuid,
job_command,
job_data,
job_picked_up_by,
job_picked_up_at,
extract(epoch from job_picked_up_at),
job_updated,
extract(epoch from modified_date)
job_progress
FROM
jobs
WHERE
job_host_uuid = ".$anvil->data->{sys}{database}{use_handle}->quote($anvil->Get->host_uuid)."
;";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }});
my $results = $anvil->Database->query({query => $query, source => $THIS_FILE, line => __LINE__});
my $count = @{$results};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
results => $results,
count => $count,
}});
foreach my $row (@{$results})
{
my $job_uuid = $row->[0];
my $job_command = $row->[1];
my $job_data = defined $row->[2] ? $row->[2] : "";
my $job_picked_up_by = $row->[3];
my $job_picked_up_at = $row->[4];
my $unix_picked_up = $row->[3];
my $job_updated = $row->[5];
my $unix_updated = $row->[6];
my $job_progress = $row->[7];
my $started_seconds_ago = time - $unix_picked_up;
my $updated_seconds_ago = time - $unix_updated;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
job_uuid => $job_uuid,
job_command => $job_command,
job_data => $job_data,
job_picked_up_by => $job_picked_up_by,
job_picked_up_at => $job_picked_up_at,
unix_picked_up => $unix_picked_up,
job_updated => $job_updated,
unix_updated => $unix_updated,
job_progress => $job_progress,
started_seconds_ago => $started_seconds_ago,
updated_seconds_ago => $updated_seconds_ago,
}});
# If the job is done, see if it was recently enough to record in the jobs.json file.
if ($job_progress eq "100")
{
# Record in JSON if it wass last updated less than 5 minutes ago.
if ($updated_seconds_ago < 300)
{
$jobs_file .= "{ \"job_uuid\":\"".$job_uuid."\", \"job_command\":\"".$job_command."\", \"job_data\":\"".$job_data."\", \"job_picked_up_at\":\"".$job_picked_up_at."\", \"job_updated\":\"".$job_updated."\", \"job_progress\":\"".$job_progress."\", \"job_progress\":\"".$job_progress."\", \"started_seconds_ago\":\"".$started_seconds_ago."\", \"updated_seconds_ago\":\"".$updated_seconds_ago."\" }, \n";
}
next;
}
# If we're here, the job isn't done. So first, record it.
$jobs_file .= "{ \"job_uuid\":\"".$job_uuid."\", \"job_command\":\"".$job_command."\", \"job_data\":\"".$job_data."\", \"job_picked_up_at\":\"".$job_picked_up_at."\", \"job_updated\":\"".$job_updated."\", \"job_progress\":\"".$job_progress."\", \"job_progress\":\"".$job_progress."\", \"started_seconds_ago\":\"".$started_seconds_ago."\", \"updated_seconds_ago\":\"".$updated_seconds_ago."\" }, \n";
# See if the job was picked up by another running instance.
if ($job_picked_up_by)
{
# Check if the PID is still active.
$anvil->System->pids({ignore_me => 1});
### TODO: Add a check to verify the job isn't hung.
# Skip if this job is in progress.
next if exists $anvil->data->{pids}{$job_picked_up_by};
# The previous job is gone, but the job isn't finished. Start it again.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "striker_warning_0007", variables => {
command => $job_command,
pid => $job_picked_up_by,
percent => $job_progress,
}});
clear_job($anvil, $job_uuid);
}
# Start the job, appending '--job-uuid' to the command.
$anvil->data->{jobs}{handles}{$job_uuid} = $anvil->System->call({
debug => 2,
background => 1,
stdout_file => "/tmp/anvil.job.".$job_uuid.".stdout",
stderr_file => "/tmp/anvil.job.".$job_uuid.".stderr",
shell_call => $job_command." --job-uuid ".$job_uuid,
source => $THIS_FILE,
line => __LINE__,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "jobs::handles::${job_uuid}" => $anvil->data->{jobs}{handles}{$job_uuid} }});
# Record the PID
my $pid = $anvil->data->{jobs}{handles}{$job_uuid}->pid();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { pid => $pid }});
my $query = "
UPDATE
jobs
SET
job_picked_up_by = ".$anvil->data->{sys}{database}{use_handle}->quote($pid).",
modified_date = ".$anvil->data->{sys}{database}{use_handle}->quote($anvil->data->{sys}{database}{timestamp})."
WHERE
job_uuid = ".$anvil->data->{sys}{database}{use_handle}->quote($job_uuid)."
";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }});
$anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__});
}
# Close the jobs file.
$jobs_file .= "]}\n";
return(0);
}
# This calls 'anvil-update-states' which will scan the local machine's state (hardware and software) and
# record write it out to an HTML file
sub update_state_file
{
my ($anvil) = @_;
my $shell_call = $anvil->data->{path}{exe}{'anvil-update-states'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { shell_call => $shell_call }});
my $states_output = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__});
if ($states_output)
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { states_output => $states_output }});
}
return(0);
}