#!/usr/bin/perl # # This is the master daemon that manages all periodically run processes on Striker dashboards and Anvil! # nodes. # # Exit codes; # 0 = Normal exit # 1 = md5sum of this program changed. Exited to reload. # # TODO: # use strict; use warnings; use Anvil::Tools; use Proc::Simple; my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0]; my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0]; if (($running_directory =~ /^\./) && ($ENV{PWD})) { $running_directory =~ s/^\./$ENV{PWD}/; } # Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete. $| = 1; my $anvil = Anvil::Tools->new({log_level => 2, log_secure => 1}); # Connect to the database(s). If we have no connections, we'll proceed anyway as one of the 'run_once' tasks # is to setup the database server. $anvil->Database->connect(); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => "log_0132"}); # There are some things we only want to run on (re)start and don't need to always run. run_once($anvil); # Calculate my sum so that we can exit if it changes later. $anvil->Storage->record_md5sums; # Disconnect. We'll reconnect inside the loop $anvil->Database->disconnect; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => "log_0132"}); # These are the things we always want running. while(1) { # Loop and sleep for 2s. keep_running($anvil); # Exit if called with '--run-once' if ($anvil->data->{switches}{'run-once'}) { $anvil->nice_exit({code => 0}); } # Has the file on disk changed? if ($anvil->Storage->check_md5sums) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "warn", key => "message_0014"}); $anvil->nice_exit({code => 1}); } # Sleep now. sleep 2; } $anvil->nice_exit({code => 0}); ############################################################################################################# # Functions # ############################################################################################################# # These are tools that don't need to constantly run. They'll typically run when the server starts up or the # daemon is restarted or reloaded. sub run_once { my ($anvil) = @_; # Check that the database is ready. my $shell_call = $anvil->data->{path}{exe}{'anvil-prep-database'}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { shell_call => $shell_call }}); my $database_output = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__}); if ($database_output) { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { database_output => $database_output }}); } # If the uptime is less than ten minutes, clear the reboot flag. my $uptime = $anvil->Storage->read_file({ debug => 2, force_read => 1, cache => 0, file => $anvil->data->{path}{proc}{uptime}, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { uptime => $uptime }}); # Clean it up. We'll have gotten two numbers, the uptime in seconds (to two decimal places) and the # total idle time. We only care about the int number. $uptime =~ s/^(\d+)\..*$/$1/; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { uptime => $uptime }}); if ($uptime < 600) { # Clear the reboot request. my $output = $anvil->System->call({ debug => 2, shell_call => $anvil->data->{path}{exe}{'anvil-clear-reboot'}, source => $THIS_FILE, line => __LINE__, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output }}); } return(0); } # These are tools that need to keep running. sub keep_running { my ($anvil) = @_; # Check for jobs that were running and now exited. if (exists $anvil->data->{processes}) { foreach my $job_uuid (%{$anvil->data->{jobs}{handles}}) { # If it's not a handle, delete it. my $running = $anvil->data->{jobs}{handles}{$job_uuid}->poll(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => { "jobs::handles::${job_uuid}" => $anvil->data->{jobs}{handles}{$job_uuid}, running => $running, }}); # If it's not running, update the table to clear the 'job_picked_up_by' column. if (not $running) { my $exit_status = $anvil->data->{jobs}{handles}{$job_uuid}->exit_status(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => { exit_status => $exit_status }}); # Free up memory $anvil->data->{jobs}{handles}{$job_uuid}->cleanup(); clear_job($anvil, $job_uuid); } } } # Update hardware state files. update_state_file($anvil); # Run any pending jobs by calling 'anvil-jobs' with the 'job_uuid' as a background process. run_jobs($anvil); return(0); } # This clears the 'job_picked_up_by'. sub clear_job { my ($anvil, $job_uuid) = @_; my $query = " UPDATE jobs SET job_picked_up_by = '0', modified_date = ".$anvil->data->{sys}{database}{use_handle}->quote($anvil->data->{sys}{database}{timestamp})." WHERE job_uuid = ".$anvil->data->{sys}{database}{use_handle}->quote($job_uuid)." "; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }}); $anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__}); return(0); } # This will check for any jobs that aren't at 100%. For each found, if 'picked_up_by' is set, a check is made # to see if the PID is still alive. If it isn't, or if 'picked_up_by' is not set, the appropriate tool is # invoked to handle it. sub run_jobs { my ($anvil) = @_; # We'll also update the jobs.json file. my $jobs_file = "{\"jobs\":[\n"; # Get a list of pending or incomplete jobs. my $query = " SELECT job_uuid, job_command, job_data, job_picked_up_by, job_picked_up_at, extract(epoch from job_picked_up_at), job_updated, extract(epoch from modified_date) job_progress FROM jobs WHERE job_host_uuid = ".$anvil->data->{sys}{database}{use_handle}->quote($anvil->Get->host_uuid)." ;"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }}); my $results = $anvil->Database->query({query => $query, source => $THIS_FILE, line => __LINE__}); my $count = @{$results}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { results => $results, count => $count, }}); foreach my $row (@{$results}) { my $job_uuid = $row->[0]; my $job_command = $row->[1]; my $job_data = defined $row->[2] ? $row->[2] : ""; my $job_picked_up_by = $row->[3]; my $job_picked_up_at = $row->[4]; my $unix_picked_up = $row->[3]; my $job_updated = $row->[5]; my $unix_updated = $row->[6]; my $job_progress = $row->[7]; my $started_seconds_ago = time - $unix_picked_up; my $updated_seconds_ago = time - $unix_updated; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid, job_command => $job_command, job_data => $job_data, job_picked_up_by => $job_picked_up_by, job_picked_up_at => $job_picked_up_at, unix_picked_up => $unix_picked_up, job_updated => $job_updated, unix_updated => $unix_updated, job_progress => $job_progress, started_seconds_ago => $started_seconds_ago, updated_seconds_ago => $updated_seconds_ago, }}); # If the job is done, see if it was recently enough to record in the jobs.json file. if ($job_progress eq "100") { # Record in JSON if it wass last updated less than 5 minutes ago. if ($updated_seconds_ago < 300) { $jobs_file .= "{ \"job_uuid\":\"".$job_uuid."\", \"job_command\":\"".$job_command."\", \"job_data\":\"".$job_data."\", \"job_picked_up_at\":\"".$job_picked_up_at."\", \"job_updated\":\"".$job_updated."\", \"job_progress\":\"".$job_progress."\", \"job_progress\":\"".$job_progress."\", \"started_seconds_ago\":\"".$started_seconds_ago."\", \"updated_seconds_ago\":\"".$updated_seconds_ago."\" }, \n"; } next; } # If we're here, the job isn't done. So first, record it. $jobs_file .= "{ \"job_uuid\":\"".$job_uuid."\", \"job_command\":\"".$job_command."\", \"job_data\":\"".$job_data."\", \"job_picked_up_at\":\"".$job_picked_up_at."\", \"job_updated\":\"".$job_updated."\", \"job_progress\":\"".$job_progress."\", \"job_progress\":\"".$job_progress."\", \"started_seconds_ago\":\"".$started_seconds_ago."\", \"updated_seconds_ago\":\"".$updated_seconds_ago."\" }, \n"; # See if the job was picked up by another running instance. if ($job_picked_up_by) { # Check if the PID is still active. $anvil->System->pids({ignore_me => 1}); ### TODO: Add a check to verify the job isn't hung. # Skip if this job is in progress. next if exists $anvil->data->{pids}{$job_picked_up_by}; # The previous job is gone, but the job isn't finished. Start it again. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "striker_warning_0007", variables => { command => $job_command, pid => $job_picked_up_by, percent => $job_progress, }}); clear_job($anvil, $job_uuid); } # Start the job, appending '--job-uuid' to the command. $anvil->data->{jobs}{handles}{$job_uuid} = $anvil->System->call({ debug => 2, background => 1, stdout_file => "/tmp/anvil.job.".$job_uuid.".stdout", stderr_file => "/tmp/anvil.job.".$job_uuid.".stderr", shell_call => $job_command." --job-uuid ".$job_uuid, source => $THIS_FILE, line => __LINE__, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "jobs::handles::${job_uuid}" => $anvil->data->{jobs}{handles}{$job_uuid} }}); # Record the PID my $pid = $anvil->data->{jobs}{handles}{$job_uuid}->pid(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { pid => $pid }}); my $query = " UPDATE jobs SET job_picked_up_by = ".$anvil->data->{sys}{database}{use_handle}->quote($pid).", modified_date = ".$anvil->data->{sys}{database}{use_handle}->quote($anvil->data->{sys}{database}{timestamp})." WHERE job_uuid = ".$anvil->data->{sys}{database}{use_handle}->quote($job_uuid)." "; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }}); $anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__}); } # Close the jobs file. $jobs_file .= "]}\n"; return(0); } # This calls 'anvil-update-states' which will scan the local machine's state (hardware and software) and # record write it out to an HTML file sub update_state_file { my ($anvil) = @_; my $shell_call = $anvil->data->{path}{exe}{'anvil-update-states'}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { shell_call => $shell_call }}); my $states_output = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__}); if ($states_output) { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { states_output => $states_output }}); } return(0); }