anvil/tools/anvil-daemon
Digimer 07c3b405ad * Starting work on adding "Install Target" function (will likely rename this, but basic same function as IT in m2).
* Added 'sys::database::failed_connection_log_level' to allow silencing of log messages when a Striker peer database is not available.
* Started updating the .spec for the new release to add supported packages needed for PXE/dhcp/tftpboot.
* Added to repo tftpboot files as pulling them out of the packages and moving them into the right place relative to the modest size of adding them directly to our source wasn't justified.
* Created the still very very early 'tools/anvil-manage-firewall' tool.

Signed-off-by: Digimer <digimer@alteeve.ca>
2018-10-10 19:43:23 -04:00

571 lines
21 KiB
Perl
Executable File

#!/usr/bin/perl
#
# This is the master daemon that manages all periodically run processes on Striker dashboards and Anvil!
# nodes.
#
# Exit codes;
# 0 = Normal exit
# 1 = md5sum of this program changed. Exited to reload.
# 2 = Unable to connect to any database, even after trying to initialize the local system.
#
# TODO:
# - Need to check what kind of machine this is and not prep the database unless its a dashboard.
# - Add a "running: pending,yes,done,dead" and show an appropriate icon beside jobs
# - Decide if holding before the main loop until 'systemctl is-system-running' returns 'running' is a good
# idea or not.
#
# NOTE:
# - For later; 'reboot --force --force' immediately kills the OS, like disabling ACPI on EL6 and hitting the power button. Might be useful in ScanCore down the road.
#
use strict;
use warnings;
use Anvil::Tools;
use Proc::Simple;
#use Time::HiRes qw ( time sleep );
use JSON;
use HTML::Strip;
use HTML::FromText;
use Data::Dumper;
my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0];
my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0];
if (($running_directory =~ /^\./) && ($ENV{PWD}))
{
$running_directory =~ s/^\./$ENV{PWD}/;
}
# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete.
$| = 1;
my $anvil = Anvil::Tools->new({log_level => 2, log_secure => 1});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, secure => 0, key => "log_0115", variables => { program => $THIS_FILE }});
# Connect to the database(s). If we have no connections, we'll proceed anyway as one of the 'run_once' tasks
# is to setup the database server.
$anvil->Database->connect({debug => 3, check_if_configured => 1});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"});
# If I have no databases, sleep for a second and then exit (systemd will restart us).
if (not $anvil->data->{sys}{database}{connections})
{
# Try to configure the local database, and then try to connect again.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, secure => 0, key => "log_0201"});
prep_database($anvil);
sleep 1;
# Try connecting again
$anvil->Database->connect();
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, secure => 0, key => "log_0132"});
if (not $anvil->data->{sys}{database}{connections})
{
# Still nothing, sleep and exit.
print $anvil->Words->string({key => "error_0003"})."\n";
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, secure => 0, key => "error_0003"});
$anvil->nice_exit({exit_code => 2});
}
}
# Read switches
$anvil->data->{switches}{'refresh-json'} = "";
$anvil->data->{switches}{'run-once'} = 0;
$anvil->data->{switches}{'main-loop-only'} = 0;
$anvil->data->{switches}{'no-start'} = 0;
$anvil->data->{switches}{'startup-only'} = 0;
$anvil->Get->switches;
if ($anvil->data->{switches}{'refresh-json'})
{
$anvil->data->{switches}{'run-once'} = 1;
$anvil->data->{switches}{'main-loop-only'} = 1;
$anvil->data->{switches}{'no-start'} = 1;
}
# There are some things we only want to run on (re)start and don't need to always run.
run_once($anvil) if not $anvil->data->{switches}{'main-loop-only'};
# Calculate my sum so that we can exit if it changes later.
$anvil->Storage->record_md5sums;
# Disconnect. We'll reconnect inside the loop
$anvil->Database->disconnect();
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0203"});
# This will prevent restarting while jobs are running.
$anvil->data->{sys}{jobs_running} = 0;
# Once a minute, we'll check the md5sums and see if we should restart. We don't check every loop as it places a
my $system_check = 60;
my $now_time = time;
my $next_check = $now_time + $system_check;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
"s1:system_check" => $system_check,
"s2:now_time" => $now_time,
"s3:next_check" => $next_check,
}});
# When we periodically check if system files have changed, we'll also ask Database>connect() to check if it
# needs to be configured or updated. This is done periodically as it is expensive to run on every loop.
my $check_if_database_is_configured = 0;
# These are the things we always want running.
while(1)
{
# Reload defaults, re-read the config and then connect to the database(s)
$anvil->_set_paths();
$anvil->_set_defaults();
$anvil->Storage->read_config({force_read => 1, file => $anvil->data->{path}{configs}{'anvil.conf'}});
$anvil->Database->connect({check_if_configured => $check_if_database_is_configured});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"});
# Mark that we don't want to check the database now.
$check_if_database_is_configured = 0;
if ($anvil->data->{sys}{database}{connections})
{
# Run the normal tasks
keep_running($anvil);
}
else
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, secure => 0, key => "log_0202"});
}
# Exit if called with '--run-once'
if ($anvil->data->{switches}{'run-once'})
{
$anvil->nice_exit({code => 0});
}
# Has the file on disk changed?
$now_time = time;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
"s1:now_time" => $now_time,
"s2:next_check" => $next_check,
}});
if ($now_time >= $next_check)
{
# Even if it is time to check, don't if a job is running.
if ((not $anvil->data->{sys}{jobs_running}) && ($anvil->Storage->check_md5sums))
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "warn", key => "message_0014"});
$anvil->nice_exit({code => 1});
}
# Mark that we want to check the database config next time.
$check_if_database_is_configured = 1;
# Update the next check time.
$next_check = $now_time + $system_check;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
"s1:system_check" => $system_check,
"s2:next_check" => $next_check,
}});
}
# Exit if 'run-once' selected.
if ($anvil->data->{switches}{'run-once'})
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "warn", key => "message_0055"});
$anvil->nice_exit({code => 0});
}
# Disconnect from the database(s) and sleep now.
$anvil->Database->disconnect();
sleep(1);
}
$anvil->nice_exit({code => 0});
#############################################################################################################
# Functions #
#############################################################################################################
# These are tools that don't need to constantly run. They'll typically run when the server starts up or the
# daemon is restarted or reloaded.
sub run_once
{
my ($anvil) = @_;
# Check that the database is ready.
prep_database($anvil);
# Check to see if we need to do boot-time tasks. We only run these if we've just booted
boot_time_tasks($anvil);
if ($anvil->data->{switches}{'startup-only'})
{
$anvil->nice_exit({code => 0});
}
return(0);
}
# This handles tasks that need to run on boot (if any)
sub boot_time_tasks
{
my ($anvil) = @_;
# If the uptime is less than ten minutes, clear the reboot flag.
my $uptime = $anvil->System->get_uptime;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { uptime => $uptime }});
# Now find out if a reboot is listed as needed and when it was last changed.
my $reboot_needed = 0;
my $changed_seconds_ago = 0;
my $query = "
SELECT
variable_value,
(SELECT extract(epoch from now()) - extract(epoch from modified_date)) AS changed_seconds_ago
FROM
variables
WHERE
variable_source_table = 'hosts'
AND
variable_source_uuid = ".$anvil->data->{sys}{database}{use_handle}->quote($anvil->data->{sys}{host_uuid})."
AND
variable_name = 'reboot::needed'
;";
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0124", variables => { query => $query }});
my $results = $anvil->Database->query({query => $query, source => $THIS_FILE, line => __LINE__});
my $count = @{$results};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
results => $results,
count => $count,
}});
if ($count)
{
$reboot_needed = $results->[0]->[0];
$changed_seconds_ago = $results->[0]->[1];
$changed_seconds_ago =~ s/^(\d+)\..*$/$1/;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
reboot_needed => $reboot_needed,
changed_seconds_ago => $changed_seconds_ago,
}});
}
# If a reboot is needed, see if the uptime is less than the time since the reboot needed flag was
# set. If the uptime is less, then the system rebooted since it was requested so clear it. h/t to
# Lisa Seelye (@thedoh) for this idea!
my $difference = ($changed_seconds_ago - $uptime);
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"s1:reboot_needed" => $reboot_needed,
"s2:changed_seconds_ago" => $changed_seconds_ago,
"s3:uptime" => $uptime,
"s4:difference" => $difference,
}});
if (($reboot_needed) && ($uptime < $changed_seconds_ago))
{
# Clear the reboot request.
$reboot_needed = $anvil->System->reboot_needed({debug => 2, set => 0});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { reboot_needed => $reboot_needed }});
# Check to see if there was a reboot job in progress. If so, finish it off.
my $job_uuid = $anvil->Job->get_job_uuid({debug => 2, program => "anvil-manage-power"});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }});
if ($job_uuid)
{
# Update the percentage to '100' and then clear the old PID.
my $date_time = $anvil->Get->date_and_time();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { date_time => $date_time }});
$anvil->Job->update_progress({
debug => 2,
progress => 100,
message => "message_0064,!!date_and_time!".$date_time."!!",
job_uuid => $job_uuid,
picked_up_by => 0,
});
}
}
return(0);
}
# Configure the local database, if needed.
sub prep_database
{
my ($anvil) = @_;
my $shell_call = $anvil->data->{path}{exe}{'anvil-prep-database'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { shell_call => $shell_call }});
my $database_output = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__});
if ($database_output)
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { database_output => $database_output }});
}
return(0);
}
# These are tools that need to keep running.
sub keep_running
{
my ($anvil) = @_;
# Check for jobs that were running and now exited.
if (exists $anvil->data->{processes})
{
foreach my $job_uuid (%{$anvil->data->{jobs}{handles}})
{
# If it's not a handle, delete it.
my $running = $anvil->data->{jobs}{handles}{$job_uuid}->poll();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"jobs::handles::${job_uuid}" => $anvil->data->{jobs}{handles}{$job_uuid},
running => $running,
}});
# If it's not running, update the table to clear the 'job_picked_up_by' column.
if (not $running)
{
my $exit_status = $anvil->data->{jobs}{handles}{$job_uuid}->exit_status();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => {
job_uuid => $job_uuid,
exit_status => $exit_status,
}});
# Free up memory
$anvil->data->{jobs}{handles}{$job_uuid}->cleanup();
$anvil->Job->clear({job_uuid => $job_uuid});
}
}
}
# Update hardware state files if the system isn't configured. Running it always is too intensive.
my $configured = $anvil->System->check_if_configured;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { configured => $configured }});
if (not $configured)
{
update_state_file($anvil);
}
# Run any pending jobs by calling 'anvil-jobs' with the 'job_uuid' as a background process.
run_jobs($anvil);
return(0);
}
# This will check for any jobs that aren't at 100%. For each found, if 'picked_up_by' is set, a check is made
# to see if the PID is still alive. If it isn't, or if 'picked_up_by' is not set, the appropriate tool is
# invoked to handle it.
sub run_jobs
{
my ($anvil) = @_;
# This will be set to 1 if any jobs are not complete, preventing a restart of the daemon if it's
# changed on disk.
$anvil->data->{sys}{jobs_running} = 0;
# We'll also update the jobs.json file.
my $jobs_file = "{\"jobs\":[\n";
# Get a list of pending or incomplete jobs.
my $return = $anvil->Database->get_jobs({debug => 3, ended_within => 300});
my $count = @{$return};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
'return' => $return,
count => $count,
}});
foreach my $hash_ref (@{$return})
{
my $job_uuid = $hash_ref->{job_uuid};
my $job_command = $hash_ref->{job_command};
my $job_data = $hash_ref->{job_data};
my $job_picked_up_by = $hash_ref->{job_picked_up_by};
my $job_picked_up_at = $hash_ref->{job_picked_up_at};
my $job_updated = $hash_ref->{job_updated};
my $job_name = $hash_ref->{job_name};
my $job_progress = $hash_ref->{job_progress};
my $job_title = $hash_ref->{job_title};
my $job_description = $hash_ref->{job_description};
my $job_status = $hash_ref->{job_status};
my $started_seconds_ago = $job_picked_up_at ? (time - $job_picked_up_at) : 0;
my $updated_seconds_ago = $job_updated ? (time - $job_updated) : 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
job_uuid => $job_uuid,
job_command => $job_command,
job_data => $job_data,
job_picked_up_by => $job_picked_up_by,
job_picked_up_at => $job_picked_up_at,
job_updated => $job_updated,
job_name => $job_name,
job_progress => $job_progress,
job_title => $job_title,
job_description => $job_description,
job_status => $job_status,
started_seconds_ago => $started_seconds_ago,
updated_seconds_ago => $updated_seconds_ago,
}});
if ($job_progress ne "100")
{
$anvil->data->{sys}{jobs_running} = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { "sys::jobs_running" => $anvil->data->{sys}{jobs_running} }});
}
# See if the job was picked up by a now-dead instance.
if ($job_picked_up_by)
{
# Check if the PID is still active.
$anvil->System->pids({ignore_me => 1});
### TODO: Add a check to verify the job isn't hung.
# Skip if this job is in progress.
if (not exists $anvil->data->{pids}{$job_picked_up_by})
{
# If the job is done, just clear the 'job_picked_up_by' and be done.
if ($job_progress ne "100")
{
# It's possible that the job updated to 100% and exited after we
# gathered the job data, so we won't restart until we've seen it not
# running and not at 100% after 5 loops.
if ((not exists $anvil->data->{lost_job_count}{$job_uuid}) or (not defined $anvil->data->{lost_job_count}{$job_uuid}))
{
$anvil->data->{lost_job_count}{$job_uuid} = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "lost_job_count::${job_uuid}" => $anvil->data->{lost_job_count}{$job_uuid} }});
}
if ($anvil->data->{lost_job_count}{$job_uuid} > 5)
{
# The previous job is gone, but the job isn't finished. Start it again.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "striker_warning_0007", variables => {
command => $job_command,
pid => $job_picked_up_by,
percent => $job_progress,
}});
# Clear some variables.
$job_progress = 0;
$job_status = "message_0056";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
job_progress => $job_progress,
job_status => $job_status,
}});
# Clear the job.
$anvil->Job->clear({debug => 3, job_uuid => $job_uuid});
$anvil->data->{lost_job_count}{$job_uuid} = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "lost_job_count::${job_uuid}" => $anvil->data->{lost_job_count}{$job_uuid} }});
}
else
{
$anvil->data->{lost_job_count}{$job_uuid}++;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "lost_job_count::${job_uuid}" => $anvil->data->{lost_job_count}{$job_uuid} }});
}
}
# Clear the PID
$job_picked_up_by = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { job_picked_up_by => $job_picked_up_by }});
}
}
# Convert the double-banged strings into a proper message.
my $say_title = $job_title ? $anvil->Words->parse_banged_string({key_string => $job_title}) : "";
my $say_description = $job_description ? $anvil->Words->parse_banged_string({key_string => $job_description}) : "";
my $say_status = $job_status ? $anvil->Words->parse_banged_string({key_string => $job_status}) : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
job_title => $job_title,
say_description => $say_description,
say_status => $say_status,
}});
# Make the status HTML friendly. Strip any embedded HTML then encode the text string.
if ($say_status)
{
my $html_strip = HTML::Strip->new();
$say_status = $html_strip->parse($say_status);
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { say_status => $say_status }});
# Now make the resulting text string HTML friendly
my $text_to_html = HTML::FromText->new({
urls => 1,
email => 1,
lines => 1,
});
$say_status = $text_to_html->parse($say_status);
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { say_status => $say_status }});
}
# Add this to the jobs.json file
my $json_string = to_json ({
job_uuid => $job_uuid,
job_command => $job_command,
job_data => $job_data,
job_picked_up_at => $job_picked_up_at,
job_updated => $job_updated,
job_name => $job_name,
job_progress => $job_progress,
job_title => $say_title,
job_description => $say_description,
job_status => $say_status,
started_seconds_ago => $started_seconds_ago,
updated_seconds_ago => $updated_seconds_ago,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { json_string => $json_string }});
$jobs_file .= $json_string.",\n";
# If the job is done, move on.
next if $job_progress eq "100";
# If the job is not running, start it.
if ((not $job_picked_up_by) && ($job_progress ne "100") && (not $anvil->data->{switches}{'no-start'}))
{
# Start the job, appending '--job-uuid' to the command.
my $command = $job_command." --job-uuid ".$job_uuid;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, secure => 0, key => "log_0210", variables => { command => $command }});
$anvil->data->{jobs}{handles}{$job_uuid} = $anvil->System->call({
debug => 3,
background => 1,
stdout_file => "/tmp/anvil.job.".$job_uuid.".stdout",
stderr_file => "/tmp/anvil.job.".$job_uuid.".stderr",
shell_call => $command,
source => $THIS_FILE,
line => __LINE__,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "jobs::handles::${job_uuid}" => $anvil->data->{jobs}{handles}{$job_uuid} }});
# Log the PID (the job should update the database).
my $pid = $anvil->data->{jobs}{handles}{$job_uuid}->pid();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { pid => $pid }});
}
}
# Close the jobs file.
$jobs_file =~ s/,\n$/\n/ms;
$jobs_file .= "]}\n";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { jobs_file => $jobs_file }});
# Write the JSON file
my $output_json = $anvil->data->{path}{directories}{html}."/status/jobs.json";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { output_xml => $output_json }});
$anvil->Storage->write_file({
file => $output_json,
body => $jobs_file,
overwrite => 1,
mode => "0644",
user => "apache",
group => "apache"
});
return(0);
}
# This calls 'anvil-update-states' which will scan the local machine's state (hardware and software) and
# record write it out to an HTML file
sub update_state_file
{
my ($anvil) = @_;
my $states_output = $anvil->System->call({debug => 3, shell_call => $anvil->data->{path}{exe}{'anvil-update-states'}, source => $THIS_FILE, line => __LINE__});
if ($states_output)
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { states_output => $states_output }});
}
return(0);
}