Local modifications to ClusterLabs/Anvil by Alteeve
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

510 lines
18 KiB

#!/usr/bin/perl
#
# This is the master daemon that manages all periodically run processes on Striker dashboards and Anvil!
# nodes.
#
# Exit codes;
# 0 = Normal exit
# 1 = md5sum of this program changed. Exited to reload.
# 2 = Unable to connect to any database, even after trying to initialize the local system.
#
# TODO:
# - Need to check what kind of machine this is and not prep the database unless its a dashboard.
# - Add a "running: pending,yes,done,dead" and show an appropriate icon beside jobs
# - Decide if holding before the main loop until 'systemctl is-system-running' returns 'running' is a good
# idea or not.
#
# NOTE:
# - For later; 'reboot --force --force' immediately kills the OS, like disabling ACPI on EL6 and hitting the power button. Might be useful in ScanCore down the road.
#
use strict;
use warnings;
use Anvil::Tools;
use Proc::Simple;
use JSON;
use HTML::Strip;
use HTML::FromText;
my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0];
my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0];
if (($running_directory =~ /^\./) && ($ENV{PWD}))
{
$running_directory =~ s/^\./$ENV{PWD}/;
}
# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete.
$| = 1;
my $anvil = Anvil::Tools->new({log_level => 2, log_secure => 1});
# Connect to the database(s). If we have no connections, we'll proceed anyway as one of the 'run_once' tasks
# is to setup the database server.
$anvil->Database->connect();
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => "log_0132"});
# If I have no databases, sleep for a second and then exit (systemd will restart us).
if (not $anvil->data->{sys}{database}{connections})
{
# Try to configure the local database, and then try to connect again.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, secure => 0, key => "log_0201"});
prep_database($anvil);
sleep 1;
# Try connecting again
$anvil->Database->connect();
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => "log_0132"});
if (not $anvil->data->{sys}{database}{connections})
{
# Still nothing, sleep and exit.
print $anvil->Words->string({key => "error_0003"})."\n";
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, secure => 0, key => "error_0003"});
$anvil->nice_exit({exit_code => 2});
}
}
# Read switches
$anvil->data->{switches}{'refresh-json'} = "";
$anvil->data->{switches}{'run-once'} = 0;
$anvil->data->{switches}{'main-loop-only'} = 0;
$anvil->data->{switches}{'no-start'} = 0;
$anvil->data->{switches}{'startup-only'} = 0;
$anvil->Get->switches;
if ($anvil->data->{switches}{'refresh-json'})
{
$anvil->data->{switches}{'run-once'} = 1;
$anvil->data->{switches}{'main-loop-only'} = 1;
$anvil->data->{switches}{'no-start'} = 1;
}
# There are some things we only want to run on (re)start and don't need to always run.
run_once($anvil) if not $anvil->data->{switches}{'main-loop-only'};
# Calculate my sum so that we can exit if it changes later.
$anvil->Storage->record_md5sums;
# Disconnect. We'll reconnect inside the loop
$anvil->Database->disconnect();
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => "log_0203"});
# This will prevent restarting while jobs are running.
$anvil->data->{sys}{jobs_running} = 0;
# These are the things we always want running.
while(1)
{
# Connect to the database(s)
$anvil->Storage->read_config({file => "/etc/anvil/anvil.conf"});
$anvil->Database->connect();
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => "log_0132"});
if ($anvil->data->{sys}{database}{connections})
{
# Loop and sleep for 2s.
keep_running($anvil);
}
else
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, secure => 0, key => "log_0202"});
}
# Exit if called with '--run-once'
if ($anvil->data->{switches}{'run-once'})
{
$anvil->nice_exit({code => 0});
}
# Has the file on disk changed?
if ((not $anvil->data->{sys}{jobs_running}) && ($anvil->Storage->check_md5sums))
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "warn", key => "message_0014"});
$anvil->nice_exit({code => 1});
}
# Exit if 'run-once' selected.
if ($anvil->data->{switches}{'run-once'})
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "warn", key => "message_0055"});
$anvil->nice_exit({code => 0});
}
# Disconnect from the database(s) and sleep now.
$anvil->Database->disconnect();
sleep 2;
}
$anvil->nice_exit({code => 0});
#############################################################################################################
# Functions #
#############################################################################################################
# These are tools that don't need to constantly run. They'll typically run when the server starts up or the
# daemon is restarted or reloaded.
sub run_once
{
my ($anvil) = @_;
# Check that the database is ready.
prep_database($anvil);
# Check to see if we need to do boot-time tasks. We only run these if we've just booted
boot_time_tasks($anvil);
if ($anvil->data->{switches}{'startup-only'})
{
$anvil->nice_exit({code => 0});
}
return(0);
}
# This handles tasks that need to run on boot (if any)
sub boot_time_tasks
{
my ($anvil) = @_;
# If the uptime is less than ten minutes, clear the reboot flag.
my $uptime = $anvil->Storage->read_file({
debug => 2,
force_read => 1,
cache => 0,
file => $anvil->data->{path}{proc}{uptime},
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { uptime => $uptime }});
# Clean it up. We'll have gotten two numbers, the uptime in seconds (to two decimal places) and the
# total idle time. We only care about the int number.
$uptime =~ s/^(\d+)\..*$/$1/;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { uptime => $uptime }});
# Now find out if a reboot is needed and when it was last changed.
my $reboot_needed = 0;
my $changed_seconds_ago = 0;
my $query = "
SELECT
variable_value,
(SELECT extract(epoch from now()) - extract(epoch from modified_date)) AS changed_seconds_ago
FROM
variables
WHERE
variable_source_table = 'hosts'
AND
variable_source_uuid = ".$anvil->data->{sys}{database}{use_handle}->quote($anvil->data->{sys}{host_uuid})."
AND
variable_name = 'reboot::needed'
;";
my $results = $anvil->Database->query({query => $query, source => $THIS_FILE, line => __LINE__});
my $count = @{$results};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
results => $results,
count => $count,
}});
if ($count)
{
$reboot_needed = $results->[0]->[0];
$changed_seconds_ago = $results->[0]->[1];
$changed_seconds_ago =~ s/^(\d+)\..*$/$1/;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
reboot_needed => $reboot_needed,
changed_seconds_ago => $changed_seconds_ago,
}});
}
# If a reboot is needed, see if the uptime is less than the time since the reboot needed flag was
# set. If the uptime is less, then the system rebooted since it was requested so clear it. h/t to
# Lisa Seelye (@thedoh) for this idea!
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
reboot_needed => $reboot_needed,
changed_seconds_ago => $changed_seconds_ago,
uptime => $uptime,
}});
if (($reboot_needed) && ($uptime < $changed_seconds_ago))
{
# Clear the reboot request.
$reboot_needed = $anvil->System->reboot_needed({set => 0});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { reboot_needed => $reboot_needed }});
# Check to see if there was a reboot job in progress. If so, finish it off.
my $job_uuid = $anvil->Job->get_job_uuid({debug => 2, program => "anvil-manage-power"});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }});
if ($job_uuid)
{
# Update the percentage to '100' and then clear the old PID.
my $date_time = $anvil->Get->date_and_time();
if ($anvil->data->{jobs}{job_uuid})
{
$anvil->Job->update_progress({
debug => 2,
progress => 100,
message => "message_0064,!!date_and_time!".$date_time."!!",
job_uuid => $job_uuid
});
}
$anvil->Job->clear({job_uuid => $job_uuid});
}
}
return(0);
}
# Configure the local database, if needed.
sub prep_database
{
my ($anvil) = @_;
my $shell_call = $anvil->data->{path}{exe}{'anvil-prep-database'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { shell_call => $shell_call }});
my $database_output = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__});
if ($database_output)
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { database_output => $database_output }});
}
return(0);
}
# These are tools that need to keep running.
sub keep_running
{
my ($anvil) = @_;
# Check for jobs that were running and now exited.
if (exists $anvil->data->{processes})
{
foreach my $job_uuid (%{$anvil->data->{jobs}{handles}})
{
# If it's not a handle, delete it.
my $running = $anvil->data->{jobs}{handles}{$job_uuid}->poll();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => {
"jobs::handles::${job_uuid}" => $anvil->data->{jobs}{handles}{$job_uuid},
running => $running,
}});
# If it's not running, update the table to clear the 'job_picked_up_by' column.
if (not $running)
{
my $exit_status = $anvil->data->{jobs}{handles}{$job_uuid}->exit_status();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => { exit_status => $exit_status }});
# Free up memory
$anvil->data->{jobs}{handles}{$job_uuid}->cleanup();
$anvil->Job->clear({job_uuid => $job_uuid});
}
}
}
# Update hardware state files.
update_state_file($anvil);
# Run any pending jobs by calling 'anvil-jobs' with the 'job_uuid' as a background process.
run_jobs($anvil);
return(0);
}
# This will check for any jobs that aren't at 100%. For each found, if 'picked_up_by' is set, a check is made
# to see if the PID is still alive. If it isn't, or if 'picked_up_by' is not set, the appropriate tool is
# invoked to handle it.
sub run_jobs
{
my ($anvil) = @_;
# This will be set to 1 if any jobs are not complete, preventing a restart of the daemon if it's
# changed on disk.
$anvil->data->{sys}{jobs_running} = 0;
# We'll also update the jobs.json file.
my $jobs_file = "{\"jobs\":[\n";
# Get a list of pending or incomplete jobs.
my $return = $anvil->Database->get_jobs({ended_within => 300});
my $count = @{$return};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'return' => $return,
count => $count,
}});
foreach my $hash_ref (@{$return})
{
my $job_uuid = $hash_ref->{job_uuid};
my $job_command = $hash_ref->{job_command};
my $job_data = $hash_ref->{job_data};
my $job_picked_up_by = $hash_ref->{job_picked_up_by};
my $job_picked_up_at = $hash_ref->{job_picked_up_at};
my $job_updated = $hash_ref->{job_updated};
my $job_name = $hash_ref->{job_name};
my $job_progress = $hash_ref->{job_progress};
my $job_title = $hash_ref->{job_title};
my $job_description = $hash_ref->{job_description};
my $job_status = $hash_ref->{job_status};
my $started_seconds_ago = $job_picked_up_at ? (time - $job_picked_up_at) : 0;
my $updated_seconds_ago = $job_updated ? (time - $job_updated) : 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
job_uuid => $job_uuid,
job_command => $job_command,
job_data => $job_data,
job_picked_up_by => $job_picked_up_by,
job_picked_up_at => $job_picked_up_at,
job_updated => $job_updated,
job_name => $job_name,
job_progress => $job_progress,
job_title => $job_title,
job_description => $job_description,
job_status => $job_status,
started_seconds_ago => $started_seconds_ago,
updated_seconds_ago => $updated_seconds_ago,
}});
if ($job_progress ne "100")
{
$anvil->data->{sys}{jobs_running} = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "sys::jobs_running" => $anvil->data->{sys}{jobs_running} }});
}
# See if the job was picked up by a now-dead instance.
if ($job_picked_up_by)
{
# Check if the PID is still active.
$anvil->System->pids({ignore_me => 1});
### TODO: Add a check to verify the job isn't hung.
# Skip if this job is in progress.
if (not exists $anvil->data->{pids}{$job_picked_up_by})
{
# If the job is done, just clear the 'job_picked_up_by' and be done.
$anvil->Job->clear({job_uuid => $job_uuid});
if ($job_progress ne "100")
{
# The previous job is gone, but the job isn't finished. Start it again.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "striker_warning_0007", variables => {
command => $job_command,
pid => $job_picked_up_by,
percent => $job_progress,
}});
# Clear some variables.
$job_progress = 0;
$job_status = "message_0056";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
job_progress => $job_progress,
job_status => $job_status,
}});
}
# Clear the PID
$job_picked_up_by = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_picked_up_by => $job_picked_up_by }});
}
}
# Convert the double-banged strings into a proper message.
my $say_title = $job_title ? $anvil->Words->parse_banged_string({debug => 2, key_string => $job_title}) : "";
my $say_description = $job_description ? $anvil->Words->parse_banged_string({debug => 2, key_string => $job_description}) : "";
my $say_status = $job_status ? $anvil->Words->parse_banged_string({debug => 2, key_string => $job_status}) : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
job_title => $job_title,
say_description => $say_description,
say_status => $say_status,
}});
# Make the status HTML friendly. Strip any embedded HTML then encode the text string.
if ($say_status)
{
my $html_strip = HTML::Strip->new();
$say_status = $html_strip->parse($say_status);
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { say_status => $say_status }});
# Now make the resulting text string HTML friendly
my $text_to_html = HTML::FromText->new({
urls => 1,
email => 1,
lines => 1,
});
$say_status = $text_to_html->parse($say_status);
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { say_status => $say_status }});
}
# Add this to the jobs.json file
my $json_string = to_json ({
job_uuid => $job_uuid,
job_command => $job_command,
job_data => $job_data,
job_picked_up_at => $job_picked_up_at,
job_updated => $job_updated,
job_name => $job_name,
job_progress => $job_progress,
job_title => $say_title,
job_description => $say_description,
job_status => $say_status,
started_seconds_ago => $started_seconds_ago,
updated_seconds_ago => $updated_seconds_ago,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { json_string => $json_string }});
$jobs_file .= $json_string.",\n";
# If the job is done, move on.
next if $job_progress eq "100";
# If the job is not running, start it.
if ((not $job_picked_up_by) && (not $anvil->data->{switches}{'no-start'}))
{
# Start the job, appending '--job-uuid' to the command.
$anvil->data->{jobs}{handles}{$job_uuid} = $anvil->System->call({
debug => 2,
background => 1,
stdout_file => "/tmp/anvil.job.".$job_uuid.".stdout",
stderr_file => "/tmp/anvil.job.".$job_uuid.".stderr",
shell_call => $job_command." --job-uuid ".$job_uuid,
source => $THIS_FILE,
line => __LINE__,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "jobs::handles::${job_uuid}" => $anvil->data->{jobs}{handles}{$job_uuid} }});
# Log the PID (the job should update the database).
my $pid = $anvil->data->{jobs}{handles}{$job_uuid}->pid();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { pid => $pid }});
}
}
# Close the jobs file.
$jobs_file =~ s/,\n$/\n/ms;
$jobs_file .= "]}\n";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { jobs_file => $jobs_file }});
# Write the JSON file
my $output_json = $anvil->data->{path}{directories}{html}."/status/jobs.json";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { output_xml => $output_json }});
$anvil->Storage->write_file({
file => $output_json,
body => $jobs_file,
overwrite => 1,
mode => "0644",
user => "apache",
group => "apache"
});
return(0);
}
# This calls 'anvil-update-states' which will scan the local machine's state (hardware and software) and
# record write it out to an HTML file
sub update_state_file
{
my ($anvil) = @_;
my $shell_call = $anvil->data->{path}{exe}{'anvil-update-states'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { shell_call => $shell_call }});
my $states_output = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__});
if ($states_output)
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { states_output => $states_output }});
}
return(0);
}