anvil/tools/anvil-daemon
Digimer 911f7cfb6a This is another big commit with a lot of DB work. Getting closer to sorting out the frequent resyncs.
* Changes Database->connect to always use the first DB connected to, not the local one if that applies. This treats the first DB (sorted by UUID) as "primary" and the second (or third...) as more of a backup.
* Moved db_in_use and lock_request to use the 'states' table instead of the variables table. These are set and removed so often that it was messing up things with resync's when the data is transient anyway. Fixed multiple bugs with both to better set and clear properly.
* Created Database->read_state() to assist with the above changes.
* Updated Database->refresh_timestamp() to specifically check that the returned time stamp differs from the previously used one, looping until they differ if needed.
* Disabled striker-manage-install-target when called to update the repos, as the Install Target function doesn't work at this point.

Signed-off-by: Digimer <digimer@alteeve.ca>
2022-06-16 20:10:43 -04:00

1773 lines
68 KiB
Perl
Executable File

#!/usr/bin/perl
#
# This is the master daemon that manages all periodically run processes on Striker dashboards, Anvil! cluster
# nodes and DR hosts.
#
# Exit codes;
# 0 = Normal exit or md5sum of this program changed and it exited to reload.
# 1 = Not running as root.
# 2 = Unable to connect to any database, even after trying to initialize the local system.
#
# TODO:
# - Need to check what kind of machine this is and not prep the database unless its a dashboard.
# - Add a "running: pending,yes,done,dead" and show an appropriate icon beside jobs
# - Decide if holding before the main loop until 'systemctl is-system-running' returns 'running' is a good
# idea or not.
# - Write the status of this and the scancore daemon to /etc/anvil/anvil.motd and symlink it to /etc/motd.d/
# - Write a script that runs in crontab at UTC 17:00 that sends an email if Scancore or anvil-daemon are disabled.
# - Examine limites in: https://www.freedesktop.org/software/systemd/man/systemd.exec.html#LimitCPU=
# - Write a background program to scan the BCN and uses OUI data to try and find / auto-configure PDUs and UPSes
# -
# - Increase DRBD's default timeout
# - Check for and enable persistent journald logging
#
# NOTE:
# - For later; 'reboot --force --force' immediately kills the OS, like disabling ACPI on EL6 and hitting the
# power button. Might be useful in ScanCore down the road.
#
# Switches:
#
# --main-loop-only
#
# This skips the one-time, start-up tasks and just goes into the main-loop,
#
# --no-start
#
# This will prevent any pending jobs from being picked up and started in this run. Note that other job checks will still happen.
#
# --refresh-json
#
# This just updates the JSON files used by the web interface. It is the same as '--run-once --main-loop-only --no-start'
#
# --run-once
#
# This will tell the program to exit after runn the main loop once.
#
# --startup-only
#
# This will tell the program to exit after running the start up tasks, so the main loop won't run.
#
use strict;
use warnings;
use Anvil::Tools;
use Proc::Simple;
#use Time::HiRes qw ( time sleep );
use JSON;
use HTML::Strip;
use HTML::FromText;
use Data::Dumper;
use Text::Diff;
my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0];
my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0];
if (($running_directory =~ /^\./) && ($ENV{PWD}))
{
$running_directory =~ s/^\./$ENV{PWD}/;
}
# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete.
$| = 1;
# Prevent a discrepency between UID/GID and EUID/EGID from throwing an error.
$< = $>;
$( = $);
# NOTE: Setting 'log_level' and 'log_secure' here will get overridden in the main lopp. Use the Log methods
# in the loop as well to override defaults in code.
my $anvil = Anvil::Tools->new();
# Make sure we're running as 'root'
# $< == real UID, $> == effective UID
if (($< != 0) && ($> != 0))
{
# Not root
print $anvil->Words->string({key => "error_0005"})."\n";
$anvil->nice_exit({exit_code => 1});
}
# If, so some reason, anvil.conf is lost, create it.
$anvil->System->_check_anvil_conf();
# Connect to the database(s). If we have no connections, we'll proceed anyway as one of the 'run_once' tasks
# is to setup the database server.
$anvil->Database->connect({
check_if_configured => 1,
check_for_resync => 1,
});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0132"});
# If I have no databases, sleep for a second and then exit (systemd will restart us).
if (not $anvil->data->{sys}{database}{connections})
{
# If this is a dashboard, try to configure and then connect to the local database. If this isn't a
# dashboard, then just go into a loop waiting for a database to be configured.
if ($anvil->Get->host_type eq "striker")
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0201"});
prep_database($anvil);
# Try connecting again
$anvil->Database->connect({check_if_configured => 1, check_for_resync => 1});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0132"});
if (not $anvil->data->{sys}{database}{connections})
{
# Still nothing, sleep and exit.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, key => "error_0003"});
$anvil->nice_exit({exit_code => 2});
}
}
else
{
# Wait until we have one.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, key => "error_0075"});
until($anvil->data->{sys}{database}{connections})
{
sleep 10;
check_network($anvil);
$anvil->refresh();
$anvil->Database->connect({check_if_configured => 1, check_for_resync => 1});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, key => "log_0132"});
if (not $anvil->data->{sys}{database}{connections})
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 3, key => "log_0439"});
}
}
}
}
# Read switches
$anvil->data->{switches}{'refresh-json'} = "";
$anvil->data->{switches}{'run-once'} = 0;
$anvil->data->{switches}{'main-loop-only'} = 0;
$anvil->data->{switches}{'no-start'} = 0;
$anvil->data->{switches}{'purge'} = 0;
$anvil->data->{switches}{'startup-only'} = 0;
$anvil->Get->switches;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }});
if ($anvil->data->{switches}{'refresh-json'})
{
$anvil->data->{switches}{'run-once'} = 1;
$anvil->data->{switches}{'main-loop-only'} = 1;
$anvil->data->{switches}{'no-start'} = 1;
}
# This is used to track initial checkes / repairs of network issues.
$anvil->data->{sys}{network}{initial_checks} = 0;
# There are some things we only want to run on (re)start and don't need to always run.
run_once($anvil) if not $anvil->data->{switches}{'main-loop-only'};
# Calculate my sum so that we can exit if it changes later.
$anvil->Storage->record_md5sums;
# What time is it, Mr. Fox?
my $now_time = time;
# To avoid multiple dashboards running a network scan and OUI parse, the dashboard peer with the lowest
# host_uuid sets it's daily checks to run now, and the other(s) will get a two hour's delay.
my $delay = set_delay($anvil);
# Once a minute, we'll check the md5sums and see if we should restart.
# Once a day, we'll refresh an Install Target's RPM repository (has no effect on non-Striker dashboards).
$anvil->data->{timing}{minute_checks} = 60;
$anvil->data->{timing}{ten_minute_checks} = 600;
$anvil->data->{timing}{daily_checks} = 86400;
$anvil->data->{timing}{repo_update_interval} = 86400;
$anvil->data->{timing}{next_minute_check} = $now_time - 1;
$anvil->data->{timing}{next_ten_minute_check} = $now_time - 1;
$anvil->data->{timing}{next_daily_check} = ($now_time + $delay) - 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"s1:timing::minute_checks" => $anvil->data->{timing}{minute_checks},
"s2:timing::ten_minute_checks" => $anvil->data->{timing}{ten_minute_checks},
"s3:timing::daily_checks" => $anvil->data->{timing}{daily_checks},
"s4:timing::repo_update_interval" => $anvil->data->{timing}{repo_update_interval},
"s5:now_time" => $now_time,
"s6:delay" => $delay,
"s7:timing::next_minute_check" => $anvil->data->{timing}{next_minute_check},
"s8:timing::next_ten_minute_check" => $anvil->data->{timing}{next_ten_minute_check},
"s9:timing::next_daily_check" => $anvil->data->{timing}{next_daily_check},
}});
# Disconnect. We'll reconnect inside the loop
$anvil->Database->disconnect();
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, key => "log_0203"});
# This will prevent restarting while jobs are running.
$anvil->data->{sys}{jobs_running} = 0;
# When we periodically check if system files have changed, we'll also ask Database>connect() to check if it
# needs to be configured or updated. This is done periodically as it is expensive to run on every loop.
my $check_if_database_is_configured = 0;
# These are the things we always want running.
while(1)
{
# Reload defaults, re-read the config and then connect to the database(s)
$anvil->refresh();
# If, so some reason, anvil.conf is lost, create it.
$anvil->System->_check_anvil_conf();
$anvil->Database->connect({check_if_configured => $check_if_database_is_configured, check_for_resync => 1});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, key => "log_0132"});
# Mark that we don't want to check the database now.
$check_if_database_is_configured = 0;
# If this host is mapping the network, we'll skip a lot of stuff. If set for over an hour, we'll
# clear it.
$anvil->data->{sys}{mapping_network} = check_if_mapping($anvil);
if ($anvil->data->{sys}{database}{connections})
{
# Run the normal tasks
keep_running($anvil);
# Handle periodic tasks
handle_periodic_tasks($anvil) if not $anvil->data->{sys}{mapping_network};
}
else
{
# No databases available, we'll update the state file in case this host is having it's
# network mapped and the interface used to talk to the databases went down. That's all we
# can do though.
update_state_file($anvil);
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, key => "log_0202"});
}
# Exit if 'run-once' selected.
if ($anvil->data->{switches}{'run-once'})
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "message_0055"});
$anvil->nice_exit({exit_code => 0});
}
# Check how much RAM we're using.
check_ram($anvil);
# Disconnect from the database(s) and sleep now.
$anvil->Database->disconnect({debug => 2});
sleep(2);
}
$anvil->nice_exit({exit_code => 0});
#############################################################################################################
# Functions #
#############################################################################################################
# If we're using too much ram, send an alert and exit.
sub check_ram
{
my ($anvil) = @_;
# Problem 0 == ok, 1 == too much ram used, 2 == no pid found
my ($problem, $ram_used) = $anvil->System->check_ram_use({program => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
problem => $problem,
ram_used => $anvil->Convert->add_commas({number => $ram_used})." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $ram_used}).")",
}});
if ($problem)
{
# See if an [anvil-sync-shared' job is running and, if so, don't exit. The file copy is
# counted and not an actual problem.
$anvil->Database->get_jobs({debug => 2});
foreach my $job_uuid (keys %{$anvil->data->{jobs}{running}})
{
my $job_command = $anvil->data->{jobs}{running}{$job_uuid}{job_command};
my $job_progress = $anvil->data->{jobs}{running}{$job_uuid}{job_progress};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
job_command => $job_command,
job_progress => $job_progress,
}});
if (($job_progress != 100) && ($job_command =~ /anvil-sync-shared/))
{
# Don't abort.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "warning_0139", variables => {
job_command => $job_command,
ram_used => $anvil->Convert->bytes_to_human_readable({'bytes' => $ram_used}),
ram_used_bytes => $anvil->Convert->add_commas({number => $ram_used}),
}});
return(0);
}
}
# Send an alert and exit.
$anvil->Alert->register({alert_level => "notice", message => "error_0357", variables => {
program => $THIS_FILE,
ram_used => $anvil->Convert->bytes_to_human_readable({'bytes' => $ram_used}),
ram_used_bytes => $anvil->Convert->add_commas({number => $ram_used}),
}, set_by => $THIS_FILE, sort_position => 0});
$anvil->Email->send_alerts();
# Log the same
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0357", variables => {
program => $THIS_FILE,
ram_used => $anvil->Convert->bytes_to_human_readable({'bytes' => $ram_used}),
ram_used_bytes => $anvil->Convert->add_commas({number => $ram_used}),
}});
# Exit with RC0 so that systemctl restarts
$anvil->nice_exit({exit_code => 0});
}
return(0);
}
# Check to see if we're mapping the network on this host.
sub check_if_mapping
{
my ($anvil) = @_;
$anvil->data->{sys}{mapping_network} = 0;
if ($anvil->data->{sys}{database}{connections})
{
my ($map_network_value, $map_network_uuid, $map_network_mtime, $map_network_modified_date) = $anvil->Database->read_variable({
debug => 3,
variable_name => "config::map_network",
variable_source_table => "hosts",
variable_source_uuid => $anvil->data->{sys}{host_uuid},
});
# We'll run for a day (should be cancelled by the program when the user's done, so this
# shouldn't fire in practice).
my $expire_age = 86400;
my $map_network_age = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:map_network_value' => $map_network_value,
's2:map_network_mtime' => $map_network_mtime,
's3:map_network_modified_date' => $map_network_modified_date,
's4:map_network_uuid' => $map_network_uuid,
}});
if ($map_network_uuid)
{
$map_network_age = time - $map_network_mtime;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { map_network_age => $map_network_age }});
}
if ($map_network_value)
{
# How long ago was it set?
$anvil->data->{switches}{'clear-mapping'} = "" if not defined $anvil->data->{switches}{'clear-mapping'};
if (($map_network_age >= $expire_age) or ($anvil->data->{switches}{'clear-mapping'}))
{
# Clear it.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0470"});
$anvil->Database->insert_or_update_variables({
debug => 3,
variable_value => 0,
variable_uuid => $map_network_uuid,
update_value_only => 1,
});
}
else
{
# Mark it so we only track the network.
my $say_age = $anvil->Convert->add_commas({number => $expire_age});
my $timeout = $anvil->Convert->add_commas({number => ($expire_age - $map_network_age)});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0471", variables => {
age => $say_age,
timeout => $timeout,
}});
$anvil->data->{sys}{mapping_network} = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { "sys::mapping_network" => $anvil->data->{sys}{mapping_network} }});
# Close any open ssh connections.
foreach my $ssh_fh_key (keys %{$anvil->data->{cache}{ssh_fh}})
{
my $ssh_fh = $anvil->data->{cache}{ssh_fh}{$ssh_fh_key};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
ssh_fh_key => $ssh_fh_key,
ssh_fh => $ssh_fh,
}});
if ($ssh_fh =~ /^Net::OpenSSH/)
{
$ssh_fh->disconnect();
}
delete $anvil->data->{cache}{ssh_fh}{$ssh_fh_key};
}
}
}
}
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { "sys::mapping_network" => $anvil->data->{sys}{mapping_network} }});
return($anvil->data->{sys}{mapping_network});
}
# This decides if the local system will delay daily runs on start-up.
sub set_delay
{
my ($anvil) = @_;
my $delay = 7200;
my $host_type = $anvil->Get->host_type();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_type => $host_type }});
if ($host_type eq "striker")
{
foreach my $uuid (sort {$a cmp $b} keys %{$anvil->data->{database}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"sys::host_uuid" => $anvil->data->{sys}{host_uuid},
uuid => $uuid,
}});
if ($uuid eq $anvil->data->{sys}{host_uuid})
{
$delay = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { delay => $delay }});
}
last;
}
}
else
{
# Not a dashboard, don't delay
$delay = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { delay => $delay }});
}
return($delay);
}
# This checks to see if it's time to see if the network is ok and, if the system has been up long enough,
# checks and tries to repair network issues.
sub check_network
{
my ($anvil) = @_;
# The network sometimes doesn't come up, but we don't want to try recovering it too soon. As such,
# we'll start watching the network after the uptime is 2 minutes.
my $uptime = $anvil->Get->uptime;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { uptime => $uptime }});
if ($uptime > 120)
{
# Check that bonds are up. Degraded bonds will be left alone.
if (not $anvil->data->{sys}{network}{initial_checks})
{
my $running = $anvil->System->check_daemon({daemon => "NetworkManager"});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { running => $running }});
if (not $running)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "message_0250", variables => { daemon => "NetworkManager" }});
my $return_code = $anvil->System->start_daemon({daemon => "NetworkManager"});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { return_code => $return_code }});
}
#$anvil->Network->check_network({heal => "all"});
$anvil->data->{sys}{network}{initial_checks} = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
"sys::network::initial_checks" => $anvil->data->{sys}{network}{initial_checks},
}});
}
else
{
### NOTE: This is constantly trying to "fix" healthy bonds, without a know way to
### trigger to debug. As such, disabling for now.
#$anvil->Network->check_network({heal => "down_only"});
}
check_firewall($anvil);
}
# Check that all users can ping.
if (1)
{
my $shell_call = $anvil->data->{path}{exe}{sysctl}." net.ipv4.ping_group_range";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output }});
if ($output =~ /net.ipv4.ping_group_range = (\d+)\t(\d+)$/)
{
my $lowest_uid = $1;
my $highest_uid = $2;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
lowest_uid => $lowest_uid,
highest_uid => $highest_uid,
}});
if ($highest_uid < 2000)
{
# Tell the user we're enabling ping for all users.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0683"});
my $shell_call = $anvil->data->{path}{exe}{sysctl}." -w net.ipv4.ping_group_range=\"0 2147483647\"";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => { output => $output }});
}
}
}
return(0);
}
# This handles running tasks that only run on some loops.
sub handle_periodic_tasks
{
my ($anvil) = @_;
my $now_time = time;
my $host_type = $anvil->Get->host_type();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"s1:now_time" => $now_time,
"s2:timing::next_minute_check" => $anvil->data->{timing}{next_minute_check},
"s3:timing::next_ten_minute_check" => $anvil->data->{timing}{next_ten_minute_check},
"s4:timing::next_daily_check" => $anvil->data->{timing}{next_daily_check},
"s5:host_type" => $host_type,
}});
# Time to run once per minute tasks.
if ($now_time >= $anvil->data->{timing}{next_minute_check})
{
# Check the firewall needs to be updated.
check_network($anvil);
# Check to see if the PXE environment needs to be updated.
check_install_target($anvil);
# Check that the users we care about have ssh public keys and they're recorded in ssh_keys.
$anvil->System->check_ssh_keys({debug => 2});
$anvil->System->update_hosts({debug => 3});
# Check if the files on disk have changed. Even if it is time to check, don't if a job is
# running.
if ((not $anvil->data->{timing}{jobs_running}) && ($anvil->Storage->check_md5sums))
{
# NOTE: We exit with '0' to prevent systemctl from showing a scary red message.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "message_0014"});
$anvil->nice_exit({exit_code => 0});
}
# Mark that we want to check the database config next time.
$check_if_database_is_configured = 1;
# Update the next check time.
$anvil->data->{timing}{next_minute_check} = $now_time + $anvil->data->{timing}{minute_checks};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
"s1:timing::minute_checks" => $anvil->data->{timing}{minute_checks},
"s2:timing::next_minute_check" => $anvil->data->{timing}{next_minute_check},
}});
# Even when this runs, it should finish in under ten seconds so we don't need to background it.
my ($parse_output, $return_code) = $anvil->System->call({debug => 3, shell_call => $anvil->data->{path}{exe}{'anvil-parse-fence-agents'}.$anvil->Log->switches, source => $THIS_FILE, line => __LINE__});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { parse_output => $parse_output }});
# Scan the local network.
update_state_file($anvil);
# Make sure the shared directories exist.
foreach my $target (sort {$a cmp $b} keys %{$anvil->data->{path}{directories}{shared}})
{
my $directory = $anvil->data->{path}{directories}{shared}{$target};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
target => $target,
directory => $directory,
}});
if (not -e $anvil->data->{path}{directories}{shared}{$target})
{
my $failed = $anvil->Storage->make_directory({
directory => $directory,
group => "apache",
user => "apache",
mode => "0775",
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { failed => $failed }});
if ($failed)
{
# Something went wrong.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "log_0254", variables => {
directory => $directory,
}});
}
else
{
# Success
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0255", variables => {
directory => $directory,
}});
}
}
}
# Check mail server config.
my $problem = $anvil->Email->check_config({debug => 3});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { problem => $problem }});
# Check if any files have been uploaded to /mnt/shared/incoming on striker
check_incoming($anvil);
# Check for stale db_in_use states.
check_db_in_use_states($anvil);
}
# Now check to see if it's time to run less frequent tasks.
if ($now_time >= $anvil->data->{timing}{next_ten_minute_check})
{
my $host_type = $anvil->Get->host_type();
my $host_uuid = $anvil->Get->host_uuid();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
host_type => $host_type,
host_uuid => $host_uuid,
}});
# Are we a Striker and is there two or more connections? If so, evaluate if we should shut
# down our database.
if ($host_type eq "striker")
{
# If we're the active database, dump our database out and rsync it to our peers.
my $peers = keys %{$anvil->data->{database}};
my $connections = $anvil->data->{sys}{database}{connections};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
peers => $peers,
connections => $connections,
}});
if (exists $anvil->data->{cache}{database_handle}{$host_uuid})
{
# Verify that the database is up.
my $running = $anvil->System->check_daemon({daemon => $anvil->data->{sys}{daemon}{postgresql}});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { running => $running }});
if ($running)
{
# Backup our DB.
my $dump_file = $anvil->Database->backup_database({debug => 2});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { dump_file => $dump_file }});
# Now rsync it to our peer(s)
foreach my $this_host_uuid (sort {$a cmp $b} keys %{$anvil->data->{database}})
{
next if $this_host_uuid eq $host_uuid;
my $destination = "root\@".$anvil->data->{database}{$this_host_uuid}{host}.":".$anvil->data->{path}{directories}{pgsql}."/";
my $password = $anvil->data->{database}{$this_host_uuid}{password};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
this_host_uuid => $this_host_uuid,
destination => $destination,
password => $anvil->Log->is_secure($password),
}});
my $start_time = time;
my $failed = $anvil->Storage->rsync({
debug => 3,
destination => $destination,
password => $password,
source => $dump_file,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { failed => $failed }});
my $rsync_time = time - $start_time;
my $size = $anvil->Convert->bytes_to_human_readable({'bytes' => $anvil->data->{file_stat}{$dump_file}{size}});
my $size_bytes = $anvil->Convert->add_commas({number => $anvil->data->{file_stat}{$dump_file}{size}});
my $target_name = $anvil->Get->host_name_from_uuid({debug => 3, host_uuid => $this_host_uuid});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0658", variables => {
file => $dump_file,
host_name => $target_name,
took => $rsync_time,
size => $size,
size_bytes => $size_bytes,
}});
}
}
}
}
# Reap old db_in_use states over 6 hours old.
my $query = "DELETE FROM states WHERE state_name LIKE 'db_in_use%' AND modified_date < (SELECT now() - interval '6 hour');\n";
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0124", variables => { query => $query }});
$anvil->Database->write({debug => 2, query => $query, source => $THIS_FILE, line => __LINE__});
# Update the next check time.
$anvil->data->{timing}{next_ten_minute_check} = $now_time + $anvil->data->{timing}{ten_minute_checks};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"s1:timing::ten_minute_checks" => $anvil->data->{timing}{ten_minute_checks},
"s2:timing::next_ten_minute_check" => $anvil->data->{timing}{next_ten_minute_check},
}});
}
# Now check to see if it's time to run daily tasks.
if ($now_time >= $anvil->data->{timing}{next_daily_check})
{
### NOTE: We call it once/day, but this will also trigger on restart of anvil-daemon. As such, we
### don't use '--force' and let striker-manage-install-target skip the repo update if it happened
### recently enough.
if ($host_type eq "striker")
{
# Age out old data. This takes up to a minute.
$anvil->Database->_age_out_data();
# Archive old data
$anvil->Database->archive_database();
### TODO: This is here only to handle the period of time where we disabled postgres
### on boot. This should be removed sometime after 2022-08-01
$anvil->System->enable_daemon({daemon => $anvil->data->{sys}{daemon}{postgresql}});
# Record a job, don't call it directly. It takes too long to run.
my ($job_uuid) = $anvil->Database->insert_or_update_jobs({
file => $THIS_FILE,
line => __LINE__,
job_command => $anvil->data->{path}{exe}{'striker-manage-install-target'}." --refresh".$anvil->Log->switches,
job_data => "",
job_name => "install-target::refresh",
job_title => "job_0015",
job_description => "job_0017",
job_progress => 0,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => { job_uuid => $job_uuid }});
# Update the OUI data.
($job_uuid) = $anvil->Database->insert_or_update_jobs({
file => $THIS_FILE,
line => __LINE__,
job_command => $anvil->data->{path}{exe}{'striker-parse-oui'}.$anvil->Log->switches,
job_data => "",
job_name => "oui-data::refresh",
job_title => "job_0064",
job_description => "job_0065",
job_progress => 0,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }});
# Scan the networks
($job_uuid) = $anvil->Database->insert_or_update_jobs({
file => $THIS_FILE,
line => __LINE__,
job_command => $anvil->data->{path}{exe}{'striker-scan-network'}.$anvil->Log->switches,
job_data => "",
job_name => "scan-network::refresh",
job_title => "job_0066",
job_description => "job_0067",
job_progress => 0,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }});
}
# Update the next check time.
$anvil->data->{timing}{next_daily_check} = $now_time + $anvil->data->{timing}{daily_checks};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"s1:timing::daily_checks" => $anvil->data->{timing}{daily_checks},
"s2:timing::next_daily_check" => $anvil->data->{timing}{next_daily_check},
}});
}
return(0);
}
### NOTE: This logic plays out in a slightly different way in Database->shutdown().
# Check for stale db_in_use states.
sub check_db_in_use_states
{
my ($anvil) = @_;
# We only reap db_in_use entries for us.
$anvil->System->pids();
my $query = "
SELECT
state_uuid,
state_name,
state_note
FROM
states
WHERE
state_name LIKE 'db_in_use::%'
AND
state_host_uuid = ".$anvil->Database->quote($anvil->Get->host_uuid)."
;";
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0124", variables => { query => $query }});
my $results = $anvil->Database->query({query => $query, source => $THIS_FILE, line => __LINE__});
my $count = @{$results};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
results => $results,
count => $count,
}});
if ($count)
{
foreach my $row (@{$results})
{
my $state_uuid = $row->[0];
my $state_name = $row->[1];
my $state_note = $row->[2];
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:state_uuid' => $state_uuid,
's2:state_name' => $state_name,
's3:state_note' => $state_note,
}});
my ($db_uuid, $state_pid) = ($state_name =~ /db_in_use::(.*?)::(\d+)$/);
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:db_uuid' => $anvil->Get->host_name_from_uuid({host_uuid => $db_uuid})." (".$db_uuid.")",
's4:state_pid' => $state_pid,
}});
if (not exists $anvil->data->{pids}{$state_pid})
{
# Reap the 'db_is_use'.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "warning_0140", variables => {
db => $anvil->Get->host_name_from_uuid({host_uuid => $db_uuid})." (".$db_uuid.")",
pid => $state_pid,
}});
my $query = "DELETE FROM states WHERE state_uuid = ".$anvil->Database->quote($state_uuid).";";
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0124", variables => { query => $query }});
$anvil->Database->write({debug => 2, query => $query, source => $THIS_FILE, line => __LINE__});
}
}
}
return(0);
}
# On dashboards, this checks to see if any files are in /mnt/shared/incoming and, if so, that they've been processed.
sub check_incoming
{
my ($anvil) = @_;
my $system_type = $anvil->Get->host_type();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { system_type => $system_type }});
if ($system_type eq "striker")
{
# Look for files in /mnt/shared/incoming that are not yet in the database.
my $directory = $anvil->data->{path}{directories}{shared}{incoming};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { directory => $directory }});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { directory => $directory }});
local(*DIRECTORY);
opendir(DIRECTORY, $directory);
while(my $file = readdir(DIRECTORY))
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { file => $file }});
next if $file eq ".";
next if $file eq "..";
next if $file =~ /^\./; # This is files being rsync'ed still
my $full_path = $directory."/".$file;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { full_path => $full_path }});
# Skip anything that is not a file.
next if not -f $full_path;
# Is this file already in the DB?
my $query = "SELECT file_uuid FROM files WHERE file_name = ".$anvil->Database->quote($file).";";
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0124", variables => { query => $query }});
my $results = $anvil->Database->query({query => $query, source => $THIS_FILE, line => __LINE__});
my $count = @{$results};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
results => $results,
count => $count,
}});
if (not $count)
{
# Add it to the database.
my $size = (stat($full_path))[7];
my $say_size_human = $anvil->Convert->bytes_to_human_readable({'bytes' => $size});
my $say_size_comma = $anvil->Convert->add_commas({number => $size});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
size => $size,
say_size_human => $say_size_human,
say_size_comma => $say_size_comma,
}});
# Register a job to call anvil-sync-shared
my ($job_uuid) = $anvil->Database->insert_or_update_jobs({
file => $THIS_FILE,
line => __LINE__,
job_command => $anvil->data->{path}{exe}{'anvil-sync-shared'},
job_data => "file=".$full_path,
job_name => "storage::move_incoming",
job_title => "job_0132",
job_description => "job_0133",
job_progress => 0,
job_host_uuid => $anvil->data->{sys}{host_uuid},
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }});
}
}
closedir(DIRECTORY);
}
return(0);
}
# This calls striker-manage-install-target to see if the dhcpd is running or not. If it is or isn't, the config
# variable 'install-target::enabled' is set/updated. On non-Striker hosts, this simply returns without doing
# anything.
sub check_install_target
{
my ($anvil) = @_;
my $system_type = $anvil->Get->host_type();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { system_type => $system_type }});
if ($system_type ne "striker")
{
# Not a dashboard, nothing to do.
return(0);
}
my $status = "unavailable";
my ($output, $return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{'striker-manage-install-target'}." --status --check --no-refresh".$anvil->Log->switches});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { output => $output }});
foreach my $line (split/\n/, $output)
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { line => $line }});
if ($line =~ /status=(\d)/)
{
my $digit = $1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { digit => $digit }});
if ($digit == 0)
{
$status = "disabled";
}
elsif ($digit == 1)
{
$status = "enabled";
}
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { status => $status }});
last;
}
}
# Record the status
$anvil->Database->insert_or_update_variables({
variable_name => "install-target::enabled",
variable_source_uuid => $anvil->Get->host_uuid,
variable_source_table => "hosts",
variable_value => $status,
variable_default => "unavailable",
variable_description => "striker_0110",
variable_section => "system",
});
return(0);
}
# These are tools that don't need to constantly run. They'll typically run when the server starts up or the
# daemon is restarted or reloaded.
sub run_once
{
my ($anvil) = @_;
# Check that the database is ready.
prep_database($anvil);
# Check to see if we need to do boot-time tasks. We only run these if we've just booted
boot_time_tasks($anvil);
# Check the ssh stuff.
# NOTE: This actually runs again in the minutes tasks, but needs to run on boot as well.
$anvil->System->check_ssh_keys({debug => 2});
# Check setuid wrappers
check_setuid_wrappers($anvil);
# Check journald is configured for persistent storage.
check_journald($anvil);
if ($anvil->data->{switches}{'startup-only'})
{
$anvil->nice_exit({exit_code => 0});
}
return(0);
}
sub check_journald
{
my ($anvil) = @_;
# Check the journald.conf to ensure logging in configured to be persistent.
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { 'path::configs::journald.conf' => $anvil->data->{path}{configs}{'journald.conf'} }});
my $peristent_seen = 0;
my $change_storage = 0;
my $old_journald_conf = $anvil->Storage->read_file({file => $anvil->data->{path}{configs}{'journald.conf'}});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { old_journald_conf => $old_journald_conf }});
foreach my $line (split/\n/, $old_journald_conf)
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { line => $line }});
if ($line =~ /^Storage=(.*)$/)
{
my $value = $1;
if ($value eq "persistent")
{
$peristent_seen = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { peristent_seen => $peristent_seen }});
}
else
{
$change_storage = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { change_storage => $change_storage }});
}
}
}
# Make sure the journald directory
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { 'path::directories::journald' => $anvil->data->{path}{directories}{journald} }});
if (not -d $anvil->data->{path}{directories}{journald})
{
$anvil->Storage->make_directory({
debug => 2,
directory => $anvil->data->{path}{directories}{journald},
});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "message_0248", variables => { directory => $anvil->data->{path}{directories}{journald} }});
}
# Make sure the journald is configured for persistent storage.
if (not $peristent_seen)
{
my $storage_added = 0;
my $new_journald_conf = "";
foreach my $line (split/\n/, $old_journald_conf)
{
if (($line =~ /^Storage=/) && ($change_storage))
{
if (not $storage_added)
{
$storage_added = 1;
$new_journald_conf .= "Storage=persistent\n";
}
next;
}
if (($line =~ /^#Storage=/) && (not $storage_added))
{
$storage_added = 1;
$new_journald_conf .= "Storage=persistent\n";
}
$new_journald_conf .= $line."\n";
}
if (not $storage_added)
{
$new_journald_conf .= "Storage=persistent\n";
}
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { new_journald_conf => $new_journald_conf }});
$anvil->Storage->write_file({
debug => 3,
secure => 0,
file => $anvil->data->{path}{configs}{'journald.conf'},
body => $new_journald_conf,
mode => "0644",
overwrite => 1,
});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "message_0013", variables => { file => $anvil->data->{path}{configs}{'journald.conf'} }});
# Restart the journald service.
my $shell_call = $anvil->data->{path}{exe}{systemctl}." restart systemd-journald.service";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({debug => 3, shell_call => $shell_call, source => $THIS_FILE, line => __LINE__});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
}
return(0);
}
# This creates, as needed, the setuid wrappers used by apache to make certain system calls.
sub check_setuid_wrappers
{
my ($anvil) = @_;
my $host_type = $anvil->Get->host_type();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { host_type => $host_type }});
if ($host_type ne "striker")
{
# Not a dashboard, setuid scripts aren't needed.
return(0);
}
# Does the call_striker-get-peer-data wrapper exist yet?
if (-e $anvil->data->{path}{exe}{'call_striker-get-peer-data'})
{
# Exists, skipping.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, key => "log_0436", variables => { wrapper => $anvil->data->{path}{exe}{'call_striker-get-peer-data'} }});
}
else
{
# What is the admin user and group ID?
my $admin_uid = getpwnam('admin');
my $admin_gid = getgrnam('admin');
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
admin_uid => $admin_uid,
admin_gid => $admin_gid,
}});
next if not $admin_uid;
next if not $admin_gid;
# Write the body out
my $call_striker_get_peer_data_body = "#define REAL_PATH \"".$anvil->data->{path}{exe}{'striker-get-peer-data'}."\"\n";
$call_striker_get_peer_data_body .= "main(ac, av)\n";
$call_striker_get_peer_data_body .= "char **av;\n";
$call_striker_get_peer_data_body .= "{\n";
$call_striker_get_peer_data_body .= " setuid(".$admin_uid.");\n";
$call_striker_get_peer_data_body .= " setgid(".$admin_gid.");\n";
$call_striker_get_peer_data_body .= " execv(REAL_PATH, av);\n";
$call_striker_get_peer_data_body .= "}\n";
my $error = $anvil->Storage->write_file({
debug => 3,
file => $anvil->data->{path}{exe}{'call_striker-get-peer-data'}.".c",
body => $call_striker_get_peer_data_body,
mode => '644',
overwrite => 1,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { error => $error }});
# If it wrote out, compile it.
if (not -e $anvil->data->{path}{exe}{'call_striker-get-peer-data'}.".c")
{
# Failed to write.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "error_0071", variables => { file => $anvil->data->{path}{exe}{'call_striker-get-peer-data'}.".c" }});
}
else
{
# Compile it
my ($output, $return_code) = $anvil->System->call({
debug => 3,
shell_call => $anvil->data->{path}{exe}{gcc}." -o ".$anvil->data->{path}{exe}{'call_striker-get-peer-data'}." ".$anvil->data->{path}{exe}{'call_striker-get-peer-data'}.".c",
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
output => $output,
return_code => $return_code,
}});
# If it compiled, setuid it.
if (not -e $anvil->data->{path}{exe}{'call_striker-get-peer-data'})
{
# Something went wrong compiling it.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "error_0072", variables => { file => $anvil->data->{path}{exe}{'call_striker-get-peer-data'}.".c" }});
}
else
{
$anvil->Storage->change_owner({
debug => 3,
path => $anvil->data->{path}{exe}{'call_striker-get-peer-data'},
user => 'root',
group => 'root',
});
$anvil->Storage->change_mode({
debug => 3,
path => $anvil->data->{path}{exe}{'call_striker-get-peer-data'},
mode => '4755',
});
}
}
}
return(0);
}
# Configure/update the firewall.
sub check_firewall
{
my ($anvil) = @_;
# Don't call this if we're not configured yet.
my $configured = $anvil->System->check_if_configured({debug => 3});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { configured => $configured }});
# Check the firewall needs to be updated.
if ($configured)
{
my ($output, $return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{'anvil-manage-firewall'}.$anvil->Log->switches});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { output => $output, return_code => $return_code }});
}
return(0);
}
# This handles tasks that need to run on boot (if any)
sub boot_time_tasks
{
my ($anvil) = @_;
# If the uptime is less than ten minutes, clear the reboot flag.
my $uptime = $anvil->Get->uptime;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { uptime => $uptime }});
# Now find out if a reboot is listed as needed and when it was last changed.
my $reboot_needed = 0;
my $changed_seconds_ago = 0;
my $query = "
SELECT
variable_value,
(SELECT extract(epoch from now()) - extract(epoch from modified_date)) AS changed_seconds_ago
FROM
variables
WHERE
variable_source_table = 'hosts'
AND
variable_source_uuid = ".$anvil->Database->quote($anvil->Get->host_uuid)."
AND
variable_name = 'reboot::needed'
;";
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0124", variables => { query => $query }});
my $results = $anvil->Database->query({query => $query, source => $THIS_FILE, line => __LINE__});
my $count = @{$results};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
results => $results,
count => $count,
}});
if ($count)
{
$reboot_needed = $results->[0]->[0];
$changed_seconds_ago = $results->[0]->[1];
$changed_seconds_ago =~ s/^(\d+)\..*$/$1/;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
reboot_needed => $reboot_needed,
changed_seconds_ago => $changed_seconds_ago,
}});
}
### TODO: This shouldn't be needed anymore. anvil-manage-power doesn't set the progress to '50' prior
### to reboot anymore.
# If a reboot is needed, see if the uptime is less than the time since the reboot needed flag was
# set. If the uptime is less, then the system rebooted since it was requested so clear it. h/t to
# Lisa Seelye (@thedoh) for this idea!
my $difference = ($changed_seconds_ago - $uptime);
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"s1:reboot_needed" => $reboot_needed,
"s2:changed_seconds_ago" => $changed_seconds_ago,
"s3:uptime" => $uptime,
"s4:difference" => $difference,
}});
if ($reboot_needed)
{
if ($uptime < $changed_seconds_ago)
{
# Clear the reboot request.
$reboot_needed = $anvil->System->reboot_needed({debug => 2, set => 0});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { reboot_needed => $reboot_needed }});
# Check to see if there was a reboot job in progress. If so, finish it off.
my $job_uuid = $anvil->Job->get_job_uuid({
debug => 2,
program => "anvil-manage-power",
incomplete => 1,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }});
if ($job_uuid)
{
# Update the percentage to '100' and then clear the old PID.
my $date_time = $anvil->Get->date_and_time();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { date_time => $date_time }});
$anvil->Job->update_progress({
progress => 100,
message => "message_0064,!!date_and_time!".$date_time."!!",
job_uuid => $job_uuid,
picked_up_by => 0,
});
}
}
}
else
{
# Update our status
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 2, key => "log_0572"});
$anvil->Database->get_hosts({debug => 2});
my $host_uuid = $anvil->Get->host_uuid({debug => 2});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { host_uuid => $host_uuid }});
$anvil->Database->insert_or_update_hosts({
debug => 2,
host_ipmi => $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_ipmi},
host_key => $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_key},
host_name => $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_name},
host_type => $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type},
host_uuid => $host_uuid,
host_status => "online",
});
# Make sure our stop reason is cleared.
my $variable_uuid = $anvil->Database->insert_or_update_variables({
variable_name => 'system::stop_reason',
variable_value => '',
variable_default => '',
variable_description => 'striker_0279',
variable_section => 'system',
variable_source_uuid => $host_uuid,
variable_source_table => 'hosts',
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { variable_uuid => $variable_uuid }});
}
# Make sure /etc/hosts is updated.
$anvil->System->update_hosts();
# This handles weird bits for things like bug work-arounds.
handle_special_cases($anvil);
# Now look for jobs that have a job status of 'anvil_startup'
run_jobs($anvil, 1);
# Check the firewall needs to be updated.
check_firewall($anvil);
# If we're a striker, check apache
my $host_type = $anvil->Get->host_type;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_type => $host_type }});
if ($host_type eq "striker")
{
$anvil->Striker->check_httpd_conf({debug => 3});
}
return(0);
}
# This handles weird bits for things like bug work-arounds.
sub handle_special_cases
{
my ($anvil) = @_;
my $host_type = $anvil->Get->host_type();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_type => $host_type }});
if ($host_type ne "striker")
{
### TODO: Test that this is fixed. The bug is now ERRATA
# RHBZ #1961562 - https://bugzilla.redhat.com/show_bug.cgi?id=1961562#c16
# We're a node or DR host. We need to touch this file.
my $work_around_file = "/etc/qemu/firmware/50-edk2-ovmf-cc.json";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { work_around_file => $work_around_file }});
if (not -e $work_around_file)
{
$anvil->Storage->write_file({
debug => 2,
file => $work_around_file,
body => "",
overwrite => 0,
backup => 0,
mode => "0644",
user => "root",
group => "root",
});
}
# Make sure DRBD compiled after a kernel upgrade.
$anvil->DRBD->_initialize_kmod({debug => 2});
}
### TODO: Remove these later. This is here to clean up how we used to handle db_in_use and lock_request flags.
if (1)
{
# Broadly clear all states that are '0' now.
my $queries = [];
push @{$queries}, "DELETE FROM states WHERE state_name LIKE 'db_in_use::%' AND state_note != '1';";
push @{$queries}, "DELETE FROM history.variables WHERE variable_name = 'lock_request';";
push @{$queries}, "DELETE FROM variables WHERE variable_name = 'lock_request';";
foreach my $query (@{$queries})
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0124", variables => { query => $query }});
}
$anvil->Database->write({debug => 2, query => $queries, source => $THIS_FILE, line => __LINE__});
}
return(0);
}
# Configure the local database, if needed.
sub prep_database
{
my ($anvil) = @_;
# If there's a backup file, we're configured and possibly just off.
my $prep_database = 1;
foreach my $uuid (keys %{$anvil->data->{database}})
{
my $dump_file = $anvil->data->{path}{directories}{pgsql}."/anvil_db_dump.".$uuid.".sql";
$dump_file =~ s/\/\//\//g;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { dump_file => $dump_file }});
if (-e $dump_file)
{
# No need to prepare.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0665", variables => { file => $dump_file }});
$prep_database = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { prep_database => $prep_database }});
}
}
# Only run this if we're a dashboard.
my $host_type = $anvil->Get->host_type();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_type => $host_type }});
if ($host_type eq "striker")
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
prep_database => $prep_database,
"sys::database::connections" => $anvil->data->{sys}{database}{connections},
}});
if ($prep_database)
{
### NOTE: This failed once, in case / until it happens again, we'll force log level 2 and secure logging.
my $shell_call = $anvil->data->{path}{exe}{'striker-prep-database'}." -vv --log-secure";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my ($database_output, $return_code) = $anvil->System->call({debug => 2, shell_call => $shell_call, source => $THIS_FILE, line => __LINE__ });
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
database_output => $database_output,
return_code => $return_code,
}});
}
elsif (not $anvil->data->{sys}{database}{connections})
{
# Start the daemon locally, if needed.
my $running = $anvil->System->check_daemon({daemon => "postgresql"});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { running => $running }});
if ($running == 2)
{
# Not installed, nothing to do.
}
elsif (not $running)
{
# Start it.
my $return_code = $anvil->System->start_daemon({daemon => "postgresql"});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { return_code => $return_code }});
}
}
}
return(0);
}
# These are tools that need to keep running.
sub keep_running
{
my ($anvil) = @_;
# Check for jobs that were running and now exited.
if ((not $anvil->data->{sys}{mapping_network}) && (exists $anvil->data->{processes}))
{
foreach my $job_uuid (%{$anvil->data->{jobs}{handles}})
{
# If it's not a handle, delete it.
my $running = $anvil->data->{jobs}{handles}{$job_uuid}->poll();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
"jobs::handles::${job_uuid}" => $anvil->data->{jobs}{handles}{$job_uuid},
running => $running,
}});
# If it's not running, update the table to clear the 'job_picked_up_by' column.
if (not $running)
{
my $exit_status = $anvil->data->{jobs}{handles}{$job_uuid}->exit_status();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => {
job_uuid => $job_uuid,
exit_status => $exit_status,
}});
# Free up memory
$anvil->data->{jobs}{handles}{$job_uuid}->cleanup();
$anvil->Job->clear({job_uuid => $job_uuid});
}
}
}
# If we're configured, write out the status JSON file. If we're not configured, Update hardware state files.
my $configured = $anvil->System->check_if_configured;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { configured => $configured }});
if ((not $anvil->data->{sys}{mapping_network}) && ($configured))
{
# Write out state information for all known Anvil! systems and the information from
# unconfigured nods and DR hosts, using just database data (hence, fast enough to run
# constantly).
$anvil->System->generate_state_json({debug => 2});
}
else
{
# Run this to monitor the network in real time.
update_state_file($anvil);
}
# Run any pending jobs by calling 'anvil-jobs' with the 'job_uuid' as a background process.
run_jobs($anvil, 0) if not $anvil->data->{sys}{mapping_network};
return(0);
}
# This will check for any jobs that aren't at 100%. For each found, if 'picked_up_by' is set, a check is made
# to see if the PID is still alive. If it isn't, or if 'picked_up_by' is not set, the appropriate tool is
# invoked to handle it.
sub run_jobs
{
my ($anvil, $startup) = @_;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { startup => $startup }});
# This will be set to 1 if any jobs are not complete, preventing a restart of the daemon if it's
# changed on disk.
$anvil->data->{sys}{jobs_running} = 0;
# We'll also update the jobs.json file.
my $jobs_file = "{\"jobs\":[\n";
# Get a list of pending or incomplete jobs.
my $ended_within = $startup ? 1 : 300;
my $return = $anvil->Database->get_jobs({ended_within => $ended_within});
my $count = @{$return};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
ended_within => $ended_within,
'return' => $return,
count => $count,
}});
foreach my $hash_ref (@{$return})
{
my $job_uuid = $hash_ref->{job_uuid};
my $job_command = $hash_ref->{job_command};
my $job_data = $hash_ref->{job_data};
my $job_picked_up_by = $hash_ref->{job_picked_up_by};
my $job_picked_up_at = $hash_ref->{job_picked_up_at};
my $job_updated = $hash_ref->{job_updated};
my $job_name = $hash_ref->{job_name};
my $job_progress = $hash_ref->{job_progress};
my $job_title = $hash_ref->{job_title};
my $job_description = $hash_ref->{job_description};
my $job_status = $hash_ref->{job_status};
my $started_seconds_ago = $job_picked_up_at ? (time - $job_picked_up_at) : 0;
my $updated_seconds_ago = $job_updated ? (time - $job_updated) : 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
job_uuid => $job_uuid,
job_command => $job_command,
job_data => $job_data,
job_picked_up_by => $job_picked_up_by,
job_picked_up_at => $job_picked_up_at,
job_updated => $job_updated,
job_name => $job_name,
job_progress => $job_progress,
job_title => $job_title,
job_description => $job_description,
job_status => $job_status,
started_seconds_ago => $started_seconds_ago,
updated_seconds_ago => $updated_seconds_ago,
}});
# If this is a start-up call, only start jobs whose status is 'anvil_startup'.
if (($startup) && ($job_status ne "anvil_startup"))
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0639", variables => {
job_uuid => $job_uuid,
job_command => $job_command,
}});
next;
}
if ($job_progress ne "100")
{
$anvil->data->{sys}{jobs_running} = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "sys::jobs_running" => $anvil->data->{sys}{jobs_running} }});
}
# See if the job was picked up by a now-dead instance.
if ($job_picked_up_by)
{
# Check if the PID is still active.
$anvil->System->pids({ignore_me => 1});
### TODO: Add a check to verify the job isn't hung.
# Skip if this job is in progress.
if (not exists $anvil->data->{pids}{$job_picked_up_by})
{
# If the job is done, just clear the 'job_picked_up_by' and be done.
if ($job_progress ne "100")
{
# It's possible that the job updated to 100% and exited after we
# gathered the job data, so we won't restart until we've seen it not
# running and not at 100% after 5 loops.
if ((not exists $anvil->data->{lost_job_count}{$job_uuid}) or (not defined $anvil->data->{lost_job_count}{$job_uuid}))
{
$anvil->data->{lost_job_count}{$job_uuid} = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "lost_job_count::${job_uuid}" => $anvil->data->{lost_job_count}{$job_uuid} }});
}
if ($anvil->data->{lost_job_count}{$job_uuid} > 5)
{
# The previous job is gone, but the job isn't finished. Start it again.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "warning_0007", variables => {
command => $job_command,
pid => $job_picked_up_by,
percent => $job_progress,
}});
# Clear some variables.
$job_progress = 0;
$job_status = "message_0056";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
job_progress => $job_progress,
job_status => $job_status,
}});
# Clear the job.
$anvil->Job->clear({debug => 2, job_uuid => $job_uuid});
$anvil->data->{lost_job_count}{$job_uuid} = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "lost_job_count::${job_uuid}" => $anvil->data->{lost_job_count}{$job_uuid} }});
}
else
{
$anvil->data->{lost_job_count}{$job_uuid}++;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "lost_job_count::${job_uuid}" => $anvil->data->{lost_job_count}{$job_uuid} }});
}
}
# Clear the PID
$job_picked_up_by = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_picked_up_by => $job_picked_up_by }});
}
}
# Convert the double-banged strings into a proper message.
my $say_title = $job_title ? $anvil->Words->parse_banged_string({key_string => $job_title}) : "";
my $say_description = $job_description ? $anvil->Words->parse_banged_string({key_string => $job_description}) : "";
my $say_status = $job_status ? $anvil->Words->parse_banged_string({key_string => $job_status}) : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
job_title => $job_title,
say_description => $say_description,
say_status => $say_status,
}});
# Make the status HTML friendly. Strip any embedded HTML then encode the text string.
if ($say_status)
{
my $html_strip = HTML::Strip->new();
$say_status = $html_strip->parse($say_status);
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { say_status => $say_status }});
# Now make the resulting text string HTML friendly
my $text_to_html = HTML::FromText->new({
urls => 1,
email => 1,
lines => 1,
});
$say_status = $text_to_html->parse($say_status);
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { say_status => $say_status }});
}
# Add this to the jobs.json file
my $json_string = to_json ({
job_uuid => $job_uuid,
job_command => $job_command,
job_data => $job_data,
job_picked_up_at => $job_picked_up_at,
job_updated => $job_updated,
job_name => $job_name,
job_progress => $job_progress,
job_title => $say_title,
job_description => $say_description,
job_status => $say_status,
started_seconds_ago => $started_seconds_ago,
updated_seconds_ago => $updated_seconds_ago,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { json_string => $json_string }});
$jobs_file .= $json_string.",\n";
# If the job is done, move on.
next if $job_progress eq "100";
next if $anvil->data->{switches}{'no-start'};
# If 'startup' is set, we only care if 'job_status' is 'anvil_startup'
if ((not $startup) && ($say_status eq "anvil_startup"))
{
# Skip this, it will run next time anvil-daemon restarts.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0593", variables => {
command => $job_command,
job_uuid => $job_uuid,
}});
next;
}
# If the job is not running, start it.
if (not $job_picked_up_by)
{
my $command = $job_command." --job-uuid ".$job_uuid;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0210", variables => { command => $command }});
# Have we started this job recently?
if (exists $anvil->data->{jobs}{$job_uuid}{started})
{
my $last_start = time - $anvil->data->{jobs}{$job_uuid}{started};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { last_start => $last_start }});
if ($last_start < 60)
{
# Skip, Started too recently.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0578", variables => {
command => $command,
last_start => $last_start,
}});
next;
}
}
# Start the job, appending '--job-uuid' to the command.
($anvil->data->{jobs}{handles}{$job_uuid}, my $return_code) = $anvil->System->call({
background => 1,
stdout_file => "/tmp/anvil.job.".$job_uuid.".stdout",
stderr_file => "/tmp/anvil.job.".$job_uuid.".stderr",
shell_call => $command,
source => $THIS_FILE,
line => __LINE__,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"jobs::handles::${job_uuid}" => $anvil->data->{jobs}{handles}{$job_uuid},
return_code => $return_code,
}});
# Log the PID (the job should update the database).
my $pid = $anvil->data->{jobs}{handles}{$job_uuid}->pid();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { pid => $pid }});
# Record that we've tried to start this job, so that we don't try to restart it for any reason for at least a minute.
$anvil->data->{jobs}{$job_uuid}{started} = time;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'jobs::$job_uuid::started' => $anvil->data->{jobs}{$job_uuid}{started} }});
}
}
# Close the jobs file.
$jobs_file =~ s/,\n$/\n/ms;
$jobs_file .= "]}\n";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { jobs_file => $jobs_file }});
# Write the JSON file
my $output_json = $anvil->data->{path}{directories}{html}."/status/jobs.json";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { output_xml => $output_json }});
$anvil->Storage->write_file({
file => $output_json,
body => $jobs_file,
overwrite => 1,
backup => 0,
mode => "0644",
user => "apache",
group => "apache",
});
return(0);
}
# This calls 'anvil-update-states' which will scan the local machine's state (hardware and software) and
# record write it out to an HTML file
sub update_state_file
{
my ($anvil) = @_;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, key => "log_0480"});
#my $shell_call = $anvil->data->{path}{exe}{'anvil-update-states'}.$anvil->Log->switches;
my $shell_call = $anvil->data->{path}{exe}{'anvil-update-states'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { shell_call => $shell_call }});
my ($states_output, $return_code) = $anvil->System->call({debug => 3, shell_call => $shell_call, source => $THIS_FILE, line => __LINE__});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
states_output => $states_output,
return_code => $return_code,
}});
return(0);
}