#!/usr/bin/perl # # This is the master daemon that manages all periodically run processes on Striker dashboards, Anvil! cluster # nodes and DR hosts. # # Exit codes; # 0 = Normal exit or md5sum of this program changed and it exited to reload. # 1 = Not running as root. # 2 = Unable to connect to any database, even after trying to initialize the local system. # # TODO: # - Need to check what kind of machine this is and not prep the database unless its a dashboard. # - Add a "running: pending,yes,done,dead" and show an appropriate icon beside jobs # - Decide if holding before the main loop until 'systemctl is-system-running' returns 'running' is a good # idea or not. # - Write the status of this and the scancore daemon to /etc/anvil/anvil.motd and symlink it to /etc/motd.d/ # - Write a script that runs in crontab at UTC 17:00 that sends an email if Scancore or anvil-daemon are disabled. # - Examine limites in: https://www.freedesktop.org/software/systemd/man/systemd.exec.html#LimitCPU= # - Write a background program to scan the BCN and uses OUI data to try and find / auto-configure PDUs and UPSes # - # - Increase DRBD's default timeout # - Check for and enable persistent journald logging # # NOTE: # - For later; 'reboot --force --force' immediately kills the OS, like disabling ACPI on EL6 and hitting the # power button. Might be useful in ScanCore down the road. # # Switches: # # --main-loop-only # # This skips the one-time, start-up tasks and just goes into the main-loop, # # --no-start # # This will prevent any pending jobs from being picked up and started in this run. Note that other job checks will still happen. # # --refresh-json # # This just updates the JSON files used by the web interface. It is the same as '--run-once --main-loop-only --no-start' # # --run-once # # This will tell the program to exit after runn the main loop once. # # --startup-only # # This will tell the program to exit after running the start up tasks, so the main loop won't run. # use strict; use warnings; use Anvil::Tools; use Proc::Simple; #use Time::HiRes qw ( time sleep ); use JSON; use HTML::Strip; use HTML::FromText; use Data::Dumper; use Text::Diff; my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0]; my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0]; if (($running_directory =~ /^\./) && ($ENV{PWD})) { $running_directory =~ s/^\./$ENV{PWD}/; } # Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete. $| = 1; # Prevent a discrepency between UID/GID and EUID/EGID from throwing an error. $< = $>; $( = $); # NOTE: Setting 'log_level' and 'log_secure' here will get overridden in the main lopp. Use the Log methods # in the loop as well to override defaults in code. my $anvil = Anvil::Tools->new(); # Make sure we're running as 'root' # $< == real UID, $> == effective UID if (($< != 0) && ($> != 0)) { # Not root print $anvil->Words->string({key => "error_0005"})."\n"; $anvil->nice_exit({exit_code => 1}); } # If, so some reason, anvil.conf is lost, create it. $anvil->System->_check_anvil_conf(); # Connect to the database(s). If we have no connections, we'll proceed anyway as one of the 'run_once' tasks # is to setup the database server. $anvil->Database->connect({check_if_configured => 1, check_for_resync => 1}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0132"}); # If I have no databases, sleep for a second and then exit (systemd will restart us). if (not $anvil->data->{sys}{database}{connections}) { # If this is a dashboard, try to configure and then connect to the local database. If this isn't a # dashboard, then just go into a loop waiting for a database to be configured. if ($anvil->Get->host_type eq "striker") { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0201"}); prep_database($anvil); # Try connecting again $anvil->refresh(); $anvil->Database->connect(); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0132"}); if (not $anvil->data->{sys}{database}{connections}) { # Still nothing, sleep and exit. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, key => "error_0003"}); $anvil->nice_exit({exit_code => 2}); } } else { # Wait until we have one. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, key => "error_0075"}); until($anvil->data->{sys}{database}{connections}) { sleep 10; check_network($anvil); $anvil->refresh(); $anvil->Database->connect({check_if_configured => 1, check_for_resync => 1}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, key => "log_0132"}); if (not $anvil->data->{sys}{database}{connections}) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 3, key => "log_0439"}); } } } } # Read switches $anvil->data->{switches}{'refresh-json'} = ""; $anvil->data->{switches}{'run-once'} = 0; $anvil->data->{switches}{'main-loop-only'} = 0; $anvil->data->{switches}{'no-start'} = 0; $anvil->data->{switches}{'startup-only'} = 0; $anvil->Get->switches; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }}); if ($anvil->data->{switches}{'refresh-json'}) { $anvil->data->{switches}{'run-once'} = 1; $anvil->data->{switches}{'main-loop-only'} = 1; $anvil->data->{switches}{'no-start'} = 1; } # This is used to track initial checkes / repairs of network issues. $anvil->data->{sys}{network}{initial_checks} = 0; # There are some things we only want to run on (re)start and don't need to always run. run_once($anvil) if not $anvil->data->{switches}{'main-loop-only'}; # Calculate my sum so that we can exit if it changes later. $anvil->Storage->record_md5sums; # What time is it, Mr. Fox? my $now_time = time; # To avoid multiple dashboards running a network scan and OUI parse, the dashboard peer with the lowest # host_uuid sets it's daily checks to run now, and the other(s) will get a two hour's delay. my $delay = set_delay($anvil); # Once a minute, we'll check the md5sums and see if we should restart. # Once a day, we'll refresh an Install Target's RPM repository (has no effect on non-Striker dashboards). $anvil->data->{timing}{minute_checks} = 60; $anvil->data->{timing}{daily_checks} = 86400; $anvil->data->{timing}{repo_update_interval} = 86400; $anvil->data->{timing}{next_minute_check} = $now_time - 1; $anvil->data->{timing}{next_daily_check} = ($now_time + $delay) - 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "s1:timing::minute_checks" => $anvil->data->{timing}{minute_checks}, "s2:timing::daily_checks" => $anvil->data->{timing}{daily_checks}, "s3:timing::repo_update_interval" => $anvil->data->{timing}{repo_update_interval}, "s4:now_time" => $now_time, "s5:delay" => $delay, "s6:timing::next_minute_check" => $anvil->data->{timing}{next_minute_check}, "s7:timing::next_daily_check" => $anvil->data->{timing}{next_daily_check}, }}); # Disconnect. We'll reconnect inside the loop $anvil->Database->disconnect(); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, key => "log_0203"}); # This will prevent restarting while jobs are running. $anvil->data->{sys}{jobs_running} = 0; # When we periodically check if system files have changed, we'll also ask Database>connect() to check if it # needs to be configured or updated. This is done periodically as it is expensive to run on every loop. my $check_if_database_is_configured = 0; # These are the things we always want running. while(1) { # Reload defaults, re-read the config and then connect to the database(s) $anvil->refresh(); # If, so some reason, anvil.conf is lost, create it. $anvil->System->_check_anvil_conf(); $anvil->Database->connect({check_if_configured => $check_if_database_is_configured, check_for_resync => 1}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, key => "log_0132"}); # Mark that we don't want to check the database now. $check_if_database_is_configured = 0; # If this host is mapping the network, we'll skip a lot of stuff. If set for over an hour, we'll # clear it. $anvil->data->{sys}{mapping_network} = check_if_mapping($anvil); if ($anvil->data->{sys}{database}{connections}) { # Run the normal tasks keep_running($anvil); # Handle periodic tasks handle_periodic_tasks($anvil) if not $anvil->data->{sys}{mapping_network}; } else { # No databases available, we'll update the state file in case this host is having it's # network mapped and the interface used to talk to the databases went down. That's all we # can do though. update_state_file($anvil); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, key => "log_0202"}); } # Exit if 'run-once' selected. if ($anvil->data->{switches}{'run-once'}) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "message_0055"}); $anvil->nice_exit({exit_code => 0}); } # Disconnect from the database(s) and sleep now. $anvil->Database->disconnect(); sleep(2); } $anvil->nice_exit({exit_code => 0}); ############################################################################################################# # Functions # ############################################################################################################# # Check to see if we're mapping the network on this host. sub check_if_mapping { my ($anvil) = @_; $anvil->data->{sys}{mapping_network} = 0; if ($anvil->data->{sys}{database}{connections}) { my ($map_network_value, $map_network_uuid, $map_network_mtime, $map_network_modified_date) = $anvil->Database->read_variable({ debug => 3, variable_name => "config::map_network", variable_source_table => "hosts", variable_source_uuid => $anvil->data->{sys}{host_uuid}, }); # We'll run for a day (should be cancelled by the program when the user's done, so this # shouldn't fire in practice). my $expire_age = 86400; my $map_network_age = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { 's1:map_network_value' => $map_network_value, 's2:map_network_mtime' => $map_network_mtime, 's3:map_network_modified_date' => $map_network_modified_date, 's4:map_network_uuid' => $map_network_uuid, }}); if ($map_network_uuid) { $map_network_age = time - $map_network_mtime; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { map_network_age => $map_network_age }}); } if ($map_network_value) { # How long ago was it set? $anvil->data->{switches}{'clear-mapping'} = "" if not defined $anvil->data->{switches}{'clear-mapping'}; if (($map_network_age >= $expire_age) or ($anvil->data->{switches}{'clear-mapping'})) { # Clear it. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0470"}); $anvil->Database->insert_or_update_variables({ debug => 3, variable_value => 0, variable_uuid => $map_network_uuid, update_value_only => 1, }); } else { # Mark it so we only track the network. my $say_age = $anvil->Convert->add_commas({number => $expire_age}); my $timeout = $anvil->Convert->add_commas({number => ($expire_age - $map_network_age)}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0471", variables => { age => $say_age, timeout => $timeout, }}); $anvil->data->{sys}{mapping_network} = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { "sys::mapping_network" => $anvil->data->{sys}{mapping_network} }}); # Close any open ssh connections. foreach my $ssh_fh_key (keys %{$anvil->data->{cache}{ssh_fh}}) { my $ssh_fh = $anvil->data->{cache}{ssh_fh}{$ssh_fh_key}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { ssh_fh_key => $ssh_fh_key, ssh_fh => $ssh_fh, }}); if ($ssh_fh =~ /^Net::OpenSSH/) { $ssh_fh->disconnect(); } delete $anvil->data->{cache}{ssh_fh}{$ssh_fh_key}; } } } } $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { "sys::mapping_network" => $anvil->data->{sys}{mapping_network} }}); return($anvil->data->{sys}{mapping_network}); } # This decides if the local system will delay daily runs on start-up. sub set_delay { my ($anvil) = @_; my $delay = 7200; my $type = $anvil->Get->host_type(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { type => $type }}); if ($type eq "striker") { foreach my $uuid (keys %{$anvil->data->{database}}) { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { "sys::host_uuid" => $anvil->data->{sys}{host_uuid}, uuid => $uuid, }}); if ($uuid eq $anvil->data->{sys}{host_uuid}) { $delay = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { delay => $delay }}); } last; } } else { # Not a dashboard, don't delay $delay = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { delay => $delay }}); } return($delay); } # This checks to see if it's time to see if the network is ok and, if the system has been up long enough, # checks and tries to repair network issues. sub check_network { my ($anvil) = @_; # The network sometimes doesn't come up, but we don't want to try recovering it too soon. As such, # we'll start watching the network after the uptime is 2 minutes. my $uptime = $anvil->Get->uptime; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { uptime => $uptime }}); if ($uptime > 120) { # Check that bonds are up. Degraded bonds will be left alone. if (not $anvil->data->{sys}{network}{initial_checks}) { my $running = $anvil->System->check_daemon({daemon => "NetworkManager"}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { running => $running }}); if (not $running) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "message_0250", variables => { daemon => "NetworkManager" }}); my $return_code = $anvil->System->start_daemon({daemon => "NetworkManager"}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { return_code => $return_code }}); } $anvil->Network->check_network({heal => "all"}); $anvil->data->{sys}{network}{initial_checks} = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "sys::network::initial_checks" => $anvil->data->{sys}{network}{initial_checks}, }}); } else { $anvil->Network->check_network({heal => "down_only"}); } check_firewall($anvil); } return(0); } # This handles running tasks that only run on some loops. sub handle_periodic_tasks { my ($anvil) = @_; my $now_time = time; my $type = $anvil->Get->host_type(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { "s1:now_time" => $now_time, "s2:timing::next_minute_check" => $anvil->data->{timing}{next_minute_check}, "s3:timing::next_daily_check" => $anvil->data->{timing}{next_daily_check}, "s4:type" => $type, }}); # Time to run once per minute tasks. if ($now_time >= $anvil->data->{timing}{next_minute_check}) { # Check the firewall needs to be updated. check_network($anvil); # Check to see if the PXE environment needs to be updated. check_install_target($anvil); # Check that the users we care about have ssh public keys and they're recorded in ssh_keys. $anvil->System->check_ssh_keys({debug => 3}); $anvil->System->update_hosts({debug => 3}); # Check if the files on disk have changed. Even if it is time to check, don't if a job is # running. if ((not $anvil->data->{timing}{jobs_running}) && ($anvil->Storage->check_md5sums)) { # NOTE: We exit with '0' to prevent systemctl from showing a scary red message. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "message_0014"}); $anvil->nice_exit({exit_code => 0}); } # Mark that we want to check the database config next time. $check_if_database_is_configured = 1; # Update the next check time. $anvil->data->{timing}{next_minute_check} = $now_time + $anvil->data->{timing}{minute_checks}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { "s1:timing::minute_checks" => $anvil->data->{timing}{minute_checks}, "s2:timing::next_minute_check" => $anvil->data->{timing}{next_minute_check}, }}); # Even when this runs, it should finish in under ten seconds so we don't need to background it. my ($parse_output, $return_code) = $anvil->System->call({debug => 3, shell_call => $anvil->data->{path}{exe}{'anvil-parse-fence-agents'}.$anvil->Log->switches, source => $THIS_FILE, line => __LINE__}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { parse_output => $parse_output }}); # Scan the local network. update_state_file($anvil); # Make sure the shared directories exist. foreach my $target (sort {$a cmp $b} keys %{$anvil->data->{path}{directories}{shared}}) { my $directory = $anvil->data->{path}{directories}{shared}{$target}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { target => $target, directory => $directory, }}); if (not -e $anvil->data->{path}{directories}{shared}{$target}) { my $failed = $anvil->Storage->make_directory({ directory => $directory, group => "apache", user => "apache", mode => "0775", }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { failed => $failed }}); if ($failed) { # Something went wrong. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "log_0254", variables => { directory => $directory, }}); } else { # Success $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0255", variables => { directory => $directory, }}); } } } # Check mail server config. my $problem = $anvil->Email->check_config({debug => 3}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { problem => $problem }}); } # Now check to see if it's time to run daily tasks. if ($now_time >= $anvil->data->{timing}{next_daily_check}) { ### NOTE: We call it once/day, but this will also trigger on restart of anvil-daemon. As such, we ### don't use '--force' and let striker-manage-install-target skip the repo update if it happened ### recently enough. if ($type eq "striker") { # Age out old data. This takes up to a minute. $anvil->Database->_age_out_data(); # Archive old data $anvil->Database->archive_database(); # Record a job, don't call it directly. It takes too long to run. my ($job_uuid) = $anvil->Database->insert_or_update_jobs({ file => $THIS_FILE, line => __LINE__, job_command => $anvil->data->{path}{exe}{'striker-manage-install-target'}." --refresh".$anvil->Log->switches, job_data => "", job_name => "install-target::refresh", job_title => "job_0015", job_description => "job_0017", job_progress => 0, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }}); # Update the OUI data. ($job_uuid) = $anvil->Database->insert_or_update_jobs({ file => $THIS_FILE, line => __LINE__, job_command => $anvil->data->{path}{exe}{'striker-parse-oui'}.$anvil->Log->switches, job_data => "", job_name => "oui-data::refresh", job_title => "job_0064", job_description => "job_0065", job_progress => 0, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }}); # Scan the networks ($job_uuid) = $anvil->Database->insert_or_update_jobs({ file => $THIS_FILE, line => __LINE__, job_command => $anvil->data->{path}{exe}{'striker-scan-network'}.$anvil->Log->switches, job_data => "", job_name => "scan-network::refresh", job_title => "job_0066", job_description => "job_0067", job_progress => 0, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }}); } # Update the next check time. $anvil->data->{timing}{next_daily_check} = $now_time + $anvil->data->{timing}{daily_checks}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { "s1:timing::daily_checks" => $anvil->data->{timing}{daily_checks}, "s2:timing::next_daily_check" => $anvil->data->{timing}{next_daily_check}, }}); } return(0); } # This calls striker-manage-install-target to see if the dhcpd is running or not. If it is or isn't, the config # variable 'install-target::enabled' is set/updated. On non-Striker hosts, this simply returns without doing # anything. sub check_install_target { my ($anvil) = @_; my $system_type = $anvil->Get->host_type(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { system_type => $system_type }}); if ($system_type ne "striker") { # Not a dashboard, nothing to do. return(0); } my $status = "unavailable"; my ($output, $return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{'striker-manage-install-target'}." --status --check --no-refresh".$anvil->Log->switches}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { output => $output }}); foreach my $line (split/\n/, $output) { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { line => $line }}); if ($line =~ /status=(\d)/) { my $digit = $1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { digit => $digit }}); if ($digit == 0) { $status = "disabled"; } elsif ($digit == 1) { $status = "enabled"; } $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { status => $status }}); last; } } # Record the status $anvil->Database->insert_or_update_variables({ variable_name => "install-target::enabled", variable_source_uuid => $anvil->Get->host_uuid, variable_source_table => "hosts", variable_value => $status, variable_default => "unavailable", variable_description => "striker_0110", variable_section => "system", }); return(0); } # These are tools that don't need to constantly run. They'll typically run when the server starts up or the # daemon is restarted or reloaded. sub run_once { my ($anvil) = @_; # Check that the database is ready. prep_database($anvil); # Check to see if we need to do boot-time tasks. We only run these if we've just booted boot_time_tasks($anvil); # Check the ssh stuff. # NOTE: This actually runs again in the minutes tasks, but needs to run on boot as well. $anvil->System->check_ssh_keys(); # Check setuid wrappers check_setuid_wrappers($anvil); # Check journald is configured for persistent storage. check_journald($anvil); if ($anvil->data->{switches}{'startup-only'}) { $anvil->nice_exit({exit_code => 0}); } return(0); } sub check_journald { my ($anvil) = @_; # Check the journald.conf to ensure logging in configured to be persistent. $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { 'path::configs::journald.conf' => $anvil->data->{path}{configs}{'journald.conf'} }}); my $peristent_seen = 0; my $change_storage = 0; my $old_journald_conf = $anvil->Storage->read_file({file => $anvil->data->{path}{configs}{'journald.conf'}}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { old_journald_conf => $old_journald_conf }}); foreach my $line (split/\n/, $old_journald_conf) { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { line => $line }}); if ($line =~ /^Storage=(.*)$/) { my $value = $1; if ($value eq "persistent") { $peristent_seen = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { peristent_seen => $peristent_seen }}); } else { $change_storage = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { change_storage => $change_storage }}); } } } # Make sure the journald directory $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { 'path::directories::journald' => $anvil->data->{path}{directories}{journald} }}); if (not -d $anvil->data->{path}{directories}{journald}) { $anvil->Storage->make_directory({ debug => 2, directory => $anvil->data->{path}{directories}{journald}, }); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "message_0248", variables => { directory => $anvil->data->{path}{directories}{journald} }}); } # Make sure the journald is configured for persistent storage. if (not $peristent_seen) { my $storage_added = 0; my $new_journald_conf = ""; foreach my $line (split/\n/, $old_journald_conf) { if (($line =~ /^Storage=/) && ($change_storage)) { if (not $storage_added) { $storage_added = 1; $new_journald_conf .= "Storage=persistent\n"; } next; } if (($line =~ /^#Storage=/) && (not $storage_added)) { $storage_added = 1; $new_journald_conf .= "Storage=persistent\n"; } $new_journald_conf .= $line."\n"; } if (not $storage_added) { $new_journald_conf .= "Storage=persistent\n"; } $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { new_journald_conf => $new_journald_conf }}); $anvil->Storage->write_file({ debug => 3, secure => 0, file => $anvil->data->{path}{configs}{'journald.conf'}, body => $new_journald_conf, mode => "0644", overwrite => 1, }); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "message_0013", variables => { file => $anvil->data->{path}{configs}{'journald.conf'} }}); # Restart the journald service. my $shell_call = $anvil->data->{path}{exe}{systemctl}." restart systemd-journald.service"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my ($output, $return_code) = $anvil->System->call({debug => 3, shell_call => $shell_call, source => $THIS_FILE, line => __LINE__}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code, }}); } return(0); } # This creates, as needed, the setuid wrappers used by apache to make certain system calls. sub check_setuid_wrappers { my ($anvil) = @_; my $host_type = $anvil->Get->host_type(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { host_type => $host_type }}); if ($host_type ne "striker") { # Not a dashboard, setuid scripts aren't needed. return(0); } # Does the call_striker-get-peer-data wrapper exist yet? if (-e $anvil->data->{path}{exe}{'call_striker-get-peer-data'}) { # Exists, skipping. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, key => "log_0436", variables => { wrapper => $anvil->data->{path}{exe}{'call_striker-get-peer-data'} }}); } else { # What is the admin user and group ID? my $admin_uid = getpwnam('admin'); my $admin_gid = getgrnam('admin'); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { admin_uid => $admin_uid, admin_gid => $admin_gid, }}); next if not $admin_uid; next if not $admin_gid; # Write the body out my $call_striker_get_peer_data_body = "#define REAL_PATH \"".$anvil->data->{path}{exe}{'striker-get-peer-data'}."\"\n"; $call_striker_get_peer_data_body .= "main(ac, av)\n"; $call_striker_get_peer_data_body .= "char **av;\n"; $call_striker_get_peer_data_body .= "{\n"; $call_striker_get_peer_data_body .= " setuid(".$admin_uid.");\n"; $call_striker_get_peer_data_body .= " setgid(".$admin_gid.");\n"; $call_striker_get_peer_data_body .= " execv(REAL_PATH, av);\n"; $call_striker_get_peer_data_body .= "}\n"; my $error = $anvil->Storage->write_file({ debug => 3, file => $anvil->data->{path}{exe}{'call_striker-get-peer-data'}.".c", body => $call_striker_get_peer_data_body, mode => '644', overwrite => 1, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { error => $error }}); # If it wrote out, compile it. if (not -e $anvil->data->{path}{exe}{'call_striker-get-peer-data'}.".c") { # Failed to write. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "error_0071", variables => { file => $anvil->data->{path}{exe}{'call_striker-get-peer-data'}.".c" }}); } else { # Compile it my ($output, $return_code) = $anvil->System->call({ debug => 3, shell_call => $anvil->data->{path}{exe}{gcc}." -o ".$anvil->data->{path}{exe}{'call_striker-get-peer-data'}." ".$anvil->data->{path}{exe}{'call_striker-get-peer-data'}.".c", }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { output => $output, return_code => $return_code, }}); # If it compiled, setuid it. if (not -e $anvil->data->{path}{exe}{'call_striker-get-peer-data'}) { # Something went wrong compiling it. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "error_0072", variables => { file => $anvil->data->{path}{exe}{'call_striker-get-peer-data'}.".c" }}); } else { $anvil->Storage->change_owner({ debug => 3, path => $anvil->data->{path}{exe}{'call_striker-get-peer-data'}, user => 'root', group => 'root', }); $anvil->Storage->change_mode({ debug => 3, path => $anvil->data->{path}{exe}{'call_striker-get-peer-data'}, mode => '4755', }); } } } return(0); } # Configure/update the firewall. sub check_firewall { my ($anvil) = @_; # Don't call this if we're not configured yet. my $configured = $anvil->System->check_if_configured({debug => 3}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { configured => $configured }}); # Check the firewall needs to be updated. if ($configured) { my ($output, $return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{'anvil-manage-firewall'}.$anvil->Log->switches}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { output => $output, return_code => $return_code }}); } return(0); } # This handles tasks that need to run on boot (if any) sub boot_time_tasks { my ($anvil) = @_; # If the uptime is less than ten minutes, clear the reboot flag. my $uptime = $anvil->Get->uptime; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { uptime => $uptime }}); # Now find out if a reboot is listed as needed and when it was last changed. my $reboot_needed = 0; my $changed_seconds_ago = 0; my $query = " SELECT variable_value, (SELECT extract(epoch from now()) - extract(epoch from modified_date)) AS changed_seconds_ago FROM variables WHERE variable_source_table = 'hosts' AND variable_source_uuid = ".$anvil->Database->quote($anvil->Get->host_uuid)." AND variable_name = 'reboot::needed' ;"; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0124", variables => { query => $query }}); my $results = $anvil->Database->query({query => $query, source => $THIS_FILE, line => __LINE__}); my $count = @{$results}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { results => $results, count => $count, }}); if ($count) { $reboot_needed = $results->[0]->[0]; $changed_seconds_ago = $results->[0]->[1]; $changed_seconds_ago =~ s/^(\d+)\..*$/$1/; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { reboot_needed => $reboot_needed, changed_seconds_ago => $changed_seconds_ago, }}); } # If a reboot is needed, see if the uptime is less than the time since the reboot needed flag was # set. If the uptime is less, then the system rebooted since it was requested so clear it. h/t to # Lisa Seelye (@thedoh) for this idea! my $difference = ($changed_seconds_ago - $uptime); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "s1:reboot_needed" => $reboot_needed, "s2:changed_seconds_ago" => $changed_seconds_ago, "s3:uptime" => $uptime, "s4:difference" => $difference, }}); if ($reboot_needed) { if ($uptime < $changed_seconds_ago) { # Clear the reboot request. $reboot_needed = $anvil->System->reboot_needed({debug => 2, set => 0}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { reboot_needed => $reboot_needed }}); # Check to see if there was a reboot job in progress. If so, finish it off. my $job_uuid = $anvil->Job->get_job_uuid({debug => 2, program => "anvil-manage-power"}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }}); if ($job_uuid) { # Update the percentage to '100' and then clear the old PID. my $date_time = $anvil->Get->date_and_time(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { date_time => $date_time }}); $anvil->Job->update_progress({ progress => 100, message => "message_0064,!!date_and_time!".$date_time."!!", job_uuid => $job_uuid, picked_up_by => 0, }); } } } else { # Update our status $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 2, key => "log_0572"}); $anvil->Database->get_hosts({debug => 2}); my $host_uuid = $anvil->Get->host_uuid({debug => 2}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { host_uuid => $host_uuid }}); $anvil->Database->insert_or_update_hosts({ debug => 2, host_ipmi => $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_ipmi}, host_key => $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_key}, host_name => $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_name}, host_type => $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type}, host_uuid => $host_uuid, host_status => "online", }); # Make sure our stop reason is cleared. my $variable_uuid = $anvil->Database->insert_or_update_variables({ variable_name => 'system::stop_reason', variable_value => '', variable_default => '', variable_description => 'striker_0279', variable_section => 'system', variable_source_uuid => $host_uuid, variable_source_table => 'hosts', }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { variable_uuid => $variable_uuid }}); } # Make sure /etc/hosts is updated. $anvil->System->update_hosts(); # This handles weird bits for things like bug work-arounds. handle_special_cases($anvil); # Now look for jobs that have a job status of 'anvil_startup' run_jobs($anvil, 1); # Check the firewall needs to be updated. check_firewall($anvil); # If we're a striker, check apache my $host_type = $anvil->Get->host_type; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_type => $host_type }}); if ($host_type eq "striker") { $anvil->Striker->check_httpd_conf({debug => 2}); } return(0); } # This handles weird bits for things like bug work-arounds. sub handle_special_cases { my ($anvil) = @_; # RHBZ #1961562 - https://bugzilla.redhat.com/show_bug.cgi?id=1961562#c16 my $host_type = $anvil->Get->host_type(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_type => $host_type }}); if ($host_type ne "striker") { # We're a node or DR host. We need to touch this file. my $work_around_file = "/etc/qemu/firmware/50-edk2-ovmf-cc.json"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { work_around_file => $work_around_file }}); if (not -e $work_around_file) { $anvil->Storage->write_file({ debug => 2, file => $work_around_file, body => "", overwrite => 0, backup => 0, mode => "0644", user => "root", group => "root", }); } } return(0); } # Configure the local database, if needed. sub prep_database { my ($anvil) = @_; # Only run this if we're a dashboard. my $host_type = $anvil->Get->host_type(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_type => $host_type }}); if ($host_type eq "striker") { my ($database_output, $return_code) = $anvil->System->call({debug => 2, shell_call => $anvil->data->{path}{exe}{'striker-prep-database'}.$anvil->Log->switches, source => $THIS_FILE, line => __LINE__ }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { database_output => $database_output, return_code => $return_code, }}); } return(0); } # These are tools that need to keep running. sub keep_running { my ($anvil) = @_; # Check for jobs that were running and now exited. if ((not $anvil->data->{sys}{mapping_network}) && (exists $anvil->data->{processes})) { foreach my $job_uuid (%{$anvil->data->{jobs}{handles}}) { # If it's not a handle, delete it. my $running = $anvil->data->{jobs}{handles}{$job_uuid}->poll(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { "jobs::handles::${job_uuid}" => $anvil->data->{jobs}{handles}{$job_uuid}, running => $running, }}); # If it's not running, update the table to clear the 'job_picked_up_by' column. if (not $running) { my $exit_status = $anvil->data->{jobs}{handles}{$job_uuid}->exit_status(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => { job_uuid => $job_uuid, exit_status => $exit_status, }}); # Free up memory $anvil->data->{jobs}{handles}{$job_uuid}->cleanup(); $anvil->Job->clear({job_uuid => $job_uuid}); } } } # If we're configured, write out the status JSON file. If we're not configured, Update hardware state files. my $configured = $anvil->System->check_if_configured; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { configured => $configured }}); if ((not $anvil->data->{sys}{mapping_network}) && ($configured)) { # Write out state information for all known Anvil! systems and the information from # unconfigured nods and DR hosts, using just database data (hence, fast enough to run # constantly). $anvil->System->generate_state_json({debug => 3}); } else { # Run this to monitor the network in real time. update_state_file($anvil); } # Run any pending jobs by calling 'anvil-jobs' with the 'job_uuid' as a background process. run_jobs($anvil, 0) if not $anvil->data->{sys}{mapping_network}; return(0); } # This will check for any jobs that aren't at 100%. For each found, if 'picked_up_by' is set, a check is made # to see if the PID is still alive. If it isn't, or if 'picked_up_by' is not set, the appropriate tool is # invoked to handle it. sub run_jobs { my ($anvil, $startup) = @_; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { startup => $startup }}); # This will be set to 1 if any jobs are not complete, preventing a restart of the daemon if it's # changed on disk. $anvil->data->{sys}{jobs_running} = 0; # We'll also update the jobs.json file. my $jobs_file = "{\"jobs\":[\n"; # Get a list of pending or incomplete jobs. my $ended_within = $startup ? 1 : 300; my $return = $anvil->Database->get_jobs({ended_within => $ended_within}); my $count = @{$return}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { ended_within => $ended_within, 'return' => $return, count => $count, }}); foreach my $hash_ref (@{$return}) { my $job_uuid = $hash_ref->{job_uuid}; my $job_command = $hash_ref->{job_command}; my $job_data = $hash_ref->{job_data}; my $job_picked_up_by = $hash_ref->{job_picked_up_by}; my $job_picked_up_at = $hash_ref->{job_picked_up_at}; my $job_updated = $hash_ref->{job_updated}; my $job_name = $hash_ref->{job_name}; my $job_progress = $hash_ref->{job_progress}; my $job_title = $hash_ref->{job_title}; my $job_description = $hash_ref->{job_description}; my $job_status = $hash_ref->{job_status}; my $started_seconds_ago = $job_picked_up_at ? (time - $job_picked_up_at) : 0; my $updated_seconds_ago = $job_updated ? (time - $job_updated) : 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid, job_command => $job_command, job_data => $job_data, job_picked_up_by => $job_picked_up_by, job_picked_up_at => $job_picked_up_at, job_updated => $job_updated, job_name => $job_name, job_progress => $job_progress, job_title => $job_title, job_description => $job_description, job_status => $job_status, started_seconds_ago => $started_seconds_ago, updated_seconds_ago => $updated_seconds_ago, }}); # If this is a start-up call, only start jobs whose status is 'anvil_startup'. if (($startup) && ($job_status ne "anvil_startup")) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0639", variables => { job_uuid => $job_uuid, job_command => $job_command, }}); next; } if ($job_progress ne "100") { $anvil->data->{sys}{jobs_running} = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "sys::jobs_running" => $anvil->data->{sys}{jobs_running} }}); } # See if the job was picked up by a now-dead instance. if ($job_picked_up_by) { # Check if the PID is still active. $anvil->System->pids({ignore_me => 1}); ### TODO: Add a check to verify the job isn't hung. # Skip if this job is in progress. if (not exists $anvil->data->{pids}{$job_picked_up_by}) { # If the job is done, just clear the 'job_picked_up_by' and be done. if ($job_progress ne "100") { # It's possible that the job updated to 100% and exited after we # gathered the job data, so we won't restart until we've seen it not # running and not at 100% after 5 loops. if ((not exists $anvil->data->{lost_job_count}{$job_uuid}) or (not defined $anvil->data->{lost_job_count}{$job_uuid})) { $anvil->data->{lost_job_count}{$job_uuid} = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "lost_job_count::${job_uuid}" => $anvil->data->{lost_job_count}{$job_uuid} }}); } if ($anvil->data->{lost_job_count}{$job_uuid} > 5) { # The previous job is gone, but the job isn't finished. Start it again. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "warning_0007", variables => { command => $job_command, pid => $job_picked_up_by, percent => $job_progress, }}); # Clear some variables. $job_progress = 0; $job_status = "message_0056"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_progress => $job_progress, job_status => $job_status, }}); # Clear the job. $anvil->Job->clear({debug => 2, job_uuid => $job_uuid}); $anvil->data->{lost_job_count}{$job_uuid} = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "lost_job_count::${job_uuid}" => $anvil->data->{lost_job_count}{$job_uuid} }}); } else { $anvil->data->{lost_job_count}{$job_uuid}++; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "lost_job_count::${job_uuid}" => $anvil->data->{lost_job_count}{$job_uuid} }}); } } # Clear the PID $job_picked_up_by = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_picked_up_by => $job_picked_up_by }}); } } # Convert the double-banged strings into a proper message. my $say_title = $job_title ? $anvil->Words->parse_banged_string({key_string => $job_title}) : ""; my $say_description = $job_description ? $anvil->Words->parse_banged_string({key_string => $job_description}) : ""; my $say_status = $job_status ? $anvil->Words->parse_banged_string({key_string => $job_status}) : ""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { job_title => $job_title, say_description => $say_description, say_status => $say_status, }}); # Make the status HTML friendly. Strip any embedded HTML then encode the text string. if ($say_status) { my $html_strip = HTML::Strip->new(); $say_status = $html_strip->parse($say_status); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { say_status => $say_status }}); # Now make the resulting text string HTML friendly my $text_to_html = HTML::FromText->new({ urls => 1, email => 1, lines => 1, }); $say_status = $text_to_html->parse($say_status); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { say_status => $say_status }}); } # Add this to the jobs.json file my $json_string = to_json ({ job_uuid => $job_uuid, job_command => $job_command, job_data => $job_data, job_picked_up_at => $job_picked_up_at, job_updated => $job_updated, job_name => $job_name, job_progress => $job_progress, job_title => $say_title, job_description => $say_description, job_status => $say_status, started_seconds_ago => $started_seconds_ago, updated_seconds_ago => $updated_seconds_ago, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { json_string => $json_string }}); $jobs_file .= $json_string.",\n"; # If the job is done, move on. next if $job_progress eq "100"; next if $anvil->data->{switches}{'no-start'}; # If 'startup' is set, we only care if 'job_status' is 'anvil_startup' if ((not $startup) && ($say_status eq "anvil_startup")) { # Skip this, it will run next time anvil-daemon restarts. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0593", variables => { command => $job_command, job_uuid => $job_uuid, }}); next; } # If the job is not running, start it. if (not $job_picked_up_by) { my $command = $job_command." --job-uuid ".$job_uuid; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0210", variables => { command => $command }}); # Have we started this job recently? if (exists $anvil->data->{jobs}{$job_uuid}{started}) { my $last_start = time - $anvil->data->{jobs}{$job_uuid}{started}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { last_start => $last_start }}); if ($last_start < 60) { # Skip, Started too recently. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0578", variables => { command => $command, last_start => $last_start, }}); next; } } # Start the job, appending '--job-uuid' to the command. ($anvil->data->{jobs}{handles}{$job_uuid}, my $return_code) = $anvil->System->call({ background => 1, stdout_file => "/tmp/anvil.job.".$job_uuid.".stdout", stderr_file => "/tmp/anvil.job.".$job_uuid.".stderr", shell_call => $command, source => $THIS_FILE, line => __LINE__, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "jobs::handles::${job_uuid}" => $anvil->data->{jobs}{handles}{$job_uuid}, return_code => $return_code, }}); # Log the PID (the job should update the database). my $pid = $anvil->data->{jobs}{handles}{$job_uuid}->pid(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { pid => $pid }}); # Record that we've tried to start this job, so that we don't try to restart it for any reason for at least a minute. $anvil->data->{jobs}{$job_uuid}{started} = time; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'jobs::$job_uuid::started' => $anvil->data->{jobs}{$job_uuid}{started} }}); } } # Close the jobs file. $jobs_file =~ s/,\n$/\n/ms; $jobs_file .= "]}\n"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { jobs_file => $jobs_file }}); # Write the JSON file my $output_json = $anvil->data->{path}{directories}{html}."/status/jobs.json"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { output_xml => $output_json }}); $anvil->Storage->write_file({ file => $output_json, body => $jobs_file, overwrite => 1, backup => 0, mode => "0644", user => "apache", group => "apache", }); return(0); } # This calls 'anvil-update-states' which will scan the local machine's state (hardware and software) and # record write it out to an HTML file sub update_state_file { my ($anvil) = @_; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, key => "log_0480"}); #my $shell_call = $anvil->data->{path}{exe}{'anvil-update-states'}.$anvil->Log->switches; my $shell_call = $anvil->data->{path}{exe}{'anvil-update-states'}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my ($states_output, $return_code) = $anvil->System->call({debug => 3, shell_call => $shell_call, source => $THIS_FILE, line => __LINE__}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { states_output => $states_output, return_code => $return_code, }}); return(0); }