#!/usr/bin/perl # # This is the master daemon that manages all periodically run processes on Striker dashboards, Anvil! cluster # nodes and DR hosts. # # Exit codes; # 0 = Normal exit or md5sum of this program changed and it exited to reload. # 1 = Not running as root. # 2 = Unable to connect to any database, even after trying to initialize the local system. # # TODO: # - Need to check what kind of machine this is and not prep the database unless its a dashboard. # - Add a "running: pending,yes,done,dead" and show an appropriate icon beside jobs # - Decide if holding before the main loop until 'systemctl is-system-running' returns 'running' is a good # idea or not. # - Write the status of this and the scancore daemon to /etc/anvil/anvil.motd and symlink it to /etc/motd.d/ # - Write a script that runs in crontab at UTC 17:00 that sends an email if Scancore or anvil-daemon are disabled. # - Examine limites in: https://www.freedesktop.org/software/systemd/man/systemd.exec.html#LimitCPU= # - Write a background program to scan the BCN and uses OUI data to try and find / auto-configure PDUs and UPSes # - # - Increase DRBD's default timeout # - Check for and enable persistent journald logging # # NOTE: # - For later; 'reboot --force --force' immediately kills the OS, like disabling ACPI on EL6 and hitting the # power button. Might be useful in ScanCore down the road. # # Switches: # # --main-loop-only # # This skips the one-time, start-up tasks and just goes into the main-loop, # # --no-start # # This will prevent any pending jobs from being picked up and started in this run. Note that other job checks will still happen. # # --refresh-json # # This just updates the JSON files used by the web interface. It is the same as '--run-once --main-loop-only --no-start' # # --run-once # # This will tell the program to exit after runn the main loop once. # # --startup-only # # This will tell the program to exit after running the start up tasks, so the main loop won't run. # use strict; use warnings; use Anvil::Tools; use Proc::Simple; #use Time::HiRes qw ( time sleep ); use JSON; use HTML::Strip; use HTML::FromText; use Data::Dumper; use Text::Diff; my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0]; my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0]; if (($running_directory =~ /^\./) && ($ENV{PWD})) { $running_directory =~ s/^\./$ENV{PWD}/; } # Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete. $| = 1; # Prevent a discrepency between UID/GID and EUID/EGID from throwing an error. $< = $>; $( = $); # NOTE: Setting 'log_level' and 'log_secure' here will get overridden in the main lopp. Use the Log methods # in the loop as well to override defaults in code. my $anvil = Anvil::Tools->new(); # Make sure we're running as 'root' # $< == real UID, $> == effective UID if (($< != 0) && ($> != 0)) { # Not root print $anvil->Words->string({key => "error_0005"})."\n"; $anvil->nice_exit({exit_code => 1}); } # If, so some reason, anvil.conf is lost, create it. $anvil->System->_check_anvil_conf(); # Connect to the database(s). If we have no connections, we'll proceed anyway as one of the 'run_once' tasks # is to setup the database server. $anvil->Database->connect({ check_if_configured => 1, check_for_resync => 1, }); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0132"}); # If I have no databases, sleep for a second and then exit (systemd will restart us). if (not $anvil->data->{sys}{database}{connections}) { # If this is a dashboard, try to configure and then connect to the local database. If this isn't a # dashboard, then just go into a loop waiting for a database to be configured. if ($anvil->Get->host_type eq "striker") { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0201"}); prep_database($anvil); # Try connecting again $anvil->Database->connect({check_if_configured => 1, check_for_resync => 1}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0132"}); if (not $anvil->data->{sys}{database}{connections}) { # Still nothing, sleep and exit. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, key => "error_0003"}); $anvil->nice_exit({exit_code => 2}); } } else { # Wait until we have one. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, key => "error_0075"}); until($anvil->data->{sys}{database}{connections}) { sleep 10; check_network($anvil); $anvil->refresh(); $anvil->Database->connect({check_if_configured => 1, check_for_resync => 1}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, key => "log_0132"}); if (not $anvil->data->{sys}{database}{connections}) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 3, key => "log_0439"}); } } } } # Read switches $anvil->data->{switches}{'refresh-json'} = ""; $anvil->data->{switches}{'run-once'} = 0; $anvil->data->{switches}{'main-loop-only'} = 0; $anvil->data->{switches}{'no-start'} = 0; $anvil->data->{switches}{'startup-only'} = 0; $anvil->Get->switches; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }}); if ($anvil->data->{switches}{'refresh-json'}) { $anvil->data->{switches}{'run-once'} = 1; $anvil->data->{switches}{'main-loop-only'} = 1; $anvil->data->{switches}{'no-start'} = 1; } # This is used to track initial checkes / repairs of network issues. $anvil->data->{sys}{network}{initial_checks} = 0; # There are some things we only want to run on (re)start and don't need to always run. run_once($anvil) if not $anvil->data->{switches}{'main-loop-only'}; # Calculate my sum so that we can exit if it changes later. $anvil->Storage->record_md5sums; # What time is it, Mr. Fox? my $now_time = time; # To avoid multiple dashboards running a network scan and OUI parse, the dashboard peer with the lowest # host_uuid sets it's daily checks to run now, and the other(s) will get a two hour's delay. my $delay = set_delay($anvil); # Once a minute, we'll check the md5sums and see if we should restart. # Once a day, we'll refresh an Install Target's RPM repository (has no effect on non-Striker dashboards). $anvil->data->{timing}{minute_checks} = 60; $anvil->data->{timing}{ten_minute_checks} = 600; $anvil->data->{timing}{daily_checks} = 86400; $anvil->data->{timing}{repo_update_interval} = 86400; $anvil->data->{timing}{next_minute_check} = $now_time - 1; $anvil->data->{timing}{next_ten_minute_check} = $now_time - 1; $anvil->data->{timing}{next_daily_check} = ($now_time + $delay) - 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "s1:timing::minute_checks" => $anvil->data->{timing}{minute_checks}, "s2:timing::ten_minute_checks" => $anvil->data->{timing}{ten_minute_checks}, "s3:timing::daily_checks" => $anvil->data->{timing}{daily_checks}, "s4:timing::repo_update_interval" => $anvil->data->{timing}{repo_update_interval}, "s5:now_time" => $now_time, "s6:delay" => $delay, "s7:timing::next_minute_check" => $anvil->data->{timing}{next_minute_check}, "s8:timing::next_ten_minute_check" => $anvil->data->{timing}{next_ten_minute_check}, "s9:timing::next_daily_check" => $anvil->data->{timing}{next_daily_check}, }}); # Disconnect. We'll reconnect inside the loop $anvil->Database->disconnect(); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, key => "log_0203"}); # This will prevent restarting while jobs are running. $anvil->data->{sys}{jobs_running} = 0; # When we periodically check if system files have changed, we'll also ask Database>connect() to check if it # needs to be configured or updated. This is done periodically as it is expensive to run on every loop. my $check_if_database_is_configured = 0; # These are the things we always want running. while(1) { # Reload defaults, re-read the config and then connect to the database(s) $anvil->refresh(); # If, so some reason, anvil.conf is lost, create it. $anvil->System->_check_anvil_conf(); $anvil->Database->connect({check_if_configured => $check_if_database_is_configured, check_for_resync => 1}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, key => "log_0132"}); # Mark that we don't want to check the database now. $check_if_database_is_configured = 0; # If this host is mapping the network, we'll skip a lot of stuff. If set for over an hour, we'll # clear it. $anvil->data->{sys}{mapping_network} = check_if_mapping($anvil); if ($anvil->data->{sys}{database}{connections}) { # Run the normal tasks keep_running($anvil); # Handle periodic tasks handle_periodic_tasks($anvil) if not $anvil->data->{sys}{mapping_network}; } else { # No databases available, we'll update the state file in case this host is having it's # network mapped and the interface used to talk to the databases went down. That's all we # can do though. update_state_file($anvil); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, key => "log_0202"}); } # Exit if 'run-once' selected. if ($anvil->data->{switches}{'run-once'}) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "message_0055"}); $anvil->nice_exit({exit_code => 0}); } # Check how much RAM we're using. check_ram($anvil); # Disconnect from the database(s) and sleep now. $anvil->Database->disconnect({debug => 2}); sleep(2); } $anvil->nice_exit({exit_code => 0}); ############################################################################################################# # Functions # ############################################################################################################# # If we're using too much ram, send an alert and exit. sub check_ram { my ($anvil) = @_; # Problem 0 == ok, 1 == too much ram used, 2 == no pid found my ($problem, $ram_used) = $anvil->System->check_ram_use({program => $THIS_FILE}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem, ram_used => $anvil->Convert->add_commas({number => $ram_used})." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $ram_used}).")", }}); if ($problem) { # See if an [anvil-sync-shared' job is running and, if so, don't exit. The file copy is # counted and not an actual problem. $anvil->Database->get_jobs({debug => 2}); foreach my $job_uuid (keys %{$anvil->data->{jobs}{running}}) { my $job_command = $anvil->data->{jobs}{running}{$job_uuid}{job_command}; my $job_progress = $anvil->data->{jobs}{running}{$job_uuid}{job_progress}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_command => $job_command, job_progress => $job_progress, }}); if (($job_progress != 100) && ($job_command =~ /anvil-sync-shared/)) { # Don't abort. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "warning_0139", variables => { job_command => $job_command, ram_used => $anvil->Convert->bytes_to_human_readable({'bytes' => $ram_used}), ram_used_bytes => $anvil->Convert->add_commas({number => $ram_used}), }}); return(0); } } # Send an alert and exit. $anvil->Alert->register({alert_level => "notice", message => "error_0357", variables => { program => $THIS_FILE, ram_used => $anvil->Convert->bytes_to_human_readable({'bytes' => $ram_used}), ram_used_bytes => $anvil->Convert->add_commas({number => $ram_used}), }, set_by => $THIS_FILE, sort_position => 0}); $anvil->Email->send_alerts(); # Log the same $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0357", variables => { program => $THIS_FILE, ram_used => $anvil->Convert->bytes_to_human_readable({'bytes' => $ram_used}), ram_used_bytes => $anvil->Convert->add_commas({number => $ram_used}), }}); # Exit with RC0 so that systemctl restarts $anvil->nice_exit({exit_code => 0}); } return(0); } # Check to see if we're mapping the network on this host. sub check_if_mapping { my ($anvil) = @_; $anvil->data->{sys}{mapping_network} = 0; if ($anvil->data->{sys}{database}{connections}) { my ($map_network_value, $map_network_uuid, $map_network_mtime, $map_network_modified_date) = $anvil->Database->read_variable({ debug => 3, variable_name => "config::map_network", variable_source_table => "hosts", variable_source_uuid => $anvil->data->{sys}{host_uuid}, }); # We'll run for a day (should be cancelled by the program when the user's done, so this # shouldn't fire in practice). my $expire_age = 86400; my $map_network_age = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:map_network_value' => $map_network_value, 's2:map_network_mtime' => $map_network_mtime, 's3:map_network_modified_date' => $map_network_modified_date, 's4:map_network_uuid' => $map_network_uuid, }}); if ($map_network_uuid) { $map_network_age = time - $map_network_mtime; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { map_network_age => $map_network_age }}); } if ($map_network_value) { # How long ago was it set? $anvil->data->{switches}{'clear-mapping'} = "" if not defined $anvil->data->{switches}{'clear-mapping'}; if (($map_network_age >= $expire_age) or ($anvil->data->{switches}{'clear-mapping'})) { # Clear it. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0470"}); $anvil->Database->insert_or_update_variables({ debug => 3, variable_value => 0, variable_uuid => $map_network_uuid, update_value_only => 1, }); } else { # Mark it so we only track the network. my $say_age = $anvil->Convert->add_commas({number => $expire_age}); my $timeout = $anvil->Convert->add_commas({number => ($expire_age - $map_network_age)}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0471", variables => { age => $say_age, timeout => $timeout, }}); $anvil->data->{sys}{mapping_network} = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { "sys::mapping_network" => $anvil->data->{sys}{mapping_network} }}); # Close any open ssh connections. foreach my $ssh_fh_key (keys %{$anvil->data->{cache}{ssh_fh}}) { my $ssh_fh = $anvil->data->{cache}{ssh_fh}{$ssh_fh_key}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { ssh_fh_key => $ssh_fh_key, ssh_fh => $ssh_fh, }}); if ($ssh_fh =~ /^Net::OpenSSH/) { $ssh_fh->disconnect(); } delete $anvil->data->{cache}{ssh_fh}{$ssh_fh_key}; } } } } $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { "sys::mapping_network" => $anvil->data->{sys}{mapping_network} }}); return($anvil->data->{sys}{mapping_network}); } # This decides if the local system will delay daily runs on start-up. sub set_delay { my ($anvil) = @_; my $delay = 7200; my $type = $anvil->Get->host_type(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { type => $type }}); if ($type eq "striker") { foreach my $uuid (keys %{$anvil->data->{database}}) { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { "sys::host_uuid" => $anvil->data->{sys}{host_uuid}, uuid => $uuid, }}); if ($uuid eq $anvil->data->{sys}{host_uuid}) { $delay = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { delay => $delay }}); } last; } } else { # Not a dashboard, don't delay $delay = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { delay => $delay }}); } return($delay); } # This checks to see if it's time to see if the network is ok and, if the system has been up long enough, # checks and tries to repair network issues. sub check_network { my ($anvil) = @_; # The network sometimes doesn't come up, but we don't want to try recovering it too soon. As such, # we'll start watching the network after the uptime is 2 minutes. my $uptime = $anvil->Get->uptime; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { uptime => $uptime }}); if ($uptime > 120) { # Check that bonds are up. Degraded bonds will be left alone. if (not $anvil->data->{sys}{network}{initial_checks}) { my $running = $anvil->System->check_daemon({daemon => "NetworkManager"}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { running => $running }}); if (not $running) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "message_0250", variables => { daemon => "NetworkManager" }}); my $return_code = $anvil->System->start_daemon({daemon => "NetworkManager"}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { return_code => $return_code }}); } #$anvil->Network->check_network({heal => "all"}); $anvil->data->{sys}{network}{initial_checks} = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { "sys::network::initial_checks" => $anvil->data->{sys}{network}{initial_checks}, }}); } else { ### NOTE: This is constantly trying to "fix" healthy bonds, without a know way to ### trigger to debug. As such, disabling for now. #$anvil->Network->check_network({heal => "down_only"}); } check_firewall($anvil); } # Check that all users can ping. if (1) { my $shell_call = $anvil->data->{path}{exe}{sysctl}." net.ipv4.ping_group_range"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output }}); if ($output =~ /net.ipv4.ping_group_range = (\d+)\t(\d+)$/) { my $lowest_uid = $1; my $highest_uid = $2; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { lowest_uid => $lowest_uid, highest_uid => $highest_uid, }}); if ($highest_uid < 2000) { # Tell the user we're enabling ping for all users. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0683"}); my $shell_call = $anvil->data->{path}{exe}{sysctl}." -w net.ipv4.ping_group_range=\"0 2147483647\""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => { shell_call => $shell_call }}); my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => { output => $output }}); } } } return(0); } # This handles running tasks that only run on some loops. sub handle_periodic_tasks { my ($anvil) = @_; my $now_time = time; my $type = $anvil->Get->host_type(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { "s1:now_time" => $now_time, "s2:timing::next_minute_check" => $anvil->data->{timing}{next_minute_check}, "s3:timing::next_ten_minute_check" => $anvil->data->{timing}{next_ten_minute_check}, "s4:timing::next_daily_check" => $anvil->data->{timing}{next_daily_check}, "s5:type" => $type, }}); # Time to run once per minute tasks. if ($now_time >= $anvil->data->{timing}{next_minute_check}) { # Check the firewall needs to be updated. check_network($anvil); # Check to see if the PXE environment needs to be updated. check_install_target($anvil); # Check that the users we care about have ssh public keys and they're recorded in ssh_keys. $anvil->System->check_ssh_keys({debug => 2}); $anvil->System->update_hosts({debug => 3}); # Check if the files on disk have changed. Even if it is time to check, don't if a job is # running. if ((not $anvil->data->{timing}{jobs_running}) && ($anvil->Storage->check_md5sums)) { # NOTE: We exit with '0' to prevent systemctl from showing a scary red message. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "message_0014"}); $anvil->nice_exit({exit_code => 0}); } # Mark that we want to check the database config next time. $check_if_database_is_configured = 1; # Update the next check time. $anvil->data->{timing}{next_minute_check} = $now_time + $anvil->data->{timing}{minute_checks}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { "s1:timing::minute_checks" => $anvil->data->{timing}{minute_checks}, "s2:timing::next_minute_check" => $anvil->data->{timing}{next_minute_check}, }}); # Even when this runs, it should finish in under ten seconds so we don't need to background it. my ($parse_output, $return_code) = $anvil->System->call({debug => 3, shell_call => $anvil->data->{path}{exe}{'anvil-parse-fence-agents'}.$anvil->Log->switches, source => $THIS_FILE, line => __LINE__}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { parse_output => $parse_output }}); # Scan the local network. update_state_file($anvil); # Make sure the shared directories exist. foreach my $target (sort {$a cmp $b} keys %{$anvil->data->{path}{directories}{shared}}) { my $directory = $anvil->data->{path}{directories}{shared}{$target}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { target => $target, directory => $directory, }}); if (not -e $anvil->data->{path}{directories}{shared}{$target}) { my $failed = $anvil->Storage->make_directory({ directory => $directory, group => "apache", user => "apache", mode => "0775", }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { failed => $failed }}); if ($failed) { # Something went wrong. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "log_0254", variables => { directory => $directory, }}); } else { # Success $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0255", variables => { directory => $directory, }}); } } } # Check mail server config. my $problem = $anvil->Email->check_config({debug => 3}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { problem => $problem }}); # Check if any files have been uploaded to /mnt/shared/incoming on striker check_incoming($anvil); # Check for stale db_in_use states. check_db_in_use_states($anvil); } # Now check to see if it's time to run less frequent tasks. if ($now_time >= $anvil->data->{timing}{next_ten_minute_check}) { my $host_type = $anvil->Get->host_type(); my $host_uuid = $anvil->Get->host_uuid(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_type => $host_type, host_uuid => $host_uuid, }}); # Are we a Striker and is there two or more connections? If so, evaluate if we should shut # down our database. if ($host_type eq "striker") { if ($anvil->data->{sys}{database}{connections} > 1) { # Make sure that all active databases are in the host's table. If they're # not, we're still early in setup. To do this, we create an array of hosts # and then query both/all DBs to ensure they all have all hosts. my $all_in_hosts = 1; my $db_hosts = []; foreach my $uuid (sort {$a cmp $b} keys %{$anvil->data->{cache}{database_handle}}) { push @{$db_hosts}, $uuid; } foreach my $db_uuid (@{$db_hosts}) { my $query = "SELECT COUNT(*) FROM hosts WHERE host_uuid = ".$anvil->Database->quote($db_uuid).";"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:db_uuid' => $db_uuid, 's2:query' => $query, }}); foreach my $host_uuid (@{$db_hosts}) { my $count = $anvil->Database->query({debug => 2, uuid => $db_uuid, query => $query, source => $THIS_FILE, line => __LINE__})->[0]->[0]; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:host_uuid' => $host_uuid, 's2:db_uuid' => $db_uuid, 's2:count' => $count, }}); if (not $count) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "warning_0143", variables => { db_uuid => $db_uuid, host_uuid => $host_uuid, }}); $all_in_hosts = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_in_hosts => $all_in_hosts }}); } } } # Sort by UUID, skip the first, and see if we're one of the others. $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_in_hosts => $all_in_hosts }}); if ($all_in_hosts) { my $first_uuid = ""; foreach my $uuid (sort {$a cmp $b} keys %{$anvil->data->{cache}{database_handle}}) { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { uuid => $uuid }}); if (not $first_uuid) { $first_uuid = $uuid; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { first_uuid => $first_uuid }}); # Skip the first UUID so it doesn't evaluate for # shutdown. next; } elsif ($uuid eq $host_uuid) { # This won't return until we're down. $anvil->Database->shutdown({debug => 2}); } } } } # If we're the active database, dump our database out and rsync it to our peers. my $peers = keys %{$anvil->data->{database}}; my $connections = $anvil->data->{sys}{database}{connections}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peers => $peers, connections => $connections, }}); if (exists $anvil->data->{cache}{database_handle}{$host_uuid}) { # Verify that the database is up. my $running = $anvil->System->check_daemon({daemon => $anvil->data->{sys}{daemon}{postgresql}}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { running => $running }}); if ($running) { # Backup our DB. my $dump_file = $anvil->Database->backup_database({debug => 2}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { dump_file => $dump_file }}); # Now rsync it to our peer(s) foreach my $this_host_uuid (sort {$a cmp $b} keys %{$anvil->data->{database}}) { next if $this_host_uuid eq $host_uuid; my $destination = "root\@".$anvil->data->{database}{$this_host_uuid}{host}.":".$anvil->data->{path}{directories}{pgsql}."/"; my $password = $anvil->data->{database}{$this_host_uuid}{password}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { this_host_uuid => $this_host_uuid, destination => $destination, password => $anvil->Log->is_secure($password), }}); my $start_time = time; my $failed = $anvil->Storage->rsync({ debug => 3, destination => $destination, password => $password, source => $dump_file, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { failed => $failed }}); my $rsync_time = time - $start_time; my $size = $anvil->Convert->bytes_to_human_readable({'bytes' => $anvil->data->{file_stat}{$dump_file}{size}}); my $size_bytes = $anvil->Convert->add_commas({number => $anvil->data->{file_stat}{$dump_file}{size}}); my $target_name = $anvil->Get->host_name_from_uuid({debug => 3, host_uuid => $this_host_uuid}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0658", variables => { file => $dump_file, host_name => $target_name, took => $rsync_time, size => $size, size_bytes => $size_bytes, }}); } } } } # Update the next check time. $anvil->data->{timing}{next_ten_minute_check} = $now_time + $anvil->data->{timing}{ten_minute_checks}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "s1:timing::ten_minute_checks" => $anvil->data->{timing}{ten_minute_checks}, "s2:timing::next_ten_minute_check" => $anvil->data->{timing}{next_ten_minute_check}, }}); } # Now check to see if it's time to run daily tasks. if ($now_time >= $anvil->data->{timing}{next_daily_check}) { ### NOTE: We call it once/day, but this will also trigger on restart of anvil-daemon. As such, we ### don't use '--force' and let striker-manage-install-target skip the repo update if it happened ### recently enough. if ($type eq "striker") { # Age out old data. This takes up to a minute. $anvil->Database->_age_out_data(); # Archive old data $anvil->Database->archive_database(); # Record a job, don't call it directly. It takes too long to run. my ($job_uuid) = $anvil->Database->insert_or_update_jobs({ file => $THIS_FILE, line => __LINE__, job_command => $anvil->data->{path}{exe}{'striker-manage-install-target'}." --refresh".$anvil->Log->switches, job_data => "", job_name => "install-target::refresh", job_title => "job_0015", job_description => "job_0017", job_progress => 0, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }}); # Update the OUI data. ($job_uuid) = $anvil->Database->insert_or_update_jobs({ file => $THIS_FILE, line => __LINE__, job_command => $anvil->data->{path}{exe}{'striker-parse-oui'}.$anvil->Log->switches, job_data => "", job_name => "oui-data::refresh", job_title => "job_0064", job_description => "job_0065", job_progress => 0, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }}); # Scan the networks ($job_uuid) = $anvil->Database->insert_or_update_jobs({ file => $THIS_FILE, line => __LINE__, job_command => $anvil->data->{path}{exe}{'striker-scan-network'}.$anvil->Log->switches, job_data => "", job_name => "scan-network::refresh", job_title => "job_0066", job_description => "job_0067", job_progress => 0, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }}); } # Update the next check time. $anvil->data->{timing}{next_daily_check} = $now_time + $anvil->data->{timing}{daily_checks}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { "s1:timing::daily_checks" => $anvil->data->{timing}{daily_checks}, "s2:timing::next_daily_check" => $anvil->data->{timing}{next_daily_check}, }}); } return(0); } ### NOTE: This logic plays out in a slightly different way in Database->shutdown(). # Check for stale db_in_use states. sub check_db_in_use_states { my ($anvil) = @_; # We only reap db_in_use entries for us. $anvil->System->pids(); my $host_uuid = $anvil->Database->quote($anvil->Get->host_uuid); $host_uuid =~ s/^'(.*)'$/$1/; my $query = " SELECT state_uuid, state_name, state_note FROM states WHERE state_name LIKE 'db_in_use::".$host_uuid."::%' ;"; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0124", variables => { query => $query }}); my $results = $anvil->Database->query({query => $query, source => $THIS_FILE, line => __LINE__}); my $count = @{$results}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { results => $results, count => $count, }}); if ($count) { foreach my $row (@{$results}) { my $state_uuid = $row->[0]; my $state_name = $row->[1]; my $state_note = $row->[2]; my $state_pid = ($state_name =~ /db_in_use::.*?::(.*)$/)[0]; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:state_uuid' => $state_uuid, 's2:state_name' => $state_name, 's3:state_note' => $state_note, 's4:state_pid' => $state_pid, }}); if (not exists $anvil->data->{pids}{$state_pid}) { # Reap the 'db_is_use'. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "warning_0140", variables => { pid => $state_pid }}); my $query = "DELETE FROM states WHERE state_uuid = ".$anvil->Database->quote($state_uuid).";"; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0124", variables => { query => $query }}); $anvil->Database->write({debug => 2, query => $query, source => $THIS_FILE, line => __LINE__}); } ### TODO: What are the chances of a PID being reused in the minute between ### the program's death and us detecting it? Should we filter the ### 'pids::::command' value against our programs and scan agents? } } return(0); } # On dashboards, this checks to see if any files are in /mnt/shared/incoming and, if so, that they've been processed. sub check_incoming { my ($anvil) = @_; my $system_type = $anvil->Get->host_type(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { system_type => $system_type }}); if ($system_type eq "striker") { # Look for files in /mnt/shared/incoming that are not yet in the database. my $directory = $anvil->data->{path}{directories}{shared}{incoming}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { directory => $directory }}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { directory => $directory }}); local(*DIRECTORY); opendir(DIRECTORY, $directory); while(my $file = readdir(DIRECTORY)) { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { file => $file }}); next if $file eq "."; next if $file eq ".."; next if $file =~ /^\./; # This is files being rsync'ed still my $full_path = $directory."/".$file; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { full_path => $full_path }}); # Skip anything that is not a file. next if not -f $full_path; # Is this file already in the DB? my $query = "SELECT file_uuid FROM files WHERE file_name = ".$anvil->Database->quote($file).";"; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0124", variables => { query => $query }}); my $results = $anvil->Database->query({query => $query, source => $THIS_FILE, line => __LINE__}); my $count = @{$results}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { results => $results, count => $count, }}); if (not $count) { # Add it to the database. my $size = (stat($full_path))[7]; my $say_size_human = $anvil->Convert->bytes_to_human_readable({'bytes' => $size}); my $say_size_comma = $anvil->Convert->add_commas({number => $size}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { size => $size, say_size_human => $say_size_human, say_size_comma => $say_size_comma, }}); # Register a job to call anvil-sync-shared my ($job_uuid) = $anvil->Database->insert_or_update_jobs({ file => $THIS_FILE, line => __LINE__, job_command => $anvil->data->{path}{exe}{'anvil-sync-shared'}, job_data => "file=".$full_path, job_name => "storage::move_incoming", job_title => "job_0132", job_description => "job_0133", job_progress => 0, job_host_uuid => $anvil->data->{sys}{host_uuid}, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }}); } } closedir(DIRECTORY); } return(0); } # This calls striker-manage-install-target to see if the dhcpd is running or not. If it is or isn't, the config # variable 'install-target::enabled' is set/updated. On non-Striker hosts, this simply returns without doing # anything. sub check_install_target { my ($anvil) = @_; my $system_type = $anvil->Get->host_type(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { system_type => $system_type }}); if ($system_type ne "striker") { # Not a dashboard, nothing to do. return(0); } my $status = "unavailable"; my ($output, $return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{'striker-manage-install-target'}." --status --check --no-refresh".$anvil->Log->switches}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { output => $output }}); foreach my $line (split/\n/, $output) { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { line => $line }}); if ($line =~ /status=(\d)/) { my $digit = $1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { digit => $digit }}); if ($digit == 0) { $status = "disabled"; } elsif ($digit == 1) { $status = "enabled"; } $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { status => $status }}); last; } } # Record the status $anvil->Database->insert_or_update_variables({ variable_name => "install-target::enabled", variable_source_uuid => $anvil->Get->host_uuid, variable_source_table => "hosts", variable_value => $status, variable_default => "unavailable", variable_description => "striker_0110", variable_section => "system", }); return(0); } # These are tools that don't need to constantly run. They'll typically run when the server starts up or the # daemon is restarted or reloaded. sub run_once { my ($anvil) = @_; # Check that the database is ready. prep_database($anvil); # Check to see if we need to do boot-time tasks. We only run these if we've just booted boot_time_tasks($anvil); # Check the ssh stuff. # NOTE: This actually runs again in the minutes tasks, but needs to run on boot as well. $anvil->System->check_ssh_keys({debug => 2}); # Check setuid wrappers check_setuid_wrappers($anvil); # Check journald is configured for persistent storage. check_journald($anvil); if ($anvil->data->{switches}{'startup-only'}) { $anvil->nice_exit({exit_code => 0}); } return(0); } sub check_journald { my ($anvil) = @_; # Check the journald.conf to ensure logging in configured to be persistent. $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { 'path::configs::journald.conf' => $anvil->data->{path}{configs}{'journald.conf'} }}); my $peristent_seen = 0; my $change_storage = 0; my $old_journald_conf = $anvil->Storage->read_file({file => $anvil->data->{path}{configs}{'journald.conf'}}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { old_journald_conf => $old_journald_conf }}); foreach my $line (split/\n/, $old_journald_conf) { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { line => $line }}); if ($line =~ /^Storage=(.*)$/) { my $value = $1; if ($value eq "persistent") { $peristent_seen = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { peristent_seen => $peristent_seen }}); } else { $change_storage = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { change_storage => $change_storage }}); } } } # Make sure the journald directory $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { 'path::directories::journald' => $anvil->data->{path}{directories}{journald} }}); if (not -d $anvil->data->{path}{directories}{journald}) { $anvil->Storage->make_directory({ debug => 2, directory => $anvil->data->{path}{directories}{journald}, }); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "message_0248", variables => { directory => $anvil->data->{path}{directories}{journald} }}); } # Make sure the journald is configured for persistent storage. if (not $peristent_seen) { my $storage_added = 0; my $new_journald_conf = ""; foreach my $line (split/\n/, $old_journald_conf) { if (($line =~ /^Storage=/) && ($change_storage)) { if (not $storage_added) { $storage_added = 1; $new_journald_conf .= "Storage=persistent\n"; } next; } if (($line =~ /^#Storage=/) && (not $storage_added)) { $storage_added = 1; $new_journald_conf .= "Storage=persistent\n"; } $new_journald_conf .= $line."\n"; } if (not $storage_added) { $new_journald_conf .= "Storage=persistent\n"; } $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { new_journald_conf => $new_journald_conf }}); $anvil->Storage->write_file({ debug => 3, secure => 0, file => $anvil->data->{path}{configs}{'journald.conf'}, body => $new_journald_conf, mode => "0644", overwrite => 1, }); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "message_0013", variables => { file => $anvil->data->{path}{configs}{'journald.conf'} }}); # Restart the journald service. my $shell_call = $anvil->data->{path}{exe}{systemctl}." restart systemd-journald.service"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my ($output, $return_code) = $anvil->System->call({debug => 3, shell_call => $shell_call, source => $THIS_FILE, line => __LINE__}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code, }}); } return(0); } # This creates, as needed, the setuid wrappers used by apache to make certain system calls. sub check_setuid_wrappers { my ($anvil) = @_; my $host_type = $anvil->Get->host_type(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { host_type => $host_type }}); if ($host_type ne "striker") { # Not a dashboard, setuid scripts aren't needed. return(0); } # Does the call_striker-get-peer-data wrapper exist yet? if (-e $anvil->data->{path}{exe}{'call_striker-get-peer-data'}) { # Exists, skipping. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, key => "log_0436", variables => { wrapper => $anvil->data->{path}{exe}{'call_striker-get-peer-data'} }}); } else { # What is the admin user and group ID? my $admin_uid = getpwnam('admin'); my $admin_gid = getgrnam('admin'); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { admin_uid => $admin_uid, admin_gid => $admin_gid, }}); next if not $admin_uid; next if not $admin_gid; # Write the body out my $call_striker_get_peer_data_body = "#define REAL_PATH \"".$anvil->data->{path}{exe}{'striker-get-peer-data'}."\"\n"; $call_striker_get_peer_data_body .= "main(ac, av)\n"; $call_striker_get_peer_data_body .= "char **av;\n"; $call_striker_get_peer_data_body .= "{\n"; $call_striker_get_peer_data_body .= " setuid(".$admin_uid.");\n"; $call_striker_get_peer_data_body .= " setgid(".$admin_gid.");\n"; $call_striker_get_peer_data_body .= " execv(REAL_PATH, av);\n"; $call_striker_get_peer_data_body .= "}\n"; my $error = $anvil->Storage->write_file({ debug => 3, file => $anvil->data->{path}{exe}{'call_striker-get-peer-data'}.".c", body => $call_striker_get_peer_data_body, mode => '644', overwrite => 1, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { error => $error }}); # If it wrote out, compile it. if (not -e $anvil->data->{path}{exe}{'call_striker-get-peer-data'}.".c") { # Failed to write. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "error_0071", variables => { file => $anvil->data->{path}{exe}{'call_striker-get-peer-data'}.".c" }}); } else { # Compile it my ($output, $return_code) = $anvil->System->call({ debug => 3, shell_call => $anvil->data->{path}{exe}{gcc}." -o ".$anvil->data->{path}{exe}{'call_striker-get-peer-data'}." ".$anvil->data->{path}{exe}{'call_striker-get-peer-data'}.".c", }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { output => $output, return_code => $return_code, }}); # If it compiled, setuid it. if (not -e $anvil->data->{path}{exe}{'call_striker-get-peer-data'}) { # Something went wrong compiling it. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "error_0072", variables => { file => $anvil->data->{path}{exe}{'call_striker-get-peer-data'}.".c" }}); } else { $anvil->Storage->change_owner({ debug => 3, path => $anvil->data->{path}{exe}{'call_striker-get-peer-data'}, user => 'root', group => 'root', }); $anvil->Storage->change_mode({ debug => 3, path => $anvil->data->{path}{exe}{'call_striker-get-peer-data'}, mode => '4755', }); } } } return(0); } # Configure/update the firewall. sub check_firewall { my ($anvil) = @_; # Don't call this if we're not configured yet. my $configured = $anvil->System->check_if_configured({debug => 3}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { configured => $configured }}); # Check the firewall needs to be updated. if ($configured) { my ($output, $return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{'anvil-manage-firewall'}.$anvil->Log->switches}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { output => $output, return_code => $return_code }}); } return(0); } # This handles tasks that need to run on boot (if any) sub boot_time_tasks { my ($anvil) = @_; # If the uptime is less than ten minutes, clear the reboot flag. my $uptime = $anvil->Get->uptime; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { uptime => $uptime }}); # Now find out if a reboot is listed as needed and when it was last changed. my $reboot_needed = 0; my $changed_seconds_ago = 0; my $query = " SELECT variable_value, (SELECT extract(epoch from now()) - extract(epoch from modified_date)) AS changed_seconds_ago FROM variables WHERE variable_source_table = 'hosts' AND variable_source_uuid = ".$anvil->Database->quote($anvil->Get->host_uuid)." AND variable_name = 'reboot::needed' ;"; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0124", variables => { query => $query }}); my $results = $anvil->Database->query({query => $query, source => $THIS_FILE, line => __LINE__}); my $count = @{$results}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { results => $results, count => $count, }}); if ($count) { $reboot_needed = $results->[0]->[0]; $changed_seconds_ago = $results->[0]->[1]; $changed_seconds_ago =~ s/^(\d+)\..*$/$1/; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { reboot_needed => $reboot_needed, changed_seconds_ago => $changed_seconds_ago, }}); } ### TODO: This shouldn't be needed anymore. anvil-manage-power doesn't set the progress to '50' prior ### to reboot anymore. # If a reboot is needed, see if the uptime is less than the time since the reboot needed flag was # set. If the uptime is less, then the system rebooted since it was requested so clear it. h/t to # Lisa Seelye (@thedoh) for this idea! my $difference = ($changed_seconds_ago - $uptime); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "s1:reboot_needed" => $reboot_needed, "s2:changed_seconds_ago" => $changed_seconds_ago, "s3:uptime" => $uptime, "s4:difference" => $difference, }}); if ($reboot_needed) { if ($uptime < $changed_seconds_ago) { # Clear the reboot request. $reboot_needed = $anvil->System->reboot_needed({debug => 2, set => 0}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { reboot_needed => $reboot_needed }}); # Check to see if there was a reboot job in progress. If so, finish it off. my $job_uuid = $anvil->Job->get_job_uuid({ debug => 2, program => "anvil-manage-power", incomplete => 1, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }}); if ($job_uuid) { # Update the percentage to '100' and then clear the old PID. my $date_time = $anvil->Get->date_and_time(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { date_time => $date_time }}); $anvil->Job->update_progress({ progress => 100, message => "message_0064,!!date_and_time!".$date_time."!!", job_uuid => $job_uuid, picked_up_by => 0, }); } } } else { # Update our status $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 2, key => "log_0572"}); $anvil->Database->get_hosts({debug => 2}); my $host_uuid = $anvil->Get->host_uuid({debug => 2}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { host_uuid => $host_uuid }}); $anvil->Database->insert_or_update_hosts({ debug => 2, host_ipmi => $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_ipmi}, host_key => $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_key}, host_name => $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_name}, host_type => $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type}, host_uuid => $host_uuid, host_status => "online", }); # Make sure our stop reason is cleared. my $variable_uuid = $anvil->Database->insert_or_update_variables({ variable_name => 'system::stop_reason', variable_value => '', variable_default => '', variable_description => 'striker_0279', variable_section => 'system', variable_source_uuid => $host_uuid, variable_source_table => 'hosts', }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { variable_uuid => $variable_uuid }}); } # Make sure /etc/hosts is updated. $anvil->System->update_hosts(); # This handles weird bits for things like bug work-arounds. handle_special_cases($anvil); # Now look for jobs that have a job status of 'anvil_startup' run_jobs($anvil, 1); # Check the firewall needs to be updated. check_firewall($anvil); # If we're a striker, check apache my $host_type = $anvil->Get->host_type; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_type => $host_type }}); if ($host_type eq "striker") { $anvil->Striker->check_httpd_conf({debug => 2}); } return(0); } # This handles weird bits for things like bug work-arounds. sub handle_special_cases { my ($anvil) = @_; my $host_type = $anvil->Get->host_type(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_type => $host_type }}); if ($host_type ne "striker") { ### TODO: Test that this is fixed. The bug is now ERRATA # RHBZ #1961562 - https://bugzilla.redhat.com/show_bug.cgi?id=1961562#c16 # We're a node or DR host. We need to touch this file. my $work_around_file = "/etc/qemu/firmware/50-edk2-ovmf-cc.json"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { work_around_file => $work_around_file }}); if (not -e $work_around_file) { $anvil->Storage->write_file({ debug => 2, file => $work_around_file, body => "", overwrite => 0, backup => 0, mode => "0644", user => "root", group => "root", }); } # Make sure DRBD compiled after a kernel upgrade. $anvil->DRBD->_initialize_kmod({debug => 2}); } return(0); } # Configure the local database, if needed. sub prep_database { my ($anvil) = @_; # If there's a backup file, we're configured and possibly just off. my $prep_database = 1; foreach my $uuid (keys %{$anvil->data->{database}}) { my $dump_file = $anvil->data->{path}{directories}{pgsql}."/anvil_db_dump.".$uuid.".sql"; $dump_file =~ s/\/\//\//g; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { dump_file => $dump_file }}); if (-e $dump_file) { # No need to prepare. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0665", variables => { file => $dump_file }}); $prep_database = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { prep_database => $prep_database }}); } } # Only run this if we're a dashboard. my $host_type = $anvil->Get->host_type(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_type => $host_type }}); if ($host_type eq "striker") { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { prep_database => $prep_database, "sys::database::connections" => $anvil->data->{sys}{database}{connections}, }}); if ($prep_database) { ### NOTE: This failed once, in case / until it happens again, we'll force log level 2 and secure logging. my $shell_call = $anvil->data->{path}{exe}{'striker-prep-database'}." -vv --log-secure"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my ($database_output, $return_code) = $anvil->System->call({debug => 2, shell_call => $shell_call, source => $THIS_FILE, line => __LINE__ }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { database_output => $database_output, return_code => $return_code, }}); } elsif (not $anvil->data->{sys}{database}{connections}) { # Start the daemon locally, if needed. my $running = $anvil->System->check_daemon({daemon => "postgresql"}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { running => $running }}); if ($running == 2) { # Not installed, nothing to do. } elsif (not $running) { # Start it. my $return_code = $anvil->System->start_daemon({daemon => "postgresql"}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { return_code => $return_code }}); } } } return(0); } # These are tools that need to keep running. sub keep_running { my ($anvil) = @_; # Check for jobs that were running and now exited. if ((not $anvil->data->{sys}{mapping_network}) && (exists $anvil->data->{processes})) { foreach my $job_uuid (%{$anvil->data->{jobs}{handles}}) { # If it's not a handle, delete it. my $running = $anvil->data->{jobs}{handles}{$job_uuid}->poll(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { "jobs::handles::${job_uuid}" => $anvil->data->{jobs}{handles}{$job_uuid}, running => $running, }}); # If it's not running, update the table to clear the 'job_picked_up_by' column. if (not $running) { my $exit_status = $anvil->data->{jobs}{handles}{$job_uuid}->exit_status(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => { job_uuid => $job_uuid, exit_status => $exit_status, }}); # Free up memory $anvil->data->{jobs}{handles}{$job_uuid}->cleanup(); $anvil->Job->clear({job_uuid => $job_uuid}); } } } # If we're configured, write out the status JSON file. If we're not configured, Update hardware state files. my $configured = $anvil->System->check_if_configured; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { configured => $configured }}); if ((not $anvil->data->{sys}{mapping_network}) && ($configured)) { # Write out state information for all known Anvil! systems and the information from # unconfigured nods and DR hosts, using just database data (hence, fast enough to run # constantly). $anvil->System->generate_state_json({debug => 3}); } else { # Run this to monitor the network in real time. update_state_file($anvil); } # Run any pending jobs by calling 'anvil-jobs' with the 'job_uuid' as a background process. run_jobs($anvil, 0) if not $anvil->data->{sys}{mapping_network}; return(0); } # This will check for any jobs that aren't at 100%. For each found, if 'picked_up_by' is set, a check is made # to see if the PID is still alive. If it isn't, or if 'picked_up_by' is not set, the appropriate tool is # invoked to handle it. sub run_jobs { my ($anvil, $startup) = @_; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { startup => $startup }}); # This will be set to 1 if any jobs are not complete, preventing a restart of the daemon if it's # changed on disk. $anvil->data->{sys}{jobs_running} = 0; # We'll also update the jobs.json file. my $jobs_file = "{\"jobs\":[\n"; # Get a list of pending or incomplete jobs. my $ended_within = $startup ? 1 : 300; my $return = $anvil->Database->get_jobs({ended_within => $ended_within}); my $count = @{$return}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { ended_within => $ended_within, 'return' => $return, count => $count, }}); foreach my $hash_ref (@{$return}) { my $job_uuid = $hash_ref->{job_uuid}; my $job_command = $hash_ref->{job_command}; my $job_data = $hash_ref->{job_data}; my $job_picked_up_by = $hash_ref->{job_picked_up_by}; my $job_picked_up_at = $hash_ref->{job_picked_up_at}; my $job_updated = $hash_ref->{job_updated}; my $job_name = $hash_ref->{job_name}; my $job_progress = $hash_ref->{job_progress}; my $job_title = $hash_ref->{job_title}; my $job_description = $hash_ref->{job_description}; my $job_status = $hash_ref->{job_status}; my $started_seconds_ago = $job_picked_up_at ? (time - $job_picked_up_at) : 0; my $updated_seconds_ago = $job_updated ? (time - $job_updated) : 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid, job_command => $job_command, job_data => $job_data, job_picked_up_by => $job_picked_up_by, job_picked_up_at => $job_picked_up_at, job_updated => $job_updated, job_name => $job_name, job_progress => $job_progress, job_title => $job_title, job_description => $job_description, job_status => $job_status, started_seconds_ago => $started_seconds_ago, updated_seconds_ago => $updated_seconds_ago, }}); # If this is a start-up call, only start jobs whose status is 'anvil_startup'. if (($startup) && ($job_status ne "anvil_startup")) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0639", variables => { job_uuid => $job_uuid, job_command => $job_command, }}); next; } if ($job_progress ne "100") { $anvil->data->{sys}{jobs_running} = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "sys::jobs_running" => $anvil->data->{sys}{jobs_running} }}); } # See if the job was picked up by a now-dead instance. if ($job_picked_up_by) { # Check if the PID is still active. $anvil->System->pids({ignore_me => 1}); ### TODO: Add a check to verify the job isn't hung. # Skip if this job is in progress. if (not exists $anvil->data->{pids}{$job_picked_up_by}) { # If the job is done, just clear the 'job_picked_up_by' and be done. if ($job_progress ne "100") { # It's possible that the job updated to 100% and exited after we # gathered the job data, so we won't restart until we've seen it not # running and not at 100% after 5 loops. if ((not exists $anvil->data->{lost_job_count}{$job_uuid}) or (not defined $anvil->data->{lost_job_count}{$job_uuid})) { $anvil->data->{lost_job_count}{$job_uuid} = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "lost_job_count::${job_uuid}" => $anvil->data->{lost_job_count}{$job_uuid} }}); } if ($anvil->data->{lost_job_count}{$job_uuid} > 5) { # The previous job is gone, but the job isn't finished. Start it again. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "warning_0007", variables => { command => $job_command, pid => $job_picked_up_by, percent => $job_progress, }}); # Clear some variables. $job_progress = 0; $job_status = "message_0056"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_progress => $job_progress, job_status => $job_status, }}); # Clear the job. $anvil->Job->clear({debug => 2, job_uuid => $job_uuid}); $anvil->data->{lost_job_count}{$job_uuid} = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "lost_job_count::${job_uuid}" => $anvil->data->{lost_job_count}{$job_uuid} }}); } else { $anvil->data->{lost_job_count}{$job_uuid}++; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "lost_job_count::${job_uuid}" => $anvil->data->{lost_job_count}{$job_uuid} }}); } } # Clear the PID $job_picked_up_by = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_picked_up_by => $job_picked_up_by }}); } } # Convert the double-banged strings into a proper message. my $say_title = $job_title ? $anvil->Words->parse_banged_string({key_string => $job_title}) : ""; my $say_description = $job_description ? $anvil->Words->parse_banged_string({key_string => $job_description}) : ""; my $say_status = $job_status ? $anvil->Words->parse_banged_string({key_string => $job_status}) : ""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { job_title => $job_title, say_description => $say_description, say_status => $say_status, }}); # Make the status HTML friendly. Strip any embedded HTML then encode the text string. if ($say_status) { my $html_strip = HTML::Strip->new(); $say_status = $html_strip->parse($say_status); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { say_status => $say_status }}); # Now make the resulting text string HTML friendly my $text_to_html = HTML::FromText->new({ urls => 1, email => 1, lines => 1, }); $say_status = $text_to_html->parse($say_status); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { say_status => $say_status }}); } # Add this to the jobs.json file my $json_string = to_json ({ job_uuid => $job_uuid, job_command => $job_command, job_data => $job_data, job_picked_up_at => $job_picked_up_at, job_updated => $job_updated, job_name => $job_name, job_progress => $job_progress, job_title => $say_title, job_description => $say_description, job_status => $say_status, started_seconds_ago => $started_seconds_ago, updated_seconds_ago => $updated_seconds_ago, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { json_string => $json_string }}); $jobs_file .= $json_string.",\n"; # If the job is done, move on. next if $job_progress eq "100"; next if $anvil->data->{switches}{'no-start'}; # If 'startup' is set, we only care if 'job_status' is 'anvil_startup' if ((not $startup) && ($say_status eq "anvil_startup")) { # Skip this, it will run next time anvil-daemon restarts. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0593", variables => { command => $job_command, job_uuid => $job_uuid, }}); next; } # If the job is not running, start it. if (not $job_picked_up_by) { my $command = $job_command." --job-uuid ".$job_uuid; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0210", variables => { command => $command }}); # Have we started this job recently? if (exists $anvil->data->{jobs}{$job_uuid}{started}) { my $last_start = time - $anvil->data->{jobs}{$job_uuid}{started}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { last_start => $last_start }}); if ($last_start < 60) { # Skip, Started too recently. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0578", variables => { command => $command, last_start => $last_start, }}); next; } } # Start the job, appending '--job-uuid' to the command. ($anvil->data->{jobs}{handles}{$job_uuid}, my $return_code) = $anvil->System->call({ background => 1, stdout_file => "/tmp/anvil.job.".$job_uuid.".stdout", stderr_file => "/tmp/anvil.job.".$job_uuid.".stderr", shell_call => $command, source => $THIS_FILE, line => __LINE__, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "jobs::handles::${job_uuid}" => $anvil->data->{jobs}{handles}{$job_uuid}, return_code => $return_code, }}); # Log the PID (the job should update the database). my $pid = $anvil->data->{jobs}{handles}{$job_uuid}->pid(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { pid => $pid }}); # Record that we've tried to start this job, so that we don't try to restart it for any reason for at least a minute. $anvil->data->{jobs}{$job_uuid}{started} = time; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'jobs::$job_uuid::started' => $anvil->data->{jobs}{$job_uuid}{started} }}); } } # Close the jobs file. $jobs_file =~ s/,\n$/\n/ms; $jobs_file .= "]}\n"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { jobs_file => $jobs_file }}); # Write the JSON file my $output_json = $anvil->data->{path}{directories}{html}."/status/jobs.json"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { output_xml => $output_json }}); $anvil->Storage->write_file({ file => $output_json, body => $jobs_file, overwrite => 1, backup => 0, mode => "0644", user => "apache", group => "apache", }); return(0); } # This calls 'anvil-update-states' which will scan the local machine's state (hardware and software) and # record write it out to an HTML file sub update_state_file { my ($anvil) = @_; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, key => "log_0480"}); #my $shell_call = $anvil->data->{path}{exe}{'anvil-update-states'}.$anvil->Log->switches; my $shell_call = $anvil->data->{path}{exe}{'anvil-update-states'}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { shell_call => $shell_call }}); my ($states_output, $return_code) = $anvil->System->call({debug => 3, shell_call => $shell_call, source => $THIS_FILE, line => __LINE__}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { states_output => $states_output, return_code => $return_code, }}); return(0); }