The main change in this commit deals with anvil-daemon startup. During OS updates, it would pick up the queued update job and run it while the other --no-db one was still running. This could become an issue for other tasks in the future, so updated anvil-daemon to not run any jobs for the first minute after startup. Also updated it to see if an OS update is underway (given how it can start mid-RPM update, before packages like kmod-drbd are ready to build). While doing this, implemented caching of daily tasks (like agine out data, archiving data, network scans, etc) to only run once per day, period. As it was before, they would always run on anvil-daemon startup, then wait 24 hours.

Note that work has started it reworking anvil-update-system, but it is incomplete (and broken) in this commit.

Signed-off-by: digimer <mkelly@alteeve.ca>
main
digimer 1 year ago
parent 714ccdb5b6
commit e278de4b5a
  1. 7
      share/words.xml
  2. 199
      tools/anvil-daemon
  3. 155
      tools/anvil-update-system

@ -2922,6 +2922,8 @@ Proceed? [y/N]</key>
<key name="message_0322">Installing the latest DRBD kmod RPM now.</key>
<key name="message_0323">Retrying the OS update now.</key>
<key name="message_0324">Update almost complete. Picked this job up after a '--no-db' run, and now we have database access again.</key>
<key name="message_0325">[ Note ] - It looks like 'dnf' (pid(s): [#!variable!pids!#]) is running, holding our start up until it's done (in case the system is being updated now).</key>
<key name="message_0326">This daemon just started. Holding off starting jobs for another: [#!variable!will_start_in!#] second(s).</key>
<!-- Translate names (protocols, etc) -->
<key name="name_0001">Normal Password</key> <!-- none in mail-server -->
@ -3268,6 +3270,11 @@ If you are comfortable that the target has changed for a known reason, you can s
<key name="striker_0299">Migration Network link #!variable!number!#</key>
<key name="striker_0300">This is where you configure the optional network dedicated to RAM-copy during live migrations.</key>
<key name="striker_0301">This puts a temporary hold on a DRBD minor number or TCP port so that it isn't used again in the time between when it was queried as the next free number, and before it can be used.</key>
<key name="striker_0302">This indicates when, in unix time, the database was last aged-out.</key>
<key name="striker_0303">This indicates when, in unix time, the database was last archived.</key>
<key name="striker_0304">This indicates when, in unix time, the local install target data was updated.</key>
<key name="striker_0305">This indicates when, in unix time, the OUI data was last update. The OUI data is a list of MAC address prefixes and which companies they've been assigned to.</key>
<key name="striker_0306">This indicates when, in unix time, the network was last scanned. This is done to determine what IPs are used by servers on the Anvil! cluster, and to try to identify foundation pack devices on the network. These scans are simple ping sweeps used to get the MAC addresses of devices with IPs.</key>
<!-- These are generally units and appended to numbers -->
<key name="suffix_0001">#!variable!number!#/sec</key>

@ -67,6 +67,9 @@ if (($< != 0) && ($> != 0))
# If, so some reason, anvil.conf is lost, create it.
$anvil->System->_check_anvil_conf();
# If dnf is running, hold.
wait_on_dnf($anvil);
# Connect to the database(s). If we have no connections, we'll proceed anyway as one of the 'run_once' tasks
# is to setup the database server.
$anvil->Database->connect({
@ -117,7 +120,13 @@ if (not $anvil->data->{sys}{database}{connections})
}
# Read switches
$anvil->Get->switches({list => ["clear-mapping", "refresh-json", "run-once", "main-loop-only", "no-start", "startup-only"], man => $THIS_FILE});
$anvil->Get->switches({list => [
"clear-mapping",
"refresh-json",
"run-once",
"main-loop-only",
"no-start",
"startup-only"], man => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }});
@ -136,6 +145,10 @@ if ($anvil->data->{switches}{'refresh-json'})
# This is used to track initial checkes / repairs of network issues.
$anvil->data->{sys}{network}{initial_checks} = 0;
# We use this to delay starting jobs for a short time.
our $start_time = time;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { start_time => $start_time }});
# There are some things we only want to run on (re)start and don't need to always run.
run_once($anvil) if not $anvil->data->{switches}{'main-loop-only'};
@ -239,6 +252,49 @@ $anvil->nice_exit({exit_code => 0});
# Functions #
#############################################################################################################
# This checks to see if dnf is running when this daemon starts (as happens during an OS update) and hold
# until the dnf update is complete.
sub wait_on_dnf
{
my ($anvil) = @_;
my $next_log = time - 1;
my $waiting = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
next_log => $next_log,
waiting => $waiting,
}});
while ($waiting)
{
my $pids = $anvil->System->pids({program_name => "dnf"});
my $dnf_instances = @{$pids};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { dnf_instances => $dnf_instances }});
if ($dnf_instances)
{
if (time > $next_log)
{
my $say_pids = "";
foreach my $pid (@{$pids})
{
$say_pids .= $pid.", ";
}
$say_pids =~ s/, $//;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "message_0325", variables => { pids => $say_pids }});
$next_log = time + 60;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }});
}
sleep 10;
}
else
{
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
}
}
}
# If we're using too much ram, send an alert and exit.
sub check_ram
{
@ -667,18 +723,81 @@ sub handle_periodic_tasks
### don't use '--force' and let striker-manage-install-target skip the repo update if it happened
### recently enough.
if ($host_type eq "striker")
{
### TODO: This is here only to handle the period of time where we disabled postgres
### on boot. This should be removed sometime after 2022-08-01
#$anvil->System->enable_daemon({daemon => $anvil->data->{sys}{daemon}{postgresql}});
# Record a job, don't call it directly. It takes too long to run.
my $host_uuid = $anvil->Get->host_uuid();
my ($last_age_out, undef, undef) = $anvil->Database->read_variable({variable_name => "database::".$host_uuid."::aged-out"});
my $time_since_last_age_out = $last_age_out =~ /^\d+$/ ? time - $last_age_out : 100000;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:host_uuid' => $host_uuid,
's2:last_age_out' => $last_age_out,
's3:time_since_last_age_out' => $time_since_last_age_out,
}});
# Run an age-out?
if ($time_since_last_age_out > 86400)
{
# Age out old data. This takes up to a minute.
my $variable_uuid = $anvil->Database->insert_or_update_variables({
variable_name => "database::".$host_uuid."::aged-out",
variable_value => time,
variable_default => "0",
variable_description => "striker_0302",
variable_section => "database",
variable_source_uuid => "NULL",
variable_source_table => "",
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { variable_uuid => $variable_uuid }});
$anvil->Database->_age_out_data();
}
# Run an archive?
my ($last_archive, undef, undef) = $anvil->Database->read_variable({variable_name => "database::".$host_uuid."::archived"});
my $time_since_last_archive = $last_archive =~ /^\d+$/ ? time - $last_archive : 100000;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:last_archive' => $last_archive,
's2:time_since_last_archive' => $time_since_last_archive,
}});
if ($time_since_last_archive > 86400)
{
# Archive old data
my $variable_uuid = $anvil->Database->insert_or_update_variables({
variable_name => "database::".$host_uuid."::archived",
variable_value => time,
variable_default => "0",
variable_description => "striker_0303",
variable_section => "database",
variable_source_uuid => "NULL",
variable_source_table => "",
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { variable_uuid => $variable_uuid }});
$anvil->Database->archive_database();
}
### TODO: This is here only to handle the period of time where we disabled postgres
### on boot. This should be removed sometime after 2022-08-01
$anvil->System->enable_daemon({daemon => $anvil->data->{sys}{daemon}{postgresql}});
# Record a job, don't call it directly. It takes too long to run.
# Run the install target update?
my ($last_mit, undef, undef) = $anvil->Database->read_variable({variable_name => "jobs::last-ran::".$host_uuid."::manage-install-target"});
my $time_since_last_mit = $last_mit =~ /^\d+$/ ? time - $last_mit : 100000;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:last_mit' => $last_mit,
's2:time_since_last_mit' => $time_since_last_mit,
}});
if ($time_since_last_mit > 86400)
{
# Update the local install target data.
my $variable_uuid = $anvil->Database->insert_or_update_variables({
variable_name => "jobs::last-ran::".$host_uuid."::manage-install-target",
variable_value => time,
variable_default => "0",
variable_description => "striker_0304",
variable_section => "jobs",
variable_source_uuid => "NULL",
variable_source_table => "",
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { variable_uuid => $variable_uuid }});
my ($job_uuid) = $anvil->Database->insert_or_update_jobs({
file => $THIS_FILE,
line => __LINE__,
@ -690,9 +809,29 @@ sub handle_periodic_tasks
job_progress => 0,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => { job_uuid => $job_uuid }});
}
# Update the OUI data.
($job_uuid) = $anvil->Database->insert_or_update_jobs({
# Update the OUI data?
my ($last_parse_oui, undef, undef) = $anvil->Database->read_variable({variable_name => "jobs::last-ran::striker-parse-oui"});
my $time_since_last_parse_oui = $last_parse_oui =~ /^\d+$/ ? time - $last_parse_oui : 100000;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:last_parse_oui' => $last_parse_oui,
's2:time_since_last_parse_oui' => $time_since_last_parse_oui,
}});
if ($time_since_last_parse_oui > 86400)
{
# Yup.
my $variable_uuid = $anvil->Database->insert_or_update_variables({
variable_name => "jobs::last-ran::striker-parse-oui",
variable_value => time,
variable_default => "0",
variable_description => "striker_0305",
variable_section => "jobs",
variable_source_uuid => "NULL",
variable_source_table => "",
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { variable_uuid => $variable_uuid }});
my ($job_uuid) = $anvil->Database->insert_or_update_jobs({
file => $THIS_FILE,
line => __LINE__,
job_command => $anvil->data->{path}{exe}{'striker-parse-oui'}.$anvil->Log->switches,
@ -703,9 +842,29 @@ sub handle_periodic_tasks
job_progress => 0,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }});
}
# Scan the networks
($job_uuid) = $anvil->Database->insert_or_update_jobs({
# Scan the network?
my ($last_network_scan, undef, undef) = $anvil->Database->read_variable({variable_name => "jobs::last-ran::striker-scan-network"});
my $time_since_last_network_scan = $last_network_scan =~ /^\d+$/ ? time - $last_network_scan : 100000;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:last_network_scan' => $last_network_scan,
's2:time_since_last_network_scan' => $time_since_last_network_scan,
}});
if ($time_since_last_parse_oui > 86400)
{
# Yup.
my $variable_uuid = $anvil->Database->insert_or_update_variables({
variable_name => "jobs::last-ran::striker-scan-network",
variable_value => time,
variable_default => "0",
variable_description => "striker_0306",
variable_section => "jobs",
variable_source_uuid => "NULL",
variable_source_table => "",
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { variable_uuid => $variable_uuid }});
my ($job_uuid) = $anvil->Database->insert_or_update_jobs({
file => $THIS_FILE,
line => __LINE__,
job_command => $anvil->data->{path}{exe}{'striker-scan-network'}.$anvil->Log->switches,
@ -717,6 +876,7 @@ sub handle_periodic_tasks
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }});
}
}
# Update the next check time.
$anvil->data->{timing}{next_daily_check} = $now_time + $anvil->data->{timing}{daily_checks};
@ -1402,7 +1562,24 @@ sub keep_running
sub run_jobs
{
my ($anvil, $startup) = @_;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { startup => $startup }});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { startup => $startup }});
# Don't start jobs for 30 seconds after startup.
if (not $startup)
{
my $time_since_start = time - $start_time;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
time_since_start => $time_since_start,
start_time => $start_time,
}});
if ($time_since_start < 60)
{
# Log that we'll start jobs in X seconds.
my $will_start_in = 60 - $time_since_start;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "message_0326", variables => { will_start_in => $will_start_in }});
return(0);
}
}
# This will be set to 1 if any jobs are not complete, preventing a restart of the daemon if it's
# changed on disk.

@ -116,7 +116,7 @@ update_progress($anvil, 2, "message_0033");
$anvil->System->maintenance_mode({set => 1}) if $anvil->data->{sys}{database}{connections};
# Run the update
run_os_update($anvil, 1, 3);
run_os_update($anvil, 3);
# If we had no database, try to reconnect now tha
if (not $anvil->data->{sys}{database}{connections})
@ -259,7 +259,7 @@ sub update_progress
# This updates the OS.
sub run_os_update
{
my ($anvil, $try, $progress) = @_;
my ($anvil, $progress) = @_;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
try => $try,
progress => $progress,
@ -333,6 +333,75 @@ WHERE
update_progress($anvil, 5, "message_0316");
}
# Before we start, do we need to remove our locally build DRBD kernel modules?
my $remove_drbd_kmod = 0;
my $shell_call = $anvil->data->{path}{exe}{dnf}." check-update";
open (my $file_handle, $shell_call." 2>&1 |") or $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, secure => 0, priority => "err", key => "log_0014", variables => { shell_call => $shell_call, error => $! }});
while(<$file_handle>)
{
chomp;
my $line = $_;
$output .= $line."\n";
$line = $anvil->Words->clean_spaces({string => $line});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { line => $line }});
if (($line =~ /kmod-drbd/) or ($line =~ /kernel/))
{
# Looks like it.
$remove_drbd_kmod = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { remove_drbd_kmod => $remove_drbd_kmod }});
last;
}
}
close $file_handle;
# So, shall we?
if ($remove_drbd_kmod)
{
update_progress($anvil, $progress++, "message_0320");
my $versions_to_remove = "";
my $shell_call = $anvil->data->{path}{exe}{dnf}." list installed";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
foreach my $line (split/\n/, $output)
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { line => $line }});
if ($line =~ /(kmod-drbd-\d+.*?)\s/)
{
$versions_to_remove .= $1." ";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { versions_to_remove => $versions_to_remove }});
}
}
# Now remove those packages.
update_progress($anvil, $progress++, "message_0321");
$shell_call = $anvil->data->{path}{exe}{dnf}." -y remove ".$versions_to_remove;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
# # Now install the new packages.
# update_progress($anvil, $progress++, "message_0322");
# $shell_call = $anvil->data->{path}{exe}{dnf}." -y install kmod-drbd";
# $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
#
# ($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
# $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
# output => $output,
# return_code => $return_code,
# }});
}
# NOTE: We run this directly to better monitor progress and update the progress.
my $package_changes = 0;
my $transaction_shown = 0;
@ -343,9 +412,9 @@ WHERE
my $next_step = 0;
my $verifying = 0;
my $output = "";
my $shell_call = $anvil->data->{path}{exe}{dnf}." -y update; ".$anvil->data->{path}{exe}{echo}." return_code:\$?";
$shell_call = $anvil->data->{path}{exe}{dnf}." -y update; ".$anvil->data->{path}{exe}{echo}." return_code:\$?";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { shell_call => $shell_call }});
open (my $file_handle, $shell_call." 2>&1 |") or $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, secure => 0, priority => "err", key => "log_0014", variables => { shell_call => $shell_call, error => $! }});
open ($file_handle, $shell_call." 2>&1 |") or $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, secure => 0, priority => "err", key => "log_0014", variables => { shell_call => $shell_call, error => $! }});
while(<$file_handle>)
{
chomp;
@ -469,76 +538,6 @@ WHERE
}
close $file_handle;
# If this is the first try and it failed, see if it's a DRBD issue.
if ((not $success) && ($try == 1))
{
# Is this the DRBD kmod issue?
my $remove_drbd_kmod = 0;
foreach my $line (split/\n/, $output)
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { line => $line }});
if ($line =~ /kmod-drbd/)
{
# Looks like it.
$remove_drbd_kmod = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { remove_drbd_kmod => $remove_drbd_kmod }});
last;
}
}
# Clear the old kmod and try the update again.
if ($remove_drbd_kmod)
{
update_progress($anvil, $progress++, "message_0320");
my $versions_to_remove = "";
my $shell_call = $anvil->data->{path}{exe}{dnf}." list installed";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
foreach my $line (split/\n/, $output)
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { line => $line }});
if ($line =~ /(kmod-drbd-\d+.*?)\s/)
{
$versions_to_remove .= $1." ";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { versions_to_remove => $versions_to_remove }});
}
}
# Now remove those packages.
update_progress($anvil, $progress++, "message_0321");
$shell_call = $anvil->data->{path}{exe}{dnf}." -y remove ".$versions_to_remove;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
# Now install the new packages.
update_progress($anvil, $progress++, "message_0322");
$shell_call = $anvil->data->{path}{exe}{dnf}." -y install kmod-drbd";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
# Now try again.
update_progress($anvil, $progress++, "message_0323");
run_os_update($anvil, 2, $progress);
return(0);
}
}
# Reload daemons to pick up any changed systemctl daemons.
my ($systemctl_output, $return_code) = $anvil->System->call({debug => 3, shell_call => $anvil->data->{path}{exe}{systemctl}." daemon-reload", source => $THIS_FILE, line => __LINE__});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { systemctl_output => $systemctl_output, return_code => $return_code }});
@ -569,18 +568,16 @@ WHERE
if ($installed_kernel ne $active_kernel)
{
# Reboot needed
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0687", variables => { reason => "#!string!log_0690!#" }});
my $reboot_needed = $anvil->System->reboot_needed({set => 1});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { reboot_needed => $reboot_needed }});
$anvil->data->{sys}{reboot} = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'sys::reboot' => $anvil->data->{sys}{reboot} }});
}
# If we installed and packages, and '--reboot' was given, reboot anyway.
if (($package_changes) && ($anvil->data->{switches}{reboot}))
{
# Reboot needed
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0687", variables => { reason => "#!string!log_0690!#" }});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'sys::reboot' => $anvil->data->{sys}{reboot} }});
my $reboot_needed = $anvil->System->reboot_needed({set => 1});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { reboot_needed => $reboot_needed }});
}
# Did it work?

Loading…
Cancel
Save