2023-07-07 21:54:57 +00:00
#!/usr/bin/perl
#
# This program will disable our daemons on all machines, then update each striker. It then walks through all
# DR hosts and Anvil! nodes. With nodes, it migrates servers to the peer, takes the node out of the cluster,
# updates it, reboots if the kernel was updated, and then rejoins the cluster, migrates the VMs and the does
# the same process on the peer sub-node.
#
# Exit codes;
# 0 = Normal exit.
# 1 = No database connection.
#
# TODO:
#
# USAGE:
#
use strict;
use warnings;
use Anvil::Tools;
require POSIX;
use Term::Cap;
use Text::Diff;
use Data::Dumper;
my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0];
my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0];
if (($running_directory =~ /^\./) && ($ENV{PWD}))
{
$running_directory =~ s/^\./$ENV{PWD}/;
}
# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete.
$| = 1;
my $anvil = Anvil::Tools->new();
# Read switches (target ([user@]host[:port]) and the file with the target's password.
2023-07-15 02:29:07 +00:00
$anvil->Get->switches({list => [
"clear-cache",
"force",
"no-reboot",
2023-07-17 00:45:47 +00:00
"reboot",
2023-07-15 02:29:07 +00:00
"reboot-self",
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
"timeout",
2023-07-15 02:29:07 +00:00
"y",
"yes"], man => $THIS_FILE});
2023-07-07 21:54:57 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }});
# Connect to the database(s). If we have no connections, we'll proceed anyway as one of the 'run_once' tasks
# is to setup the database server.
$anvil->Database->connect();
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"});
if (not $anvil->data->{sys}{database}{connections})
{
# No databases, update the job, sleep for a bit and then exit. The daemon will pick it up and try
# again after we exit.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0305"});
sleep 10;
$anvil->nice_exit({exit_code => 1});
}
# Make sure we're running as 'root'
# $< == real UID, $> == effective UID
if (($< != 0) && ($> != 0))
{
# Not root
print $anvil->Words->string({key => "error_0005"})."\n";
$anvil->nice_exit({exit_code => 1});
}
# Make sure we're a striker.
if ($anvil->Get->host_type ne "striker")
{
print "This has to be run on a Striker dashboard.\n";
$anvil->nice_exit({exit_code => 1});
}
2023-07-16 02:23:30 +00:00
# If we still don't have a job-uuit, go into interactive mode.
$anvil->data->{sys}{progress} = 0;
if ($anvil->data->{switches}{'job-uuid'})
{
# Load the job data.
$anvil->Job->clear();
$anvil->Job->get_job_details({debug => 2});
$anvil->Job->update_progress({
progress => $anvil->data->{sys}{progress}++,
job_picked_up_by => $$,
job_picked_up_at => time,
'print' => 1,
message => "message_0319",
});
}
# Update beginning. Verifying all known machines are accessible...
$anvil->Job->update_progress({
'print' => 1,
progress => $anvil->data->{sys}{progress}++,
message => "job_0469",
});
2023-07-07 21:54:57 +00:00
my $all_access = verify_access($anvil);
2023-07-15 02:29:07 +00:00
if ((not $all_access) && (not $anvil->data->{switches}{force}))
2023-07-07 21:54:57 +00:00
{
print "[ Error ] - Not all systems are accessible. Update aborted!\n";
$anvil->nice_exit({exit_code => 1});
}
print "Success!\n";
2023-07-15 02:29:07 +00:00
if (($anvil->data->{switches}{y}) or ($anvil->data->{switches}{yes}))
2023-07-07 21:54:57 +00:00
{
2023-07-15 02:29:07 +00:00
print "[ Note ] - Proceeding without confirmation, '-y' or '--yes' used.\n";
2023-07-07 21:54:57 +00:00
}
else
{
2023-07-15 02:29:07 +00:00
print "[ Note ] - All nodes need to be up and running for the update to run on nodes.
[ Note ] - Any out-of-sync storage needs to complete before a node can be updated.
[ Warning ] - Servers will be migrated between subnodes, which can cause reduced performance during
[ Warning ] - the these migrations. If a sub-node is not active, it will be activated as part of the
[ Warning ] - upgrade process.\n";
print "\n".$anvil->Words->string({key => "message_0021"})."\n";
my $answer = <STDIN>;
chomp $answer;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { answer => $answer }});
if ($answer =~ /^y/i)
{
print $anvil->Words->string({key => "message_0175"})."\n";
}
else
{
print $anvil->Words->string({key => "message_0022"})."\n";
$anvil->nice_exit({exit_code => 0});
}
2023-07-07 21:54:57 +00:00
}
2023-07-15 02:29:07 +00:00
manage_daemons($anvil, "stop");
2023-07-07 21:54:57 +00:00
# Update systems
update_strikers_and_dr($anvil);
# Update DR Host
update_nodes($anvil);
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
manage_daemons($anvil, "start");
2023-07-15 02:29:07 +00:00
print "Updates complete!\n";
my $host_uuid = $anvil->Get->host_uuid;
my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:host_uuid' => $host_uuid,
's2:short_host_name' => $short_host_name,
}});
if ($anvil->data->{sys}{reboot_needed})
{
if ($anvil->data->{switches}{'reboot-self'})
{
print "[ Note ] - The local system needs to be rebooted, and '--reboot-self' was used. Rebooting in 60 seconds! Use ctrl+c to abort!\n";
my $waiting = 60;
while ($waiting)
{
print $waiting.", ";
sleep 5;
$waiting -= 5;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $waiting }});
}
print "\nRebooting now!\n";
my $shell_call = $anvil->data->{path}{exe}{systemctl}." reboot";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code }});
print "Reboot requested, exiting.\n";
}
else
{
print "[ Note ] - This host needs to be rebooted to activate the new kernel. Please update as soon as you can.\n";
}
}
2023-07-07 21:54:57 +00:00
$anvil->nice_exit({exit_code => 0});
#############################################################################################################
# Functions #
#############################################################################################################
sub update_nodes
{
my ($anvil) = @_;
# Here, we loop through anvil systems, and find which sub nodes will be updated first, and which will
# be updated second.
foreach my $anvil_name (sort {$a cmp $b} keys %{$anvil->data->{anvils}{anvil_name}})
{
my $anvil_uuid = $anvil->data->{anvils}{anvil_name}{$anvil_name}{anvil_uuid};
my $anvil_description = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_description};
my $anvil_node1_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node1_host_uuid};
my $anvil_node2_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node2_host_uuid};
my $primary_host_uuid = $anvil->Cluster->get_primary_host_uuid({anvil_uuid => $anvil_uuid});
2023-07-24 23:00:19 +00:00
$primary_host_uuid = $anvil_node1_host_uuid if not $primary_host_uuid;
2023-07-15 02:29:07 +00:00
my $secondary_host_uuid = $primary_host_uuid eq $anvil_node1_host_uuid ? $anvil_node2_host_uuid : $anvil_node1_host_uuid;
my $node1_short_host_name = $anvil->data->{hosts}{host_uuid}{$anvil_node1_host_uuid}{short_host_name};
my $node2_short_host_name = $anvil->data->{hosts}{host_uuid}{$anvil_node2_host_uuid}{short_host_name};
2023-07-07 21:54:57 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:anvil_name' => $anvil_name,
's2:anvil_uuid' => $anvil_uuid,
's3:anvil_description' => $anvil_description,
's4:anvil_node1_host_uuid' => $anvil_node1_host_uuid,
's5:anvil_node2_host_uuid' => $anvil_node2_host_uuid,
's6:primary_host_uuid' => $primary_host_uuid,
2023-07-15 02:29:07 +00:00
's7:secondary_host_uuid' => $secondary_host_uuid,
's8:node1_short_host_name' => $node1_short_host_name,
's9:node2_short_host_name' => $node2_short_host_name,
2023-07-07 21:54:57 +00:00
}});
2023-07-15 02:29:07 +00:00
# Before we proceed, are both nodes online? If so, great. If not, are both offline? If only
# one is online, abort. Check now in case things have changed since our first scan
print "Preparing to update the Anvil! node: [".$anvil_name."]. Verifying subnode access:\n";
foreach my $host_uuid ($secondary_host_uuid, $primary_host_uuid)
2023-07-07 21:54:57 +00:00
{
2023-07-15 02:29:07 +00:00
my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:host_uuid' => $host_uuid,
's2:short_host_name' => $short_host_name,
}});
print "- Verifying access to subnode: [".$short_host_name."]\n";
2023-07-16 02:23:30 +00:00
my $matches = $anvil->Network->find_access({
debug => 2,
target => $host_uuid,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { matches => $matches }});
2023-07-15 02:29:07 +00:00
$anvil->data->{peer}{$short_host_name}{access}{ip} = "";
$anvil->data->{peer}{$short_host_name}{access}{network} = "";
2023-07-24 19:43:54 +00:00
foreach my $preferred_network ("bcn", "mn", "ifn", "sn", "any")
2023-07-15 02:29:07 +00:00
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { preferred_network => $preferred_network }});
foreach my $network_name (sort {$a cmp $b} keys %{$anvil->data->{network_access}})
{
2023-07-24 19:43:54 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { network_name => $network_name }});
if (($network_name !~ /^$preferred_network/) && ($preferred_network ne "any"))
{
next;
}
2023-07-15 02:29:07 +00:00
my $target_ip = $anvil->data->{network_access}{$network_name}{target_ip_address};
my $test_access = $anvil->Remote->test_access({target => $target_ip});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
2023-07-24 19:43:54 +00:00
's1:target_ip' => $target_ip,
's2:test_access' => $test_access,
2023-07-15 02:29:07 +00:00
}});
if ($test_access)
{
# We're good.
$anvil->data->{peer}{$short_host_name}{access}{ip} = $target_ip;
$anvil->data->{peer}{$short_host_name}{access}{network} = $network_name;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"s1:peer::${short_host_name}::access::ip" => $anvil->data->{peer}{$short_host_name}{access}{ip},
"s2:peer::${short_host_name}::access::network" => $anvil->data->{peer}{$short_host_name}{access}{network},
}});
2023-07-23 00:07:26 +00:00
print "- Access found over the: [".$network_name."] networking using the IP: [".$target_ip."]\n";
2023-07-15 02:29:07 +00:00
last;
}
}
}
if (not $anvil->data->{peer}{$short_host_name}{access}{ip})
{
print "[ Warning ] - Access not found!\n";
}
2023-07-07 21:54:57 +00:00
}
2023-07-15 02:29:07 +00:00
if ((($anvil->data->{peer}{$node1_short_host_name}{access}{ip}) && (not $anvil->data->{peer}{$node2_short_host_name}{access}{ip})) or
((not $anvil->data->{peer}{$node1_short_host_name}{access}{ip}) && ($anvil->data->{peer}{$node2_short_host_name}{access}{ip})))
2023-07-07 21:54:57 +00:00
{
2023-07-15 02:29:07 +00:00
# Only one node online, skip this Anvil node.
if ($anvil->data->{switches}{force})
{
# Skip this Anvil! system
print "[ Warning ] - '--force' used, skipping this node.\n";
print "[ NOTE ] - This node may not be able to communicate with the Striker dashboards until updated manually!\n";
next;
}
else
{
print "[ Error ] - Exiting update! Please bring the missing subnode back online and try again!\n";
$anvil->nice_exit({exit_code => 1});
}
2023-07-07 21:54:57 +00:00
}
2023-07-15 02:29:07 +00:00
# Update the secondary first, as it should have no VMs on it.
foreach my $host_uuid ($secondary_host_uuid, $primary_host_uuid)
2023-07-07 21:54:57 +00:00
{
2023-07-15 02:29:07 +00:00
# Withdraw the node from the cluster.
my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name};
my $peer_host_uuid = $host_uuid eq $primary_host_uuid ? $secondary_host_uuid : $primary_host_uuid;
my $peer_short_host_name = $anvil->data->{hosts}{host_uuid}{$peer_host_uuid}{short_host_name};
2023-07-07 21:54:57 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
2023-07-15 02:29:07 +00:00
's1:host_uuid' => $host_uuid,
's2:short_host_name' => $short_host_name,
's3:peer_host_uuid' => $peer_host_uuid,
's4:peer_short_host_name' => $peer_short_host_name,
2023-07-07 21:54:57 +00:00
}});
2023-07-15 02:29:07 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:host_uuid' => $host_uuid,
's2:short_host_name' => $short_host_name,
}});
print "Preparing to update: [".$short_host_name."]. Withdrawing the subnode from the Anvil! node.\n";
print "- [ Note ] - If the node has servers that need to be migrated off, or if the node is SyncSource for storage,\n";
print "- [ Note ] - this could take some time to complete.\n";
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
# Make sure VMs are off, DRBD is down and the node is out of the cluster. Call this
# with nohup so it doesn't get killed by the loss of the SSH connection.
my $shell_call = $anvil->data->{path}{exe}{'anvil-safe-stop'}." --no-db".$anvil->Log->switches()." >/dev/null 2>&1 &";
2023-07-15 02:29:07 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
2023-07-15 02:29:07 +00:00
my ($output, $error, $return_code) = $anvil->Remote->call({
2023-07-07 21:54:57 +00:00
shell_call => $shell_call,
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
error => $error,
return_code => $return_code,
}});
2023-07-15 02:29:07 +00:00
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
# Now wait for DRBD resources to stop (which requires VMs be off).
print "- Waiting for all DRBD resource (and the servers using them) to stop before proceeding.\n";
2023-07-25 23:13:41 +00:00
my $wait_until = time + $anvil->data->{switches}{timeout};
my $next_log = time + 60;
my $waiting = 1;
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
wait_until => $wait_until,
next_log => $next_log,
waiting => $waiting,
}});
2023-07-15 02:29:07 +00:00
while ($waiting)
{
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
my $drbd_up = 0;
my $pacemaker_up = 0;
$anvil->DRBD->get_status({
host => $short_host_name,
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
});
# How may resources are up?
my $resource_count = keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { resource_count => $resource_count }});
if ($resource_count)
{
# DRBD is still up.
$drbd_up = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { drbd_up => $drbd_up }});
}
# Is pacemaker down?
2023-07-15 02:29:07 +00:00
my $problem = $anvil->Cluster->parse_cib({target => $anvil->data->{peer}{$short_host_name}{access}{ip}});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }});
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
if (not $problem)
{
# Node is still in the cluster.
$pacemaker_up = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { pacemaker_up => $pacemaker_up }});
}
if ((not $pacemaker_up) && (not $drbd_up))
2023-07-15 02:29:07 +00:00
{
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
}
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
if ($waiting)
2023-07-15 02:29:07 +00:00
{
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
# Log which resources are still up
2023-07-15 02:29:07 +00:00
if (time > $next_log)
{
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
my $say_time = $anvil->Get->date_and_time({time_only => 1});
if ($pacemaker_up)
2023-07-15 02:29:07 +00:00
{
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
print "[ Note ] - [".$say_time."] - The subnode is still in the cluster.\n";
2023-07-15 02:29:07 +00:00
}
else
{
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
print "[ Note ] - [".$say_time."] - The subnode is no longer in the cluster, good.\n";
}
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}})
{
print "[ Note ] - [".$say_time."] - The resource: [".$resource."] is still up.\n";
2023-07-15 02:29:07 +00:00
}
2023-07-25 23:13:41 +00:00
$next_log = time + 60;
my $time_left = $wait_until - time;
my $say_time_left = $anvil->Convert->time({
'time' => $time_left,
translate => 1,
long => 0,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
next_log => $next_log,
time_left => $time_left,
say_time_left => $say_time_left,
}});
print "- Waiting for another: [".$say_time_left."], will check again shortly.\n";
2023-07-15 02:29:07 +00:00
}
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
if (time > $wait_until)
{
# Timeout.
print "[ Error ] - Timed out while waiting for the subnode: [".$short_host_name."] to stop all DRBD resources nad leave the cluster. Aborting the update.\n";
$anvil->nice_exit({exit_code => 1});
}
sleep 10;
2023-07-15 02:29:07 +00:00
}
}
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
my $update_switches = "";
2023-07-15 02:29:07 +00:00
if ($anvil->data->{switches}{'no-reboot'})
{
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
$update_switches .= " --no-reboot";
2023-07-15 02:29:07 +00:00
}
2023-07-17 00:45:47 +00:00
if ($anvil->data->{switches}{reboot})
{
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
$update_switches .= " --reboot";
2023-07-17 00:45:47 +00:00
}
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { update_switches => $update_switches }});
# We register a job, even though anvil-daemon isn't running. This will get picked up
# by 'anvil-update-systems --no-db' towards the end of it's run.
print "- Registering a job to update the subnode, which we can track to confirm when the update is done.\n";
$shell_call = $anvil->data->{path}{exe}{'anvil-update-system'}.$update_switches.$anvil->Log->switches();
2023-07-15 02:29:07 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
my $job_uuid = $anvil->Database->insert_or_update_jobs({
2023-07-15 02:29:07 +00:00
debug => 2,
job_command => $shell_call,
job_description => "job_0468",
job_host_uuid => $host_uuid,
job_name => "system::update-system",
job_progress => 0,
job_title => "job_0467"
2023-07-07 21:54:57 +00:00
});
2023-07-15 02:29:07 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }});
print "- [ Note ] - Job registered with UUID: [".$job_uuid."], waiting for it to complete.\n";
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
# Now call anvil-update-system with --no-db and background it so we can close
# the DB connection without killing the process.
print "- Calling the no-database update of: [".$short_host_name."]\n";
$shell_call = $anvil->data->{path}{exe}{nohup}." ".$anvil->data->{path}{exe}{'anvil-update-system'}." --no-db".$update_switches;
if ($anvil->data->{switches}{'clear-cache'})
{
# We'll only call clear-cache on this one.
$shell_call .= " --clear-cache";
}
$shell_call .= $anvil->Log->switches()." >/dev/null 2>&1 &";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
($output, $error, $return_code) = $anvil->Remote->call({
shell_call => $shell_call,
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
error => $error,
return_code => $return_code,
}});
# Record the start time so that we can be sure the subnode has rebooted (uptime is
# less than the current time minus this start time), if the host reboots as part of
# the update.
my $rebooted = 0;
my $reboot_time = time;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
rebooted => $rebooted,
reboot_time => $reboot_time,
short_host_name => $short_host_name,
}});
2023-07-15 02:29:07 +00:00
# Verify that the node is no longer in the cluster.
2023-07-25 23:13:41 +00:00
$wait_until = time + $anvil->data->{switches}{timeout};
$waiting = 1;
$next_log = time + 60;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
wait_until => $wait_until,
next_log => $next_log,
}});
2023-07-15 02:29:07 +00:00
while ($waiting)
{
2023-07-15 02:52:51 +00:00
$anvil->Job->get_job_details({job_uuid => $job_uuid});
2023-07-15 02:29:07 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"jobs::job_progress" => $anvil->data->{jobs}{job_progress},
2023-07-15 02:52:51 +00:00
"jobs::job_data" => $anvil->data->{jobs}{job_data},
2023-07-15 02:29:07 +00:00
}});
if ($anvil->data->{jobs}{job_progress} == 100)
{
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
print "- Done! The subnode: [".$short_host_name."] has been updated\n";
2023-07-15 02:29:07 +00:00
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
# Did it reboot?
2023-07-15 02:52:51 +00:00
if ($anvil->data->{jobs}{job_data} eq "rebooted")
2023-07-15 02:29:07 +00:00
{
$rebooted = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { rebooted => $rebooted }});
}
2023-07-15 20:19:21 +00:00
# Did it fail?
if ($anvil->data->{jobs}{job_data} eq "failed")
{
# Abort!
print "[ Error ] - There was a problem updating the subnode! Anvil! cluster update aborted.\n";
$anvil->nice_exit({exit_code => 1});
}
2023-07-15 02:29:07 +00:00
}
else
{
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
my $say_date = $anvil->Get->date_and_time({time_only => 1});
2023-07-25 23:13:41 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { say_date => $say_date }});
2023-07-15 02:29:07 +00:00
if (time > $next_log)
{
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
print "[ Note ] - [".$say_date."] - The job progress is: [".$anvil->data->{jobs}{job_progress}."], continuing to wait.\n";
if ($anvil->data->{jobs}{job_progress} eq "0")
2023-07-15 02:29:07 +00:00
{
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
print "[ Note ] - [".$say_date."] - It is expected for the job to stay at '0' for a while.\n";
2023-07-15 02:29:07 +00:00
}
2023-07-25 23:13:41 +00:00
$next_log = time + 60;
my $time_left = $wait_until - time;
my $say_time_left = $anvil->Convert->time({
'time' => $time_left,
translate => 1,
long => 0,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
next_log => $next_log,
time_left => $time_left,
say_time_left => $say_time_left,
}});
print "- Waiting for another: [".$say_time_left."], will check again shortly.\n";
}
if (time > $wait_until)
{
# Timeout.
print "[ Error ] - Timed out while waiting for the subnode: [".$short_host_name."] to update. Aborting the update.\n";
$anvil->nice_exit({exit_code => 1});
2023-07-15 02:29:07 +00:00
}
sleep 5;
}
}
print "- Update completed successfully! Checking if a reboot is needed.\n";
my $run_anvil_safe_start = 0;
if ($rebooted)
{
print "- Rebooted! Will wait for it to come back up.\n";
wait_for_reboot($anvil, $host_uuid, $reboot_time);
}
else
{
print "- Reboot not needed, kernel appears to be up to date.\n";
$run_anvil_safe_start = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { run_anvil_safe_start => $run_anvil_safe_start }});
}
# Wait for the node to rejoin the cluster. As before, this is a time
# unrestricted wait loop.
print "- Waiting for the subnode to rejoin the node.\n";
2023-07-25 23:13:41 +00:00
$wait_until = time + $anvil->data->{switches}{timeout};
2023-07-15 02:29:07 +00:00
$waiting = 1;
my $start_called = 0;
$next_log = time + 60;
my $manual_start = time + 60;
2023-07-07 21:54:57 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
2023-07-25 23:13:41 +00:00
's1:wait_until' => $wait_until,
's2:next_log' => $next_log,
's3:manual_start' => $manual_start,
2023-07-07 21:54:57 +00:00
}});
2023-07-15 02:29:07 +00:00
while($waiting)
{
# Should we call a start to the cluster?
if ((not $start_called) && ($run_anvil_safe_start))
{
print "- Calling 'anvil-safe-start' to rejoin the subnode to the node.\n";
$start_called = 1;
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
my $shell_call = $anvil->data->{path}{exe}{'anvil-safe-start'}.$anvil->Log->switches()." >/dev/null 2>&1 &";
2023-07-15 02:29:07 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
start_called => $start_called,
shell_call => $shell_call,
}});
my ($output, $error, $return_code) = $anvil->Remote->call({
2023-07-15 20:19:21 +00:00
debug => 2,
2023-07-15 02:29:07 +00:00
shell_call => $shell_call,
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
error => $error,
return_code => $return_code,
}});
}
# Pull the CIB and make sure both nodes are ready, and that DRBD resources
# are all UpToDate if this is the reboot from the first node.
my ($problem) = $anvil->Cluster->parse_cib({target => $anvil->data->{peer}{$short_host_name}{access}{ip}});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }});
# Are both nodes ready?
if (not $problem)
{
# Both nodes are in the cluster, but are they full members yet?
my $both_ready = 1;
my $node_count = 0;
foreach my $node_name (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{data}{node}})
{
my $ready = $anvil->data->{cib}{parsed}{data}{node}{$node_name}{node_state}{ready};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
node_name => $node_name,
ready => $ready,
}});
if (not $ready)
{
$both_ready = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { both_ready => $both_ready }});
}
$node_count++;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { node_count => $node_count }});
}
# Did we see two nodes and are both ready?
if (($node_count == 2) && ($both_ready))
{
# Yes! If this is the first subnode, we need to wait for DRBD
# to be UpToDate. If it's the second, we just wait for the
# connections to be up.
# NOTE: We call the peer to get the DRBD data as it's got a
# better view of the storage
print "- Both subnodes are online, will now check replicated storage.\n";
$anvil->DRBD->get_status({
host => $peer_short_host_name,
target => $anvil->data->{peer}{$peer_short_host_name}{access}{ip},
});
if ($host_uuid eq $primary_host_uuid)
{
### NOTE: Should we wait for all connections
### to be up?
# This is the second node, we don't have to wait.
print "- This is the second node, no need to wait for replication to complete.\n";
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
}
else
{
# This is the first node. Wait for all volumes to be
# UpToDate.
if (time > $next_log)
{
print "- Waiting for all volumes to be UpToDate before updating the other subnode.\n";
}
my $all_uptodate = 1;
my $resources = 0;
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$peer_short_host_name}{resource}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { resource => $resource }});
foreach my $peer_name (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$peer_short_host_name}{resource}{$resource}{connection}})
{
# We don't care about DR hosts for this upgrade
my $peer_uuid = $anvil->Get->host_uuid_from_name({host_name => $peer_name});
my $peer_type = $anvil->data->{hosts}{host_uuid}{$peer_uuid}{host_type};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:peer_name' => $peer_name,
's2:peer_uuid' => $peer_uuid,
's3:peer_type' => $peer_type,
}});
next if $peer_type ne "node";
foreach my $volume (sort {$a <=> $b} keys %{$anvil->data->{drbd}{status}{$peer_short_host_name}{resource}{$resource}{connection}{$peer_name}{volume}})
{
# This is this subnode's disk state,
# as the DRBD data was collected
# from the peer.
my $disk_state = $anvil->data->{drbd}{status}{$peer_short_host_name}{resource}{$resource}{connection}{$peer_name}{volume}{$volume}{'peer-disk-state'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:volume' => $volume,
's2:disk_state' => $disk_state,
}});
if (lc($disk_state) ne "uptodate")
{
$all_uptodate = 0;
my $eta_in_seconds = $anvil->data->{drbd}{status}{$peer_short_host_name}{resource}{$resource}{connection}{$peer_name}{volume}{$volume}{'estimated-seconds-to-finish'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
all_uptodate => $all_uptodate,
eta_in_seconds => $eta_in_seconds,
}});
if (time > $next_log)
{
if ($eta_in_seconds)
{
print "- The resource: [".$resource."/".$volume."] is not synced yet, ETA is: [".$eta_in_seconds."] to complete resync.\n";
}
else
{
print "- The resource: [".$resource."/".$volume."] is not yet UpToDate.\n";
}
}
}
} # End foreach volume
} # End foreach peer
} # End foreach resource
if ($all_uptodate)
{
print "- All resources appear to be ready,\n";
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
}
} # End if host is first or second subnode
} # End if both ready
elsif (time > $next_log)
{
print "- Both subnodes are not online yet, still waiting.\n";
}
} # End if CIB was parsed
elsif (time > $next_log)
{
print "- Unable to parse the node's cluster information base, will try again soon.\n";
}
if (time > $next_log)
{
2023-07-25 23:13:41 +00:00
my $say_time = $anvil->Get->date_and_time({time_only => 1});
$next_log = time + 60;
my $time_left = $wait_until - time;
my $say_time_left = $anvil->Convert->time({
'time' => $time_left,
translate => 1,
long => 0,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:say_time' => $say_time,
's2:next_log' => $next_log,
's3:time_left' => $time_left,
's4:say_time_left' => $say_time_left,
}});
# Tell the user we're still waiting.
print "- [".$say_time."] - We're still waiting for the subnode: [".$short_host_name."] to reboot.\n";
print "- Waiting for another: [".$say_time_left."], will check again shortly.\n";
}
if (time > $wait_until)
{
# Timeout.
print "[ Error ] - Timed out while waiting for the subnode: [".$short_host_name."] to join the subcluster. Aborting the update.\n";
$anvil->nice_exit({exit_code => 1});
2023-07-15 02:29:07 +00:00
}
if ($waiting)
{
sleep 5;
}
} # End while waiting for subnode to return
# Run anvil-version-change
print "- Running 'anvil-version-changes'.\n";
$output = "";
$error = "";
$return_code = "";
$shell_call = $anvil->data->{path}{exe}{'anvil-version-changes'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
2023-07-07 21:54:57 +00:00
if ($host_uuid eq $anvil->Get->host_uuid)
{
2023-07-15 02:29:07 +00:00
($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
2023-07-07 21:54:57 +00:00
}
else
{
2023-07-15 02:29:07 +00:00
($output, $error, $return_code) = $anvil->Remote->call({
shell_call => $shell_call,
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
error => $error,
return_code => $return_code,
}});
2023-07-07 21:54:57 +00:00
}
2023-07-15 02:29:07 +00:00
print "- Done!\n";
2023-07-07 21:54:57 +00:00
}
2023-07-15 02:29:07 +00:00
}
return(0);
}
sub update_strikers_and_dr
{
my ($anvil) = @_;
2023-07-25 23:13:41 +00:00
# Before we start, set the timeouts.
if ($anvil->data->{switches}{timeout})
{
if ($anvil->data->{switches}{timeout} =~ /^(\d+)h/i)
{
my $hours = $1;
$anvil->data->{switches}{timeout} = $hours * 3600;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
hours => $hours,
"switches::timeout" => $anvil->data->{switches}{timeout},
}});
}
elsif ($anvil->data->{switches}{timeout} =~ /^(\d+)m/i)
{
my $minutes = $1;
$anvil->data->{switches}{timeout} = $minutes * 60;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
minutes => $minutes,
"switches::timeout" => $anvil->data->{switches}{timeout},
}});
}
else
{
# Set the default.
print "[ Warning ] - The passed timeout: [".$anvil->data->{switches}{timeout}."] is invalid, setting it to 24 hours.\n";
$anvil->data->{switches}{timeout} = 86400;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"switches::timeout" => $anvil->data->{switches}{timeout},
}});
}
}
else
{
$anvil->data->{switches}{timeout} = 86400;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"switches::timeout" => $anvil->data->{switches}{timeout},
}});
}
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
# Make sure the timeout, if set, is valid.
if ($anvil->data->{switches}{timeout})
2023-07-15 02:29:07 +00:00
{
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
if ($anvil->data->{switches}{timeout} =~ /\D/)
2023-07-07 21:54:57 +00:00
{
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
# Invalid, error out.
print "The --timeout switch was used: [".$anvil->data->{switches}{timeout}."], but the value isn't a number of seconds.\n";
$anvil->nice_exit({exit_code => 1});
2023-07-07 21:54:57 +00:00
}
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
}
foreach my $host_type ("striker", "dr")
{
2023-07-15 02:29:07 +00:00
foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}})
2023-07-07 21:54:57 +00:00
{
2023-07-15 02:29:07 +00:00
my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name};
my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name};
my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type};
2023-07-07 21:54:57 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
2023-07-15 02:29:07 +00:00
's1:host_name' => $host_name,
's2:host_uuid' => $host_uuid,
's3:short_host_name' => $short_host_name,
's4:this_host_type' => $this_host_type,
2023-07-07 21:54:57 +00:00
}});
2023-07-15 02:29:07 +00:00
next if $this_host_type ne $host_type;
if ($host_type eq "striker")
{
print "Starting the update of the Striker dashboard: [".$short_host_name."].\n";
}
else
{
print "Starting the update of the DR host: [".$short_host_name."].\n";
}
# If this is the local system, set the variable to track if we need to reboot.
# Otherwise, see if we have access to the peer.
if ($host_uuid eq $anvil->Get->host_uuid)
{
$anvil->data->{sys}{reboot_needed} = 0;
}
elsif(not $anvil->data->{peer}{$short_host_name}{access}{ip})
{
if ($host_type eq "striker")
{
print "- No access to the Striker dashboard: [".$short_host_name."], skipping.\n";
}
else
{
print "- No access to the DR host: [".$short_host_name."], skipping.\n";
}
next;
}
# Record the start time so that we can be sure the subnode has rebooted (uptime is
# less than the current time minus this start time), if the host reboots as part of
# the update.
my $reboot_time = time;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { reboot_time => $reboot_time }});
print "- Beginning OS update of: [".$short_host_name."]\n";
2023-07-17 00:45:47 +00:00
my $rebooted = 0;
if (($anvil->data->{switches}{'clear-cache'}) && ($host_uuid eq $anvil->Get->host_uuid))
2023-07-15 02:29:07 +00:00
{
2023-07-17 00:45:47 +00:00
my $shell_call = $anvil->data->{path}{exe}{dnf}." clean all";
2023-07-15 02:29:07 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
2023-07-17 00:45:47 +00:00
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
2023-07-15 02:29:07 +00:00
print "- Cache cleared.\n";
}
print "- Calling update now.\n";
print "- NOTE: This can seem like it's hung! You can watch the progress using 'journalctl -f' on another terminal to\n";
print "- watch the progress via the system logs. You can also check wiht 'ps aux | grep dnf'.\n";
if ($host_uuid eq $anvil->Get->host_uuid)
{
2023-07-17 00:45:47 +00:00
my $shell_call = $anvil->data->{path}{exe}{dnf}." -y update";
2023-07-15 02:29:07 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
if ($return_code)
{
print "[ Error ] - There was a problem updating the system! Expected a return code of '0', but got: [".$return_code."]\n";
print "[ Error [ - The output, if any, was\n";
print "==] Output [==\n";
print $output."\n";
print "==============\n";
}
2023-07-17 00:45:47 +00:00
# Loop through the output.
my $package_changes = 0;
foreach my $line (split/\n/, $output)
{
$line = $anvil->Words->clean_spaces({string => $line});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { line => $line }});
if ($line =~ / (\d+) Packages$/i)
{
$package_changes += $1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { package_changes => $package_changes }});
}
}
# Did the user want to reboot on any update?
if (($package_changes) && ($anvil->data->{switches}{reboot}) && ($anvil->data->{switches}{'reboot-self'}))
{
# Reboot needed
print "- Updated: [".$package_changes."] packages, and '--reboot --reboot-self' used, reboot needed!\n";
$anvil->data->{sys}{reboot_needed} = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"sys::reboot_needed" => $anvil->data->{sys}{reboot_needed},
}});
}
2023-07-15 02:29:07 +00:00
# Get the newest installed kernel
$shell_call = $anvil->data->{path}{exe}{rpm}." -q kernel | ".$anvil->data->{path}{exe}{'sort'}." | ".$anvil->data->{path}{exe}{tail}." -n 1";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
(my $installed_kernel, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
installed_kernel => $installed_kernel,
return_code => $return_code,
}});
$installed_kernel =~ s/^kernel-(\d+.\d+\.\d+-\d+)\..*$/$1/;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { installed_kernel => $installed_kernel }});
# Get the running kernel
$shell_call = $anvil->data->{path}{exe}{uname}." -r";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
(my $active_kernel, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
active_kernel => $active_kernel,
return_code => $return_code,
}});
$active_kernel =~ s/(\d+.\d+\.\d+-\d+)\..*$/$1/;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { active_kernel => $active_kernel }});
if ($installed_kernel eq $active_kernel)
{
print "- The kernel has not been updated.\n";
}
else
{
print "- The kernel appears to have been upgraded, reboot needed!\n";
$anvil->data->{sys}{reboot_needed} = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"sys::reboot_needed" => $anvil->data->{sys}{reboot_needed},
}});
}
}
else
{
# Call anvil-update-system and then wait.
print "- Beginning OS update of: [".$short_host_name."]\n";
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
if ($host_type eq "dr")
2023-07-15 02:29:07 +00:00
{
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
# Make sure VMs are off and DRBD is down. Call this with nohup so it
# doesn't get killed by the loss of the SSH connection.
my $shell_call = $anvil->data->{path}{exe}{'anvil-safe-stop'}." --no-db".$anvil->Log->switches()." >/dev/null 2>&1 &";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my ($output, $error, $return_code) = $anvil->Remote->call({
shell_call => $shell_call,
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
error => $error,
return_code => $return_code,
}});
# Now wait for DRBD resources to stop (which requires VMs be off).
print "- Waiting for all DRBD resource (and the servers using them) to stop before proceeding.\n";
2023-07-25 23:13:41 +00:00
my $wait_until = time + $anvil->data->{switches}{timeout};
my $next_log = time + 60;
my $waiting = 1;
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
wait_until => $wait_until,
next_log => $next_log,
waiting => $waiting,
}});
while ($waiting)
{
my $drbd_up = 0;
$anvil->DRBD->get_status({
host => $short_host_name,
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
});
# How may resources are up?
my $resource_count = keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { resource_count => $resource_count }});
if (not $resource_count)
{
# Done!
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
}
if ($waiting)
{
# Log which resources are still up
if (time > $next_log)
{
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}})
{
print "[ Note ] - [".$anvil->Get->date_and_time({time_only => 1})."] - The resource: [".$resource."] is still up.\n";
}
2023-07-25 23:13:41 +00:00
$next_log = time + 60;
my $time_left = $wait_until - time;
my $say_time_left = $anvil->Convert->time({
'time' => $time_left,
translate => 1,
long => 0,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
next_log => $next_log,
time_left => $time_left,
say_time_left => $say_time_left,
}});
print "- Waiting for another: [".$say_time_left."], will check again shortly.\n";
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
}
if (time > $wait_until)
{
# Timeout.
print "[ Error ] - Timed out while waiting for the DR host: [".$short_host_name."] to stop all DRBD resources. Aborting the update.\n";
$anvil->nice_exit({exit_code => 1});
}
sleep 10;
}
}
2023-07-15 02:29:07 +00:00
}
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
my $update_switches = "";
if ($anvil->data->{switches}{'no-reboot'})
2023-07-15 02:29:07 +00:00
{
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
$update_switches .= " --no-reboot";
2023-07-15 02:29:07 +00:00
}
2023-07-17 00:45:47 +00:00
if ($anvil->data->{switches}{reboot})
{
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
$update_switches .= " --reboot";
2023-07-17 00:45:47 +00:00
}
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { update_switches => $update_switches }});
# We register a job, even though anvil-daemon isn't running. This will get
# picked up by 'anvil-update-systems --no-db' towards the end of it's run.
print "- Registering a job to update the system, which we can track to confirm when the update is done.\n";
my $shell_call = $anvil->data->{path}{exe}{'anvil-update-system'}.$update_switches.$anvil->Log->switches();
2023-07-15 02:29:07 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my $job_uuid = $anvil->Database->insert_or_update_jobs({
debug => 2,
job_command => $shell_call,
job_description => "job_0468",
job_host_uuid => $host_uuid,
job_name => "system::update-system",
job_progress => 0,
job_title => "job_0467"
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }});
print "- [ Note ] - Job registered with UUID: [".$job_uuid."], waiting for it to complete.\n";
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
# Now call anvil-update-system with --no-db and background it so we can close
# the DB connection without killing the process.
print "- Calling the no-database update of: [".$short_host_name."]\n";
$shell_call = $anvil->data->{path}{exe}{nohup}." ".$anvil->data->{path}{exe}{'anvil-update-system'}." --no-db".$update_switches;
if ($anvil->data->{switches}{'clear-cache'})
{
# We'll only call clear-cache on this one.
$shell_call .= " --clear-cache";
}
$shell_call .= $anvil->Log->switches()." >/dev/null 2>&1 &";
2023-07-15 02:29:07 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
2023-07-15 02:29:07 +00:00
my ($output, $error, $return_code) = $anvil->Remote->call({
shell_call => $shell_call,
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
error => $error,
return_code => $return_code,
}});
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
# Verify / wait until the update is done.
2023-07-25 23:13:41 +00:00
my $wait_until = time + $anvil->data->{switches}{timeout};
my $waiting = 1;
my $next_log = time + 60;
2023-07-15 02:29:07 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }});
while ($waiting)
{
2023-07-15 02:52:51 +00:00
$anvil->Job->get_job_details({job_uuid => $job_uuid});
2023-07-15 02:29:07 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"jobs::job_progress" => $anvil->data->{jobs}{job_progress},
2023-07-15 02:52:51 +00:00
"jobs::job_data" => $anvil->data->{jobs}{job_data},
2023-07-15 02:29:07 +00:00
}});
if ($anvil->data->{jobs}{job_progress} == 100)
{
print "- Done! The host: [".$short_host_name."] has been updated\n";
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
# Did it reboot?
2023-07-15 02:52:51 +00:00
if ($anvil->data->{jobs}{job_data} eq "rebooted")
2023-07-15 02:29:07 +00:00
{
$rebooted = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { rebooted => $rebooted }});
}
2023-07-15 20:19:21 +00:00
# Did it fail?
if ($anvil->data->{jobs}{job_data} eq "failed")
{
# Abort!
print "[ Error ] - There was a problem updating the system! Anvil! cluster update aborted.\n";
$anvil->nice_exit({exit_code => 1});
}
2023-07-15 02:29:07 +00:00
}
else
{
if (time > $next_log)
{
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
my $say_date = $anvil->Get->date_and_time({time_only => 1});
print "[ Note ] - [".$say_date."] - The job progress is: [".$anvil->data->{jobs}{job_progress}."], continuing to wait.\n";
2023-07-15 02:29:07 +00:00
if ($anvil->data->{jobs}{job_progress} == 0)
{
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
print "[ Note ] - [".$say_date."] - It is normal for the job to show '0' progress until the database access is restored.\n";
2023-07-15 02:29:07 +00:00
}
2023-07-25 23:13:41 +00:00
$next_log = time + 60;
my $time_left = $wait_until - time;
my $say_time_left = $anvil->Convert->time({
'time' => $time_left,
translate => 1,
long => 0,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
next_log => $next_log,
time_left => $time_left,
say_time_left => $say_time_left,
}});
print "- Waiting for another: [".$say_time_left."], will check again shortly.\n";
2023-07-15 02:29:07 +00:00
}
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
if (time > $wait_until)
{
# Timeout.
print "[ Error ] - Timed out while waiting for the machine: [".$short_host_name."] to update the OS. Aborting the update.\n";
$anvil->nice_exit({exit_code => 1});
}
2023-07-15 02:29:07 +00:00
sleep 5;
}
}
}
if ($rebooted)
{
print "- Rebooted! Will wait for it to come back up.\n";
wait_for_reboot($anvil, $host_uuid, $reboot_time);
}
else
{
print "- Reboot not needed, kernel appears to be up to date.\n";
}
# Run anvil-version-change
print "- Running 'anvil-version-changes' now.\n";
2023-07-17 00:45:47 +00:00
my $output = "";
my $error = "";
my $return_code = "";
2023-07-15 02:29:07 +00:00
my $shell_call = $anvil->data->{path}{exe}{'anvil-version-changes'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
if ($host_uuid eq $anvil->Get->host_uuid)
{
($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
}
else
{
($output, $error, $return_code) = $anvil->Remote->call({
Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.
* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.
Signed-off-by: digimer <mkelly@alteeve.ca>
2023-07-22 22:09:01 +00:00
'close' => 1,
no_cache => 1,
2023-07-15 02:29:07 +00:00
shell_call => $shell_call,
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
error => $error,
return_code => $return_code,
}});
}
2023-07-07 21:54:57 +00:00
}
}
return(0);
}
2023-07-15 02:29:07 +00:00
sub wait_for_reboot
2023-07-07 21:54:57 +00:00
{
2023-07-15 02:29:07 +00:00
my ($anvil, $host_uuid, $reboot_time) = @_;
2023-07-07 21:54:57 +00:00
my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
2023-07-15 02:29:07 +00:00
's1:host_uuid' => $host_uuid,
's2:short_host_name' => $short_host_name,
2023-07-07 21:54:57 +00:00
}});
2023-07-16 02:23:30 +00:00
my $matches = $anvil->Network->find_access({
debug => 2,
target => $host_uuid,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { matches => $matches }});
2023-07-15 02:29:07 +00:00
# Wait until the node comes back up.
print "- The target has been rebooted. We'll wait for the target to come back online.\n";
2023-07-07 21:54:57 +00:00
2023-07-15 02:29:07 +00:00
# This is an infinite loop, there is no timeout for this.
2023-07-25 23:13:41 +00:00
my $wait_until = time + $anvil->data->{switches}{timeout};
my $waiting = 1;
my $next_log = time + 60;
2023-07-15 02:29:07 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }});
while($waiting)
2023-07-07 21:54:57 +00:00
{
2023-07-15 02:29:07 +00:00
# Test access
2023-07-16 02:23:30 +00:00
my $target = $anvil->data->{peer}{$short_host_name}{access}{ip};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
target => $target,
short_host_name => $short_host_name,
}});
my $test_access = $anvil->Remote->test_access({target => $target});
2023-07-15 02:29:07 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { test_access => $test_access }});
if ($test_access)
{
# What's the machine's uptime?
2023-07-16 02:23:30 +00:00
my $uptime = $anvil->Get->uptime({debug => 2, target => $anvil->data->{peer}{$short_host_name}{access}{ip}});
2023-07-15 02:29:07 +00:00
my $time_since_reboot = time - $reboot_time;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
uptime => $uptime,
time_since_reboot => $time_since_reboot,
2023-07-16 02:23:30 +00:00
short_host_name => $short_host_name,
2023-07-15 02:29:07 +00:00
}});
if (($uptime) && ($uptime < $time_since_reboot))
{
# Rebooted!
print "- Rebooted! Subnode is back up.\n";
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
}
}
if ($waiting)
{
if (time > $next_log)
{
2023-07-25 23:13:41 +00:00
my $say_time = $anvil->Get->date_and_time({time_only => 1});
$next_log = time + 60;
my $time_left = $wait_until - time;
my $say_time_left = $anvil->Convert->time({
'time' => $time_left,
translate => 1,
long => 0,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:say_time' => $say_time,
's2:next_log' => $next_log,
's3:time_left' => $time_left,
's4:say_time_left' => $say_time_left,
}});
2023-07-15 02:29:07 +00:00
# Tell the user we're still waiting.
2023-07-25 23:13:41 +00:00
print "- [".$say_time."] - We're still waiting for the subnode: [".$short_host_name."] to reboot.\n";
print "- Waiting for another: [".$say_time_left."], will check again shortly.\n";
}
if (time > $wait_until)
{
# Timeout.
print "[ Error ] - Timed out while waiting for the subnode: [".$short_host_name."] to reboot. Aborting the update.\n";
$anvil->nice_exit({exit_code => 1});
2023-07-15 02:29:07 +00:00
}
sleep 5;
}
2023-07-07 21:54:57 +00:00
}
return(0);
}
2023-07-15 02:29:07 +00:00
sub manage_daemons
2023-07-07 21:54:57 +00:00
{
2023-07-15 02:29:07 +00:00
my ($anvil, $task) = @_;
$task = "start" if not $task;
2023-07-07 21:54:57 +00:00
2023-07-16 02:23:30 +00:00
my $do_task = $task eq "start" ? "enable --now" : "stop";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { do_task => $do_task }});
2023-07-15 02:29:07 +00:00
if ($task eq "stop")
{
print "Disabling Anvil! daemons on all hosts...\n";
}
else
{
print "Enabling Anvil! daemons on all hosts...\n";
}
my $daemons = ["anvil-daemon", "scancore"];
2023-07-07 21:54:57 +00:00
foreach my $host_type ("dr", "node", "striker")
{
foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}})
{
my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name};
my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name};
my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:host_name' => $host_name,
's2:host_uuid' => $host_uuid,
's3:short_host_name' => $short_host_name,
's4:this_host_type' => $this_host_type,
}});
next if $host_type ne $this_host_type;
2023-07-15 02:29:07 +00:00
if ($task eq "stop")
{
print "- Disabling dameons on: [".$short_host_name."]... ";
}
else
{
print "- Enabling dameons on: [".$short_host_name."]... ";
}
2023-07-07 21:54:57 +00:00
if (not $anvil->data->{peer}{$short_host_name}{access}{ip})
{
print "Offline! Skipping.\n";
next;
}
# Local
2023-07-15 02:29:07 +00:00
foreach my $daemon (@{$daemons})
2023-07-07 21:54:57 +00:00
{
2023-07-16 02:23:30 +00:00
my $shell_call = $anvil->data->{path}{exe}{systemctl}." ".$do_task." ".$daemon;
2023-07-07 21:54:57 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my $output = "";
my $error = "";
my $return_code = 999;
if ($host_uuid eq $anvil->Get->host_uuid)
{
# Local
($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
}
else
{
2023-07-15 02:29:07 +00:00
# Remote, it'll be a while before we hit some clients, so close this
# connection so later access to the machines don't fail with ssh
# connection timeouts.
2023-07-07 21:54:57 +00:00
($output, $error, $return_code) = $anvil->Remote->call({
2023-07-15 02:29:07 +00:00
'close' => 1,
no_cache => 1,
2023-07-07 21:54:57 +00:00
shell_call => $shell_call,
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
error => $error,
return_code => $return_code,
}});
}
if (not $return_code)
{
2023-07-15 02:29:07 +00:00
if ($task eq "stop")
{
print $daemon." stopped... ";
}
else
{
print $daemon." started... ";
}
2023-07-07 21:54:57 +00:00
}
else
{
2023-07-15 02:29:07 +00:00
if ($task eq "stop")
{
print $daemon." didn't stop!... ";
}
else
{
print $daemon." didn't start!... ";
}
2023-07-07 21:54:57 +00:00
}
}
print "Done!\n";
}
}
return(0);
}
sub verify_access
{
my ($anvil) = @_;
2023-07-15 02:29:07 +00:00
# Load host and Anvil! data.
$anvil->Database->get_hosts();
2023-07-07 21:54:57 +00:00
# Make sure all are available before we start.
my $all_access = 1;
foreach my $host_type ("dr", "node", "striker")
{
foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}})
{
my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name};
my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name};
my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:host_name' => $host_name,
's2:host_uuid' => $host_uuid,
's3:short_host_name' => $short_host_name,
's4:this_host_type' => $this_host_type,
}});
next if $host_type ne $this_host_type;
print "- Verifying access to: [".$short_host_name."]... ";
2023-07-15 02:29:07 +00:00
my $matches = $anvil->Network->find_access({
debug => 2,
target => $host_uuid,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { matches => $matches }});
2023-07-07 21:54:57 +00:00
$anvil->data->{peer}{$short_host_name}{access}{ip} = "";
$anvil->data->{peer}{$short_host_name}{access}{network} = "";
2023-07-24 19:43:54 +00:00
foreach my $preferred_network ("bcn", "mn", "ifn", "sn", "any")
2023-07-07 21:54:57 +00:00
{
2023-07-15 02:29:07 +00:00
next if $anvil->data->{peer}{$short_host_name}{access}{ip};
2023-07-07 21:54:57 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { preferred_network => $preferred_network }});
foreach my $network_name (sort {$a cmp $b} keys %{$anvil->data->{network_access}})
{
2023-07-15 02:29:07 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { network_name => $network_name }});
2023-07-24 19:43:54 +00:00
if (($network_name !~ /^$preferred_network/) && ($preferred_network ne "any"))
{
next;
}
2023-07-15 02:29:07 +00:00
2023-07-07 21:54:57 +00:00
my $target_ip = $anvil->data->{network_access}{$network_name}{target_ip_address};
2023-07-15 02:29:07 +00:00
my $test_access = $anvil->Remote->test_access({
'close' => 1,
target => $target_ip,
});
2023-07-07 21:54:57 +00:00
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's2:target_ip' => $target_ip,
's3:test_access' => $test_access,
}});
if ($test_access)
{
# We're good.
2023-07-15 02:29:07 +00:00
print "Connected on: [".$target_ip."] via: [".$network_name."]\n";
2023-07-07 21:54:57 +00:00
$anvil->data->{peer}{$short_host_name}{access}{ip} = $target_ip;
$anvil->data->{peer}{$short_host_name}{access}{network} = $network_name;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"s1:peer::${short_host_name}::access::ip" => $anvil->data->{peer}{$short_host_name}{access}{ip},
"s2:peer::${short_host_name}::access::network" => $anvil->data->{peer}{$short_host_name}{access}{network},
}});
}
}
}
if (not $anvil->data->{peer}{$short_host_name}{access}{ip})
{
2023-07-15 02:29:07 +00:00
print "No access! Skipping.\n";
2023-07-07 21:54:57 +00:00
$all_access = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_access => $all_access }});
}
}
}
return($all_access);
2023-08-09 22:20:14 +00:00
}