bd2e4c46ae
Signed-off-by: digimer <mkelly@alteeve.ca>
1530 lines
59 KiB
Perl
Executable File
1530 lines
59 KiB
Perl
Executable File
#!/usr/bin/perl
|
|
#
|
|
# This program will disable our daemons on all machines, then update each striker. It then walks through all
|
|
# DR hosts and Anvil! nodes. With nodes, it migrates servers to the peer, takes the node out of the cluster,
|
|
# updates it, reboots if the kernel was updated, and then rejoins the cluster, migrates the VMs and the does
|
|
# the same process on the peer sub-node.
|
|
#
|
|
# Exit codes;
|
|
# 0 = Normal exit.
|
|
# 1 = No database connection.
|
|
#
|
|
# TODO:
|
|
#
|
|
# USAGE:
|
|
#
|
|
|
|
use strict;
|
|
use warnings;
|
|
use Anvil::Tools;
|
|
require POSIX;
|
|
use Term::Cap;
|
|
use Text::Diff;
|
|
use Data::Dumper;
|
|
|
|
my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0];
|
|
my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0];
|
|
if (($running_directory =~ /^\./) && ($ENV{PWD}))
|
|
{
|
|
$running_directory =~ s/^\./$ENV{PWD}/;
|
|
}
|
|
|
|
# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete.
|
|
$| = 1;
|
|
|
|
my $anvil = Anvil::Tools->new();
|
|
|
|
# Read switches (target ([user@]host[:port]) and the file with the target's password.
|
|
$anvil->Get->switches({list => [
|
|
"clear-cache",
|
|
"force",
|
|
"no-reboot",
|
|
"reboot",
|
|
"reboot-self",
|
|
"timeout",
|
|
"y",
|
|
"yes"], man => $THIS_FILE});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}});
|
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }});
|
|
|
|
# Connect to the database(s). If we have no connections, we'll proceed anyway as one of the 'run_once' tasks
|
|
# is to setup the database server.
|
|
$anvil->Database->connect();
|
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"});
|
|
if (not $anvil->data->{sys}{database}{connections})
|
|
{
|
|
# No databases, update the job, sleep for a bit and then exit. The daemon will pick it up and try
|
|
# again after we exit.
|
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0305"});
|
|
sleep 10;
|
|
$anvil->nice_exit({exit_code => 1});
|
|
}
|
|
|
|
# Make sure we're running as 'root'
|
|
# $< == real UID, $> == effective UID
|
|
if (($< != 0) && ($> != 0))
|
|
{
|
|
# Not root
|
|
print $anvil->Words->string({key => "error_0005"})."\n";
|
|
$anvil->nice_exit({exit_code => 1});
|
|
}
|
|
|
|
# Make sure we're a striker.
|
|
if ($anvil->Get->host_type ne "striker")
|
|
{
|
|
print "This has to be run on a Striker dashboard.\n";
|
|
$anvil->nice_exit({exit_code => 1});
|
|
}
|
|
|
|
# If we still don't have a job-uuit, go into interactive mode.
|
|
$anvil->data->{sys}{progress} = 0;
|
|
if ($anvil->data->{switches}{'job-uuid'})
|
|
{
|
|
# Load the job data.
|
|
$anvil->Job->clear();
|
|
$anvil->Job->get_job_details({debug => 2});
|
|
$anvil->Job->update_progress({
|
|
progress => $anvil->data->{sys}{progress}++,
|
|
job_picked_up_by => $$,
|
|
job_picked_up_at => time,
|
|
'print' => 1,
|
|
message => "message_0319",
|
|
});
|
|
}
|
|
|
|
# Update beginning. Verifying all known machines are accessible...
|
|
$anvil->Job->update_progress({
|
|
'print' => 1,
|
|
progress => $anvil->data->{sys}{progress}++,
|
|
message => "job_0469",
|
|
});
|
|
my $all_access = verify_access($anvil);
|
|
if ((not $all_access) && (not $anvil->data->{switches}{force}))
|
|
{
|
|
print "[ Error ] - Not all systems are accessible. Update aborted!\n";
|
|
$anvil->nice_exit({exit_code => 1});
|
|
}
|
|
print "Success!\n";
|
|
|
|
if (($anvil->data->{switches}{y}) or ($anvil->data->{switches}{yes}))
|
|
{
|
|
print "[ Note ] - Proceeding without confirmation, '-y' or '--yes' used.\n";
|
|
}
|
|
else
|
|
{
|
|
print "[ Note ] - All nodes need to be up and running for the update to run on nodes.
|
|
[ Note ] - Any out-of-sync storage needs to complete before a node can be updated.
|
|
[ Warning ] - Servers will be migrated between subnodes, which can cause reduced performance during
|
|
[ Warning ] - the these migrations. If a sub-node is not active, it will be activated as part of the
|
|
[ Warning ] - upgrade process.\n";
|
|
print "\n".$anvil->Words->string({key => "message_0021"})."\n";
|
|
my $answer = <STDIN>;
|
|
chomp $answer;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { answer => $answer }});
|
|
|
|
if ($answer =~ /^y/i)
|
|
{
|
|
print $anvil->Words->string({key => "message_0175"})."\n";
|
|
}
|
|
else
|
|
{
|
|
print $anvil->Words->string({key => "message_0022"})."\n";
|
|
$anvil->nice_exit({exit_code => 0});
|
|
}
|
|
}
|
|
|
|
manage_daemons($anvil, "stop");
|
|
|
|
# Update systems
|
|
update_strikers_and_dr($anvil);
|
|
|
|
# Update DR Host
|
|
update_nodes($anvil);
|
|
|
|
manage_daemons($anvil, "start");
|
|
|
|
print "Updates complete!\n";
|
|
|
|
my $host_uuid = $anvil->Get->host_uuid;
|
|
my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name};
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
's1:host_uuid' => $host_uuid,
|
|
's2:short_host_name' => $short_host_name,
|
|
}});
|
|
if ($anvil->data->{sys}{reboot_needed})
|
|
{
|
|
if ($anvil->data->{switches}{'reboot-self'})
|
|
{
|
|
print "[ Note ] - The local system needs to be rebooted, and '--reboot-self' was used. Rebooting in 60 seconds! Use ctrl+c to abort!\n";
|
|
my $waiting = 60;
|
|
while ($waiting)
|
|
{
|
|
print $waiting.", ";
|
|
sleep 5;
|
|
$waiting -= 5;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $waiting }});
|
|
}
|
|
print "\nRebooting now!\n";
|
|
|
|
my $shell_call = $anvil->data->{path}{exe}{systemctl}." reboot";
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
|
|
|
|
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code }});
|
|
|
|
print "Reboot requested, exiting.\n";
|
|
}
|
|
else
|
|
{
|
|
print "[ Note ] - This host needs to be rebooted to activate the new kernel. Please update as soon as you can.\n";
|
|
}
|
|
}
|
|
|
|
$anvil->nice_exit({exit_code => 0});
|
|
|
|
|
|
#############################################################################################################
|
|
# Functions #
|
|
#############################################################################################################
|
|
|
|
sub update_nodes
|
|
{
|
|
my ($anvil) = @_;
|
|
|
|
# Here, we loop through anvil systems, and find which sub nodes will be updated first, and which will
|
|
# be updated second.
|
|
foreach my $anvil_name (sort {$a cmp $b} keys %{$anvil->data->{anvils}{anvil_name}})
|
|
{
|
|
my $anvil_uuid = $anvil->data->{anvils}{anvil_name}{$anvil_name}{anvil_uuid};
|
|
my $anvil_description = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_description};
|
|
my $anvil_node1_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node1_host_uuid};
|
|
my $anvil_node2_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node2_host_uuid};
|
|
my $primary_host_uuid = $anvil->Cluster->get_primary_host_uuid({anvil_uuid => $anvil_uuid});
|
|
$primary_host_uuid = $anvil_node1_host_uuid if not $primary_host_uuid;
|
|
my $secondary_host_uuid = $primary_host_uuid eq $anvil_node1_host_uuid ? $anvil_node2_host_uuid : $anvil_node1_host_uuid;
|
|
my $node1_short_host_name = $anvil->data->{hosts}{host_uuid}{$anvil_node1_host_uuid}{short_host_name};
|
|
my $node2_short_host_name = $anvil->data->{hosts}{host_uuid}{$anvil_node2_host_uuid}{short_host_name};
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
's1:anvil_name' => $anvil_name,
|
|
's2:anvil_uuid' => $anvil_uuid,
|
|
's3:anvil_description' => $anvil_description,
|
|
's4:anvil_node1_host_uuid' => $anvil_node1_host_uuid,
|
|
's5:anvil_node2_host_uuid' => $anvil_node2_host_uuid,
|
|
's6:primary_host_uuid' => $primary_host_uuid,
|
|
's7:secondary_host_uuid' => $secondary_host_uuid,
|
|
's8:node1_short_host_name' => $node1_short_host_name,
|
|
's9:node2_short_host_name' => $node2_short_host_name,
|
|
}});
|
|
|
|
# Before we proceed, are both nodes online? If so, great. If not, are both offline? If only
|
|
# one is online, abort. Check now in case things have changed since our first scan
|
|
print "Preparing to update the Anvil! node: [".$anvil_name."]. Verifying subnode access:\n";
|
|
foreach my $host_uuid ($secondary_host_uuid, $primary_host_uuid)
|
|
{
|
|
my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name};
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
's1:host_uuid' => $host_uuid,
|
|
's2:short_host_name' => $short_host_name,
|
|
}});
|
|
print "- Verifying access to subnode: [".$short_host_name."]\n";
|
|
my $matches = $anvil->Network->find_access({
|
|
debug => 2,
|
|
target => $host_uuid,
|
|
});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { matches => $matches }});
|
|
$anvil->data->{peer}{$short_host_name}{access}{ip} = "";
|
|
$anvil->data->{peer}{$short_host_name}{access}{network} = "";
|
|
foreach my $preferred_network ("bcn", "mn", "ifn", "sn", "any")
|
|
{
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { preferred_network => $preferred_network }});
|
|
foreach my $network_name (sort {$a cmp $b} keys %{$anvil->data->{network_access}})
|
|
{
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { network_name => $network_name }});
|
|
if (($network_name !~ /^$preferred_network/) && ($preferred_network ne "any"))
|
|
{
|
|
next;
|
|
}
|
|
my $target_ip = $anvil->data->{network_access}{$network_name}{target_ip_address};
|
|
my $test_access = $anvil->Remote->test_access({target => $target_ip});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
's1:target_ip' => $target_ip,
|
|
's2:test_access' => $test_access,
|
|
}});
|
|
|
|
if ($test_access)
|
|
{
|
|
# We're good.
|
|
$anvil->data->{peer}{$short_host_name}{access}{ip} = $target_ip;
|
|
$anvil->data->{peer}{$short_host_name}{access}{network} = $network_name;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
"s1:peer::${short_host_name}::access::ip" => $anvil->data->{peer}{$short_host_name}{access}{ip},
|
|
"s2:peer::${short_host_name}::access::network" => $anvil->data->{peer}{$short_host_name}{access}{network},
|
|
}});
|
|
print "- Access found over the: [".$network_name."] network using the IP: [".$target_ip."]\n";
|
|
last;
|
|
}
|
|
}
|
|
}
|
|
if (not $anvil->data->{peer}{$short_host_name}{access}{ip})
|
|
{
|
|
print "[ Warning ] - Access not found!\n";
|
|
}
|
|
}
|
|
|
|
if ((($anvil->data->{peer}{$node1_short_host_name}{access}{ip}) && (not $anvil->data->{peer}{$node2_short_host_name}{access}{ip})) or
|
|
((not $anvil->data->{peer}{$node1_short_host_name}{access}{ip}) && ($anvil->data->{peer}{$node2_short_host_name}{access}{ip})))
|
|
{
|
|
# Only one node online, skip this Anvil node.
|
|
if ($anvil->data->{switches}{force})
|
|
{
|
|
# Skip this Anvil! system
|
|
print "[ Warning ] - '--force' used, skipping this node.\n";
|
|
print "[ NOTE ] - This node may not be able to communicate with the Striker dashboards until updated manually!\n";
|
|
next;
|
|
}
|
|
else
|
|
{
|
|
print "[ Error ] - Exiting update! Please bring the missing subnode back online and try again!\n";
|
|
$anvil->nice_exit({exit_code => 1});
|
|
}
|
|
}
|
|
|
|
# Update the secondary first, as it should have no VMs on it.
|
|
foreach my $host_uuid ($secondary_host_uuid, $primary_host_uuid)
|
|
{
|
|
# Withdraw the node from the cluster.
|
|
my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name};
|
|
my $peer_host_uuid = $host_uuid eq $primary_host_uuid ? $secondary_host_uuid : $primary_host_uuid;
|
|
my $peer_short_host_name = $anvil->data->{hosts}{host_uuid}{$peer_host_uuid}{short_host_name};
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
's1:host_uuid' => $host_uuid,
|
|
's2:short_host_name' => $short_host_name,
|
|
's3:peer_host_uuid' => $peer_host_uuid,
|
|
's4:peer_short_host_name' => $peer_short_host_name,
|
|
}});
|
|
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
's1:host_uuid' => $host_uuid,
|
|
's2:short_host_name' => $short_host_name,
|
|
}});
|
|
|
|
print "Preparing to update: [".$short_host_name."]. Withdrawing the subnode from the Anvil! node.\n";
|
|
print "- [ Note ] - If the node has servers that need to be migrated off, or if the node is SyncSource for storage,\n";
|
|
print "- [ Note ] - this could take some time to complete.\n";
|
|
|
|
# Make sure VMs are off, DRBD is down and the node is out of the cluster. Call this
|
|
# with nohup so it doesn't get killed by the loss of the SSH connection.
|
|
my $shell_call = $anvil->data->{path}{exe}{'anvil-safe-stop'}." --no-db".$anvil->Log->switches()." >/dev/null 2>&1 &";
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
|
|
|
|
my ($output, $error, $return_code) = $anvil->Remote->call({
|
|
shell_call => $shell_call,
|
|
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
|
|
});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
output => $output,
|
|
error => $error,
|
|
return_code => $return_code,
|
|
}});
|
|
|
|
# Now wait for DRBD resources to stop (which requires VMs be off).
|
|
print "- Waiting for all DRBD resource (and the servers using them) to stop before proceeding.\n";
|
|
my $wait_until = time + $anvil->data->{switches}{timeout};
|
|
my $next_log = time + 60;
|
|
my $waiting = 1;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
wait_until => $wait_until,
|
|
next_log => $next_log,
|
|
waiting => $waiting,
|
|
}});
|
|
while ($waiting)
|
|
{
|
|
my $drbd_up = 0;
|
|
my $pacemaker_up = 0;
|
|
$anvil->DRBD->get_status({
|
|
host => $short_host_name,
|
|
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
|
|
});
|
|
|
|
# How may resources are up?
|
|
my $resource_count = keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}};
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { resource_count => $resource_count }});
|
|
|
|
if ($resource_count)
|
|
{
|
|
# DRBD is still up.
|
|
$drbd_up = 1;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { drbd_up => $drbd_up }});
|
|
}
|
|
|
|
# Is pacemaker down?
|
|
my $problem = $anvil->Cluster->parse_cib({target => $anvil->data->{peer}{$short_host_name}{access}{ip}});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }});
|
|
|
|
if (not $problem)
|
|
{
|
|
# Node is still in the cluster.
|
|
$pacemaker_up = 1;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { pacemaker_up => $pacemaker_up }});
|
|
}
|
|
|
|
if ((not $pacemaker_up) && (not $drbd_up))
|
|
{
|
|
$waiting = 0;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
|
|
}
|
|
|
|
if ($waiting)
|
|
{
|
|
# Log which resources are still up
|
|
if (time > $next_log)
|
|
{
|
|
my $say_time = $anvil->Get->date_and_time({time_only => 1});
|
|
if ($pacemaker_up)
|
|
{
|
|
print "[ Note ] - [".$say_time."] - The subnode is still in the cluster.\n";
|
|
}
|
|
else
|
|
{
|
|
print "[ Note ] - [".$say_time."] - The subnode is no longer in the cluster, good.\n";
|
|
}
|
|
|
|
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}})
|
|
{
|
|
print "[ Note ] - [".$say_time."] - The resource: [".$resource."] is still up.\n";
|
|
}
|
|
$next_log = time + 60;
|
|
my $time_left = $wait_until - time;
|
|
my $say_time_left = $anvil->Convert->time({
|
|
'time' => $time_left,
|
|
translate => 1,
|
|
long => 0,
|
|
});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
next_log => $next_log,
|
|
time_left => $time_left,
|
|
say_time_left => $say_time_left,
|
|
}});
|
|
print "- Waiting for another: [".$say_time_left."], will check again shortly.\n";
|
|
}
|
|
if (time > $wait_until)
|
|
{
|
|
# Timeout.
|
|
print "[ Error ] - Timed out while waiting for the subnode: [".$short_host_name."] to stop all DRBD resources nad leave the cluster. Aborting the update.\n";
|
|
$anvil->nice_exit({exit_code => 1});
|
|
}
|
|
|
|
sleep 10;
|
|
}
|
|
}
|
|
|
|
my $update_switches = "";
|
|
if ($anvil->data->{switches}{'no-reboot'})
|
|
{
|
|
$update_switches .= " --no-reboot";
|
|
}
|
|
if ($anvil->data->{switches}{reboot})
|
|
{
|
|
$update_switches .= " --reboot";
|
|
}
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { update_switches => $update_switches }});
|
|
|
|
# We register a job, even though anvil-daemon isn't running. This will get picked up
|
|
# by 'anvil-update-systems --no-db' towards the end of it's run.
|
|
print "- Registering a job to update the subnode, which we can track to confirm when the update is done.\n";
|
|
$shell_call = $anvil->data->{path}{exe}{'anvil-update-system'}.$update_switches.$anvil->Log->switches();
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
|
|
my $job_uuid = $anvil->Database->insert_or_update_jobs({
|
|
debug => 2,
|
|
job_command => $shell_call,
|
|
job_description => "job_0468",
|
|
job_host_uuid => $host_uuid,
|
|
job_name => "system::update-system",
|
|
job_progress => 0,
|
|
job_title => "job_0467"
|
|
});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }});
|
|
print "- [ Note ] - Job registered with UUID: [".$job_uuid."], waiting for it to complete.\n";
|
|
|
|
# Now call anvil-update-system with --no-db and background it so we can close
|
|
# the DB connection without killing the process.
|
|
print "- Calling the no-database update of: [".$short_host_name."]\n";
|
|
$shell_call = $anvil->data->{path}{exe}{nohup}." ".$anvil->data->{path}{exe}{'anvil-update-system'}." --no-db".$update_switches;
|
|
if ($anvil->data->{switches}{'clear-cache'})
|
|
{
|
|
# We'll only call clear-cache on this one.
|
|
$shell_call .= " --clear-cache";
|
|
}
|
|
$shell_call .= $anvil->Log->switches()." >/dev/null 2>&1 &";
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
|
|
|
|
($output, $error, $return_code) = $anvil->Remote->call({
|
|
shell_call => $shell_call,
|
|
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
|
|
});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
output => $output,
|
|
error => $error,
|
|
return_code => $return_code,
|
|
}});
|
|
|
|
# Record the start time so that we can be sure the subnode has rebooted (uptime is
|
|
# less than the current time minus this start time), if the host reboots as part of
|
|
# the update.
|
|
my $rebooted = 0;
|
|
my $reboot_time = time;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
rebooted => $rebooted,
|
|
reboot_time => $reboot_time,
|
|
short_host_name => $short_host_name,
|
|
}});
|
|
|
|
# Verify that the node is no longer in the cluster.
|
|
$wait_until = time + $anvil->data->{switches}{timeout};
|
|
$waiting = 1;
|
|
$next_log = time + 60;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
wait_until => $wait_until,
|
|
next_log => $next_log,
|
|
}});
|
|
while ($waiting)
|
|
{
|
|
$anvil->Job->get_job_details({job_uuid => $job_uuid});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
"jobs::job_progress" => $anvil->data->{jobs}{job_progress},
|
|
"jobs::job_data" => $anvil->data->{jobs}{job_data},
|
|
}});
|
|
if ($anvil->data->{jobs}{job_progress} == 100)
|
|
{
|
|
print "- Done! The subnode: [".$short_host_name."] has been updated\n";
|
|
$waiting = 0;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
|
|
|
|
# Did it reboot?
|
|
if ($anvil->data->{jobs}{job_data} eq "rebooted")
|
|
{
|
|
$rebooted = 1;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { rebooted => $rebooted }});
|
|
}
|
|
|
|
# Did it fail?
|
|
if ($anvil->data->{jobs}{job_data} eq "failed")
|
|
{
|
|
# Abort!
|
|
print "[ Error ] - There was a problem updating the subnode! Anvil! cluster update aborted.\n";
|
|
$anvil->nice_exit({exit_code => 1});
|
|
}
|
|
}
|
|
else
|
|
{
|
|
my $say_date = $anvil->Get->date_and_time({time_only => 1});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { say_date => $say_date }});
|
|
if (time > $next_log)
|
|
{
|
|
print "[ Note ] - [".$say_date."] - The job progress is: [".$anvil->data->{jobs}{job_progress}."], continuing to wait.\n";
|
|
if ($anvil->data->{jobs}{job_progress} eq "0")
|
|
{
|
|
print "[ Note ] - [".$say_date."] - It is expected for the job to stay at '0' for a while.\n";
|
|
}
|
|
$next_log = time + 60;
|
|
my $time_left = $wait_until - time;
|
|
my $say_time_left = $anvil->Convert->time({
|
|
'time' => $time_left,
|
|
translate => 1,
|
|
long => 0,
|
|
});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
next_log => $next_log,
|
|
time_left => $time_left,
|
|
say_time_left => $say_time_left,
|
|
}});
|
|
print "- Waiting for another: [".$say_time_left."], will check again shortly.\n";
|
|
}
|
|
if (time > $wait_until)
|
|
{
|
|
# Timeout.
|
|
print "[ Error ] - Timed out while waiting for the subnode: [".$short_host_name."] to update. Aborting the update.\n";
|
|
$anvil->nice_exit({exit_code => 1});
|
|
}
|
|
sleep 5;
|
|
}
|
|
}
|
|
|
|
print "- Update completed successfully! Checking if a reboot is needed.\n";
|
|
my $run_anvil_safe_start = 0;
|
|
if ($rebooted)
|
|
{
|
|
print "- Rebooted! Will wait for it to come back up.\n";
|
|
wait_for_reboot($anvil, $host_uuid, $reboot_time);
|
|
}
|
|
else
|
|
{
|
|
print "- Reboot not needed, kernel appears to be up to date.\n";
|
|
$run_anvil_safe_start = 1;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { run_anvil_safe_start => $run_anvil_safe_start }});
|
|
}
|
|
|
|
# Wait for the node to rejoin the cluster. As before, this is a time
|
|
# unrestricted wait loop.
|
|
print "- Waiting for the subnode to rejoin the node.\n";
|
|
$wait_until = time + $anvil->data->{switches}{timeout};
|
|
$waiting = 1;
|
|
my $start_called = 0;
|
|
$next_log = time + 60;
|
|
my $manual_start = time + 60;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
's1:wait_until' => $wait_until,
|
|
's2:next_log' => $next_log,
|
|
's3:manual_start' => $manual_start,
|
|
}});
|
|
while($waiting)
|
|
{
|
|
# Should we call a start to the cluster?
|
|
if ((not $start_called) && ($run_anvil_safe_start))
|
|
{
|
|
print "- Calling 'anvil-safe-start' to rejoin the subnode to the node.\n";
|
|
$start_called = 1;
|
|
my $shell_call = $anvil->data->{path}{exe}{'anvil-safe-start'}.$anvil->Log->switches()." >/dev/null 2>&1 &";
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
start_called => $start_called,
|
|
shell_call => $shell_call,
|
|
}});
|
|
|
|
my ($output, $error, $return_code) = $anvil->Remote->call({
|
|
debug => 2,
|
|
shell_call => $shell_call,
|
|
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
|
|
});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
output => $output,
|
|
error => $error,
|
|
return_code => $return_code,
|
|
}});
|
|
}
|
|
|
|
# Pull the CIB and make sure both nodes are ready, and that DRBD resources
|
|
# are all UpToDate if this is the reboot from the first node.
|
|
my ($problem) = $anvil->Cluster->parse_cib({target => $anvil->data->{peer}{$short_host_name}{access}{ip}});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }});
|
|
|
|
# Are both nodes ready?
|
|
if (not $problem)
|
|
{
|
|
# Both nodes are in the cluster, but are they full members yet?
|
|
my $both_ready = 1;
|
|
my $node_count = 0;
|
|
foreach my $node_name (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{data}{node}})
|
|
{
|
|
my $ready = $anvil->data->{cib}{parsed}{data}{node}{$node_name}{node_state}{ready};
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
node_name => $node_name,
|
|
ready => $ready,
|
|
}});
|
|
if (not $ready)
|
|
{
|
|
$both_ready = 0;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { both_ready => $both_ready }});
|
|
}
|
|
$node_count++;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { node_count => $node_count }});
|
|
}
|
|
|
|
# Did we see two nodes and are both ready?
|
|
if (($node_count == 2) && ($both_ready))
|
|
{
|
|
# Yes! If this is the first subnode, we need to wait for DRBD
|
|
# to be UpToDate. If it's the second, we just wait for the
|
|
# connections to be up.
|
|
# NOTE: We call the peer to get the DRBD data as it's got a
|
|
# better view of the storage
|
|
print "- Both subnodes are online, will now check replicated storage.\n";
|
|
$anvil->DRBD->get_status({
|
|
host => $peer_short_host_name,
|
|
target => $anvil->data->{peer}{$peer_short_host_name}{access}{ip},
|
|
});
|
|
|
|
if ($host_uuid eq $primary_host_uuid)
|
|
{
|
|
### NOTE: Should we wait for all connections
|
|
### to be up?
|
|
# This is the second node, we don't have to wait.
|
|
print "- This is the second node, no need to wait for replication to complete.\n";
|
|
$waiting = 0;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
|
|
}
|
|
else
|
|
{
|
|
# This is the first node. Wait for all volumes to be
|
|
# UpToDate.
|
|
if (time > $next_log)
|
|
{
|
|
print "- Waiting for all volumes to be UpToDate before updating the other subnode.\n";
|
|
}
|
|
my $all_uptodate = 1;
|
|
my $resources = 0;
|
|
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$peer_short_host_name}{resource}})
|
|
{
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { resource => $resource }});
|
|
foreach my $peer_name (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$peer_short_host_name}{resource}{$resource}{connection}})
|
|
{
|
|
# We don't care about DR hosts for this upgrade
|
|
my $peer_uuid = $anvil->Get->host_uuid_from_name({host_name => $peer_name});
|
|
my $peer_type = $anvil->data->{hosts}{host_uuid}{$peer_uuid}{host_type};
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
's1:peer_name' => $peer_name,
|
|
's2:peer_uuid' => $peer_uuid,
|
|
's3:peer_type' => $peer_type,
|
|
}});
|
|
next if $peer_type ne "node";
|
|
foreach my $volume (sort {$a <=> $b} keys %{$anvil->data->{drbd}{status}{$peer_short_host_name}{resource}{$resource}{connection}{$peer_name}{volume}})
|
|
{
|
|
# This is this subnode's disk state,
|
|
# as the DRBD data was collected
|
|
# from the peer.
|
|
my $disk_state = $anvil->data->{drbd}{status}{$peer_short_host_name}{resource}{$resource}{connection}{$peer_name}{volume}{$volume}{'peer-disk-state'};
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
's1:volume' => $volume,
|
|
's2:disk_state' => $disk_state,
|
|
}});
|
|
|
|
if (lc($disk_state) ne "uptodate")
|
|
{
|
|
$all_uptodate = 0;
|
|
my $eta_in_seconds = $anvil->data->{drbd}{status}{$peer_short_host_name}{resource}{$resource}{connection}{$peer_name}{volume}{$volume}{'estimated-seconds-to-finish'};
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
all_uptodate => $all_uptodate,
|
|
eta_in_seconds => $eta_in_seconds,
|
|
}});
|
|
if (time > $next_log)
|
|
{
|
|
if ($eta_in_seconds)
|
|
{
|
|
print "- The resource: [".$resource."/".$volume."] is not synced yet, ETA is: [".$eta_in_seconds."] to complete resync.\n";
|
|
}
|
|
else
|
|
{
|
|
print "- The resource: [".$resource."/".$volume."] is not yet UpToDate.\n";
|
|
}
|
|
}
|
|
}
|
|
} # End foreach volume
|
|
} # End foreach peer
|
|
} # End foreach resource
|
|
|
|
if ($all_uptodate)
|
|
{
|
|
print "- All resources appear to be ready,\n";
|
|
$waiting = 0;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
|
|
}
|
|
} # End if host is first or second subnode
|
|
} # End if both ready
|
|
elsif (time > $next_log)
|
|
{
|
|
print "- Both subnodes are not online yet, still waiting.\n";
|
|
}
|
|
} # End if CIB was parsed
|
|
elsif (time > $next_log)
|
|
{
|
|
print "- Unable to parse the node's cluster information base, will try again soon.\n";
|
|
}
|
|
|
|
if (time > $next_log)
|
|
{
|
|
my $say_time = $anvil->Get->date_and_time({time_only => 1});
|
|
$next_log = time + 60;
|
|
my $time_left = $wait_until - time;
|
|
my $say_time_left = $anvil->Convert->time({
|
|
'time' => $time_left,
|
|
translate => 1,
|
|
long => 0,
|
|
});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
's1:say_time' => $say_time,
|
|
's2:next_log' => $next_log,
|
|
's3:time_left' => $time_left,
|
|
's4:say_time_left' => $say_time_left,
|
|
}});
|
|
|
|
# Tell the user we're still waiting.
|
|
print "- [".$say_time."] - We're still waiting for the subnode: [".$short_host_name."] to reboot.\n";
|
|
print "- Waiting for another: [".$say_time_left."], will check again shortly.\n";
|
|
}
|
|
if (time > $wait_until)
|
|
{
|
|
# Timeout.
|
|
print "[ Error ] - Timed out while waiting for the subnode: [".$short_host_name."] to join the subcluster. Aborting the update.\n";
|
|
$anvil->nice_exit({exit_code => 1});
|
|
}
|
|
|
|
if ($waiting)
|
|
{
|
|
sleep 5;
|
|
}
|
|
} # End while waiting for subnode to return
|
|
|
|
# Run anvil-version-change
|
|
print "- Running 'anvil-version-changes'.\n";
|
|
$output = "";
|
|
$error = "";
|
|
$return_code = "";
|
|
$shell_call = $anvil->data->{path}{exe}{'anvil-version-changes'};
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
|
|
if ($host_uuid eq $anvil->Get->host_uuid)
|
|
{
|
|
($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
output => $output,
|
|
return_code => $return_code,
|
|
}});
|
|
}
|
|
else
|
|
{
|
|
($output, $error, $return_code) = $anvil->Remote->call({
|
|
shell_call => $shell_call,
|
|
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
|
|
});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
output => $output,
|
|
error => $error,
|
|
return_code => $return_code,
|
|
}});
|
|
}
|
|
print "- Done!\n";
|
|
}
|
|
}
|
|
|
|
return(0);
|
|
}
|
|
|
|
sub update_strikers_and_dr
|
|
{
|
|
my ($anvil) = @_;
|
|
|
|
# Before we start, set the timeouts.
|
|
if ($anvil->data->{switches}{timeout})
|
|
{
|
|
if ($anvil->data->{switches}{timeout} =~ /^(\d+)h/i)
|
|
{
|
|
my $hours = $1;
|
|
$anvil->data->{switches}{timeout} = $hours * 3600;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
hours => $hours,
|
|
"switches::timeout" => $anvil->data->{switches}{timeout},
|
|
}});
|
|
}
|
|
elsif ($anvil->data->{switches}{timeout} =~ /^(\d+)m/i)
|
|
{
|
|
my $minutes = $1;
|
|
$anvil->data->{switches}{timeout} = $minutes * 60;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
minutes => $minutes,
|
|
"switches::timeout" => $anvil->data->{switches}{timeout},
|
|
}});
|
|
}
|
|
else
|
|
{
|
|
# Set the default.
|
|
print "[ Warning ] - The passed timeout: [".$anvil->data->{switches}{timeout}."] is invalid, setting it to 24 hours.\n";
|
|
$anvil->data->{switches}{timeout} = 86400;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
"switches::timeout" => $anvil->data->{switches}{timeout},
|
|
}});
|
|
}
|
|
}
|
|
else
|
|
{
|
|
$anvil->data->{switches}{timeout} = 86400;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
"switches::timeout" => $anvil->data->{switches}{timeout},
|
|
}});
|
|
}
|
|
|
|
# Make sure the timeout, if set, is valid.
|
|
if ($anvil->data->{switches}{timeout})
|
|
{
|
|
if ($anvil->data->{switches}{timeout} =~ /\D/)
|
|
{
|
|
# Invalid, error out.
|
|
print "The --timeout switch was used: [".$anvil->data->{switches}{timeout}."], but the value isn't a number of seconds.\n";
|
|
$anvil->nice_exit({exit_code => 1});
|
|
}
|
|
}
|
|
|
|
foreach my $host_type ("striker", "dr")
|
|
{
|
|
foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}})
|
|
{
|
|
my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name};
|
|
my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name};
|
|
my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type};
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
's1:host_name' => $host_name,
|
|
's2:host_uuid' => $host_uuid,
|
|
's3:short_host_name' => $short_host_name,
|
|
's4:this_host_type' => $this_host_type,
|
|
}});
|
|
next if $this_host_type ne $host_type;
|
|
|
|
if ($host_type eq "striker")
|
|
{
|
|
print "Starting the update of the Striker dashboard: [".$short_host_name."].\n";
|
|
}
|
|
else
|
|
{
|
|
print "Starting the update of the DR host: [".$short_host_name."].\n";
|
|
}
|
|
|
|
# If this is the local system, set the variable to track if we need to reboot.
|
|
# Otherwise, see if we have access to the peer.
|
|
if ($host_uuid eq $anvil->Get->host_uuid)
|
|
{
|
|
$anvil->data->{sys}{reboot_needed} = 0;
|
|
}
|
|
elsif(not $anvil->data->{peer}{$short_host_name}{access}{ip})
|
|
{
|
|
if ($host_type eq "striker")
|
|
{
|
|
print "- No access to the Striker dashboard: [".$short_host_name."], skipping.\n";
|
|
}
|
|
else
|
|
{
|
|
print "- No access to the DR host: [".$short_host_name."], skipping.\n";
|
|
}
|
|
next;
|
|
}
|
|
|
|
# Record the start time so that we can be sure the subnode has rebooted (uptime is
|
|
# less than the current time minus this start time), if the host reboots as part of
|
|
# the update.
|
|
my $reboot_time = time;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { reboot_time => $reboot_time }});
|
|
|
|
print "- Beginning OS update of: [".$short_host_name."]\n";
|
|
my $rebooted = 0;
|
|
if (($anvil->data->{switches}{'clear-cache'}) && ($host_uuid eq $anvil->Get->host_uuid))
|
|
{
|
|
my $shell_call = $anvil->data->{path}{exe}{dnf}." clean all";
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
|
|
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
output => $output,
|
|
return_code => $return_code,
|
|
}});
|
|
print "- Cache cleared.\n";
|
|
}
|
|
print "- Calling update now.\n";
|
|
print "- NOTE: This can seem like it's hung! You can watch the progress using 'journalctl -f' on another terminal to\n";
|
|
print "- watch the progress via the system logs. You can also check wiht 'ps aux | grep dnf'.\n";
|
|
if ($host_uuid eq $anvil->Get->host_uuid)
|
|
{
|
|
my $shell_call = $anvil->data->{path}{exe}{dnf}." -y update";
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
|
|
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
output => $output,
|
|
return_code => $return_code,
|
|
}});
|
|
if ($return_code)
|
|
{
|
|
print "[ Error ] - There was a problem updating the system! Expected a return code of '0', but got: [".$return_code."]\n";
|
|
print "[ Error [ - The output, if any, was\n";
|
|
print "==] Output [==\n";
|
|
print $output."\n";
|
|
print "==============\n";
|
|
}
|
|
|
|
# Loop through the output.
|
|
my $package_changes = 0;
|
|
foreach my $line (split/\n/, $output)
|
|
{
|
|
$line = $anvil->Words->clean_spaces({string => $line});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { line => $line }});
|
|
|
|
if ($line =~ / (\d+) Packages$/i)
|
|
{
|
|
$package_changes += $1;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { package_changes => $package_changes }});
|
|
}
|
|
}
|
|
|
|
# Did the user want to reboot on any update?
|
|
if (($package_changes) && ($anvil->data->{switches}{reboot}) && ($anvil->data->{switches}{'reboot-self'}))
|
|
{
|
|
# Reboot needed
|
|
print "- Updated: [".$package_changes."] packages, and '--reboot --reboot-self' used, reboot needed!\n";
|
|
$anvil->data->{sys}{reboot_needed} = 1;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
"sys::reboot_needed" => $anvil->data->{sys}{reboot_needed},
|
|
}});
|
|
}
|
|
|
|
# Get the newest installed kernel
|
|
$shell_call = $anvil->data->{path}{exe}{rpm}." -q kernel | ".$anvil->data->{path}{exe}{'sort'}." | ".$anvil->data->{path}{exe}{tail}." -n 1";
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
|
|
(my $installed_kernel, $return_code) = $anvil->System->call({shell_call => $shell_call});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
installed_kernel => $installed_kernel,
|
|
return_code => $return_code,
|
|
}});
|
|
$installed_kernel =~ s/^kernel-(\d+.\d+\.\d+-\d+)\..*$/$1/;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { installed_kernel => $installed_kernel }});
|
|
|
|
# Get the running kernel
|
|
$shell_call = $anvil->data->{path}{exe}{uname}." -r";
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
|
|
(my $active_kernel, $return_code) = $anvil->System->call({shell_call => $shell_call});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
active_kernel => $active_kernel,
|
|
return_code => $return_code,
|
|
}});
|
|
$active_kernel =~ s/(\d+.\d+\.\d+-\d+)\..*$/$1/;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { active_kernel => $active_kernel }});
|
|
|
|
if ($installed_kernel eq $active_kernel)
|
|
{
|
|
print "- The kernel has not been updated.\n";
|
|
}
|
|
else
|
|
{
|
|
print "- The kernel appears to have been upgraded, reboot needed!\n";
|
|
$anvil->data->{sys}{reboot_needed} = 1;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
"sys::reboot_needed" => $anvil->data->{sys}{reboot_needed},
|
|
}});
|
|
}
|
|
}
|
|
else
|
|
{
|
|
# Call anvil-update-system and then wait.
|
|
print "- Beginning OS update of: [".$short_host_name."]\n";
|
|
if ($host_type eq "dr")
|
|
{
|
|
# Make sure VMs are off and DRBD is down. Call this with nohup so it
|
|
# doesn't get killed by the loss of the SSH connection.
|
|
my $shell_call = $anvil->data->{path}{exe}{'anvil-safe-stop'}." --no-db".$anvil->Log->switches()." >/dev/null 2>&1 &";
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
|
|
|
|
my ($output, $error, $return_code) = $anvil->Remote->call({
|
|
shell_call => $shell_call,
|
|
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
|
|
});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
output => $output,
|
|
error => $error,
|
|
return_code => $return_code,
|
|
}});
|
|
|
|
# Now wait for DRBD resources to stop (which requires VMs be off).
|
|
print "- Waiting for all DRBD resource (and the servers using them) to stop before proceeding.\n";
|
|
my $wait_until = time + $anvil->data->{switches}{timeout};
|
|
my $next_log = time + 60;
|
|
my $waiting = 1;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
wait_until => $wait_until,
|
|
next_log => $next_log,
|
|
waiting => $waiting,
|
|
}});
|
|
while ($waiting)
|
|
{
|
|
my $drbd_up = 0;
|
|
$anvil->DRBD->get_status({
|
|
host => $short_host_name,
|
|
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
|
|
});
|
|
|
|
# How may resources are up?
|
|
my $resource_count = keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}};
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { resource_count => $resource_count }});
|
|
|
|
if (not $resource_count)
|
|
{
|
|
# Done!
|
|
$waiting = 0;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
|
|
}
|
|
|
|
if ($waiting)
|
|
{
|
|
# Log which resources are still up
|
|
if (time > $next_log)
|
|
{
|
|
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}})
|
|
{
|
|
print "[ Note ] - [".$anvil->Get->date_and_time({time_only => 1})."] - The resource: [".$resource."] is still up.\n";
|
|
}
|
|
$next_log = time + 60;
|
|
my $time_left = $wait_until - time;
|
|
my $say_time_left = $anvil->Convert->time({
|
|
'time' => $time_left,
|
|
translate => 1,
|
|
long => 0,
|
|
});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
next_log => $next_log,
|
|
time_left => $time_left,
|
|
say_time_left => $say_time_left,
|
|
}});
|
|
print "- Waiting for another: [".$say_time_left."], will check again shortly.\n";
|
|
}
|
|
if (time > $wait_until)
|
|
{
|
|
# Timeout.
|
|
print "[ Error ] - Timed out while waiting for the DR host: [".$short_host_name."] to stop all DRBD resources. Aborting the update.\n";
|
|
$anvil->nice_exit({exit_code => 1});
|
|
}
|
|
|
|
sleep 10;
|
|
}
|
|
}
|
|
}
|
|
|
|
my $update_switches = "";
|
|
if ($anvil->data->{switches}{'no-reboot'})
|
|
{
|
|
$update_switches .= " --no-reboot";
|
|
}
|
|
if ($anvil->data->{switches}{reboot})
|
|
{
|
|
$update_switches .= " --reboot";
|
|
}
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { update_switches => $update_switches }});
|
|
|
|
# We register a job, even though anvil-daemon isn't running. This will get
|
|
# picked up by 'anvil-update-systems --no-db' towards the end of it's run.
|
|
print "- Registering a job to update the system, which we can track to confirm when the update is done.\n";
|
|
my $shell_call = $anvil->data->{path}{exe}{'anvil-update-system'}.$update_switches.$anvil->Log->switches();
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
|
|
my $job_uuid = $anvil->Database->insert_or_update_jobs({
|
|
debug => 2,
|
|
job_command => $shell_call,
|
|
job_description => "job_0468",
|
|
job_host_uuid => $host_uuid,
|
|
job_name => "system::update-system",
|
|
job_progress => 0,
|
|
job_title => "job_0467"
|
|
});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }});
|
|
print "- [ Note ] - Job registered with UUID: [".$job_uuid."], waiting for it to complete.\n";
|
|
|
|
# Now call anvil-update-system with --no-db and background it so we can close
|
|
# the DB connection without killing the process.
|
|
print "- Calling the no-database update of: [".$short_host_name."]\n";
|
|
$shell_call = $anvil->data->{path}{exe}{nohup}." ".$anvil->data->{path}{exe}{'anvil-update-system'}." --no-db".$update_switches;
|
|
if ($anvil->data->{switches}{'clear-cache'})
|
|
{
|
|
# We'll only call clear-cache on this one.
|
|
$shell_call .= " --clear-cache";
|
|
}
|
|
$shell_call .= $anvil->Log->switches()." >/dev/null 2>&1 &";
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
|
|
|
|
my ($output, $error, $return_code) = $anvil->Remote->call({
|
|
shell_call => $shell_call,
|
|
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
|
|
});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
output => $output,
|
|
error => $error,
|
|
return_code => $return_code,
|
|
}});
|
|
|
|
# Verify / wait until the update is done.
|
|
my $wait_until = time + $anvil->data->{switches}{timeout};
|
|
my $waiting = 1;
|
|
my $next_log = time + 60;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }});
|
|
while ($waiting)
|
|
{
|
|
$anvil->Job->get_job_details({job_uuid => $job_uuid});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
"jobs::job_progress" => $anvil->data->{jobs}{job_progress},
|
|
"jobs::job_data" => $anvil->data->{jobs}{job_data},
|
|
}});
|
|
if ($anvil->data->{jobs}{job_progress} == 100)
|
|
{
|
|
print "- Done! The host: [".$short_host_name."] has been updated\n";
|
|
$waiting = 0;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
|
|
|
|
# Did it reboot?
|
|
if ($anvil->data->{jobs}{job_data} eq "rebooted")
|
|
{
|
|
$rebooted = 1;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { rebooted => $rebooted }});
|
|
}
|
|
|
|
# Did it fail?
|
|
if ($anvil->data->{jobs}{job_data} eq "failed")
|
|
{
|
|
# Abort!
|
|
print "[ Error ] - There was a problem updating the system! Anvil! cluster update aborted.\n";
|
|
$anvil->nice_exit({exit_code => 1});
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (time > $next_log)
|
|
{
|
|
my $say_date = $anvil->Get->date_and_time({time_only => 1});
|
|
print "[ Note ] - [".$say_date."] - The job progress is: [".$anvil->data->{jobs}{job_progress}."], continuing to wait.\n";
|
|
if ($anvil->data->{jobs}{job_progress} == 0)
|
|
{
|
|
print "[ Note ] - [".$say_date."] - It is normal for the job to show '0' progress until the database access is restored.\n";
|
|
}
|
|
$next_log = time + 60;
|
|
my $time_left = $wait_until - time;
|
|
my $say_time_left = $anvil->Convert->time({
|
|
'time' => $time_left,
|
|
translate => 1,
|
|
long => 0,
|
|
});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
next_log => $next_log,
|
|
time_left => $time_left,
|
|
say_time_left => $say_time_left,
|
|
}});
|
|
print "- Waiting for another: [".$say_time_left."], will check again shortly.\n";
|
|
}
|
|
if (time > $wait_until)
|
|
{
|
|
# Timeout.
|
|
print "[ Error ] - Timed out while waiting for the machine: [".$short_host_name."] to update the OS. Aborting the update.\n";
|
|
$anvil->nice_exit({exit_code => 1});
|
|
}
|
|
sleep 5;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
if ($rebooted)
|
|
{
|
|
print "- Rebooted! Will wait for it to come back up.\n";
|
|
wait_for_reboot($anvil, $host_uuid, $reboot_time);
|
|
}
|
|
else
|
|
{
|
|
print "- Reboot not needed, kernel appears to be up to date.\n";
|
|
}
|
|
|
|
# Run anvil-version-change
|
|
print "- Running 'anvil-version-changes' now.\n";
|
|
my $output = "";
|
|
my $error = "";
|
|
my $return_code = "";
|
|
my $shell_call = $anvil->data->{path}{exe}{'anvil-version-changes'};
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
|
|
if ($host_uuid eq $anvil->Get->host_uuid)
|
|
{
|
|
($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
output => $output,
|
|
return_code => $return_code,
|
|
}});
|
|
}
|
|
else
|
|
{
|
|
($output, $error, $return_code) = $anvil->Remote->call({
|
|
'close' => 1,
|
|
no_cache => 1,
|
|
shell_call => $shell_call,
|
|
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
|
|
});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
output => $output,
|
|
error => $error,
|
|
return_code => $return_code,
|
|
}});
|
|
}
|
|
}
|
|
}
|
|
|
|
return(0);
|
|
}
|
|
|
|
sub wait_for_reboot
|
|
{
|
|
my ($anvil, $host_uuid, $reboot_time) = @_;
|
|
my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name};
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
's1:host_uuid' => $host_uuid,
|
|
's2:short_host_name' => $short_host_name,
|
|
}});
|
|
|
|
my $matches = $anvil->Network->find_access({
|
|
debug => 2,
|
|
target => $host_uuid,
|
|
});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { matches => $matches }});
|
|
|
|
# Wait until the node comes back up.
|
|
print "- The target has been rebooted. We'll wait for the target to come back online.\n";
|
|
|
|
# This is an infinite loop, there is no timeout for this.
|
|
my $wait_until = time + $anvil->data->{switches}{timeout};
|
|
my $waiting = 1;
|
|
my $next_log = time + 60;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }});
|
|
while($waiting)
|
|
{
|
|
# Test access
|
|
my $target = $anvil->data->{peer}{$short_host_name}{access}{ip};
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
target => $target,
|
|
short_host_name => $short_host_name,
|
|
}});
|
|
my $test_access = $anvil->Remote->test_access({target => $target});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { test_access => $test_access }});
|
|
|
|
if ($test_access)
|
|
{
|
|
# What's the machine's uptime?
|
|
my $uptime = $anvil->Get->uptime({debug => 2, target => $anvil->data->{peer}{$short_host_name}{access}{ip}});
|
|
my $time_since_reboot = time - $reboot_time;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
uptime => $uptime,
|
|
time_since_reboot => $time_since_reboot,
|
|
short_host_name => $short_host_name,
|
|
}});
|
|
|
|
if (($uptime) && ($uptime < $time_since_reboot))
|
|
{
|
|
# Rebooted!
|
|
print "- Rebooted! Subnode is back up.\n";
|
|
|
|
$waiting = 0;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
|
|
}
|
|
}
|
|
|
|
if ($waiting)
|
|
{
|
|
if (time > $next_log)
|
|
{
|
|
my $say_time = $anvil->Get->date_and_time({time_only => 1});
|
|
$next_log = time + 60;
|
|
my $time_left = $wait_until - time;
|
|
my $say_time_left = $anvil->Convert->time({
|
|
'time' => $time_left,
|
|
translate => 1,
|
|
long => 0,
|
|
});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
's1:say_time' => $say_time,
|
|
's2:next_log' => $next_log,
|
|
's3:time_left' => $time_left,
|
|
's4:say_time_left' => $say_time_left,
|
|
}});
|
|
|
|
# Tell the user we're still waiting.
|
|
print "- [".$say_time."] - We're still waiting for the subnode: [".$short_host_name."] to reboot.\n";
|
|
print "- Waiting for another: [".$say_time_left."], will check again shortly.\n";
|
|
}
|
|
if (time > $wait_until)
|
|
{
|
|
# Timeout.
|
|
print "[ Error ] - Timed out while waiting for the subnode: [".$short_host_name."] to reboot. Aborting the update.\n";
|
|
$anvil->nice_exit({exit_code => 1});
|
|
}
|
|
|
|
sleep 5;
|
|
}
|
|
}
|
|
|
|
return(0);
|
|
}
|
|
|
|
sub manage_daemons
|
|
{
|
|
my ($anvil, $task) = @_;
|
|
|
|
$task = "start" if not $task;
|
|
|
|
my $do_task = $task eq "start" ? "enable --now" : "stop";
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { do_task => $do_task }});
|
|
|
|
if ($task eq "stop")
|
|
{
|
|
print "Disabling Anvil! daemons on all hosts...\n";
|
|
}
|
|
else
|
|
{
|
|
print "Enabling Anvil! daemons on all hosts...\n";
|
|
}
|
|
my $daemons = ["anvil-daemon", "scancore"];
|
|
foreach my $host_type ("dr", "node", "striker")
|
|
{
|
|
foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}})
|
|
{
|
|
my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name};
|
|
my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name};
|
|
my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type};
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
's1:host_name' => $host_name,
|
|
's2:host_uuid' => $host_uuid,
|
|
's3:short_host_name' => $short_host_name,
|
|
's4:this_host_type' => $this_host_type,
|
|
}});
|
|
next if $host_type ne $this_host_type;
|
|
|
|
if ($task eq "stop")
|
|
{
|
|
print "- Disabling dameons on: [".$short_host_name."]... ";
|
|
}
|
|
else
|
|
{
|
|
print "- Enabling dameons on: [".$short_host_name."]... ";
|
|
}
|
|
if (not $anvil->data->{peer}{$short_host_name}{access}{ip})
|
|
{
|
|
print "Offline! Skipping.\n";
|
|
next;
|
|
}
|
|
|
|
# Local
|
|
foreach my $daemon (@{$daemons})
|
|
{
|
|
my $shell_call = $anvil->data->{path}{exe}{systemctl}." ".$do_task." ".$daemon;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
|
|
|
|
my $output = "";
|
|
my $error = "";
|
|
my $return_code = 999;
|
|
if ($host_uuid eq $anvil->Get->host_uuid)
|
|
{
|
|
# Local
|
|
($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
output => $output,
|
|
return_code => $return_code,
|
|
}});
|
|
}
|
|
else
|
|
{
|
|
# Remote, it'll be a while before we hit some clients, so close this
|
|
# connection so later access to the machines don't fail with ssh
|
|
# connection timeouts.
|
|
($output, $error, $return_code) = $anvil->Remote->call({
|
|
'close' => 1,
|
|
no_cache => 1,
|
|
shell_call => $shell_call,
|
|
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
|
|
});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
output => $output,
|
|
error => $error,
|
|
return_code => $return_code,
|
|
}});
|
|
}
|
|
if (not $return_code)
|
|
{
|
|
if ($task eq "stop")
|
|
{
|
|
print $daemon." stopped... ";
|
|
}
|
|
else
|
|
{
|
|
print $daemon." started... ";
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if ($task eq "stop")
|
|
{
|
|
print $daemon." didn't stop!... ";
|
|
}
|
|
else
|
|
{
|
|
print $daemon." didn't start!... ";
|
|
}
|
|
}
|
|
}
|
|
print "Done!\n";
|
|
}
|
|
}
|
|
|
|
return(0);
|
|
}
|
|
|
|
sub verify_access
|
|
{
|
|
my ($anvil) = @_;
|
|
|
|
# Load host and Anvil! data.
|
|
$anvil->Database->get_hosts();
|
|
|
|
# Make sure all are available before we start.
|
|
my $all_access = 1;
|
|
foreach my $host_type ("dr", "node", "striker")
|
|
{
|
|
foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}})
|
|
{
|
|
my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name};
|
|
my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name};
|
|
my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type};
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
's1:host_name' => $host_name,
|
|
's2:host_uuid' => $host_uuid,
|
|
's3:short_host_name' => $short_host_name,
|
|
's4:this_host_type' => $this_host_type,
|
|
}});
|
|
next if $host_type ne $this_host_type;
|
|
|
|
print "- Verifying access to: [".$short_host_name."]... ";
|
|
my $matches = $anvil->Network->find_access({
|
|
debug => 2,
|
|
target => $host_uuid,
|
|
});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { matches => $matches }});
|
|
|
|
$anvil->data->{peer}{$short_host_name}{access}{ip} = "";
|
|
$anvil->data->{peer}{$short_host_name}{access}{network} = "";
|
|
foreach my $preferred_network ("bcn", "mn", "ifn", "sn", "any")
|
|
{
|
|
next if $anvil->data->{peer}{$short_host_name}{access}{ip};
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { preferred_network => $preferred_network }});
|
|
foreach my $network_name (sort {$a cmp $b} keys %{$anvil->data->{network_access}})
|
|
{
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { network_name => $network_name }});
|
|
if (($network_name !~ /^$preferred_network/) && ($preferred_network ne "any"))
|
|
{
|
|
next;
|
|
}
|
|
|
|
my $target_ip = $anvil->data->{network_access}{$network_name}{target_ip_address};
|
|
my $test_access = $anvil->Remote->test_access({
|
|
'close' => 1,
|
|
target => $target_ip,
|
|
});
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
's2:target_ip' => $target_ip,
|
|
's3:test_access' => $test_access,
|
|
}});
|
|
|
|
if ($test_access)
|
|
{
|
|
# We're good.
|
|
print "Connected on: [".$target_ip."] via: [".$network_name."]\n";
|
|
$anvil->data->{peer}{$short_host_name}{access}{ip} = $target_ip;
|
|
$anvil->data->{peer}{$short_host_name}{access}{network} = $network_name;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
|
"s1:peer::${short_host_name}::access::ip" => $anvil->data->{peer}{$short_host_name}{access}{ip},
|
|
"s2:peer::${short_host_name}::access::network" => $anvil->data->{peer}{$short_host_name}{access}{network},
|
|
}});
|
|
}
|
|
}
|
|
}
|
|
|
|
if (not $anvil->data->{peer}{$short_host_name}{access}{ip})
|
|
{
|
|
print "No access! Skipping.\n";
|
|
$all_access = 0;
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_access => $all_access }});
|
|
}
|
|
}
|
|
}
|
|
|
|
return($all_access);
|
|
}
|