#!/usr/bin/perl # # This program will disable our daemons on all machines, then update each striker. It then walks through all # DR hosts and Anvil! nodes. With nodes, it migrates servers to the peer, takes the node out of the cluster, # updates it, reboots if the kernel was updated, and then rejoins the cluster, migrates the VMs and the does # the same process on the peer sub-node. # # Exit codes; # 0 = Normal exit. # 1 = No database connection. # # TODO: # # USAGE: # use strict; use warnings; use Anvil::Tools; require POSIX; use Term::Cap; use Text::Diff; use Data::Dumper; my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0]; my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0]; if (($running_directory =~ /^\./) && ($ENV{PWD})) { $running_directory =~ s/^\./$ENV{PWD}/; } # Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete. $| = 1; my $anvil = Anvil::Tools->new(); # Read switches (target ([user@]host[:port]) and the file with the target's password. $anvil->Get->switches({list => [ "clear-cache", "force", "no-reboot", "reboot", "reboot-self", "timeout", "y", "yes"], man => $THIS_FILE}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }}); # Connect to the database(s). If we have no connections, we'll proceed anyway as one of the 'run_once' tasks # is to setup the database server. $anvil->Database->connect(); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"}); if (not $anvil->data->{sys}{database}{connections}) { # No databases, update the job, sleep for a bit and then exit. The daemon will pick it up and try # again after we exit. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0305"}); sleep 10; $anvil->nice_exit({exit_code => 1}); } # Make sure we're running as 'root' # $< == real UID, $> == effective UID if (($< != 0) && ($> != 0)) { # Not root print $anvil->Words->string({key => "error_0005"})."\n"; $anvil->nice_exit({exit_code => 1}); } # Make sure we're a striker. if ($anvil->Get->host_type ne "striker") { print "This has to be run on a Striker dashboard.\n"; $anvil->nice_exit({exit_code => 1}); } # If we still don't have a job-uuit, go into interactive mode. $anvil->data->{sys}{progress} = 0; if ($anvil->data->{switches}{'job-uuid'}) { # Load the job data. $anvil->Job->clear(); $anvil->Job->get_job_details({debug => 2}); $anvil->Job->update_progress({ progress => $anvil->data->{sys}{progress}++, job_picked_up_by => $$, job_picked_up_at => time, 'print' => 1, message => "message_0319", }); } # Update beginning. Verifying all known machines are accessible... $anvil->Job->update_progress({ 'print' => 1, progress => $anvil->data->{sys}{progress}++, message => "job_0469", }); my $all_access = verify_access($anvil); if ((not $all_access) && (not $anvil->data->{switches}{force})) { print "[ Error ] - Not all systems are accessible. Update aborted!\n"; $anvil->nice_exit({exit_code => 1}); } print "Success!\n"; if (($anvil->data->{switches}{y}) or ($anvil->data->{switches}{yes})) { print "[ Note ] - Proceeding without confirmation, '-y' or '--yes' used.\n"; } else { print "[ Note ] - All nodes need to be up and running for the update to run on nodes. [ Note ] - Any out-of-sync storage needs to complete before a node can be updated. [ Warning ] - Servers will be migrated between subnodes, which can cause reduced performance during [ Warning ] - the these migrations. If a sub-node is not active, it will be activated as part of the [ Warning ] - upgrade process.\n"; print "\n".$anvil->Words->string({key => "message_0021"})."\n"; my $answer = ; chomp $answer; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { answer => $answer }}); if ($answer =~ /^y/i) { print $anvil->Words->string({key => "message_0175"})."\n"; } else { print $anvil->Words->string({key => "message_0022"})."\n"; $anvil->nice_exit({exit_code => 0}); } } manage_daemons($anvil, "stop"); # Update systems update_strikers_and_dr($anvil); # Update DR Host update_nodes($anvil); manage_daemons($anvil, "start"); print "Updates complete!\n"; my $host_uuid = $anvil->Get->host_uuid; my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:host_uuid' => $host_uuid, 's2:short_host_name' => $short_host_name, }}); if ($anvil->data->{sys}{reboot_needed}) { if ($anvil->data->{switches}{'reboot-self'}) { print "[ Note ] - The local system needs to be rebooted, and '--reboot-self' was used. Rebooting in 60 seconds! Use ctrl+c to abort!\n"; my $waiting = 60; while ($waiting) { print $waiting.", "; sleep 5; $waiting -= 5; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $waiting }}); } print "\nRebooting now!\n"; my $shell_call = $anvil->data->{path}{exe}{systemctl}." reboot"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code }}); print "Reboot requested, exiting.\n"; } else { print "[ Note ] - This host needs to be rebooted to activate the new kernel. Please update as soon as you can.\n"; } } $anvil->nice_exit({exit_code => 0}); ############################################################################################################# # Functions # ############################################################################################################# sub update_nodes { my ($anvil) = @_; # Here, we loop through anvil systems, and find which sub nodes will be updated first, and which will # be updated second. foreach my $anvil_name (sort {$a cmp $b} keys %{$anvil->data->{anvils}{anvil_name}}) { my $anvil_uuid = $anvil->data->{anvils}{anvil_name}{$anvil_name}{anvil_uuid}; my $anvil_description = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_description}; my $anvil_node1_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node1_host_uuid}; my $anvil_node2_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node2_host_uuid}; my $primary_host_uuid = $anvil->Cluster->get_primary_host_uuid({anvil_uuid => $anvil_uuid}); $primary_host_uuid = $anvil_node1_host_uuid if not $primary_host_uuid; my $secondary_host_uuid = $primary_host_uuid eq $anvil_node1_host_uuid ? $anvil_node2_host_uuid : $anvil_node1_host_uuid; my $node1_short_host_name = $anvil->data->{hosts}{host_uuid}{$anvil_node1_host_uuid}{short_host_name}; my $node2_short_host_name = $anvil->data->{hosts}{host_uuid}{$anvil_node2_host_uuid}{short_host_name}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:anvil_name' => $anvil_name, 's2:anvil_uuid' => $anvil_uuid, 's3:anvil_description' => $anvil_description, 's4:anvil_node1_host_uuid' => $anvil_node1_host_uuid, 's5:anvil_node2_host_uuid' => $anvil_node2_host_uuid, 's6:primary_host_uuid' => $primary_host_uuid, 's7:secondary_host_uuid' => $secondary_host_uuid, 's8:node1_short_host_name' => $node1_short_host_name, 's9:node2_short_host_name' => $node2_short_host_name, }}); # Before we proceed, are both nodes online? If so, great. If not, are both offline? If only # one is online, abort. Check now in case things have changed since our first scan print "Preparing to update the Anvil! node: [".$anvil_name."]. Verifying subnode access:\n"; foreach my $host_uuid ($secondary_host_uuid, $primary_host_uuid) { my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:host_uuid' => $host_uuid, 's2:short_host_name' => $short_host_name, }}); print "- Verifying access to subnode: [".$short_host_name."]\n"; my $matches = $anvil->Network->find_access({ debug => 2, target => $host_uuid, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { matches => $matches }}); $anvil->data->{peer}{$short_host_name}{access}{ip} = ""; $anvil->data->{peer}{$short_host_name}{access}{network} = ""; foreach my $preferred_network ("bcn", "mn", "ifn", "sn", "any") { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { preferred_network => $preferred_network }}); foreach my $network_name (sort {$a cmp $b} keys %{$anvil->data->{network_access}}) { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { network_name => $network_name }}); if (($network_name !~ /^$preferred_network/) && ($preferred_network ne "any")) { next; } my $target_ip = $anvil->data->{network_access}{$network_name}{target_ip_address}; my $test_access = $anvil->Remote->test_access({target => $target_ip}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:target_ip' => $target_ip, 's2:test_access' => $test_access, }}); if ($test_access) { # We're good. $anvil->data->{peer}{$short_host_name}{access}{ip} = $target_ip; $anvil->data->{peer}{$short_host_name}{access}{network} = $network_name; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "s1:peer::${short_host_name}::access::ip" => $anvil->data->{peer}{$short_host_name}{access}{ip}, "s2:peer::${short_host_name}::access::network" => $anvil->data->{peer}{$short_host_name}{access}{network}, }}); print "- Access found over the: [".$network_name."] network using the IP: [".$target_ip."]\n"; last; } } } if (not $anvil->data->{peer}{$short_host_name}{access}{ip}) { print "[ Warning ] - Access not found!\n"; } } if ((($anvil->data->{peer}{$node1_short_host_name}{access}{ip}) && (not $anvil->data->{peer}{$node2_short_host_name}{access}{ip})) or ((not $anvil->data->{peer}{$node1_short_host_name}{access}{ip}) && ($anvil->data->{peer}{$node2_short_host_name}{access}{ip}))) { # Only one node online, skip this Anvil node. if ($anvil->data->{switches}{force}) { # Skip this Anvil! system print "[ Warning ] - '--force' used, skipping this node.\n"; print "[ NOTE ] - This node may not be able to communicate with the Striker dashboards until updated manually!\n"; next; } else { print "[ Error ] - Exiting update! Please bring the missing subnode back online and try again!\n"; $anvil->nice_exit({exit_code => 1}); } } # Update the secondary first, as it should have no VMs on it. foreach my $host_uuid ($secondary_host_uuid, $primary_host_uuid) { # Withdraw the node from the cluster. my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; my $peer_host_uuid = $host_uuid eq $primary_host_uuid ? $secondary_host_uuid : $primary_host_uuid; my $peer_short_host_name = $anvil->data->{hosts}{host_uuid}{$peer_host_uuid}{short_host_name}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:host_uuid' => $host_uuid, 's2:short_host_name' => $short_host_name, 's3:peer_host_uuid' => $peer_host_uuid, 's4:peer_short_host_name' => $peer_short_host_name, }}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:host_uuid' => $host_uuid, 's2:short_host_name' => $short_host_name, }}); print "Preparing to update: [".$short_host_name."]. Withdrawing the subnode from the Anvil! node.\n"; print "- [ Note ] - If the node has servers that need to be migrated off, or if the node is SyncSource for storage,\n"; print "- [ Note ] - this could take some time to complete.\n"; # Make sure VMs are off, DRBD is down and the node is out of the cluster. Call this # with nohup so it doesn't get killed by the loss of the SSH connection. my $shell_call = $anvil->data->{path}{exe}{'anvil-safe-stop'}." --no-db".$anvil->Log->switches()." >/dev/null 2>&1 &"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my ($output, $error, $return_code) = $anvil->Remote->call({ shell_call => $shell_call, target => $anvil->data->{peer}{$short_host_name}{access}{ip}, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, error => $error, return_code => $return_code, }}); # Now wait for DRBD resources to stop (which requires VMs be off). print "- Waiting for all DRBD resource (and the servers using them) to stop before proceeding.\n"; my $wait_until = time + $anvil->data->{switches}{timeout}; my $next_log = time + 60; my $waiting = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { wait_until => $wait_until, next_log => $next_log, waiting => $waiting, }}); while ($waiting) { my $drbd_up = 0; my $pacemaker_up = 0; $anvil->DRBD->get_status({ host => $short_host_name, target => $anvil->data->{peer}{$short_host_name}{access}{ip}, }); # How may resources are up? my $resource_count = keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { resource_count => $resource_count }}); if ($resource_count) { # DRBD is still up. $drbd_up = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { drbd_up => $drbd_up }}); } # Is pacemaker down? my $problem = $anvil->Cluster->parse_cib({target => $anvil->data->{peer}{$short_host_name}{access}{ip}}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); if (not $problem) { # Node is still in the cluster. $pacemaker_up = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { pacemaker_up => $pacemaker_up }}); } if ((not $pacemaker_up) && (not $drbd_up)) { $waiting = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); } if ($waiting) { # Log which resources are still up if (time > $next_log) { my $say_time = $anvil->Get->date_and_time({time_only => 1}); if ($pacemaker_up) { print "[ Note ] - [".$say_time."] - The subnode is still in the cluster.\n"; } else { print "[ Note ] - [".$say_time."] - The subnode is no longer in the cluster, good.\n"; } foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}}) { print "[ Note ] - [".$say_time."] - The resource: [".$resource."] is still up.\n"; } $next_log = time + 60; my $time_left = $wait_until - time; my $say_time_left = $anvil->Convert->time({ 'time' => $time_left, translate => 1, long => 0, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log, time_left => $time_left, say_time_left => $say_time_left, }}); print "- Waiting for another: [".$say_time_left."], will check again shortly.\n"; } if (time > $wait_until) { # Timeout. print "[ Error ] - Timed out while waiting for the subnode: [".$short_host_name."] to stop all DRBD resources nad leave the cluster. Aborting the update.\n"; $anvil->nice_exit({exit_code => 1}); } sleep 10; } } my $update_switches = ""; if ($anvil->data->{switches}{'no-reboot'}) { $update_switches .= " --no-reboot"; } if ($anvil->data->{switches}{reboot}) { $update_switches .= " --reboot"; } $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { update_switches => $update_switches }}); # We register a job, even though anvil-daemon isn't running. This will get picked up # by 'anvil-update-systems --no-db' towards the end of it's run. print "- Registering a job to update the subnode, which we can track to confirm when the update is done.\n"; $shell_call = $anvil->data->{path}{exe}{'anvil-update-system'}.$update_switches.$anvil->Log->switches(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my $job_uuid = $anvil->Database->insert_or_update_jobs({ debug => 2, job_command => $shell_call, job_description => "job_0468", job_host_uuid => $host_uuid, job_name => "system::update-system", job_progress => 0, job_title => "job_0467" }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }}); print "- [ Note ] - Job registered with UUID: [".$job_uuid."], waiting for it to complete.\n"; # Now call anvil-update-system with --no-db and background it so we can close # the DB connection without killing the process. print "- Calling the no-database update of: [".$short_host_name."]\n"; $shell_call = $anvil->data->{path}{exe}{nohup}." ".$anvil->data->{path}{exe}{'anvil-update-system'}." --no-db".$update_switches; if ($anvil->data->{switches}{'clear-cache'}) { # We'll only call clear-cache on this one. $shell_call .= " --clear-cache"; } $shell_call .= $anvil->Log->switches()." >/dev/null 2>&1 &"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); ($output, $error, $return_code) = $anvil->Remote->call({ shell_call => $shell_call, target => $anvil->data->{peer}{$short_host_name}{access}{ip}, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, error => $error, return_code => $return_code, }}); # Record the start time so that we can be sure the subnode has rebooted (uptime is # less than the current time minus this start time), if the host reboots as part of # the update. my $rebooted = 0; my $reboot_time = time; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { rebooted => $rebooted, reboot_time => $reboot_time, short_host_name => $short_host_name, }}); # Verify that the node is no longer in the cluster. $wait_until = time + $anvil->data->{switches}{timeout}; $waiting = 1; $next_log = time + 60; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { wait_until => $wait_until, next_log => $next_log, }}); while ($waiting) { $anvil->Job->get_job_details({job_uuid => $job_uuid}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "jobs::job_progress" => $anvil->data->{jobs}{job_progress}, "jobs::job_data" => $anvil->data->{jobs}{job_data}, }}); if ($anvil->data->{jobs}{job_progress} == 100) { print "- Done! The subnode: [".$short_host_name."] has been updated\n"; $waiting = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); # Did it reboot? if ($anvil->data->{jobs}{job_data} eq "rebooted") { $rebooted = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { rebooted => $rebooted }}); } # Did it fail? if ($anvil->data->{jobs}{job_data} eq "failed") { # Abort! print "[ Error ] - There was a problem updating the subnode! Anvil! cluster update aborted.\n"; $anvil->nice_exit({exit_code => 1}); } } else { my $say_date = $anvil->Get->date_and_time({time_only => 1}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { say_date => $say_date }}); if (time > $next_log) { print "[ Note ] - [".$say_date."] - The job progress is: [".$anvil->data->{jobs}{job_progress}."], continuing to wait.\n"; if ($anvil->data->{jobs}{job_progress} eq "0") { print "[ Note ] - [".$say_date."] - It is expected for the job to stay at '0' for a while.\n"; } $next_log = time + 60; my $time_left = $wait_until - time; my $say_time_left = $anvil->Convert->time({ 'time' => $time_left, translate => 1, long => 0, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log, time_left => $time_left, say_time_left => $say_time_left, }}); print "- Waiting for another: [".$say_time_left."], will check again shortly.\n"; } if (time > $wait_until) { # Timeout. print "[ Error ] - Timed out while waiting for the subnode: [".$short_host_name."] to update. Aborting the update.\n"; $anvil->nice_exit({exit_code => 1}); } sleep 5; } } print "- Update completed successfully! Checking if a reboot is needed.\n"; my $run_anvil_safe_start = 0; if ($rebooted) { print "- Rebooted! Will wait for it to come back up.\n"; wait_for_reboot($anvil, $host_uuid, $reboot_time); } else { print "- Reboot not needed, kernel appears to be up to date.\n"; $run_anvil_safe_start = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { run_anvil_safe_start => $run_anvil_safe_start }}); } # Wait for the node to rejoin the cluster. As before, this is a time # unrestricted wait loop. print "- Waiting for the subnode to rejoin the node.\n"; $wait_until = time + $anvil->data->{switches}{timeout}; $waiting = 1; my $start_called = 0; $next_log = time + 60; my $manual_start = time + 60; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:wait_until' => $wait_until, 's2:next_log' => $next_log, 's3:manual_start' => $manual_start, }}); while($waiting) { # Should we call a start to the cluster? if ((not $start_called) && ($run_anvil_safe_start)) { print "- Calling 'anvil-safe-start' to rejoin the subnode to the node.\n"; $start_called = 1; my $shell_call = $anvil->data->{path}{exe}{'anvil-safe-start'}.$anvil->Log->switches()." >/dev/null 2>&1 &"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { start_called => $start_called, shell_call => $shell_call, }}); my ($output, $error, $return_code) = $anvil->Remote->call({ debug => 2, shell_call => $shell_call, target => $anvil->data->{peer}{$short_host_name}{access}{ip}, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, error => $error, return_code => $return_code, }}); } # Pull the CIB and make sure both nodes are ready, and that DRBD resources # are all UpToDate if this is the reboot from the first node. my ($problem) = $anvil->Cluster->parse_cib({target => $anvil->data->{peer}{$short_host_name}{access}{ip}}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); # Are both nodes ready? if (not $problem) { # Both nodes are in the cluster, but are they full members yet? my $both_ready = 1; my $node_count = 0; foreach my $node_name (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{data}{node}}) { my $ready = $anvil->data->{cib}{parsed}{data}{node}{$node_name}{node_state}{ready}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { node_name => $node_name, ready => $ready, }}); if (not $ready) { $both_ready = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { both_ready => $both_ready }}); } $node_count++; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { node_count => $node_count }}); } # Did we see two nodes and are both ready? if (($node_count == 2) && ($both_ready)) { # Yes! If this is the first subnode, we need to wait for DRBD # to be UpToDate. If it's the second, we just wait for the # connections to be up. # NOTE: We call the peer to get the DRBD data as it's got a # better view of the storage print "- Both subnodes are online, will now check replicated storage.\n"; $anvil->DRBD->get_status({ host => $peer_short_host_name, target => $anvil->data->{peer}{$peer_short_host_name}{access}{ip}, }); if ($host_uuid eq $primary_host_uuid) { ### NOTE: Should we wait for all connections ### to be up? # This is the second node, we don't have to wait. print "- This is the second node, no need to wait for replication to complete.\n"; $waiting = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); } else { # This is the first node. Wait for all volumes to be # UpToDate. if (time > $next_log) { print "- Waiting for all volumes to be UpToDate before updating the other subnode.\n"; } my $all_uptodate = 1; my $resources = 0; foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$peer_short_host_name}{resource}}) { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { resource => $resource }}); foreach my $peer_name (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$peer_short_host_name}{resource}{$resource}{connection}}) { # We don't care about DR hosts for this upgrade my $peer_uuid = $anvil->Get->host_uuid_from_name({host_name => $peer_name}); my $peer_type = $anvil->data->{hosts}{host_uuid}{$peer_uuid}{host_type}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:peer_name' => $peer_name, 's2:peer_uuid' => $peer_uuid, 's3:peer_type' => $peer_type, }}); next if $peer_type ne "node"; foreach my $volume (sort {$a <=> $b} keys %{$anvil->data->{drbd}{status}{$peer_short_host_name}{resource}{$resource}{connection}{$peer_name}{volume}}) { # This is this subnode's disk state, # as the DRBD data was collected # from the peer. my $disk_state = $anvil->data->{drbd}{status}{$peer_short_host_name}{resource}{$resource}{connection}{$peer_name}{volume}{$volume}{'peer-disk-state'}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:volume' => $volume, 's2:disk_state' => $disk_state, }}); if (lc($disk_state) ne "uptodate") { $all_uptodate = 0; my $eta_in_seconds = $anvil->data->{drbd}{status}{$peer_short_host_name}{resource}{$resource}{connection}{$peer_name}{volume}{$volume}{'estimated-seconds-to-finish'}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_uptodate => $all_uptodate, eta_in_seconds => $eta_in_seconds, }}); if (time > $next_log) { if ($eta_in_seconds) { print "- The resource: [".$resource."/".$volume."] is not synced yet, ETA is: [".$eta_in_seconds."] to complete resync.\n"; } else { print "- The resource: [".$resource."/".$volume."] is not yet UpToDate.\n"; } } } } # End foreach volume } # End foreach peer } # End foreach resource if ($all_uptodate) { print "- All resources appear to be ready,\n"; $waiting = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); } } # End if host is first or second subnode } # End if both ready elsif (time > $next_log) { print "- Both subnodes are not online yet, still waiting.\n"; } } # End if CIB was parsed elsif (time > $next_log) { print "- Unable to parse the node's cluster information base, will try again soon.\n"; } if (time > $next_log) { my $say_time = $anvil->Get->date_and_time({time_only => 1}); $next_log = time + 60; my $time_left = $wait_until - time; my $say_time_left = $anvil->Convert->time({ 'time' => $time_left, translate => 1, long => 0, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:say_time' => $say_time, 's2:next_log' => $next_log, 's3:time_left' => $time_left, 's4:say_time_left' => $say_time_left, }}); # Tell the user we're still waiting. print "- [".$say_time."] - We're still waiting for the subnode: [".$short_host_name."] to reboot.\n"; print "- Waiting for another: [".$say_time_left."], will check again shortly.\n"; } if (time > $wait_until) { # Timeout. print "[ Error ] - Timed out while waiting for the subnode: [".$short_host_name."] to join the subcluster. Aborting the update.\n"; $anvil->nice_exit({exit_code => 1}); } if ($waiting) { sleep 5; } } # End while waiting for subnode to return # Run anvil-version-change print "- Running 'anvil-version-changes'.\n"; $output = ""; $error = ""; $return_code = ""; $shell_call = $anvil->data->{path}{exe}{'anvil-version-changes'}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); if ($host_uuid eq $anvil->Get->host_uuid) { ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code, }}); } else { ($output, $error, $return_code) = $anvil->Remote->call({ shell_call => $shell_call, target => $anvil->data->{peer}{$short_host_name}{access}{ip}, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, error => $error, return_code => $return_code, }}); } print "- Done!\n"; } } return(0); } sub update_strikers_and_dr { my ($anvil) = @_; # Before we start, set the timeouts. if ($anvil->data->{switches}{timeout}) { if ($anvil->data->{switches}{timeout} =~ /^(\d+)h/i) { my $hours = $1; $anvil->data->{switches}{timeout} = $hours * 3600; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { hours => $hours, "switches::timeout" => $anvil->data->{switches}{timeout}, }}); } elsif ($anvil->data->{switches}{timeout} =~ /^(\d+)m/i) { my $minutes = $1; $anvil->data->{switches}{timeout} = $minutes * 60; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { minutes => $minutes, "switches::timeout" => $anvil->data->{switches}{timeout}, }}); } else { # Set the default. print "[ Warning ] - The passed timeout: [".$anvil->data->{switches}{timeout}."] is invalid, setting it to 24 hours.\n"; $anvil->data->{switches}{timeout} = 86400; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::timeout" => $anvil->data->{switches}{timeout}, }}); } } else { $anvil->data->{switches}{timeout} = 86400; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::timeout" => $anvil->data->{switches}{timeout}, }}); } # Make sure the timeout, if set, is valid. if ($anvil->data->{switches}{timeout}) { if ($anvil->data->{switches}{timeout} =~ /\D/) { # Invalid, error out. print "The --timeout switch was used: [".$anvil->data->{switches}{timeout}."], but the value isn't a number of seconds.\n"; $anvil->nice_exit({exit_code => 1}); } } foreach my $host_type ("striker", "dr") { foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}}) { my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name}; my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:host_name' => $host_name, 's2:host_uuid' => $host_uuid, 's3:short_host_name' => $short_host_name, 's4:this_host_type' => $this_host_type, }}); next if $this_host_type ne $host_type; if ($host_type eq "striker") { print "Starting the update of the Striker dashboard: [".$short_host_name."].\n"; } else { print "Starting the update of the DR host: [".$short_host_name."].\n"; } # If this is the local system, set the variable to track if we need to reboot. # Otherwise, see if we have access to the peer. if ($host_uuid eq $anvil->Get->host_uuid) { $anvil->data->{sys}{reboot_needed} = 0; } elsif(not $anvil->data->{peer}{$short_host_name}{access}{ip}) { if ($host_type eq "striker") { print "- No access to the Striker dashboard: [".$short_host_name."], skipping.\n"; } else { print "- No access to the DR host: [".$short_host_name."], skipping.\n"; } next; } # Record the start time so that we can be sure the subnode has rebooted (uptime is # less than the current time minus this start time), if the host reboots as part of # the update. my $reboot_time = time; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { reboot_time => $reboot_time }}); print "- Beginning OS update of: [".$short_host_name."]\n"; my $rebooted = 0; if (($anvil->data->{switches}{'clear-cache'}) && ($host_uuid eq $anvil->Get->host_uuid)) { my $shell_call = $anvil->data->{path}{exe}{dnf}." clean all"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code, }}); print "- Cache cleared.\n"; } print "- Calling update now.\n"; print "- NOTE: This can seem like it's hung! You can watch the progress using 'journalctl -f' on another terminal to\n"; print "- watch the progress via the system logs. You can also check wiht 'ps aux | grep dnf'.\n"; if ($host_uuid eq $anvil->Get->host_uuid) { my $shell_call = $anvil->data->{path}{exe}{dnf}." -y update"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code, }}); if ($return_code) { print "[ Error ] - There was a problem updating the system! Expected a return code of '0', but got: [".$return_code."]\n"; print "[ Error [ - The output, if any, was\n"; print "==] Output [==\n"; print $output."\n"; print "==============\n"; } # Loop through the output. my $package_changes = 0; foreach my $line (split/\n/, $output) { $line = $anvil->Words->clean_spaces({string => $line}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { line => $line }}); if ($line =~ / (\d+) Packages$/i) { $package_changes += $1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { package_changes => $package_changes }}); } } # Did the user want to reboot on any update? if (($package_changes) && ($anvil->data->{switches}{reboot}) && ($anvil->data->{switches}{'reboot-self'})) { # Reboot needed print "- Updated: [".$package_changes."] packages, and '--reboot --reboot-self' used, reboot needed!\n"; $anvil->data->{sys}{reboot_needed} = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "sys::reboot_needed" => $anvil->data->{sys}{reboot_needed}, }}); } # Get the newest installed kernel $shell_call = $anvil->data->{path}{exe}{rpm}." -q kernel | ".$anvil->data->{path}{exe}{'sort'}." | ".$anvil->data->{path}{exe}{tail}." -n 1"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); (my $installed_kernel, $return_code) = $anvil->System->call({shell_call => $shell_call}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { installed_kernel => $installed_kernel, return_code => $return_code, }}); $installed_kernel =~ s/^kernel-(\d+.\d+\.\d+-\d+)\..*$/$1/; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { installed_kernel => $installed_kernel }}); # Get the running kernel $shell_call = $anvil->data->{path}{exe}{uname}." -r"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); (my $active_kernel, $return_code) = $anvil->System->call({shell_call => $shell_call}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { active_kernel => $active_kernel, return_code => $return_code, }}); $active_kernel =~ s/(\d+.\d+\.\d+-\d+)\..*$/$1/; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { active_kernel => $active_kernel }}); if ($installed_kernel eq $active_kernel) { print "- The kernel has not been updated.\n"; } else { print "- The kernel appears to have been upgraded, reboot needed!\n"; $anvil->data->{sys}{reboot_needed} = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "sys::reboot_needed" => $anvil->data->{sys}{reboot_needed}, }}); } } else { # Call anvil-update-system and then wait. print "- Beginning OS update of: [".$short_host_name."]\n"; if ($host_type eq "dr") { # Make sure VMs are off and DRBD is down. Call this with nohup so it # doesn't get killed by the loss of the SSH connection. my $shell_call = $anvil->data->{path}{exe}{'anvil-safe-stop'}." --no-db".$anvil->Log->switches()." >/dev/null 2>&1 &"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my ($output, $error, $return_code) = $anvil->Remote->call({ shell_call => $shell_call, target => $anvil->data->{peer}{$short_host_name}{access}{ip}, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, error => $error, return_code => $return_code, }}); # Now wait for DRBD resources to stop (which requires VMs be off). print "- Waiting for all DRBD resource (and the servers using them) to stop before proceeding.\n"; my $wait_until = time + $anvil->data->{switches}{timeout}; my $next_log = time + 60; my $waiting = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { wait_until => $wait_until, next_log => $next_log, waiting => $waiting, }}); while ($waiting) { my $drbd_up = 0; $anvil->DRBD->get_status({ host => $short_host_name, target => $anvil->data->{peer}{$short_host_name}{access}{ip}, }); # How may resources are up? my $resource_count = keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { resource_count => $resource_count }}); if (not $resource_count) { # Done! $waiting = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); } if ($waiting) { # Log which resources are still up if (time > $next_log) { foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}}) { print "[ Note ] - [".$anvil->Get->date_and_time({time_only => 1})."] - The resource: [".$resource."] is still up.\n"; } $next_log = time + 60; my $time_left = $wait_until - time; my $say_time_left = $anvil->Convert->time({ 'time' => $time_left, translate => 1, long => 0, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log, time_left => $time_left, say_time_left => $say_time_left, }}); print "- Waiting for another: [".$say_time_left."], will check again shortly.\n"; } if (time > $wait_until) { # Timeout. print "[ Error ] - Timed out while waiting for the DR host: [".$short_host_name."] to stop all DRBD resources. Aborting the update.\n"; $anvil->nice_exit({exit_code => 1}); } sleep 10; } } } my $update_switches = ""; if ($anvil->data->{switches}{'no-reboot'}) { $update_switches .= " --no-reboot"; } if ($anvil->data->{switches}{reboot}) { $update_switches .= " --reboot"; } $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { update_switches => $update_switches }}); # We register a job, even though anvil-daemon isn't running. This will get # picked up by 'anvil-update-systems --no-db' towards the end of it's run. print "- Registering a job to update the system, which we can track to confirm when the update is done.\n"; my $shell_call = $anvil->data->{path}{exe}{'anvil-update-system'}.$update_switches.$anvil->Log->switches(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my $job_uuid = $anvil->Database->insert_or_update_jobs({ debug => 2, job_command => $shell_call, job_description => "job_0468", job_host_uuid => $host_uuid, job_name => "system::update-system", job_progress => 0, job_title => "job_0467" }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }}); print "- [ Note ] - Job registered with UUID: [".$job_uuid."], waiting for it to complete.\n"; # Now call anvil-update-system with --no-db and background it so we can close # the DB connection without killing the process. print "- Calling the no-database update of: [".$short_host_name."]\n"; $shell_call = $anvil->data->{path}{exe}{nohup}." ".$anvil->data->{path}{exe}{'anvil-update-system'}." --no-db".$update_switches; if ($anvil->data->{switches}{'clear-cache'}) { # We'll only call clear-cache on this one. $shell_call .= " --clear-cache"; } $shell_call .= $anvil->Log->switches()." >/dev/null 2>&1 &"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my ($output, $error, $return_code) = $anvil->Remote->call({ shell_call => $shell_call, target => $anvil->data->{peer}{$short_host_name}{access}{ip}, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, error => $error, return_code => $return_code, }}); # Verify / wait until the update is done. my $wait_until = time + $anvil->data->{switches}{timeout}; my $waiting = 1; my $next_log = time + 60; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }}); while ($waiting) { $anvil->Job->get_job_details({job_uuid => $job_uuid}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "jobs::job_progress" => $anvil->data->{jobs}{job_progress}, "jobs::job_data" => $anvil->data->{jobs}{job_data}, }}); if ($anvil->data->{jobs}{job_progress} == 100) { print "- Done! The host: [".$short_host_name."] has been updated\n"; $waiting = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); # Did it reboot? if ($anvil->data->{jobs}{job_data} eq "rebooted") { $rebooted = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { rebooted => $rebooted }}); } # Did it fail? if ($anvil->data->{jobs}{job_data} eq "failed") { # Abort! print "[ Error ] - There was a problem updating the system! Anvil! cluster update aborted.\n"; $anvil->nice_exit({exit_code => 1}); } } else { if (time > $next_log) { my $say_date = $anvil->Get->date_and_time({time_only => 1}); print "[ Note ] - [".$say_date."] - The job progress is: [".$anvil->data->{jobs}{job_progress}."], continuing to wait.\n"; if ($anvil->data->{jobs}{job_progress} == 0) { print "[ Note ] - [".$say_date."] - It is normal for the job to show '0' progress until the database access is restored.\n"; } $next_log = time + 60; my $time_left = $wait_until - time; my $say_time_left = $anvil->Convert->time({ 'time' => $time_left, translate => 1, long => 0, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log, time_left => $time_left, say_time_left => $say_time_left, }}); print "- Waiting for another: [".$say_time_left."], will check again shortly.\n"; } if (time > $wait_until) { # Timeout. print "[ Error ] - Timed out while waiting for the machine: [".$short_host_name."] to update the OS. Aborting the update.\n"; $anvil->nice_exit({exit_code => 1}); } sleep 5; } } } if ($rebooted) { print "- Rebooted! Will wait for it to come back up.\n"; wait_for_reboot($anvil, $host_uuid, $reboot_time); } else { print "- Reboot not needed, kernel appears to be up to date.\n"; } # Run anvil-version-change print "- Running 'anvil-version-changes' now.\n"; my $output = ""; my $error = ""; my $return_code = ""; my $shell_call = $anvil->data->{path}{exe}{'anvil-version-changes'}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); if ($host_uuid eq $anvil->Get->host_uuid) { ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code, }}); } else { ($output, $error, $return_code) = $anvil->Remote->call({ 'close' => 1, no_cache => 1, shell_call => $shell_call, target => $anvil->data->{peer}{$short_host_name}{access}{ip}, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, error => $error, return_code => $return_code, }}); } } } return(0); } sub wait_for_reboot { my ($anvil, $host_uuid, $reboot_time) = @_; my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:host_uuid' => $host_uuid, 's2:short_host_name' => $short_host_name, }}); my $matches = $anvil->Network->find_access({ debug => 2, target => $host_uuid, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { matches => $matches }}); # Wait until the node comes back up. print "- The target has been rebooted. We'll wait for the target to come back online.\n"; # This is an infinite loop, there is no timeout for this. my $wait_until = time + $anvil->data->{switches}{timeout}; my $waiting = 1; my $next_log = time + 60; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }}); while($waiting) { # Test access my $target = $anvil->data->{peer}{$short_host_name}{access}{ip}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { target => $target, short_host_name => $short_host_name, }}); my $test_access = $anvil->Remote->test_access({target => $target}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { test_access => $test_access }}); if ($test_access) { # What's the machine's uptime? my $uptime = $anvil->Get->uptime({debug => 2, target => $anvil->data->{peer}{$short_host_name}{access}{ip}}); my $time_since_reboot = time - $reboot_time; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { uptime => $uptime, time_since_reboot => $time_since_reboot, short_host_name => $short_host_name, }}); if (($uptime) && ($uptime < $time_since_reboot)) { # Rebooted! print "- Rebooted! Subnode is back up.\n"; $waiting = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); } } if ($waiting) { if (time > $next_log) { my $say_time = $anvil->Get->date_and_time({time_only => 1}); $next_log = time + 60; my $time_left = $wait_until - time; my $say_time_left = $anvil->Convert->time({ 'time' => $time_left, translate => 1, long => 0, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:say_time' => $say_time, 's2:next_log' => $next_log, 's3:time_left' => $time_left, 's4:say_time_left' => $say_time_left, }}); # Tell the user we're still waiting. print "- [".$say_time."] - We're still waiting for the subnode: [".$short_host_name."] to reboot.\n"; print "- Waiting for another: [".$say_time_left."], will check again shortly.\n"; } if (time > $wait_until) { # Timeout. print "[ Error ] - Timed out while waiting for the subnode: [".$short_host_name."] to reboot. Aborting the update.\n"; $anvil->nice_exit({exit_code => 1}); } sleep 5; } } return(0); } sub manage_daemons { my ($anvil, $task) = @_; $task = "start" if not $task; my $do_task = $task eq "start" ? "enable --now" : "stop"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { do_task => $do_task }}); if ($task eq "stop") { print "Disabling Anvil! daemons on all hosts...\n"; } else { print "Enabling Anvil! daemons on all hosts...\n"; } my $daemons = ["anvil-daemon", "scancore"]; foreach my $host_type ("dr", "node", "striker") { foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}}) { my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name}; my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:host_name' => $host_name, 's2:host_uuid' => $host_uuid, 's3:short_host_name' => $short_host_name, 's4:this_host_type' => $this_host_type, }}); next if $host_type ne $this_host_type; if ($task eq "stop") { print "- Disabling dameons on: [".$short_host_name."]... "; } else { print "- Enabling dameons on: [".$short_host_name."]... "; } if (not $anvil->data->{peer}{$short_host_name}{access}{ip}) { print "Offline! Skipping.\n"; next; } # Local foreach my $daemon (@{$daemons}) { my $shell_call = $anvil->data->{path}{exe}{systemctl}." ".$do_task." ".$daemon; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my $output = ""; my $error = ""; my $return_code = 999; if ($host_uuid eq $anvil->Get->host_uuid) { # Local ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code, }}); } else { # Remote, it'll be a while before we hit some clients, so close this # connection so later access to the machines don't fail with ssh # connection timeouts. ($output, $error, $return_code) = $anvil->Remote->call({ 'close' => 1, no_cache => 1, shell_call => $shell_call, target => $anvil->data->{peer}{$short_host_name}{access}{ip}, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, error => $error, return_code => $return_code, }}); } if (not $return_code) { if ($task eq "stop") { print $daemon." stopped... "; } else { print $daemon." started... "; } } else { if ($task eq "stop") { print $daemon." didn't stop!... "; } else { print $daemon." didn't start!... "; } } } print "Done!\n"; } } return(0); } sub verify_access { my ($anvil) = @_; # Load host and Anvil! data. $anvil->Database->get_hosts(); # Make sure all are available before we start. my $all_access = 1; foreach my $host_type ("dr", "node", "striker") { foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}}) { my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name}; my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:host_name' => $host_name, 's2:host_uuid' => $host_uuid, 's3:short_host_name' => $short_host_name, 's4:this_host_type' => $this_host_type, }}); next if $host_type ne $this_host_type; print "- Verifying access to: [".$short_host_name."]... "; my $matches = $anvil->Network->find_access({ debug => 2, target => $host_uuid, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { matches => $matches }}); $anvil->data->{peer}{$short_host_name}{access}{ip} = ""; $anvil->data->{peer}{$short_host_name}{access}{network} = ""; foreach my $preferred_network ("bcn", "mn", "ifn", "sn", "any") { next if $anvil->data->{peer}{$short_host_name}{access}{ip}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { preferred_network => $preferred_network }}); foreach my $network_name (sort {$a cmp $b} keys %{$anvil->data->{network_access}}) { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { network_name => $network_name }}); if (($network_name !~ /^$preferred_network/) && ($preferred_network ne "any")) { next; } my $target_ip = $anvil->data->{network_access}{$network_name}{target_ip_address}; my $test_access = $anvil->Remote->test_access({ 'close' => 1, target => $target_ip, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's2:target_ip' => $target_ip, 's3:test_access' => $test_access, }}); if ($test_access) { # We're good. print "Connected on: [".$target_ip."] via: [".$network_name."]\n"; $anvil->data->{peer}{$short_host_name}{access}{ip} = $target_ip; $anvil->data->{peer}{$short_host_name}{access}{network} = $network_name; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "s1:peer::${short_host_name}::access::ip" => $anvil->data->{peer}{$short_host_name}{access}{ip}, "s2:peer::${short_host_name}::access::network" => $anvil->data->{peer}{$short_host_name}{access}{network}, }}); } } } if (not $anvil->data->{peer}{$short_host_name}{access}{ip}) { print "No access! Skipping.\n"; $all_access = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_access => $all_access }}); } } } return($all_access); }