#!/usr/bin/perl # # This does shutdown-time tasks; migrate or stop servers, withdraw and power off the host. # # Exit codes; # 0 = Normal exit. # 1 = Any problem that causes an early exit. # # TODO: # # BUG: # - --poweroff when the peer is offline tries to migrate anyway. # - use strict; use warnings; use Anvil::Tools; require POSIX; use Data::Dumper; my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0]; my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0]; if (($running_directory =~ /^\./) && ($ENV{PWD})) { $running_directory =~ s/^\./$ENV{PWD}/; } # Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete. $| = 1; my $anvil = Anvil::Tools->new(); # Read switches (target ([user@]host[:port]) and the file with the target's password. $anvil->Get->switches({list => [ "no-db", "poweroff", "power-off", "stop-reason", "stop-servers"], man => $THIS_FILE}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }}); # Let 'poweroff' work as a mis-spell of 'power-off' if (($anvil->data->{switches}{'poweroff'}) && (not $anvil->data->{switches}{'power-off'})) { $anvil->data->{switches}{'power-off'} = $anvil->data->{switches}{'poweroff'}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'switches::power-off' => $anvil->data->{switches}{'power-off'}, }}); } $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0115", variables => { program => $THIS_FILE }}); # Make sure we're running as 'root' # $< == real UID, $> == effective UID if (($< != 0) && ($> != 0)) { # Not root print $anvil->Words->string({key => "error_0005"})."\n"; $anvil->nice_exit({exit_code => 1}); } if ($anvil->data->{switches}{'no-db'}) { $anvil->data->{sys}{database}{connections} = 0; $anvil->data->{switches}{'job-uuid'} = ""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'sys::database::connections' => $anvil->data->{sys}{database}{connections}, 'switches::job-uuid' => $anvil->data->{switches}{'job-uuid'}, }}); } else { $anvil->Database->connect(); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 3, key => "log_0132"}); if (not $anvil->data->{sys}{database}{connections}) { # No databases, update the job, sleep for a bit and then exit. The daemon will pick it up and try # again after we exit. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0075"}); sleep 10; $anvil->nice_exit({exit_code => 1}); } } # If we still don't have a job-uuit, go into interactive mode. if ($anvil->data->{switches}{'job-uuid'}) { # Load the job data. $anvil->Job->clear(); $anvil->Job->get_job_details(); $anvil->Job->update_progress({ progress => 1, job_picked_up_by => $$, job_picked_up_at => time, message => "message_0235", }); # Pull out the job data. foreach my $line (split/\n/, $anvil->data->{jobs}{job_data}) { if ($line =~ /power-off=(.*?)$/) { $anvil->data->{switches}{'power-off'} = $1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'switches::power-off' => $anvil->data->{switches}{'power-off'}, }}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0687", variables => { reason => "#!string!log_0691!#" }}); } if ($line =~ /stop-reason=(.*?)$/) { $anvil->data->{switches}{'stop-reason'} = $1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'switches::stop-reason' => $anvil->data->{switches}{'stop-reason'}, }}); } if ($line =~ /stop-servers=(.*?)$/) { $anvil->data->{switches}{'stop-servers'} = $1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'switches::stop-servers' => $anvil->data->{switches}{'stop-servers'}, }}); } } } # Make sure we're a subnode or DR host my $host_type = $anvil->Get->host_type(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_type => $host_type }}); if (($host_type ne "node") && ($host_type ne "dr")) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0260"}); $anvil->Job->update_progress({progress => 100, message => "error_0260"}) if $anvil->data->{switches}{'job-uuid'}; $anvil->nice_exit({exit_code => 1}); } # If no stop-reason was set, set it to 'user' if (not $anvil->data->{switches}{'stop-reason'}) { $anvil->data->{switches}{'stop-reason'} = "user"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'switches::stop-reason' => $anvil->data->{switches}{'stop-reason'}, }}); } # Migrate or stop the servers, if any servers are running here. process_servers($anvil); # This waits on DRBD if we're SyncSource wait_on_drbd($anvil); # This stops pacemaker stop_cluster($anvil); # Are we powering off? if ($anvil->data->{switches}{'power-off'}) { # Yup $anvil->Database->update_host_status({ debug => 2, host_uuid => $anvil->Get->host_uuid, host_status => "stopping", }); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0687", variables => { reason => "#!string!log_0692!#" }}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0325"}); $anvil->Job->update_progress({progress => 100, message => "job_0325"}) if $anvil->data->{switches}{'job-uuid'}; # Set the stop reason. if ($anvil->data->{switches}{'stop-reason'}) { if ($anvil->data->{switches}{'stop-reason'} eq "none") { $anvil->data->{switches}{'stop-reason'} = ""; } my $variable_uuid = $anvil->Database->insert_or_update_variables({ variable_name => 'system::stop_reason', variable_value => $anvil->data->{switches}{'stop-reason'}, variable_default => '', variable_description => 'striker_0279', variable_section => 'system', variable_source_uuid => $anvil->Get->host_uuid(), variable_source_table => 'hosts', }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { variable_uuid => $variable_uuid }}); } my $shell_call = $anvil->data->{path}{exe}{systemctl}." poweroff"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__}); # Unlikely we're still alive, but 'poweroff' does return once enqueued, so... $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code, }}); } else { # We're not shutting down, so we're done $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0326"}); $anvil->Job->update_progress({progress => 100, message => "job_0326"}); } $anvil->nice_exit({exit_code => 0}); ############################################################################################################# # Functions # ############################################################################################################# # This takes down or migrates VMs, then withdraws from the cluster. sub stop_cluster { my ($anvil) = @_; # We need to rename the server in the cluster, and we need both nodes up to do it. my $pacemaker_stopped = 0; my $waiting = 1; while($waiting) { $waiting = 0; my $problem = $anvil->Cluster->parse_cib({debug => 2}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); if ($problem) { # Cluster has stopped. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0313"}); $anvil->Job->update_progress({progress => 5, message => "job_0313"}) if $anvil->data->{switches}{'job-uuid'}; } else { $waiting = 1; if (not $pacemaker_stopped) { # Stop pacemaker now. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0323"}); $anvil->Job->update_progress({progress => 70, message => "job_0323"}) if $anvil->data->{switches}{'job-uuid'}; ### NOTE: '--force' is needed or else sole-running nodes can't exit ### (complains about the loss of quorum) my $shell_call = $anvil->data->{path}{exe}{pcs_direct}." cluster stop --force"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code, }}); $pacemaker_stopped = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { pacemaker_stopped => $pacemaker_stopped }}); } else { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0324"}); $anvil->Job->update_progress({progress => 80, message => "job_0324"}) if $anvil->data->{switches}{'job-uuid'}; } } if ($waiting) { sleep 5; } } return(0); } # This will migrate or stop sub process_servers { my ($anvil) = @_; # Use virsh to check for servers, in case pacemaker lies to us. $anvil->Server->find(); my $progress = 10; my $waiting = 1; my $first_try = 0; my $second_try = 0; my $try_again = 0; my $server_count = keys %{$anvil->data->{server}{location}}; my $progress_steps = $server_count ? int(35 / $server_count) : 70; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:server_count' => $server_count, 's2:progress_steps' => $progress_steps, }}); # If we have one or more local servers, we need to know if both subnodes are in the node's cluster. # If we're not, or the peer isn't, we can't migrate. my $can_migrate = 1; if ($server_count) { if ($anvil->Get->host_type() eq "dr") { # No pacemaker, only stop servers. $can_migrate = 0; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, key => "job_0470"}); } else { my $problem = $anvil->Cluster->parse_cib({debug => 2}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:problem' => $problem, 's2:cib::parsed::local::ready' => $anvil->data->{cib}{parsed}{'local'}{ready}, 's3:cib::parsed::peer::ready' => $anvil->data->{cib}{parsed}{peer}{ready}, }}); if ($problem) { # We're not in the node's cluster, we can't migrate. $can_migrate = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { can_migrate => $can_migrate }}); } elsif ((not $anvil->data->{cib}{parsed}{'local'}{ready}) or (not $anvil->data->{cib}{parsed}{peer}{ready})) { # One of the subnodes is not in the cluster, so we can't migrate. $can_migrate = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { can_migrate => $can_migrate }}); } } } if ($anvil->data->{switches}{'stop-servers'}) { # Tell the user we're about to shut down servers. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0320"}); $anvil->Job->update_progress({progress => 10, message => "job_0320"}) if $anvil->data->{switches}{'job-uuid'}; } elsif ($can_migrate) { # Tell the user we're about to migrate servers. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0321"}); $anvil->Job->update_progress({progress => 10, message => "job_0321"}) if $anvil->data->{switches}{'job-uuid'}; } else { # We would have to stop the servers, and the user didn't tell us to do that, abort. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0372"}); $anvil->Job->update_progress({progress => 100, message => "error_0372"}) if $anvil->data->{switches}{'job-uuid'}; $anvil->nice_exit({exit_code => 1}); } while ($waiting) { # Is the cluster up? $waiting = 0; my $problem = $anvil->Cluster->parse_cib({debug => 2}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); if ($problem) { # Nope. Are we stopping servers? if ($anvil->data->{switches}{'stop-servers'}) { # Yes, are any servers running (check virsh) foreach my $server_name (sort {$a cmp $b} keys %{$anvil->data->{server}{location}}) { my $status = $anvil->data->{server}{location}{$server_name}{status}; my $host_name = $anvil->data->{server}{location}{$server_name}{host_name}; $progress += $progress_steps; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:server_name' => $server_name, 's2:status' => $status, 's3:host_name' => $host_name, 's4:progress' => $progress, }}); if ($host_name eq $anvil->Get->host_name) { # Server is still running. if (($status eq "running") && (not $first_try)) { # It's running despite the cluster being own, stop it. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "job_0419", variables => { server => $server_name }}); $anvil->Job->update_progress({progress => $progress, message => "job_0419,!!server!".$server_name."!!"}) if $anvil->data->{switches}{'job-uuid'}; $anvil->Server->shutdown_virsh({ debug => 2, server => $server_name, wait_time => 1, }); $waiting = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); } elsif (($status eq "in shutdown") && ($try_again)) { # Hit the power button again. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "job_0420", variables => { server => $server_name }}); $anvil->Job->update_progress({progress => $progress, message => "job_0420,!!server!".$server_name."!!"}) if $anvil->data->{switches}{'job-uuid'}; $anvil->Server->shutdown_virsh({ debug => 2, server => $server_name, wait_time => 1, }); $waiting = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); } } } if ($waiting) { if (not $first_try) { $first_try = time; $second_try = $first_try + 120; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { first_try => $first_try, second_try => $second_try, }}); } elsif ($try_again) { $try_again = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { try_again => $try_again }}); } elsif (($second_try) && (time > $second_try)) { $try_again = 1; $second_try = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { second_try => $second_try, try_again => $try_again, }}); } } } $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0313"}); $anvil->Job->update_progress({progress => 80, message => "job_0313"}) if $anvil->data->{switches}{'job-uuid'}; } else { # Loop through the servers running here. my $local_name = $anvil->data->{cib}{parsed}{'local'}{name}; my $peer_name = $anvil->data->{cib}{parsed}{peer}{name}; foreach my $server (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{data}{server}}) { my $status = $anvil->data->{cib}{parsed}{data}{server}{$server}{status}; my $host_name = $anvil->data->{cib}{parsed}{data}{server}{$server}{host_name}; my $role = $anvil->data->{cib}{parsed}{data}{server}{$server}{role}; my $active = $anvil->data->{cib}{parsed}{data}{server}{$server}{active}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:server' => $server, 's2:status' => $status, 's3:host_name' => $host_name, 's4:role' => $role, 's5:active' => $active, }}); next if lc($role) eq "stopped"; if (lc($role) eq "migrating") { # No matter what, if a server is migrating, we wait. $waiting = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0315", variables => { server => $server }}); $anvil->Job->update_progress({progress => 20, message => "job_0315,!!server!".$server."!!"}) if $anvil->data->{switches}{'job-uuid'}; } elsif ($host_name eq $local_name) { # Something is running here. $waiting = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); # This is ours. How shall we deal with it? if ($anvil->data->{switches}{'stop-servers'}) { # Have we tried to stop it already? If not, use pcs. If so, # and if it's been more that 60 seconds, use virsh to try # again. if (not exists $anvil->data->{server_shutdown}{$server}) { # Use PCS. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0316", variables => { server => $server }}); $anvil->Job->update_progress({progress => 20, message => "job_0316,!!server!".$server."!!"}) if $anvil->data->{switches}{'job-uuid'}; $anvil->Cluster->shutdown_server({ debug => 2, server => $server, 'wait' => 0, }); $anvil->data->{server_shutdown}{$server}{pcs_called} = 1; $anvil->data->{server_shutdown}{$server}{virsh_called} = 0; $anvil->data->{server_shutdown}{$server}{call_virsh_at} = time + 120; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "server_shutdown::${server}::pcs_called" => $anvil->data->{server_shutdown}{$server}{pcs_called}, "server_shutdown::${server}::virsh_called" => $anvil->data->{server_shutdown}{$server}{virsh_called}, "server_shutdown::${server}::call_virsh_at" => $anvil->data->{server_shutdown}{$server}{call_virsh_at}, }}); } elsif ((not $anvil->data->{server_shutdown}{$server}{virsh_called}) && (time > $anvil->data->{server_shutdown}{$server}{call_virsh_at})) { # Use virsh $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0317", variables => { server => $server }}); $anvil->Job->update_progress({progress => 20, message => "job_0317,!!server!".$server."!!"}) if $anvil->data->{switches}{'job-uuid'}; $anvil->Server->shutdown_virsh({ debug => 2, server => $server, wait_time => 1, }); $anvil->data->{server_shutdown}{$server}{virsh_called} = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "server_shutdown::${server}::virsh_called" => $anvil->data->{server_shutdown}{$server}{virsh_called}, }}); } } else { ### TODO: Calculate how many gigs worth of RAM we'll migrate, ### and advance the "progress" by the percentage each ### server's RAM represents of the total # Migrate the servers. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0318", variables => { server => $server, node => $peer_name, }}); $anvil->Job->update_progress({progress => 20, message => "job_0318,!!server!".$server."!!,!!node!".$peer_name."!!"}) if $anvil->data->{switches}{'job-uuid'}; $anvil->Cluster->migrate_server({ server => $server, node => $peer_name, 'wait' => 1, }); } } } } if ($waiting) { sleep 5; } } $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0319"}); $anvil->Job->update_progress({progress => 30, message => "job_0319"}) if $anvil->data->{switches}{'job-uuid'}; return(0); } # This watches DRBD and waits for us to not be SyncSource. sub wait_on_drbd { my ($anvil) = @_; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0322"}); $anvil->Job->update_progress({progress => 40, message => "job_0322"}) if $anvil->data->{switches}{'job-uuid'}; my $short_host_name = $anvil->Get->short_host_name(); my $waiting = 1; while ($waiting) { # (Re)fresh my view of the storage. $waiting = 0; $anvil->DRBD->get_status({debug => 2}); # Now check to see if anything is sync'ing. foreach my $server_name (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}}) { foreach my $peer_name (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}{$server_name}{connection}}) { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peer_name => $peer_name }}); foreach my $volume (sort {$a cmp $b} %{$anvil->data->{drbd}{status}{$short_host_name}{resource}{$server_name}{connection}{$peer_name}{volume}}) { next if not exists $anvil->data->{drbd}{status}{$short_host_name}{resource}{$server_name}{connection}{$peer_name}{volume}{$volume}{'replication-state'}; my $replication_state = $anvil->data->{drbd}{status}{$short_host_name}{resource}{$server_name}{connection}{$peer_name}{volume}{$volume}{'replication-state'}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { volume => $volume, replication_state => $replication_state, }}); if ($replication_state =~ /SyncSource/i) { $waiting = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0312", variables => { peer_host => $peer_name, resource => $server_name, volume => $volume, }}); $anvil->Job->update_progress({progress => 50, message => "job_0312,!!peer_host!".$peer_name."!!,!!resource!".$server_name."!!,!!volume!".$volume."!!"}) if $anvil->data->{switches}{'job-uuid'}; } } } } if ($waiting) { sleep 10; } } # All servers should be down now, so stop DRBD. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0314"}); $anvil->Job->update_progress({progress => 60, message => "job_0314"}) if $anvil->data->{switches}{'job-uuid'}; my $shell_call = $anvil->data->{path}{exe}{drbdadm}." down all"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code, }}); return(0); }