#!/usr/bin/perl # # This does shutdown-time tasks; migrate or stop servers, withdraw and power off the host. # # Exit codes; # 0 = Normal exit. # 1 = Any problem that causes an early exit. # # TODO: # use strict; use warnings; use Anvil::Tools; require POSIX; use Data::Dumper; my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0]; my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0]; if (($running_directory =~ /^\./) && ($ENV{PWD})) { $running_directory =~ s/^\./$ENV{PWD}/; } # Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete. $| = 1; my $anvil = Anvil::Tools->new(); $anvil->data->{switches}{'job-uuid'} = ""; $anvil->data->{switches}{'power-off'} = ""; # By default, the node is withdrawn. With this switch, the node will power off as well. $anvil->data->{switches}{'stop-reason'} = ""; # Optionally used to set 'system::stop_reason' reason for this host. Valid values are 'user', 'power' and 'thermal'. $anvil->data->{switches}{'stop-servers'} = ""; # Default behaviour is to migrate servers to the peer, if the peer is up. This overrides that and forces hosted servers to shut down. $anvil->Get->switches; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'switches::job-uuid' => $anvil->data->{switches}{'job-uuid'}, 'switches::power-off' => $anvil->data->{switches}{'power-off'}, 'switches::stop-reason' => $anvil->data->{switches}{'stop-reason'}, 'switches::stop-servers' => $anvil->data->{switches}{'stop-servers'}, }}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0115", variables => { program => $THIS_FILE }}); # Make sure we're running as 'root' # $< == real UID, $> == effective UID if (($< != 0) && ($> != 0)) { # Not root print $anvil->Words->string({key => "error_0005"})."\n"; $anvil->nice_exit({exit_code => 1}); } $anvil->Database->connect(); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 3, key => "log_0132"}); if (not $anvil->data->{sys}{database}{connections}) { # No databases, update the job, sleep for a bit and then exit. The daemon will pick it up and try # again after we exit. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0075"}); sleep 10; $anvil->nice_exit({exit_code => 1}); } # If we don't have a job UUID, try to find one. if (not $anvil->data->{switches}{'job-uuid'}) { # Load the job data. $anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({program => $THIS_FILE}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }}); } # If we still don't have a job-uuit, go into interactive mode. if ($anvil->data->{switches}{'job-uuid'}) { # Load the job data. $anvil->Job->clear(); $anvil->Job->get_job_details(); $anvil->Job->update_progress({ progress => 1, job_picked_up_by => $$, job_picked_up_at => time, message => "message_0235", }); # Pull out the job data. foreach my $line (split/\n/, $anvil->data->{jobs}{job_data}) { if ($line =~ /power-off=(.*?)$/) { $anvil->data->{switches}{'power-off'} = $1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'switches::power-off' => $anvil->data->{switches}{'power-off'}, }}); } if ($line =~ /stop-reason=(.*?)$/) { $anvil->data->{switches}{'stop-reason'} = $1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'switches::stop-reason' => $anvil->data->{switches}{'stop-reason'}, }}); } if ($line =~ /stop-servers=(.*?)$/) { $anvil->data->{switches}{'stop-servers'} = $1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'switches::stop-servers' => $anvil->data->{switches}{'stop-servers'}, }}); } } } # Make sure we're in an Anvil! $anvil->data->{sys}{anvil_uuid} = $anvil->Cluster->get_anvil_uuid(); if (not $anvil->data->{sys}{anvil_uuid}) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0260"}); $anvil->Job->update_progress({progress => 100, message => "error_0260"}); $anvil->nice_exit({exit_code => 1}); } # If no stop-reason was set, set it to 'user' if (not $anvil->data->{switches}{'stop-reason'}) { $anvil->data->{switches}{'stop-reason'} = "user"; } # Migrate or stop the servers, if any servers are running here. process_servers($anvil); # This waits on DRBD if we're SyncSource wait_on_drbd($anvil); # This stops pacemaker stop_cluster($anvil); # Are we powering off? if ($anvil->data->{switches}{'power-off'}) { # Yup $anvil->Database->update_host_status({ debug => 2, host_uuid => $anvil->Get->host_uuid, host_status => "stopping", }); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0325"}); $anvil->Job->update_progress({progress => 100, message => "job_0325"}); my $shell_call = $anvil->data->{path}{exe}{systemctl}." poweroff"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__}); # Unlikely we're still alive, but 'poweroff' does return once enqueued, so... $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code, }}); } else { # We're not shutting down, so we're done $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0326"}); $anvil->Job->update_progress({progress => 100, message => "job_0326"}); } $anvil->nice_exit({exit_code => 0}); ############################################################################################################# # Functions # ############################################################################################################# # This takes down or migrates VMs, then withdraws from the cluster. sub stop_cluster { my ($anvil) = @_; # We need to rename the server in the cluster, and we need both nodes up to do it. my $pacemaker_stopped = 0; my $waiting = 1; while($waiting) { $waiting = 0; my $problem = $anvil->Cluster->parse_cib({debug => 2}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); if ($problem) { # Cluster has stopped. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0313"}); $anvil->Job->update_progress({progress => 5, message => "job_0313"}); } else { $waiting = 1; if (not $pacemaker_stopped) { # Stop pacemaker now. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0323"}); $anvil->Job->update_progress({progress => 70, message => "job_0323"}); ### NOTE: '--force' is needed or else sole-running nodes can't exit ### (complains about the loss of quorum) my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster stop --force"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code, }}); $pacemaker_stopped = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { pacemaker_stopped => $pacemaker_stopped }}); } else { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0324"}); $anvil->Job->update_progress({progress => 80, message => "job_0324"}); } } if ($waiting) { sleep 5; } } return(0); } # This will migrate or stop sub process_servers { my ($anvil) = @_; if ($anvil->data->{switches}{'stop-servers'}) { # Tell the user we're about to shut down servers. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0320"}); $anvil->Job->update_progress({progress => 10, message => "job_0320"}); } else { # Tell the user we're about to migrate servers. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0321"}); $anvil->Job->update_progress({progress => 10, message => "job_0321"}); } my $waiting = 1; while ($waiting) { # Is the cluster up? $waiting = 0; my $problem = $anvil->Cluster->parse_cib({debug => 2}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); if ($problem) { # Nope. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0313"}); $anvil->Job->update_progress({progress => 80, message => "job_0313"}); } else { # Loop through the servers running here. my $local_name = $anvil->data->{cib}{parsed}{'local'}{name}; my $peer_name = $anvil->data->{cib}{parsed}{peer}{name}; foreach my $server (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{data}{server}}) { my $status = $anvil->data->{cib}{parsed}{data}{server}{$server}{status}; my $host_name = $anvil->data->{cib}{parsed}{data}{server}{$server}{host_name}; my $role = $anvil->data->{cib}{parsed}{data}{server}{$server}{role}; my $active = $anvil->data->{cib}{parsed}{data}{server}{$server}{active}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:server' => $server, 's2:status' => $status, 's2:host_name' => $host_name, 's4:role' => $role, 's5:active' => $active, }}); next if lc($role) eq "stopped"; if (lc($role) eq "migrating") { # No matter what, if a server is migrating, we wait. $waiting = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0315", variables => { server => $server }}); $anvil->Job->update_progress({progress => 20, message => "job_0315,!!server!".$server."!!"}); } elsif ($host_name eq $local_name) { # Something is running here. $waiting = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); # This is ours. How shall we deal with it? if ($anvil->data->{switches}{'stop-servers'}) { # Have we tried to stop it already? If not, use pcs. If so, # and if it's been more that 60 seconds, use virsh to try # again. if (not exists $anvil->data->{server_shutdown}{$server}) { # Use PCS. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0316", variables => { server => $server }}); $anvil->Job->update_progress({progress => 20, message => "job_0316,!!server!".$server."!!"}); $anvil->Cluster->shutdown_server({ debug => 2, server => $server, 'wait' => 0, }); $anvil->data->{server_shutdown}{$server}{pcs_called} = 1; $anvil->data->{server_shutdown}{$server}{virsh_called} = 0; $anvil->data->{server_shutdown}{$server}{call_virsh_at} = time + 120; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "server_shutdown::${server}::pcs_called" => $anvil->data->{server_shutdown}{$server}{pcs_called}, "server_shutdown::${server}::virsh_called" => $anvil->data->{server_shutdown}{$server}{virsh_called}, "server_shutdown::${server}::call_virsh_at" => $anvil->data->{server_shutdown}{$server}{call_virsh_at}, }}); } elsif ((not $anvil->data->{server_shutdown}{$server}{virsh_called}) && (time > $anvil->data->{server_shutdown}{$server}{call_virsh_at})) { # Use virsh $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0317", variables => { server => $server }}); $anvil->Job->update_progress({progress => 20, message => "job_0317,!!server!".$server."!!"}); $anvil->Server->shutdown_virsh({ debug => 2, server => $server, wait_time => 1, }); $anvil->data->{server_shutdown}{$server}{virsh_called} = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "server_shutdown::${server}::virsh_called" => $anvil->data->{server_shutdown}{$server}{virsh_called}, }}); } } else { ### TODO: Calculate how many gigs worth of RAM we'll migrate, ### and advance the "progress" by the percentage each ### server's RAM represents of the total # Migrate the servers. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0318", variables => { server => $server, node => $peer_name, }}); $anvil->Job->update_progress({progress => 20, message => "job_0318,!!server!".$server."!!,!!node!".$peer_name."!!"}); $anvil->Cluster->migrate_server({ server => $server, node => $peer_name, 'wait' => 1, }); } } } } if ($waiting) { sleep 5; } } $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0319"}); $anvil->Job->update_progress({progress => 30, message => "job_0319"}); return(0); } # This watches DRBD and waits for us to not be SyncSource. sub wait_on_drbd { my ($anvil) = @_; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0322"}); $anvil->Job->update_progress({progress => 40, message => "job_0322"}); my $short_host_name = $anvil->Get->short_host_name(); my $waiting = 1; while ($waiting) { # (Re)fresh my view of the storage. $waiting = 0; $anvil->DRBD->get_status({debug => 2}); # Now check to see if anything is sync'ing. foreach my $server_name (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}}) { foreach my $peer_name (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}{$server_name}{connection}}) { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peer_name => $peer_name }}); foreach my $volume (sort {$a cmp $b} %{$anvil->data->{drbd}{status}{$short_host_name}{resource}{$server_name}{connection}{$peer_name}{volume}}) { next if not exists $anvil->data->{drbd}{status}{$short_host_name}{resource}{$server_name}{connection}{$peer_name}{volume}{$volume}{'replication-state'}; my $replication_state = $anvil->data->{drbd}{status}{$short_host_name}{resource}{$server_name}{connection}{$peer_name}{volume}{$volume}{'replication-state'}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { volume => $volume, replication_state => $replication_state, }}); if ($replication_state =~ /SyncSource/i) { $waiting = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0312", variables => { peer_host => $peer_name, resource => $server_name, volume => $volume, }}); $anvil->Job->update_progress({progress => 50, message => "job_0312,!!peer_host!".$peer_name."!!,!!resource!".$server_name."!!,!!volume!".$volume."!!"}); } } } } if ($waiting) { sleep 10; } } # All servers should be down now, so stop DRBD. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0314"}); $anvil->Job->update_progress({progress => 60, message => "job_0314"}); my $shell_call = $anvil->data->{path}{exe}{drbdadm}." down all"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code, }}); return(0); }