anvil/tools/anvil-safe-stop
Digimer 4800f7181f * Updated ScanCore to boot a node that is off without a stop reason.
* Fixed a bug where anvil-safe-stop was not recording the stop-reason. Also made '--poweroff' an alias for '--power-off'.

Signed-off-by: Digimer <digimer@alteeve.ca>
2021-08-07 14:01:14 -04:00

464 lines
18 KiB
Perl
Executable File

#!/usr/bin/perl
#
# This does shutdown-time tasks; migrate or stop servers, withdraw and power off the host.
#
# Exit codes;
# 0 = Normal exit.
# 1 = Any problem that causes an early exit.
#
# TODO:
#
use strict;
use warnings;
use Anvil::Tools;
require POSIX;
use Data::Dumper;
my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0];
my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0];
if (($running_directory =~ /^\./) && ($ENV{PWD}))
{
$running_directory =~ s/^\./$ENV{PWD}/;
}
# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete.
$| = 1;
my $anvil = Anvil::Tools->new();
$anvil->data->{switches}{'job-uuid'} = "";
$anvil->data->{switches}{'poweroff'} = "";
$anvil->data->{switches}{'power-off'} = ""; # By default, the node is withdrawn. With this switch, the node will power off as well.
$anvil->data->{switches}{'stop-reason'} = ""; # Optionally used to set 'system::stop_reason' reason for this host. Valid values are 'user', 'power' and 'thermal'.
$anvil->data->{switches}{'stop-servers'} = ""; # Default behaviour is to migrate servers to the peer, if the peer is up. This overrides that and forces hosted servers to shut down.
$anvil->Get->switches;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'switches::job-uuid' => $anvil->data->{switches}{'job-uuid'},
'switches::poweroff' => $anvil->data->{switches}{'poweroff'},
'switches::power-off' => $anvil->data->{switches}{'power-off'},
'switches::stop-reason' => $anvil->data->{switches}{'stop-reason'},
'switches::stop-servers' => $anvil->data->{switches}{'stop-servers'},
}});
# Let 'poweroff' work as a mis-spell of 'power-off'
if (($anvil->data->{switches}{'poweroff'}) && (not $anvil->data->{switches}{'power-off'}))
{
$anvil->data->{switches}{'power-off'} = $anvil->data->{switches}{'poweroff'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'switches::power-off' => $anvil->data->{switches}{'power-off'},
}});
}
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0115", variables => { program => $THIS_FILE }});
# Make sure we're running as 'root'
# $< == real UID, $> == effective UID
if (($< != 0) && ($> != 0))
{
# Not root
print $anvil->Words->string({key => "error_0005"})."\n";
$anvil->nice_exit({exit_code => 1});
}
$anvil->Database->connect();
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 3, key => "log_0132"});
if (not $anvil->data->{sys}{database}{connections})
{
# No databases, update the job, sleep for a bit and then exit. The daemon will pick it up and try
# again after we exit.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0075"});
sleep 10;
$anvil->nice_exit({exit_code => 1});
}
# If we don't have a job UUID, try to find one.
if (not $anvil->data->{switches}{'job-uuid'})
{
# Load the job data.
$anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({program => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }});
}
# If we still don't have a job-uuit, go into interactive mode.
if ($anvil->data->{switches}{'job-uuid'})
{
# Load the job data.
$anvil->Job->clear();
$anvil->Job->get_job_details();
$anvil->Job->update_progress({
progress => 1,
job_picked_up_by => $$,
job_picked_up_at => time,
message => "message_0235",
});
# Pull out the job data.
foreach my $line (split/\n/, $anvil->data->{jobs}{job_data})
{
if ($line =~ /power-off=(.*?)$/)
{
$anvil->data->{switches}{'power-off'} = $1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'switches::power-off' => $anvil->data->{switches}{'power-off'},
}});
}
if ($line =~ /stop-reason=(.*?)$/)
{
$anvil->data->{switches}{'stop-reason'} = $1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'switches::stop-reason' => $anvil->data->{switches}{'stop-reason'},
}});
}
if ($line =~ /stop-servers=(.*?)$/)
{
$anvil->data->{switches}{'stop-servers'} = $1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'switches::stop-servers' => $anvil->data->{switches}{'stop-servers'},
}});
}
}
}
# Make sure we're in an Anvil!
$anvil->data->{sys}{anvil_uuid} = $anvil->Cluster->get_anvil_uuid();
if (not $anvil->data->{sys}{anvil_uuid})
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0260"});
$anvil->Job->update_progress({progress => 100, message => "error_0260"});
$anvil->nice_exit({exit_code => 1});
}
# If no stop-reason was set, set it to 'user'
if (not $anvil->data->{switches}{'stop-reason'})
{
$anvil->data->{switches}{'stop-reason'} = "user";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'switches::stop-reason' => $anvil->data->{switches}{'stop-reason'},
}});
}
# Migrate or stop the servers, if any servers are running here.
process_servers($anvil);
# This waits on DRBD if we're SyncSource
wait_on_drbd($anvil);
# This stops pacemaker
stop_cluster($anvil);
# Are we powering off?
if ($anvil->data->{switches}{'power-off'})
{
# Yup
$anvil->Database->update_host_status({
debug => 2,
host_uuid => $anvil->Get->host_uuid,
host_status => "stopping",
});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0325"});
$anvil->Job->update_progress({progress => 100, message => "job_0325"});
# Set the stop reason.
if ($anvil->data->{switches}{'stop-reason'})
{
if ($anvil->data->{switches}{'stop-reason'} eq "none")
{
$anvil->data->{switches}{'stop-reason'} = "";
}
my $variable_uuid = $anvil->Database->insert_or_update_variables({
variable_name => 'system::stop_reason',
variable_value => $anvil->data->{switches}{'stop-reason'},
variable_default => '',
variable_description => 'striker_0279',
variable_section => 'system',
variable_source_uuid => $anvil->Get->host_uuid(),
variable_source_table => 'hosts',
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { variable_uuid => $variable_uuid }});
}
my $shell_call = $anvil->data->{path}{exe}{systemctl}." poweroff";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__});
# Unlikely we're still alive, but 'poweroff' does return once enqueued, so...
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
}
else
{
# We're not shutting down, so we're done
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0326"});
$anvil->Job->update_progress({progress => 100, message => "job_0326"});
}
$anvil->nice_exit({exit_code => 0});
#############################################################################################################
# Functions #
#############################################################################################################
# This takes down or migrates VMs, then withdraws from the cluster.
sub stop_cluster
{
my ($anvil) = @_;
# We need to rename the server in the cluster, and we need both nodes up to do it.
my $pacemaker_stopped = 0;
my $waiting = 1;
while($waiting)
{
$waiting = 0;
my $problem = $anvil->Cluster->parse_cib({debug => 2});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }});
if ($problem)
{
# Cluster has stopped.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0313"});
$anvil->Job->update_progress({progress => 5, message => "job_0313"});
}
else
{
$waiting = 1;
if (not $pacemaker_stopped)
{
# Stop pacemaker now.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0323"});
$anvil->Job->update_progress({progress => 70, message => "job_0323"});
### NOTE: '--force' is needed or else sole-running nodes can't exit
### (complains about the loss of quorum)
my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster stop --force";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
$pacemaker_stopped = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { pacemaker_stopped => $pacemaker_stopped }});
}
else
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0324"});
$anvil->Job->update_progress({progress => 80, message => "job_0324"});
}
}
if ($waiting)
{
sleep 5;
}
}
return(0);
}
# This will migrate or stop
sub process_servers
{
my ($anvil) = @_;
if ($anvil->data->{switches}{'stop-servers'})
{
# Tell the user we're about to shut down servers.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0320"});
$anvil->Job->update_progress({progress => 10, message => "job_0320"});
}
else
{
# Tell the user we're about to migrate servers.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0321"});
$anvil->Job->update_progress({progress => 10, message => "job_0321"});
}
my $waiting = 1;
while ($waiting)
{
# Is the cluster up?
$waiting = 0;
my $problem = $anvil->Cluster->parse_cib({debug => 2});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }});
if ($problem)
{
# Nope.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0313"});
$anvil->Job->update_progress({progress => 80, message => "job_0313"});
}
else
{
# Loop through the servers running here.
my $local_name = $anvil->data->{cib}{parsed}{'local'}{name};
my $peer_name = $anvil->data->{cib}{parsed}{peer}{name};
foreach my $server (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{data}{server}})
{
my $status = $anvil->data->{cib}{parsed}{data}{server}{$server}{status};
my $host_name = $anvil->data->{cib}{parsed}{data}{server}{$server}{host_name};
my $role = $anvil->data->{cib}{parsed}{data}{server}{$server}{role};
my $active = $anvil->data->{cib}{parsed}{data}{server}{$server}{active};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:server' => $server,
's2:status' => $status,
's2:host_name' => $host_name,
's4:role' => $role,
's5:active' => $active,
}});
next if lc($role) eq "stopped";
if (lc($role) eq "migrating")
{
# No matter what, if a server is migrating, we wait.
$waiting = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0315", variables => { server => $server }});
$anvil->Job->update_progress({progress => 20, message => "job_0315,!!server!".$server."!!"});
}
elsif ($host_name eq $local_name)
{
# Something is running here.
$waiting = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
# This is ours. How shall we deal with it?
if ($anvil->data->{switches}{'stop-servers'})
{
# Have we tried to stop it already? If not, use pcs. If so,
# and if it's been more that 60 seconds, use virsh to try
# again.
if (not exists $anvil->data->{server_shutdown}{$server})
{
# Use PCS.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0316", variables => { server => $server }});
$anvil->Job->update_progress({progress => 20, message => "job_0316,!!server!".$server."!!"});
$anvil->Cluster->shutdown_server({
debug => 2,
server => $server,
'wait' => 0,
});
$anvil->data->{server_shutdown}{$server}{pcs_called} = 1;
$anvil->data->{server_shutdown}{$server}{virsh_called} = 0;
$anvil->data->{server_shutdown}{$server}{call_virsh_at} = time + 120;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"server_shutdown::${server}::pcs_called" => $anvil->data->{server_shutdown}{$server}{pcs_called},
"server_shutdown::${server}::virsh_called" => $anvil->data->{server_shutdown}{$server}{virsh_called},
"server_shutdown::${server}::call_virsh_at" => $anvil->data->{server_shutdown}{$server}{call_virsh_at},
}});
}
elsif ((not $anvil->data->{server_shutdown}{$server}{virsh_called}) && (time > $anvil->data->{server_shutdown}{$server}{call_virsh_at}))
{
# Use virsh
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0317", variables => { server => $server }});
$anvil->Job->update_progress({progress => 20, message => "job_0317,!!server!".$server."!!"});
$anvil->Server->shutdown_virsh({
debug => 2,
server => $server,
wait_time => 1,
});
$anvil->data->{server_shutdown}{$server}{virsh_called} = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"server_shutdown::${server}::virsh_called" => $anvil->data->{server_shutdown}{$server}{virsh_called},
}});
}
}
else
{
### TODO: Calculate how many gigs worth of RAM we'll migrate,
### and advance the "progress" by the percentage each
### server's RAM represents of the total
# Migrate the servers.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0318", variables => {
server => $server,
node => $peer_name,
}});
$anvil->Job->update_progress({progress => 20, message => "job_0318,!!server!".$server."!!,!!node!".$peer_name."!!"});
$anvil->Cluster->migrate_server({
server => $server,
node => $peer_name,
'wait' => 1,
});
}
}
}
}
if ($waiting)
{
sleep 5;
}
}
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0319"});
$anvil->Job->update_progress({progress => 30, message => "job_0319"});
return(0);
}
# This watches DRBD and waits for us to not be SyncSource.
sub wait_on_drbd
{
my ($anvil) = @_;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0322"});
$anvil->Job->update_progress({progress => 40, message => "job_0322"});
my $short_host_name = $anvil->Get->short_host_name();
my $waiting = 1;
while ($waiting)
{
# (Re)fresh my view of the storage.
$waiting = 0;
$anvil->DRBD->get_status({debug => 2});
# Now check to see if anything is sync'ing.
foreach my $server_name (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}})
{
foreach my $peer_name (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}{$server_name}{connection}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peer_name => $peer_name }});
foreach my $volume (sort {$a cmp $b} %{$anvil->data->{drbd}{status}{$short_host_name}{resource}{$server_name}{connection}{$peer_name}{volume}})
{
next if not exists $anvil->data->{drbd}{status}{$short_host_name}{resource}{$server_name}{connection}{$peer_name}{volume}{$volume}{'replication-state'};
my $replication_state = $anvil->data->{drbd}{status}{$short_host_name}{resource}{$server_name}{connection}{$peer_name}{volume}{$volume}{'replication-state'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
volume => $volume,
replication_state => $replication_state,
}});
if ($replication_state =~ /SyncSource/i)
{
$waiting = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0312", variables => {
peer_host => $peer_name,
resource => $server_name,
volume => $volume,
}});
$anvil->Job->update_progress({progress => 50, message => "job_0312,!!peer_host!".$peer_name."!!,!!resource!".$server_name."!!,!!volume!".$volume."!!"});
}
}
}
}
if ($waiting)
{
sleep 10;
}
}
# All servers should be down now, so stop DRBD.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0314"});
$anvil->Job->update_progress({progress => 60, message => "job_0314"});
my $shell_call = $anvil->data->{path}{exe}{drbdadm}." down all";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
return(0);
}