anvil/tools/anvil-safe-stop
Digimer f833c311ba * To address issues with scancore debugging, we needed a tool to purge old anvils and hosts from the database. The 'test.pl' in this commit contains the new logic that will be merged into tools/striker-purge-host shortly.
* Created Database->find_host_uuid_columns() and ->_find_column() to create a list of tables and column names in the proper order to allow deletion of foreign keys to that deeply nested primary keys can be deleted. Specifically, this was meant for hosts -> host_uuid and anvils -> anvil_uuid, though it should work for other tables.
* Updated html/jquery-ui-1.12.1/package.json to address CVE-2020-7729
* Fixed a bug in the temperature table's history procedure where temperature_weight wasn't being copied.
* Updated anvil-provision-server to support '--anvil' that can take either the anvil-uuid or anvil-name.
* Updated anvil-safe-stop to default the stop-reason to 'user'.

Signed-off-by: Digimer <digimer@alteeve.ca>
2021-05-08 02:02:46 -04:00

431 lines
17 KiB
Perl
Executable File

#!/usr/bin/perl
#
# This does shutdown-time tasks; migrate or stop servers, withdraw and power off the host.
#
# Exit codes;
# 0 = Normal exit.
# 1 = Any problem that causes an early exit.
#
# TODO:
#
use strict;
use warnings;
use Anvil::Tools;
require POSIX;
use Data::Dumper;
my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0];
my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0];
if (($running_directory =~ /^\./) && ($ENV{PWD}))
{
$running_directory =~ s/^\./$ENV{PWD}/;
}
# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete.
$| = 1;
my $anvil = Anvil::Tools->new();
$anvil->data->{switches}{'job-uuid'} = "";
$anvil->data->{switches}{'power-off'} = ""; # By default, the node is withdrawn. With this switch, the node will power off as well.
$anvil->data->{switches}{'stop-reason'} = ""; # Optionally used to set 'system::stop_reason' reason for this host. Valid values are 'user', 'power' and 'thermal'.
$anvil->data->{switches}{'stop-servers'} = ""; # Default behaviour is to migrate servers to the peer, if the peer is up. This overrides that and forces hosted servers to shut down.
$anvil->Get->switches;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'switches::job-uuid' => $anvil->data->{switches}{'job-uuid'},
'switches::power-off' => $anvil->data->{switches}{'power-off'},
'switches::stop-reason' => $anvil->data->{switches}{'stop-reason'},
'switches::stop-servers' => $anvil->data->{switches}{'stop-servers'},
}});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0115", variables => { program => $THIS_FILE }});
# Make sure we're running as 'root'
# $< == real UID, $> == effective UID
if (($< != 0) && ($> != 0))
{
# Not root
print $anvil->Words->string({key => "error_0005"})."\n";
$anvil->nice_exit({exit_code => 1});
}
$anvil->Database->connect();
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 3, key => "log_0132"});
if (not $anvil->data->{sys}{database}{connections})
{
# No databases, update the job, sleep for a bit and then exit. The daemon will pick it up and try
# again after we exit.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0075"});
sleep 10;
$anvil->nice_exit({exit_code => 1});
}
# If we don't have a job UUID, try to find one.
if (not $anvil->data->{switches}{'job-uuid'})
{
# Load the job data.
$anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({program => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }});
}
# If we still don't have a job-uuit, go into interactive mode.
if ($anvil->data->{switches}{'job-uuid'})
{
# Load the job data.
$anvil->Job->clear();
$anvil->Job->get_job_details();
$anvil->Job->update_progress({
progress => 1,
job_picked_up_by => $$,
job_picked_up_at => time,
message => "message_0235",
});
# Pull out the job data.
foreach my $line (split/\n/, $anvil->data->{jobs}{job_data})
{
if ($line =~ /power-off=(.*?)$/)
{
$anvil->data->{switches}{'power-off'} = $1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'switches::power-off' => $anvil->data->{switches}{'power-off'},
}});
}
if ($line =~ /stop-reason=(.*?)$/)
{
$anvil->data->{switches}{'stop-reason'} = $1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'switches::stop-reason' => $anvil->data->{switches}{'stop-reason'},
}});
}
if ($line =~ /stop-servers=(.*?)$/)
{
$anvil->data->{switches}{'stop-servers'} = $1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'switches::stop-servers' => $anvil->data->{switches}{'stop-servers'},
}});
}
}
}
# Make sure we're in an Anvil!
$anvil->data->{sys}{anvil_uuid} = $anvil->Cluster->get_anvil_uuid();
if (not $anvil->data->{sys}{anvil_uuid})
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0260"});
$anvil->Job->update_progress({progress => 100, message => "error_0260"});
$anvil->nice_exit({exit_code => 1});
}
# If no stop-reason was set, set it to 'user'
if (not $anvil->data->{switches}{'stop-reason'})
{
$anvil->data->{switches}{'stop-reason'} = "user";
}
# Migrate or stop the servers, if any servers are running here.
process_servers($anvil);
# This waits on DRBD if we're SyncSource
wait_on_drbd($anvil);
# This stops pacemaker
stop_cluster($anvil);
# Are we powering off?
if ($anvil->data->{switches}{'power-off'})
{
# Yup
$anvil->Database->update_host_status({
debug => 2,
host_uuid => $anvil->Get->host_uuid,
host_status => "stopping",
});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0325"});
$anvil->Job->update_progress({progress => 100, message => "job_0325"});
my $shell_call = $anvil->data->{path}{exe}{systemctl}." poweroff";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__});
# Unlikely we're still alive, but 'poweroff' does return once enqueued, so...
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
}
else
{
# We're not shutting down, so we're done
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0326"});
$anvil->Job->update_progress({progress => 100, message => "job_0326"});
}
$anvil->nice_exit({exit_code => 0});
#############################################################################################################
# Functions #
#############################################################################################################
# This takes down or migrates VMs, then withdraws from the cluster.
sub stop_cluster
{
my ($anvil) = @_;
# We need to rename the server in the cluster, and we need both nodes up to do it.
my $pacemaker_stopped = 0;
my $waiting = 1;
while($waiting)
{
$waiting = 0;
my $problem = $anvil->Cluster->parse_cib({debug => 2});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }});
if ($problem)
{
# Cluster has stopped.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0313"});
$anvil->Job->update_progress({progress => 5, message => "job_0313"});
}
else
{
$waiting = 1;
if (not $pacemaker_stopped)
{
# Stop pacemaker now.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0323"});
$anvil->Job->update_progress({progress => 70, message => "job_0323"});
### NOTE: '--force' is needed or else sole-running nodes can't exit
### (complains about the loss of quorum)
my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster stop --force";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
$pacemaker_stopped = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { pacemaker_stopped => $pacemaker_stopped }});
}
else
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0324"});
$anvil->Job->update_progress({progress => 80, message => "job_0324"});
}
}
if ($waiting)
{
sleep 5;
}
}
return(0);
}
# This will migrate or stop
sub process_servers
{
my ($anvil) = @_;
if ($anvil->data->{switches}{'stop-servers'})
{
# Tell the user we're about to shut down servers.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0320"});
$anvil->Job->update_progress({progress => 10, message => "job_0320"});
}
else
{
# Tell the user we're about to migrate servers.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0321"});
$anvil->Job->update_progress({progress => 10, message => "job_0321"});
}
my $waiting = 1;
while ($waiting)
{
# Is the cluster up?
$waiting = 0;
my $problem = $anvil->Cluster->parse_cib({debug => 2});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }});
if ($problem)
{
# Nope.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0313"});
$anvil->Job->update_progress({progress => 80, message => "job_0313"});
}
else
{
# Loop through the servers running here.
my $local_name = $anvil->data->{cib}{parsed}{'local'}{name};
my $peer_name = $anvil->data->{cib}{parsed}{peer}{name};
foreach my $server (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{data}{server}})
{
my $status = $anvil->data->{cib}{parsed}{data}{server}{$server}{status};
my $host_name = $anvil->data->{cib}{parsed}{data}{server}{$server}{host_name};
my $role = $anvil->data->{cib}{parsed}{data}{server}{$server}{role};
my $active = $anvil->data->{cib}{parsed}{data}{server}{$server}{active};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:server' => $server,
's2:status' => $status,
's2:host_name' => $host_name,
's4:role' => $role,
's5:active' => $active,
}});
next if lc($role) eq "stopped";
if (lc($role) eq "migrating")
{
# No matter what, if a server is migrating, we wait.
$waiting = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0315", variables => { server => $server }});
$anvil->Job->update_progress({progress => 20, message => "job_0315,!!server!".$server."!!"});
}
elsif ($host_name eq $local_name)
{
# Something is running here.
$waiting = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
# This is ours. How shall we deal with it?
if ($anvil->data->{switches}{'stop-servers'})
{
# Have we tried to stop it already? If not, use pcs. If so,
# and if it's been more that 60 seconds, use virsh to try
# again.
if (not exists $anvil->data->{server_shutdown}{$server})
{
# Use PCS.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0316", variables => { server => $server }});
$anvil->Job->update_progress({progress => 20, message => "job_0316,!!server!".$server."!!"});
$anvil->Cluster->shutdown_server({
debug => 2,
server => $server,
'wait' => 0,
});
$anvil->data->{server_shutdown}{$server}{pcs_called} = 1;
$anvil->data->{server_shutdown}{$server}{virsh_called} = 0;
$anvil->data->{server_shutdown}{$server}{call_virsh_at} = time + 120;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"server_shutdown::${server}::pcs_called" => $anvil->data->{server_shutdown}{$server}{pcs_called},
"server_shutdown::${server}::virsh_called" => $anvil->data->{server_shutdown}{$server}{virsh_called},
"server_shutdown::${server}::call_virsh_at" => $anvil->data->{server_shutdown}{$server}{call_virsh_at},
}});
}
elsif ((not $anvil->data->{server_shutdown}{$server}{virsh_called}) && (time > $anvil->data->{server_shutdown}{$server}{call_virsh_at}))
{
# Use virsh
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0317", variables => { server => $server }});
$anvil->Job->update_progress({progress => 20, message => "job_0317,!!server!".$server."!!"});
$anvil->Server->shutdown_virsh({
debug => 2,
server => $server,
wait_time => 1,
});
$anvil->data->{server_shutdown}{$server}{virsh_called} = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"server_shutdown::${server}::virsh_called" => $anvil->data->{server_shutdown}{$server}{virsh_called},
}});
}
}
else
{
### TODO: Calculate how many gigs worth of RAM we'll migrate,
### and advance the "progress" by the percentage each
### server's RAM represents of the total
# Migrate the servers.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0318", variables => {
server => $server,
node => $peer_name,
}});
$anvil->Job->update_progress({progress => 20, message => "job_0318,!!server!".$server."!!,!!node!".$peer_name."!!"});
$anvil->Cluster->migrate_server({
server => $server,
node => $peer_name,
'wait' => 1,
});
}
}
}
}
if ($waiting)
{
sleep 5;
}
}
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0319"});
$anvil->Job->update_progress({progress => 30, message => "job_0319"});
return(0);
}
# This watches DRBD and waits for us to not be SyncSource.
sub wait_on_drbd
{
my ($anvil) = @_;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0322"});
$anvil->Job->update_progress({progress => 40, message => "job_0322"});
my $short_host_name = $anvil->Get->short_host_name();
my $waiting = 1;
while ($waiting)
{
# (Re)fresh my view of the storage.
$waiting = 0;
$anvil->DRBD->get_status({debug => 2});
# Now check to see if anything is sync'ing.
foreach my $server_name (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}})
{
foreach my $peer_name (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}{$server_name}{connection}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peer_name => $peer_name }});
foreach my $volume (sort {$a cmp $b} %{$anvil->data->{drbd}{status}{$short_host_name}{resource}{$server_name}{connection}{$peer_name}{volume}})
{
next if not exists $anvil->data->{drbd}{status}{$short_host_name}{resource}{$server_name}{connection}{$peer_name}{volume}{$volume}{'replication-state'};
my $replication_state = $anvil->data->{drbd}{status}{$short_host_name}{resource}{$server_name}{connection}{$peer_name}{volume}{$volume}{'replication-state'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
volume => $volume,
replication_state => $replication_state,
}});
if ($replication_state =~ /SyncSource/i)
{
$waiting = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0312", variables => {
peer_host => $peer_name,
resource => $server_name,
volume => $volume,
}});
$anvil->Job->update_progress({progress => 50, message => "job_0312,!!peer_host!".$peer_name."!!,!!resource!".$server_name."!!,!!volume!".$volume."!!"});
}
}
}
}
if ($waiting)
{
sleep 10;
}
}
# All servers should be down now, so stop DRBD.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0314"});
$anvil->Job->update_progress({progress => 60, message => "job_0314"});
my $shell_call = $anvil->data->{path}{exe}{drbdadm}." down all";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
return(0);
}