You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
430 lines
17 KiB
430 lines
17 KiB
#!/usr/bin/perl |
|
# |
|
# This does shutdown-time tasks; migrate or stop servers, withdraw and power off the host. |
|
# |
|
# Exit codes; |
|
# 0 = Normal exit. |
|
# 1 = Any problem that causes an early exit. |
|
# |
|
# TODO: |
|
# |
|
|
|
use strict; |
|
use warnings; |
|
use Anvil::Tools; |
|
require POSIX; |
|
use Data::Dumper; |
|
|
|
my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0]; |
|
my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0]; |
|
if (($running_directory =~ /^\./) && ($ENV{PWD})) |
|
{ |
|
$running_directory =~ s/^\./$ENV{PWD}/; |
|
} |
|
|
|
# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete. |
|
$| = 1; |
|
|
|
my $anvil = Anvil::Tools->new(); |
|
$anvil->data->{switches}{'job-uuid'} = ""; |
|
$anvil->data->{switches}{'power-off'} = ""; # By default, the node is withdrawn. With this switch, the node will power off as well. |
|
$anvil->data->{switches}{'stop-reason'} = ""; # Optionally used to set 'system::stop_reason' reason for this host. Valid values are 'user', 'power' and 'thermal'. |
|
$anvil->data->{switches}{'stop-servers'} = ""; # Default behaviour is to migrate servers to the peer, if the peer is up. This overrides that and forces hosted servers to shut down. |
|
$anvil->Get->switches; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { |
|
'switches::job-uuid' => $anvil->data->{switches}{'job-uuid'}, |
|
'switches::power-off' => $anvil->data->{switches}{'power-off'}, |
|
'switches::stop-reason' => $anvil->data->{switches}{'stop-reason'}, |
|
'switches::stop-servers' => $anvil->data->{switches}{'stop-servers'}, |
|
}}); |
|
|
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0115", variables => { program => $THIS_FILE }}); |
|
|
|
# Make sure we're running as 'root' |
|
# $< == real UID, $> == effective UID |
|
if (($< != 0) && ($> != 0)) |
|
{ |
|
# Not root |
|
print $anvil->Words->string({key => "error_0005"})."\n"; |
|
$anvil->nice_exit({exit_code => 1}); |
|
} |
|
|
|
$anvil->Database->connect(); |
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 3, key => "log_0132"}); |
|
if (not $anvil->data->{sys}{database}{connections}) |
|
{ |
|
# No databases, update the job, sleep for a bit and then exit. The daemon will pick it up and try |
|
# again after we exit. |
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0075"}); |
|
sleep 10; |
|
$anvil->nice_exit({exit_code => 1}); |
|
} |
|
|
|
# If we don't have a job UUID, try to find one. |
|
if (not $anvil->data->{switches}{'job-uuid'}) |
|
{ |
|
# Load the job data. |
|
$anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({program => $THIS_FILE}); |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }}); |
|
} |
|
|
|
# If we still don't have a job-uuit, go into interactive mode. |
|
if ($anvil->data->{switches}{'job-uuid'}) |
|
{ |
|
# Load the job data. |
|
$anvil->Job->clear(); |
|
$anvil->Job->get_job_details(); |
|
$anvil->Job->update_progress({ |
|
progress => 1, |
|
job_picked_up_by => $$, |
|
job_picked_up_at => time, |
|
message => "message_0235", |
|
}); |
|
|
|
# Pull out the job data. |
|
foreach my $line (split/\n/, $anvil->data->{jobs}{job_data}) |
|
{ |
|
if ($line =~ /power-off=(.*?)$/) |
|
{ |
|
$anvil->data->{switches}{'power-off'} = $1; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { |
|
'switches::power-off' => $anvil->data->{switches}{'power-off'}, |
|
}}); |
|
} |
|
if ($line =~ /stop-reason=(.*?)$/) |
|
{ |
|
$anvil->data->{switches}{'stop-reason'} = $1; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { |
|
'switches::stop-reason' => $anvil->data->{switches}{'stop-reason'}, |
|
}}); |
|
} |
|
if ($line =~ /stop-servers=(.*?)$/) |
|
{ |
|
$anvil->data->{switches}{'stop-servers'} = $1; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { |
|
'switches::stop-servers' => $anvil->data->{switches}{'stop-servers'}, |
|
}}); |
|
} |
|
} |
|
} |
|
|
|
# Make sure we're in an Anvil! |
|
$anvil->data->{sys}{anvil_uuid} = $anvil->Cluster->get_anvil_uuid(); |
|
if (not $anvil->data->{sys}{anvil_uuid}) |
|
{ |
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0260"}); |
|
$anvil->Job->update_progress({progress => 100, message => "error_0260"}); |
|
$anvil->nice_exit({exit_code => 1}); |
|
} |
|
|
|
# If no stop-reason was set, set it to 'user' |
|
if (not $anvil->data->{switches}{'stop-reason'}) |
|
{ |
|
$anvil->data->{switches}{'stop-reason'} = "user"; |
|
} |
|
|
|
# Migrate or stop the servers, if any servers are running here. |
|
process_servers($anvil); |
|
|
|
# This waits on DRBD if we're SyncSource |
|
wait_on_drbd($anvil); |
|
|
|
# This stops pacemaker |
|
stop_cluster($anvil); |
|
|
|
# Are we powering off? |
|
if ($anvil->data->{switches}{'power-off'}) |
|
{ |
|
# Yup |
|
$anvil->Database->update_host_status({ |
|
debug => 2, |
|
host_uuid => $anvil->Get->host_uuid, |
|
host_status => "stopping", |
|
}); |
|
|
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0325"}); |
|
$anvil->Job->update_progress({progress => 100, message => "job_0325"}); |
|
|
|
my $shell_call = $anvil->data->{path}{exe}{systemctl}." poweroff"; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); |
|
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__}); |
|
|
|
# Unlikely we're still alive, but 'poweroff' does return once enqueued, so... |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { |
|
output => $output, |
|
return_code => $return_code, |
|
}}); |
|
} |
|
else |
|
{ |
|
# We're not shutting down, so we're done |
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0326"}); |
|
$anvil->Job->update_progress({progress => 100, message => "job_0326"}); |
|
} |
|
|
|
$anvil->nice_exit({exit_code => 0}); |
|
|
|
|
|
############################################################################################################# |
|
# Functions # |
|
############################################################################################################# |
|
|
|
# This takes down or migrates VMs, then withdraws from the cluster. |
|
sub stop_cluster |
|
{ |
|
my ($anvil) = @_; |
|
|
|
# We need to rename the server in the cluster, and we need both nodes up to do it. |
|
my $pacemaker_stopped = 0; |
|
my $waiting = 1; |
|
while($waiting) |
|
{ |
|
$waiting = 0; |
|
my $problem = $anvil->Cluster->parse_cib({debug => 2}); |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); |
|
if ($problem) |
|
{ |
|
# Cluster has stopped. |
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0313"}); |
|
$anvil->Job->update_progress({progress => 5, message => "job_0313"}); |
|
} |
|
else |
|
{ |
|
$waiting = 1; |
|
if (not $pacemaker_stopped) |
|
{ |
|
# Stop pacemaker now. |
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0323"}); |
|
$anvil->Job->update_progress({progress => 70, message => "job_0323"}); |
|
|
|
### NOTE: '--force' is needed or else sole-running nodes can't exit |
|
### (complains about the loss of quorum) |
|
my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster stop --force"; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); |
|
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { |
|
output => $output, |
|
return_code => $return_code, |
|
}}); |
|
|
|
$pacemaker_stopped = 1; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { pacemaker_stopped => $pacemaker_stopped }}); |
|
} |
|
else |
|
{ |
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0324"}); |
|
$anvil->Job->update_progress({progress => 80, message => "job_0324"}); |
|
} |
|
} |
|
if ($waiting) |
|
{ |
|
sleep 5; |
|
} |
|
} |
|
|
|
return(0); |
|
} |
|
|
|
# This will migrate or stop |
|
sub process_servers |
|
{ |
|
my ($anvil) = @_; |
|
|
|
if ($anvil->data->{switches}{'stop-servers'}) |
|
{ |
|
# Tell the user we're about to shut down servers. |
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0320"}); |
|
$anvil->Job->update_progress({progress => 10, message => "job_0320"}); |
|
} |
|
else |
|
{ |
|
# Tell the user we're about to migrate servers. |
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0321"}); |
|
$anvil->Job->update_progress({progress => 10, message => "job_0321"}); |
|
} |
|
my $waiting = 1; |
|
while ($waiting) |
|
{ |
|
# Is the cluster up? |
|
$waiting = 0; |
|
my $problem = $anvil->Cluster->parse_cib({debug => 2}); |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); |
|
if ($problem) |
|
{ |
|
# Nope. |
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0313"}); |
|
$anvil->Job->update_progress({progress => 80, message => "job_0313"}); |
|
} |
|
else |
|
{ |
|
# Loop through the servers running here. |
|
my $local_name = $anvil->data->{cib}{parsed}{'local'}{name}; |
|
my $peer_name = $anvil->data->{cib}{parsed}{peer}{name}; |
|
foreach my $server (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{data}{server}}) |
|
{ |
|
my $status = $anvil->data->{cib}{parsed}{data}{server}{$server}{status}; |
|
my $host_name = $anvil->data->{cib}{parsed}{data}{server}{$server}{host_name}; |
|
my $role = $anvil->data->{cib}{parsed}{data}{server}{$server}{role}; |
|
my $active = $anvil->data->{cib}{parsed}{data}{server}{$server}{active}; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { |
|
's1:server' => $server, |
|
's2:status' => $status, |
|
's2:host_name' => $host_name, |
|
's4:role' => $role, |
|
's5:active' => $active, |
|
}}); |
|
next if lc($role) eq "stopped"; |
|
|
|
if (lc($role) eq "migrating") |
|
{ |
|
# No matter what, if a server is migrating, we wait. |
|
$waiting = 1; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); |
|
|
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0315", variables => { server => $server }}); |
|
$anvil->Job->update_progress({progress => 20, message => "job_0315,!!server!".$server."!!"}); |
|
} |
|
elsif ($host_name eq $local_name) |
|
{ |
|
# Something is running here. |
|
$waiting = 1; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); |
|
|
|
# This is ours. How shall we deal with it? |
|
if ($anvil->data->{switches}{'stop-servers'}) |
|
{ |
|
# Have we tried to stop it already? If not, use pcs. If so, |
|
# and if it's been more that 60 seconds, use virsh to try |
|
# again. |
|
if (not exists $anvil->data->{server_shutdown}{$server}) |
|
{ |
|
# Use PCS. |
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0316", variables => { server => $server }}); |
|
$anvil->Job->update_progress({progress => 20, message => "job_0316,!!server!".$server."!!"}); |
|
$anvil->Cluster->shutdown_server({ |
|
debug => 2, |
|
server => $server, |
|
'wait' => 0, |
|
}); |
|
$anvil->data->{server_shutdown}{$server}{pcs_called} = 1; |
|
$anvil->data->{server_shutdown}{$server}{virsh_called} = 0; |
|
$anvil->data->{server_shutdown}{$server}{call_virsh_at} = time + 120; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { |
|
"server_shutdown::${server}::pcs_called" => $anvil->data->{server_shutdown}{$server}{pcs_called}, |
|
"server_shutdown::${server}::virsh_called" => $anvil->data->{server_shutdown}{$server}{virsh_called}, |
|
"server_shutdown::${server}::call_virsh_at" => $anvil->data->{server_shutdown}{$server}{call_virsh_at}, |
|
}}); |
|
} |
|
elsif ((not $anvil->data->{server_shutdown}{$server}{virsh_called}) && (time > $anvil->data->{server_shutdown}{$server}{call_virsh_at})) |
|
{ |
|
# Use virsh |
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0317", variables => { server => $server }}); |
|
$anvil->Job->update_progress({progress => 20, message => "job_0317,!!server!".$server."!!"}); |
|
$anvil->Server->shutdown_virsh({ |
|
debug => 2, |
|
server => $server, |
|
wait_time => 1, |
|
}); |
|
$anvil->data->{server_shutdown}{$server}{virsh_called} = 1; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { |
|
"server_shutdown::${server}::virsh_called" => $anvil->data->{server_shutdown}{$server}{virsh_called}, |
|
}}); |
|
} |
|
} |
|
else |
|
{ |
|
### TODO: Calculate how many gigs worth of RAM we'll migrate, |
|
### and advance the "progress" by the percentage each |
|
### server's RAM represents of the total |
|
# Migrate the servers. |
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0318", variables => { |
|
server => $server, |
|
node => $peer_name, |
|
}}); |
|
$anvil->Job->update_progress({progress => 20, message => "job_0318,!!server!".$server."!!,!!node!".$peer_name."!!"}); |
|
$anvil->Cluster->migrate_server({ |
|
server => $server, |
|
node => $peer_name, |
|
'wait' => 1, |
|
}); |
|
} |
|
} |
|
} |
|
} |
|
if ($waiting) |
|
{ |
|
sleep 5; |
|
} |
|
} |
|
|
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0319"}); |
|
$anvil->Job->update_progress({progress => 30, message => "job_0319"}); |
|
|
|
return(0); |
|
} |
|
|
|
# This watches DRBD and waits for us to not be SyncSource. |
|
sub wait_on_drbd |
|
{ |
|
my ($anvil) = @_; |
|
|
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0322"}); |
|
$anvil->Job->update_progress({progress => 40, message => "job_0322"}); |
|
my $short_host_name = $anvil->Get->short_host_name(); |
|
my $waiting = 1; |
|
while ($waiting) |
|
{ |
|
# (Re)fresh my view of the storage. |
|
$waiting = 0; |
|
$anvil->DRBD->get_status({debug => 2}); |
|
|
|
# Now check to see if anything is sync'ing. |
|
foreach my $server_name (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}}) |
|
{ |
|
foreach my $peer_name (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}{$server_name}{connection}}) |
|
{ |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peer_name => $peer_name }}); |
|
foreach my $volume (sort {$a cmp $b} %{$anvil->data->{drbd}{status}{$short_host_name}{resource}{$server_name}{connection}{$peer_name}{volume}}) |
|
{ |
|
next if not exists $anvil->data->{drbd}{status}{$short_host_name}{resource}{$server_name}{connection}{$peer_name}{volume}{$volume}{'replication-state'}; |
|
my $replication_state = $anvil->data->{drbd}{status}{$short_host_name}{resource}{$server_name}{connection}{$peer_name}{volume}{$volume}{'replication-state'}; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { |
|
volume => $volume, |
|
replication_state => $replication_state, |
|
}}); |
|
|
|
if ($replication_state =~ /SyncSource/i) |
|
{ |
|
$waiting = 1; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); |
|
|
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0312", variables => { |
|
peer_host => $peer_name, |
|
resource => $server_name, |
|
volume => $volume, |
|
}}); |
|
$anvil->Job->update_progress({progress => 50, message => "job_0312,!!peer_host!".$peer_name."!!,!!resource!".$server_name."!!,!!volume!".$volume."!!"}); |
|
} |
|
} |
|
} |
|
} |
|
if ($waiting) |
|
{ |
|
sleep 10; |
|
} |
|
} |
|
|
|
# All servers should be down now, so stop DRBD. |
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0314"}); |
|
$anvil->Job->update_progress({progress => 60, message => "job_0314"}); |
|
|
|
my $shell_call = $anvil->data->{path}{exe}{drbdadm}." down all"; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); |
|
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { |
|
output => $output, |
|
return_code => $return_code, |
|
}}); |
|
|
|
return(0); |
|
}
|
|
|