You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
381 lines
15 KiB
381 lines
15 KiB
#!/usr/bin/perl |
|
# |
|
# This does shutdown-time tasks; migrate or stop servers, withdraw and power off the host. |
|
# |
|
# Exit codes; |
|
# 0 = Normal exit. |
|
# 1 = Any problem that causes an early exit. |
|
# |
|
# TODO: |
|
# |
|
|
|
use strict; |
|
use warnings; |
|
use Anvil::Tools; |
|
require POSIX; |
|
use Data::Dumper; |
|
|
|
my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0]; |
|
my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0]; |
|
if (($running_directory =~ /^\./) && ($ENV{PWD})) |
|
{ |
|
$running_directory =~ s/^\./$ENV{PWD}/; |
|
} |
|
|
|
# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete. |
|
$| = 1; |
|
|
|
my $anvil = Anvil::Tools->new(); |
|
$anvil->data->{switches}{'job-uuid'} = ""; |
|
$anvil->data->{switches}{'power-off'} = ""; # By default, the node is withdrawn. With this switch, the node will power off as well. |
|
$anvil->data->{switches}{'stop-reason'} = ""; # Optionally used to set 'system::stop_reason' reason for this host. Valid values are 'user', 'power' and 'thermal'. |
|
$anvil->data->{switches}{'stop-servers'} = ""; # Default behaviour is to migrate servers to the peer, if the peer is up. This overrides that and forces hosted servers to shut down. |
|
$anvil->Get->switches; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { |
|
'switches::job-uuid' => $anvil->data->{switches}{'job-uuid'}, |
|
'switches::power-off' => $anvil->data->{switches}{'power-off'}, |
|
'switches::stop-reason' => $anvil->data->{switches}{'stop-reason'}, |
|
'switches::stop-servers' => $anvil->data->{switches}{'stop-servers'}, |
|
}}); |
|
|
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => "log_0115", variables => { program => $THIS_FILE }}); |
|
|
|
# Make sure we're running as 'root' |
|
# $< == real UID, $> == effective UID |
|
if (($< != 0) && ($> != 0)) |
|
{ |
|
# Not root |
|
print $anvil->Words->string({key => "error_0005"})."\n"; |
|
$anvil->nice_exit({exit_code => 1}); |
|
} |
|
|
|
$anvil->Database->connect(); |
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 3, secure => 0, key => "log_0132"}); |
|
if (not $anvil->data->{sys}{database}{connections}) |
|
{ |
|
# No databases, update the job, sleep for a bit and then exit. The daemon will pick it up and try |
|
# again after we exit. |
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0075"}); |
|
sleep 10; |
|
$anvil->nice_exit({exit_code => 1}); |
|
} |
|
|
|
# If we don't have a job UUID, try to find one. |
|
if (not $anvil->data->{switches}{'job-uuid'}) |
|
{ |
|
# Load the job data. |
|
$anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({program => $THIS_FILE}); |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }}); |
|
} |
|
|
|
# If we still don't have a job-uuit, go into interactive mode. |
|
if ($anvil->data->{switches}{'job-uuid'}) |
|
{ |
|
# Load the job data. |
|
$anvil->Job->clear(); |
|
$anvil->Job->get_job_details(); |
|
$anvil->Job->update_progress({ |
|
progress => 1, |
|
job_picked_up_by => $$, |
|
job_picked_up_at => time, |
|
message => "message_0235", |
|
}); |
|
|
|
# Pull out the job data. |
|
foreach my $line (split/\n/, $anvil->data->{jobs}{job_data}) |
|
{ |
|
if ($line =~ /power-off=(.*?)$/) |
|
{ |
|
$anvil->data->{switches}{'power-off'} = $1; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { |
|
'switches::power-off' => $anvil->data->{switches}{'power-off'}, |
|
}}); |
|
} |
|
if ($line =~ /stop-reason=(.*?)$/) |
|
{ |
|
$anvil->data->{switches}{'stop-reason'} = $1; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { |
|
'switches::stop-reason' => $anvil->data->{switches}{'stop-reason'}, |
|
}}); |
|
} |
|
if ($line =~ /stop-servers=(.*?)$/) |
|
{ |
|
$anvil->data->{switches}{'stop-servers'} = $1; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { |
|
'switches::stop-servers' => $anvil->data->{switches}{'stop-servers'}, |
|
}}); |
|
} |
|
} |
|
} |
|
|
|
# Make sure we're in an Anvil! |
|
$anvil->data->{sys}{anvil_uuid} = $anvil->Cluster->get_anvil_uuid(); |
|
if (not $anvil->data->{sys}{anvil_uuid}) |
|
{ |
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0260"}); |
|
$anvil->Job->update_progress({progress => 100, message => "error_0260"}); |
|
$anvil->nice_exit({exit_code => 1}); |
|
} |
|
|
|
# Migrate or stop the servers, if any servers are running here. |
|
process_servers($anvil); |
|
|
|
# This waits on DRBD if we're SyncSource |
|
wait_on_drbd($anvil); |
|
|
|
|
|
exit(0); |
|
|
|
# This stops pacemaker, migrating or shutting down servers before hand. It will also shut |
|
stop_cluster($anvil); |
|
|
|
|
|
$anvil->nice_exit({exit_code => 0}); |
|
|
|
############################################################################################################# |
|
# Functions # |
|
############################################################################################################# |
|
|
|
# This will migrate or stop |
|
sub process_servers |
|
{ |
|
my ($anvil) = @_; |
|
|
|
my $waiting = 1; |
|
while ($waiting) |
|
{ |
|
# Is the cluster up? |
|
$waiting = 0; |
|
my $problem = $anvil->Cluster->parse_cib({debug => 2}); |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); |
|
if ($problem) |
|
{ |
|
# Nope. |
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0313"}); |
|
$anvil->Job->update_progress({progress => 90, message => "job_0313"}); |
|
} |
|
else |
|
{ |
|
# Loop through the servers running here. |
|
my $local_name = $anvil->data->{cib}{parsed}{'local'}{name}; |
|
my $peer_name = $anvil->data->{cib}{parsed}{peer}{name}; |
|
foreach my $server (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{data}{server}}) |
|
{ |
|
my $status = $anvil->data->{cib}{parsed}{data}{server}{$server}{status}; |
|
my $host_name = $anvil->data->{cib}{parsed}{data}{server}{$server}{host_name}; |
|
my $role = $anvil->data->{cib}{parsed}{data}{server}{$server}{role}; |
|
my $active = $anvil->data->{cib}{parsed}{data}{server}{$server}{active}; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { |
|
's1:server' => $server, |
|
's2:status' => $status, |
|
's2:host_name' => $host_name, |
|
's4:role' => $role, |
|
's5:active' => $active, |
|
}}); |
|
|
|
if (lc($role) eq "migrating") |
|
{ |
|
# No matter what, if a server is migrating, we wait. |
|
$waiting = 1; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); |
|
|
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0315", variables => { server => $server }}); |
|
$anvil->Job->update_progress({progress => 30, message => "job_0315,!!server!".$server."!!"}); |
|
} |
|
elsif ($host_name eq $local_name) |
|
{ |
|
# Something is running here. |
|
$waiting = 1; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); |
|
|
|
# This is ours. How shall we deal with it? |
|
if ($anvil->data->{switches}{'stop-servers'}) |
|
{ |
|
# Have we tried to stop it already? If not, use pcs. If so, |
|
# and if it's been more that 60 seconds, use virsh to try |
|
# again. |
|
if (not exists $anvil->data->{server_shutdown}{$server}) |
|
{ |
|
# Use PCS. |
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0316", variables => { server => $server }}); |
|
$anvil->Job->update_progress({progress => 30, message => "job_0316,!!server!".$server."!!"}); |
|
$anvil->Cluster->shutdown_server({ |
|
debug => 2, |
|
server => $server, |
|
'wait' => 0, |
|
}); |
|
$anvil->data->{server_shutdown}{$server}{pcs_called} = 1; |
|
$anvil->data->{server_shutdown}{$server}{virsh_called} = 0; |
|
$anvil->data->{server_shutdown}{$server}{call_virsh_at} = time + 120; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { |
|
"server_shutdown::${server}::pcs_called" => $anvil->data->{server_shutdown}{$server}{pcs_called}, |
|
"server_shutdown::${server}::virsh_called" => $anvil->data->{server_shutdown}{$server}{virsh_called}, |
|
"server_shutdown::${server}::call_virsh_at" => $anvil->data->{server_shutdown}{$server}{call_virsh_at}, |
|
}}); |
|
} |
|
elsif ((not $anvil->data->{server_shutdown}{$server}{virsh_called}) && (time > $anvil->data->{server_shutdown}{$server}{call_virsh_at})) |
|
{ |
|
# Use virsh |
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0317", variables => { server => $server }}); |
|
$anvil->Job->update_progress({progress => 30, message => "job_0317,!!server!".$server."!!"}); |
|
$anvil->Cluster->shutdown_server({ |
|
debug => 2, |
|
server => $server, |
|
'wait' => 0, |
|
}); |
|
$anvil->data->{server_shutdown}{$server}{virsh_called} = 1; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { |
|
"server_shutdown::${server}::virsh_called" => $anvil->data->{server_shutdown}{$server}{virsh_called}, |
|
}}); |
|
} |
|
} |
|
else |
|
{ |
|
### TODO: Calculate how many gigs worth of RAM we'll migrate, |
|
### and advance the "progress" by the percentage each |
|
### server's RAM represents of the total |
|
# Migrate the servers. |
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0318", variables => { |
|
server => $server, |
|
node => $peer_name, |
|
}}); |
|
$anvil->Job->update_progress({progress => 30, message => "job_0318,!!server!".$server."!!,!!node!".$peer_name."!!"}); |
|
$anvil->Cluster->migrate_server({ |
|
server => $server, |
|
node => $peer_name, |
|
'wait' => 1, |
|
}); |
|
} |
|
} |
|
} |
|
} |
|
if ($waiting) |
|
{ |
|
sleep 5; |
|
} |
|
} |
|
|
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0319"}); |
|
$anvil->Job->update_progress({progress => 30, message => "job_0319"}); |
|
exit(0); |
|
|
|
return(0); |
|
} |
|
|
|
# This takes down or migrates VMs, then withdraws from the cluster. |
|
sub stop_cluster |
|
{ |
|
my ($anvil) = @_; |
|
|
|
# We need to rename the server in the cluster, and we need both nodes up to do it. |
|
my $waiting = 1; |
|
while($waiting) |
|
{ |
|
my $problem = $anvil->Cluster->parse_cib({debug => 2}); |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); |
|
if (not $problem) |
|
{ |
|
my $local_name = $anvil->data->{cib}{parsed}{'local'}{name}; |
|
my $peer_name = $anvil->data->{cib}{parsed}{peer}{name}; |
|
my $local_ready = $anvil->data->{cib}{parsed}{data}{node}{$local_name}{node_state}{ready}; |
|
my $peer_ready = $anvil->data->{cib}{parsed}{data}{node}{$local_name}{node_state}{ready}; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { |
|
local_name => $local_name, |
|
peer_name => $peer_name, |
|
local_ready => $local_ready, |
|
peer_ready => $peer_ready, |
|
}}); |
|
if (($local_ready) && ($peer_ready)) |
|
{ |
|
# We're good. |
|
$waiting = 0; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); |
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0295"}); |
|
$anvil->Job->update_progress({progress => 15, message => "job_0295"}); |
|
} |
|
else |
|
{ |
|
# One or both nods are not online yet. |
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0296", variables => { |
|
local_name => $local_name, |
|
peer_name => $peer_name, |
|
local_ready => $local_ready, |
|
peer_ready => $peer_ready, |
|
}}); |
|
$anvil->Job->update_progress({progress => 10, message => "job_0296,!!local_name!".$local_name."!!,!!peer_name!".$peer_name."!!,!!local_ready!".$local_ready."!!,!!peer_ready!".$peer_ready."!!"}); |
|
} |
|
} |
|
else |
|
{ |
|
# Cluster hasn't started. |
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0277"}); |
|
$anvil->Job->update_progress({progress => 5, message => "job_0277"}); |
|
} |
|
if ($waiting) |
|
{ |
|
sleep 10; |
|
} |
|
} |
|
|
|
return(0); |
|
} |
|
|
|
# This watches DRBD and waits for us to not be SyncSource. |
|
sub wait_on_drbd |
|
{ |
|
my ($anvil) = @_; |
|
|
|
my $short_host_name = $anvil->Get->short_host_name(); |
|
my $waiting = 1; |
|
while ($waiting) |
|
{ |
|
# (Re)fresh my view of the storage. |
|
$waiting = 0; |
|
$anvil->DRBD->get_status({debug => 2}); |
|
|
|
# Now check to see if anything is sync'ing. |
|
foreach my $server_name (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}}) |
|
{ |
|
foreach my $peer_name (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}{$server_name}{connection}}) |
|
{ |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peer_name => $peer_name }}); |
|
foreach my $volume (sort {$a cmp $b} %{$anvil->data->{drbd}{status}{$short_host_name}{resource}{$server_name}{connection}{$peer_name}{volume}}) |
|
{ |
|
next if not exists $anvil->data->{drbd}{status}{$short_host_name}{resource}{$server_name}{connection}{$peer_name}{volume}{$volume}{'replication-state'}; |
|
my $replication_state = $anvil->data->{drbd}{status}{$short_host_name}{resource}{$server_name}{connection}{$peer_name}{volume}{$volume}{'replication-state'}; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { |
|
volume => $volume, |
|
replication_state => $replication_state, |
|
}}); |
|
|
|
if ($replication_state =~ /SyncSource/i) |
|
{ |
|
$waiting = 1; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); |
|
|
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0312", variables => { |
|
peer_host => $peer_name, |
|
resource => $server_name, |
|
volume => $volume, |
|
}}); |
|
$anvil->Job->update_progress({progress => 30, message => "job_0312,!!peer_host!".$peer_name."!!,!!resource!".$server_name."!!,!!volume!".$volume."!!"}); |
|
} |
|
} |
|
} |
|
} |
|
if ($waiting) |
|
{ |
|
sleep 10; |
|
} |
|
} |
|
|
|
# All servers should be down now, so stop DRBD. |
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0314"}); |
|
$anvil->Job->update_progress({progress => 50, message => "job_0314"}); |
|
|
|
my $shell_call = $anvil->data->{path}{exe}{drbdadm}." down all"; |
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); |
|
|
|
return(0); |
|
} |
|
|
|
|