anvil/tools/anvil-boot-server
digimer 895f1ec262 This fixes a race condition when multiple servers are provisioned at (nearly) the same time.
* In DRBD->get_next_resource(), implemented a "hold" system where the DRBD minor and TCP port(s) returned are marked as being held for one minute. So subsequent calls won't use the same numbers.
* In anvil-daemon, added a check in run_jobs() where only one instance of a given job command will be started per 2-second loop. This should help reduce the chance of simultaneous race confitions in general.
* Removed from anvil-provision-server and most other tools the call to Job->get_job_uuid(). If the program is called without the job_uuid, don't try to find it. This allows a human (or script) to make repeated calls to a program without one of those calls running a pending job instead.

Signed-off-by: digimer <mkelly@alteeve.ca>
2023-04-28 00:19:53 -04:00

483 lines
19 KiB
Perl
Executable File

#!/usr/bin/perl
#
# This program boots a server. It can be called as either a job from the webui or directly from another
# program or a terminal.
#
# Exit codes;
# 0 = Normal exit.
# 1 = No database connection.
#
# TODO:
# - Add support for boot ordering.
# - Check which node we want to put on and set a location constraint to prefer that node before calling pcs.
#
use strict;
use warnings;
use Anvil::Tools;
my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0];
my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0];
if (($running_directory =~ /^\./) && ($ENV{PWD}))
{
$running_directory =~ s/^\./$ENV{PWD}/;
}
# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete.
$| = 1;
my $anvil = Anvil::Tools->new();
$anvil->Get->switches({list => ["job-uuid", "no-wait", "server", "server-uuid", "wait"], man => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, secure => 0, key => "log_0115", variables => { program => $THIS_FILE }});
$anvil->Database->connect();
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"});
if (not $anvil->data->{sys}{database}{connections})
{
# No databases, update the job, sleep for a bit and then exit. The daemon will pick it up and try
# again after we exit.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0075"});
sleep 10;
$anvil->nice_exit({exit_code => 1});
}
if ($anvil->data->{switches}{'job-uuid'})
{
# Load the job data.
$anvil->Job->clear();
$anvil->Job->get_job_details();
$anvil->Job->update_progress({
progress => 1,
job_picked_up_by => $$,
job_picked_up_at => time,
message => "job_0282",
});
# Pull out the job data.
foreach my $line (split/\n/, $anvil->data->{jobs}{job_data})
{
if ($line =~ /server=(.*?)$/)
{
$anvil->data->{switches}{'server'} = $1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'switches::server' => $anvil->data->{switches}{'server'},
}});
}
if ($line =~ /server-uuid=(.*?)$/)
{
$anvil->data->{switches}{'server-uuid'} = $1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'switches::server-uuid' => $anvil->data->{switches}{'server-uuid'},
}});
}
}
}
# Now check that we have a server. If it's a server_uuid, read the server name.
if ($anvil->data->{switches}{'server-uuid'})
{
# Convert the server_uuid to a server_name.
my $query = "SELECT server_name FROM servers WHERE server_uuid = ".$anvil->Database->quote($anvil->data->{switches}{'server-uuid'}).";";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }});
my $server_name = $anvil->Database->query({query => $query, source => $THIS_FILE, line => __LINE__})->[0]->[0];
$server_name = "" if not defined $server_name;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { server_name => $server_name }});
if ($server_name)
{
$anvil->data->{switches}{'server'} = $server_name;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'switches::server' => $anvil->data->{switches}{'server'},
}});
}
else
{
# Invalid server UUID.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0269", variables => {
server_uuid => $anvil->data->{switches}{'server-uuid'},
}});
$anvil->Job->update_progress({progress => 100, message => "error_0269,!!server_uuid!".$anvil->data->{switches}{'server-uuid'}."!!"});
$anvil->nice_exit({exit_code => 1});
}
}
# Do we have a server name?
if (not $anvil->data->{switches}{'server'})
{
# Unable to proceed.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0257"});
$anvil->Job->update_progress({progress => 100, message => "error_0257"});
$anvil->nice_exit({exit_code => 1});
}
# Are we a node or DR host?
$anvil->data->{sys}{host_type} = $anvil->Get->host_type();
if (($anvil->data->{sys}{host_type} ne "node") && ($anvil->data->{sys}{host_type} ne "dr"))
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0258"});
$anvil->Job->update_progress({progress => 100, message => "error_0258"});
$anvil->nice_exit({exit_code => 1});
}
### TODO: Add DR support. For now, this only works on Nodes in a cluster
if ($anvil->data->{sys}{host_type} eq "dr")
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0265"});
$anvil->Job->update_progress({progress => 100, message => "error_0265"});
$anvil->nice_exit({exit_code => 1});
}
# Make sure that we're in an Anvil! system.
$anvil->data->{sys}{anvil_uuid} = $anvil->Cluster->get_anvil_uuid();
if (not $anvil->data->{sys}{anvil_uuid})
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0260"});
$anvil->Job->update_progress({progress => 100, message => "error_0260"});
$anvil->nice_exit({exit_code => 1});
}
# Wait for pacemaker to be up.
wait_for_pacemaker($anvil);
# If 'server' is 'all', boot all servers.
if (lc($anvil->data->{switches}{'server'}) eq "all")
{
boot_all_servers($anvil);
}
else
{
my $wait = $anvil->data->{switches}{'no-wait'} ? 0 : 1;
boot_server($anvil, $anvil->data->{switches}{'server'}, $wait, 50);
}
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0281"});
$anvil->Job->update_progress({progress => 100, message => "job_0281"});
$anvil->nice_exit({exit_code => 0});
#############################################################################################################
# Functions #
#############################################################################################################
sub wait_for_pacemaker
{
my ($anvil) = @_;
# Boot the server using pcs, but of course, wait for the node to be up.
my $waiting = 1;
while($waiting)
{
my $problem = $anvil->Cluster->parse_cib({debug => 3});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }});
if (not $problem)
{
my $node_name = $anvil->data->{cib}{parsed}{'local'}{name};
my $ready = $anvil->data->{cib}{parsed}{data}{node}{$node_name}{node_state}{ready};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { ready => $ready }});
if ($ready)
{
# We're good.
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0279"});
$anvil->Job->update_progress({progress => 15, message => "job_0279"});
}
else
{
# Node isn't ready yet.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0278"});
$anvil->Job->update_progress({progress => 10, message => "job_0278"});
}
}
else
{
# Cluster hasn't started.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0277"});
$anvil->Job->update_progress({progress => 5, message => "job_0277"});
}
if ($waiting)
{
sleep 10;
}
}
return(0);
}
sub boot_server
{
my ($anvil, $server, $wait, $progress) = @_;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
server => $server,
'wait' => $wait,
progress => $progress,
}});
# Verify that the server's XML file exists.
my $definition_file = $anvil->data->{path}{directories}{shared}{definitions}."/".$server.".xml";
if (not -e $definition_file)
{
# No XML, no boot
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0259", variables => { definition_file => $definition_file }});
$anvil->Job->update_progress({progress => 100, message => "error_0259,!!definition_file!".$definition_file."!!"});
$anvil->nice_exit({exit_code => 1});
}
if (not exists $anvil->data->{cib}{parsed}{data}{server}{$server})
{
# XML exists, but it's not in the cluster.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0261", variables => {
server => $server,
definition_file => $definition_file,
}});
$anvil->Job->update_progress({progress => 100, message => "error_0261,!!definition_file!".$definition_file."!!,!!server!".$server."!!"});
$anvil->nice_exit({exit_code => 1});
}
my $status = $anvil->data->{cib}{parsed}{data}{server}{$server}{status};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { status => $status }});
if ($status ne "off")
{
# It's not off, can't boot it.
if ($status eq "running")
{
# Some other state.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0548", variables => { server => $server }});
$anvil->Job->update_progress({progress => $progress, message => "log_0548,!!server!".$server."!!"});
return(0);
}
else
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0262", variables => {
server => $server,
status => $status,
}});
$anvil->Job->update_progress({progress => 100, message => "error_0262,!!status!".$status."!!,!!server!".$server."!!"});
$anvil->nice_exit({exit_code => 1});
}
}
# Now boot.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0288", variables => { server => $server }});
$anvil->Job->update_progress({progress => $progress, message => "job_0288,!!server!".$server."!!"});
my $problem = $anvil->Cluster->boot_server({
debug => 2,
server => $server,
'wait' => $wait,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }});
if ($problem)
{
# Failed, abort.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0267", variables => { server => $server }});
$anvil->Job->update_progress({progress => 100, message => "error_0267,!!server!".$server."!!"});
$anvil->nice_exit({exit_code => 1});
}
else
{
if ($wait)
{
# Booted!
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0280", variables => { server => $server }});
$anvil->Job->update_progress({progress => $progress, message => "job_0280,!!server!".$server."!!"});
}
else
{
# Boot requested
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0287", variables => { server => $server }});
$anvil->Job->update_progress({progress => $progress, message => "job_0287,!!server!".$server."!!"});
}
}
return(0);
}
sub boot_all_servers
{
my ($anvil) = @_;
### TODO: Manage the boot order here.
# We top out at 90, bottom is 20.
my $server_count = keys %{$anvil->data->{cib}{parsed}{data}{server}};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { server_count => $server_count }});
if (not $server_count)
{
# No servers exist yet.
return(0);
}
# Load information about the servers on this Anvil!.
my $anvil_uuid = $anvil->data->{sys}{anvil_uuid};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { anvil_uuid => $anvil_uuid }});
my $increment = int(70 / $server_count);
my $percent = 15;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { increment => $increment }});
# Loop until all are processed.
my $waiting = 1;
my $start_time = time;
while($waiting)
{
# Get a list of servers now.
$anvil->Database->get_servers({debug => 3});
# This will get set to 0 if any servers are waiting to boot.
my $all_processed = 1;
foreach my $server_name (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{data}{server}})
{
my $status = $anvil->data->{cib}{parsed}{data}{server}{$server_name}{status};
my $host_name = $anvil->data->{cib}{parsed}{data}{server}{$server_name}{host_name};
my $role = $anvil->data->{cib}{parsed}{data}{server}{$server_name}{role};
my $active = $anvil->data->{cib}{parsed}{data}{server}{$server_name}{active};
my $server_uuid = $anvil->data->{servers}{anvil_uuid}{$anvil_uuid}{server_name}{$server_name}{server_uuid};
my $boot_delay = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_start_delay};
$boot_delay = 0 if not $boot_delay;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:server_name' => $server_name,
's2:status' => $status,
's2:host_name' => $host_name,
's4:role' => $role,
's5:active' => $active,
's6:server_uuid' => $server_uuid,
's7:boot_delay' => $boot_delay,
}});
if (not exists $anvil->data->{boot_server}{$server_name}{processed})
{
# This will get set to the boot time once we actually start it. This will let
# us time when servers that boot after this server can boot.
$anvil->data->{boot_server}{$server_name}{processed} = 0;
}
elsif ($anvil->data->{boot_server}{$server_name}{processed})
{
# Already processed.
next;
}
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, secure => 0, key => "log_0719", variables => { server => $server_name }});
my $boot_after_server_uuid = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_start_after_server_uuid};
$boot_after_server_uuid = "" if not defined $boot_after_server_uuid;
$boot_after_server_uuid = "" if $boot_after_server_uuid eq "NULL";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { boot_after_server_uuid => $boot_after_server_uuid }});
if ($boot_after_server_uuid)
{
if ($boot_after_server_uuid eq "00000000-0000-0000-0000-000000000000")
{
# This server is configured to stay off.
$anvil->data->{boot_server}{$server_name}{processed} = time;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"boot_server::${server_name}::processed" => $anvil->data->{boot_server}{$server_name}{processed},
}});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, secure => 0, key => "log_0723", variables => { server => $server_name }});
next;
}
# What's the server's name.
my $boot_after_server_name = $anvil->data->{servers}{server_uuid}{$boot_after_server_uuid}{server_name};
$boot_after_server_name = "" if not defined $boot_after_server_name;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { boot_after_server_name => $boot_after_server_name }});
# Has this server processed?
if ($boot_after_server_name)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, secure => 0, key => "log_0717", variables => {
delay => $boot_delay,
server => $boot_after_server_name,
}});
if (not exists $anvil->data->{boot_server}{$boot_after_server_name})
{
$anvil->data->{boot_server}{$boot_after_server_name}{processed} = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"boot_server::${boot_after_server_name}::processed" => $anvil->data->{boot_server}{$boot_after_server_name}{processed},
}});
}
if ($anvil->data->{boot_server}{$boot_after_server_name}{processed})
{
my $processed_seconds_ago = time - $anvil->data->{boot_server}{$boot_after_server_name}{processed};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { processed_seconds_ago => $processed_seconds_ago }});
if ($processed_seconds_ago > $boot_delay)
{
# Ready to boot.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, secure => 0, key => "log_0721", variables => { server => $server_name }});
}
else
{
# Not ready yet.
$all_processed = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_processed => $all_processed }});
my $time_to_wait = $boot_delay - $processed_seconds_ago;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, secure => 0, key => "log_0720", variables => {
boot_after_server => $boot_after_server_name,
this_server => $server_name,
time_to_wait => $time_to_wait,
}});
next;
}
}
else
{
# The other server hasn't processed yet.
$all_processed = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_processed => $all_processed }});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, secure => 0, key => "log_0718", variables => {
boot_after_server => $boot_after_server_name,
this_server => $server_name,
}});
next;
}
}
}
if ($status eq "off")
{
# Boot it.
my $wait = $anvil->data->{switches}{'wait'} ? 1 : 0;
$percent += $increment;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'wait' => $wait,
percent => $percent,
}});
boot_server($anvil, $server_name, $wait, $percent);
# If we're here, the server processed.
$anvil->data->{boot_server}{$server_name}{processed} = time;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"boot_server::${server_name}::processed" => $anvil->data->{boot_server}{$server_name}{processed},
}});
}
elsif (not $anvil->data->{boot_server}{$server_name}{processed})
{
# It may have booted before we ran.
$anvil->data->{boot_server}{$server_name}{processed} = time;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, secure => 0, key => "log_0722", variables => { server => $server_name }});
}
}
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_processed => $all_processed }});
if ($all_processed)
{
# We're done!
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
}
else
{
# Wait a bit.
sleep 2;
my $problem = $anvil->Cluster->parse_cib({debug => 3});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }});
}
}
return(0);
}