anvil/tools/striker-boot-machine
digimer a35e790a4d Fixed a minor striker-boot-machine bug.
Divide by zero error when no hosts with IPMI found.

Signed-off-by: digimer <mkelly@alteeve.ca>
2023-10-20 11:38:15 -04:00

323 lines
13 KiB
Perl
Executable File

#!/usr/bin/perl
#
# This program will boot a target machine using either it's IPMI interface, if available, or one of the
# (non-PDU) fence methods, if the target is in an Anvil! and we have a manifest for it.
#
# Exit codes;
# 0 = Normal exit.
# 1 = No database connection.
#
# TODO:
#
use strict;
use warnings;
use Anvil::Tools;
my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0];
my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0];
if (($running_directory =~ /^\./) && ($ENV{PWD}))
{
$running_directory =~ s/^\./$ENV{PWD}/;
}
# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete.
$| = 1;
my $anvil = Anvil::Tools->new();
$anvil->Get->switches({list => [
"host",
"host-uuid",
"job-uuid"], man => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, secure => 0, key => "log_0115", variables => { program => $THIS_FILE }});
$anvil->Database->connect();
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"});
if (not $anvil->data->{sys}{database}{connections})
{
# No databases, update the job, sleep for a bit and then exit. The daemon will pick it up and try
# again after we exit.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0075"});
sleep 10;
$anvil->nice_exit({exit_code => 1});
}
if ($anvil->data->{switches}{'job-uuid'})
{
# Load the job data.
$anvil->Job->clear();
$anvil->Job->get_job_details();
$anvil->Job->update_progress({
progress => 1,
job_picked_up_by => $$,
job_picked_up_at => time,
message => "job_0283",
});
# Pull out the job data.
foreach my $line (split/\n/, $anvil->data->{jobs}{job_data})
{
if ($line =~ /host-uuid=(.*?)$/)
{
$anvil->data->{switches}{'host-uuid'} = $1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'switches::host-uuid' => $anvil->data->{switches}{'host-uuid'},
}});
}
if ($line =~ /host=(.*?)$/)
{
$anvil->data->{switches}{host} = $1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'switches::host' => $anvil->data->{switches}{host},
}});
}
}
}
# Get the host info (copy host-uuid to host)
$anvil->data->{switches}{'host-name'} = "" if not defined $anvil->data->{switches}{'host-name'};
if ((not $anvil->data->{switches}{host}) && ($anvil->data->{switches}{'host-uuid'}))
{
$anvil->data->{switches}{host} = $anvil->data->{switches}{'host-uuid'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'switches::host' => $anvil->data->{switches}{host},
}});
}
# If the host is 'all', don't translate to the host_uuid. Otherwise, look for the host_uuid from the host string
if ((not $anvil->data->{switches}{'host-uuid'}) && ($anvil->data->{switches}{host} ne "all"))
{
$anvil->data->{switches}{'host-uuid'} = $anvil->Database->get_host_uuid_from_string({string => $anvil->data->{switches}{host}});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'switches::host-uuid' => $anvil->data->{switches}{'host-uuid'},
}});
# host name not found
if (not $anvil->data->{switches}{'host-uuid'})
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0291", variables => { host_name => $anvil->data->{switches}{'host-name'} }});
$anvil->Job->update_progress({progress => 100, message => "error_0291,!!host_name!".$anvil->data->{switches}{'host-name'}."!!"});
$anvil->nice_exit({exit_code => 1});
}
$anvil->data->{switches}{'host-name'} = $anvil->Get->host_name_from_uuid({host_uuid => $anvil->data->{switches}{'host-uuid'}});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'switches::host-name' => $anvil->data->{switches}{'host-name'},
}});
}
find_boot_method($anvil);
$anvil->nice_exit({exit_code => 0});
#############################################################################################################
# Functions #
#############################################################################################################
# This will try to boot the node with host_ipmi data, if available. If not, it will try to find a (non-PDU)
# fence method to try
sub find_boot_method
{
my ($anvil) = @_;
$anvil->Database->get_hosts_info({debug => 2});
my $hosts = [];
if ($anvil->data->{switches}{'host-uuid'})
{
push @{$hosts}, $anvil->data->{switches}{'host-name'};
}
elsif ($anvil->data->{switches}{host} eq "all")
{
$anvil->Database->get_hosts({debug => 2});
foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}})
{
my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name};
my $host_ipmi = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_ipmi};
my $host_key = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_key};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:host_name' => $host_name,
's2:host_uuid' => $host_uuid,
's3:host_ipmi' => $anvil->Log->is_secure($host_ipmi),
}});
next if $host_key eq "DELETED";
next if not $host_ipmi;
push @{$hosts}, $host_name;
}
}
my $host_count = @{$hosts};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_count => $host_count }});
if (not $host_count)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "message_0357"});
$anvil->Job->update_progress({progress => 100, message => "message_0357"});
$anvil->nice_exit({exit_code => 0});
}
my $steps = int((80 / $host_count) / 3);
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { steps => $steps }});
$anvil->data->{sys}{progress} = 5;
foreach my $host_name (sort {$a cmp $b} @{$hosts})
{
my $host_uuid = $anvil->Database->get_host_uuid_from_string({string => $host_name});
my $host_ipmi = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_ipmi};
my $host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_ipmi};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
host_uuid => $host_uuid,
host_name => $host_name,
host_ipmi => $anvil->Log->is_secure($host_ipmi),
host_type => $host_type,
}});
$anvil->data->{sys}{progress} += $steps;
if ($host_ipmi)
{
# Got it.
$anvil->data->{sys}{progress} += $steps;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, key => "job_0327", variables => { host_name => $host_name }});
$anvil->Job->update_progress({progress => $anvil->data->{sys}{progress}, message => "job_0327,!!host_name!".$host_name."!!"});
# First, is the node already on?
my $shell_call = $host_ipmi." -o status";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, secure => 1, list => { shell_call => $shell_call }});
$anvil->data->{sys}{progress} += $steps;
my $problem = call_fence_agent($anvil, $shell_call, $anvil->data->{sys}{progress});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, secure => 1, list => { problem => $problem }});
$anvil->data->{sys}{progress} += $steps;
if ($problem)
{
# If I am here, there is no IPMI. Can we boot it using another fence method?
# The machine would have to be in an Anvil! for this to work.
my $anvil_uuid = $anvil->Cluster->get_anvil_uuid({host_uuid => $host_uuid});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { anvil_uuid => $anvil_uuid }});
if ($anvil_uuid)
{
my $anvil_name = $anvil->Cluster->get_anvil_name({anvil_uuid => $anvil_uuid});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { anvil_name => $anvil_name }});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, key => "job_0331", variables => {
host_name => $host_name,
anvil_name => $anvil_name,
}});
$anvil->Job->update_progress({progress => $anvil->data->{sys}{progress}, message => "job_0331,!!host_name!".$host_name."!!,!!anvil_name!".$anvil_name."!!"});
$anvil->Cluster->get_fence_methods({host_uuid => $host_uuid});
foreach my $target_host_name (sort {$a cmp $b} keys %{$anvil->data->{fence_method}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { target_host_name => $target_host_name }});
foreach my $order (sort {$a cmp $b} keys %{$anvil->data->{fence_method}{$target_host_name}{order}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { order => $order }});
foreach my $method (sort {$a cmp $b} keys %{$anvil->data->{fence_method}{$target_host_name}{order}{$order}{method}})
{
next if $method =~ /pdu/;
my $shell_call = $anvil->data->{fence_method}{$target_host_name}{order}{$order}{method}{$method}{command};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
method => $method,
shell_call => $anvil->Log->is_secure($shell_call),
}});
my $problem = call_fence_agent($anvil, $shell_call, $anvil->data->{sys}{progress});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }});
if ($problem)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0297"});
$anvil->Job->update_progress({progress => $anvil->data->{sys}{progress}, message => "error_0297"});
}
}
}
}
}
else
{
# Nothing we can do to boot this machine.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0294"});
$anvil->Job->update_progress({progress => $anvil->data->{sys}{progress}, message => "error_0294"});
}
}
}
}
# Done.
$anvil->Job->update_progress({progress => 100, message => "message_0025"});
$anvil->nice_exit({exit_code => 0});
return(0);
}
# This calls a fence agent and exits if it successfully boots the target
sub call_fence_agent
{
my ($anvil, $shell_call, $progress) = @_;
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, secure => 1});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
if ($return_code eq "0")
{
# The machine is already on
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0328"});
$anvil->Job->update_progress({progress => 100, message => "job_0328"});
return(0);
}
elsif ($return_code eq "1")
{
# Unable to connect to the fence device.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0293", variables => { output => $output }});
$anvil->Job->update_progress({progress => 100, message => "error_0293,!!output!".$output."!!"});
return(1);
}
elsif ($return_code eq "2")
{
# The machine is off, try to start it.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, key => "job_0329"});
$anvil->Job->update_progress({progress => $anvil->data->{sys}{progress}+=10, message => "job_0329"});
# First, is the node already on?
$shell_call .= " -o on";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, secure => 1, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, secure => 1});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
if ($return_code eq "0")
{
# Success!
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, key => "job_0330"});
$anvil->Job->update_progress({progress => 100, message => "job_0330"});
# Update the host's status to 'booting' and exit
$anvil->Database->update_host_status({
debug => 2,
host_uuid => $anvil->data->{switches}{'host-uuid'},
host_status => "booting",
});
return(0);
}
else
{
# Failed.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0292", variables => { output => $output }});
$anvil->Job->update_progress({progress => 100, message => "error_0292,!!output!".$output."!!"});
return(1);
}
}
return(0);
}