#!/usr/bin/perl # # This program will boot a target machine using either it's IPMI interface, if available, or one of the # (non-PDU) fence methods, if the target is in an Anvil! and we have a manifest for it. # # Exit codes; # 0 = Normal exit. # 1 = No database connection. # # TODO: # use strict; use warnings; use Anvil::Tools; my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0]; my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0]; if (($running_directory =~ /^\./) && ($ENV{PWD})) { $running_directory =~ s/^\./$ENV{PWD}/; } # Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete. $| = 1; my $anvil = Anvil::Tools->new(); $anvil->Get->switches({list => [ "host", "host-uuid", "job-uuid"], man => $THIS_FILE}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, secure => 0, key => "log_0115", variables => { program => $THIS_FILE }}); $anvil->Database->connect(); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"}); if (not $anvil->data->{sys}{database}{connections}) { # No databases, update the job, sleep for a bit and then exit. The daemon will pick it up and try # again after we exit. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0075"}); sleep 10; $anvil->nice_exit({exit_code => 1}); } if ($anvil->data->{switches}{'job-uuid'}) { # Load the job data. $anvil->Job->clear(); $anvil->Job->get_job_details(); $anvil->Job->update_progress({ progress => 1, job_picked_up_by => $$, job_picked_up_at => time, message => "job_0283", }); # Pull out the job data. foreach my $line (split/\n/, $anvil->data->{jobs}{job_data}) { if ($line =~ /host-uuid=(.*?)$/) { $anvil->data->{switches}{'host-uuid'} = $1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'switches::host-uuid' => $anvil->data->{switches}{'host-uuid'}, }}); } if ($line =~ /host=(.*?)$/) { $anvil->data->{switches}{host} = $1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'switches::host' => $anvil->data->{switches}{host}, }}); } } } # Get the host info (copy host-uuid to host) $anvil->data->{switches}{'host-name'} = "" if not defined $anvil->data->{switches}{'host-name'}; if ((not $anvil->data->{switches}{host}) && ($anvil->data->{switches}{'host-uuid'})) { $anvil->data->{switches}{host} = $anvil->data->{switches}{'host-uuid'}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'switches::host' => $anvil->data->{switches}{host}, }}); } # If the host is 'all', don't translate to the host_uuid. Otherwise, look for the host_uuid from the host string if ((not $anvil->data->{switches}{'host-uuid'}) && ($anvil->data->{switches}{host} ne "all")) { $anvil->data->{switches}{'host-uuid'} = $anvil->Database->get_host_uuid_from_string({string => $anvil->data->{switches}{host}}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'switches::host-uuid' => $anvil->data->{switches}{'host-uuid'}, }}); # host name not found if (not $anvil->data->{switches}{'host-uuid'}) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0291", variables => { host_name => $anvil->data->{switches}{'host-name'} }}); $anvil->Job->update_progress({progress => 100, message => "error_0291,!!host_name!".$anvil->data->{switches}{'host-name'}."!!"}); $anvil->nice_exit({exit_code => 1}); } $anvil->data->{switches}{'host-name'} = $anvil->Get->host_name_from_uuid({host_uuid => $anvil->data->{switches}{'host-uuid'}}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'switches::host-name' => $anvil->data->{switches}{'host-name'}, }}); } find_boot_method($anvil); $anvil->nice_exit({exit_code => 0}); ############################################################################################################# # Functions # ############################################################################################################# # This will try to boot the node with host_ipmi data, if available. If not, it will try to find a (non-PDU) # fence method to try sub find_boot_method { my ($anvil) = @_; $anvil->Database->get_hosts_info({debug => 2}); my $hosts = []; if ($anvil->data->{switches}{'host-uuid'}) { push @{$hosts}, $anvil->data->{switches}{'host-name'}; } elsif ($anvil->data->{switches}{host} eq "all") { $anvil->Database->get_hosts({debug => 2}); foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}}) { my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name}; my $host_ipmi = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_ipmi}; my $host_key = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_key}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:host_name' => $host_name, 's2:host_uuid' => $host_uuid, 's3:host_ipmi' => $anvil->Log->is_secure($host_ipmi), }}); next if $host_key eq "DELETED"; next if not $host_ipmi; push @{$hosts}, $host_name; } } my $host_count = @{$hosts}; my $steps = int((80 / $host_count) / 3); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_count => $host_count, steps => $steps, }}); $anvil->data->{sys}{progress} = 5; foreach my $host_name (sort {$a cmp $b} @{$hosts}) { my $host_uuid = $anvil->Database->get_host_uuid_from_string({string => $host_name}); my $host_ipmi = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_ipmi}; my $host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_ipmi}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_uuid => $host_uuid, host_name => $host_name, host_ipmi => $anvil->Log->is_secure($host_ipmi), host_type => $host_type, }}); $anvil->data->{sys}{progress} += $steps; if ($host_ipmi) { # Got it. $anvil->data->{sys}{progress} += $steps; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, key => "job_0327", variables => { host_name => $host_name }}); $anvil->Job->update_progress({progress => $anvil->data->{sys}{progress}, message => "job_0327,!!host_name!".$host_name."!!"}); # First, is the node already on? my $shell_call = $host_ipmi." -o status"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, secure => 1, list => { shell_call => $shell_call }}); $anvil->data->{sys}{progress} += $steps; my $problem = call_fence_agent($anvil, $shell_call, $anvil->data->{sys}{progress}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, secure => 1, list => { problem => $problem }}); $anvil->data->{sys}{progress} += $steps; if ($problem) { # If I am here, there is no IPMI. Can we boot it using another fence method? # The machine would have to be in an Anvil! for this to work. my $anvil_uuid = $anvil->Cluster->get_anvil_uuid({host_uuid => $host_uuid}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { anvil_uuid => $anvil_uuid }}); if ($anvil_uuid) { my $anvil_name = $anvil->Cluster->get_anvil_name({anvil_uuid => $anvil_uuid}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { anvil_name => $anvil_name }}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, key => "job_0331", variables => { host_name => $host_name, anvil_name => $anvil_name, }}); $anvil->Job->update_progress({progress => $anvil->data->{sys}{progress}, message => "job_0331,!!host_name!".$host_name."!!,!!anvil_name!".$anvil_name."!!"}); $anvil->Cluster->get_fence_methods({host_uuid => $host_uuid}); foreach my $target_host_name (sort {$a cmp $b} keys %{$anvil->data->{fence_method}}) { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { target_host_name => $target_host_name }}); foreach my $order (sort {$a cmp $b} keys %{$anvil->data->{fence_method}{$target_host_name}{order}}) { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { order => $order }}); foreach my $method (sort {$a cmp $b} keys %{$anvil->data->{fence_method}{$target_host_name}{order}{$order}{method}}) { next if $method =~ /pdu/; my $shell_call = $anvil->data->{fence_method}{$target_host_name}{order}{$order}{method}{$method}{command}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { method => $method, shell_call => $anvil->Log->is_secure($shell_call), }}); my $problem = call_fence_agent($anvil, $shell_call, $anvil->data->{sys}{progress}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); if ($problem) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0297"}); $anvil->Job->update_progress({progress => $anvil->data->{sys}{progress}, message => "error_0297"}); } } } } } else { # Nothing we can do to boot this machine. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0294"}); $anvil->Job->update_progress({progress => $anvil->data->{sys}{progress}, message => "error_0294"}); } } } } # Done. $anvil->Job->update_progress({progress => 100, message => "message_0025"}); $anvil->nice_exit({exit_code => 0}); return(0); } # This calls a fence agent and exits if it successfully boots the target sub call_fence_agent { my ($anvil, $shell_call, $progress) = @_; my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, secure => 1}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code, }}); if ($return_code eq "0") { # The machine is already on $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0328"}); $anvil->Job->update_progress({progress => 100, message => "job_0328"}); return(0); } elsif ($return_code eq "1") { # Unable to connect to the fence device. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0293", variables => { output => $output }}); $anvil->Job->update_progress({progress => 100, message => "error_0293,!!output!".$output."!!"}); return(1); } elsif ($return_code eq "2") { # The machine is off, try to start it. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, key => "job_0329"}); $anvil->Job->update_progress({progress => $anvil->data->{sys}{progress}+=10, message => "job_0329"}); # First, is the node already on? $shell_call .= " -o on"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, secure => 1, list => { shell_call => $shell_call }}); my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, secure => 1}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code, }}); if ($return_code eq "0") { # Success! $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, key => "job_0330"}); $anvil->Job->update_progress({progress => 100, message => "job_0330"}); # Update the host's status to 'booting' and exit $anvil->Database->update_host_status({ debug => 2, host_uuid => $anvil->data->{switches}{'host-uuid'}, host_status => "booting", }); return(0); } else { # Failed. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0292", variables => { output => $output }}); $anvil->Job->update_progress({progress => 100, message => "error_0292,!!output!".$output."!!"}); return(1); } } return(0); }