Merge pull request #346 from ClusterLabs/anvil-tools-dev
* This adds the new tool 'striker-check-machines' which simply walks …main
commit
169b8328e6
5 changed files with 249 additions and 7 deletions
@ -0,0 +1,29 @@ |
|||||||
|
.\" Manpage for the Anvil! machine power and access reporting tool. |
||||||
|
.\" Contact mkelly@alteeve.com to report issues, concerns or suggestions. |
||||||
|
.TH striker-check-machine "8" "June 20 2023" "Anvil! Intelligent Availability™ Platform" |
||||||
|
.SH NAME |
||||||
|
striker-check-machine \- This program tests access and, when not reachable, checks the power state if out of band management is available. |
||||||
|
.SH SYNOPSIS |
||||||
|
.B striker-check-machine |
||||||
|
\fI\,<command> \/\fR[\fI\,options\/\fR] |
||||||
|
.SH DESCRIPTION |
||||||
|
This loops through all know physical machines, Striker dashboards, Anvil! sub-nodes and DR hosts, and tries to connect to them. If they can't be reached over ssh, and if the machine has recorded IPMI information, an attempt to check if the machine is powered on or off is made. |
||||||
|
.TP |
||||||
|
.SH OPTIONS |
||||||
|
.TP |
||||||
|
\-?, \-h, \fB\-\-help\fR |
||||||
|
Show this man page. |
||||||
|
.TP |
||||||
|
\fB\-\-log-secure\fR |
||||||
|
When logging, record sensitive data, like passwords. |
||||||
|
.TP |
||||||
|
\-v, \-vv, \-vvv |
||||||
|
Set the log level to 1, 2 or 3 respectively. Be aware that level 3 generates a significant amount of log data. |
||||||
|
.SS "Commands:" |
||||||
|
.TP |
||||||
|
This program takes no commands. |
||||||
|
.TP |
||||||
|
.SH AUTHOR |
||||||
|
Written by Madison Kelly, Alteeve staff and the Anvil! project contributors. |
||||||
|
.SH "REPORTING BUGS" |
||||||
|
Report bugs to users@clusterlabs.org |
@ -0,0 +1,150 @@ |
|||||||
|
#!/usr/bin/perl |
||||||
|
|
||||||
|
use strict; |
||||||
|
use warnings; |
||||||
|
use Anvil::Tools; |
||||||
|
use Data::Dumper; |
||||||
|
use Text::Diff; |
||||||
|
use Term::Cap; |
||||||
|
use Time::Local; |
||||||
|
|
||||||
|
$| = 1; |
||||||
|
|
||||||
|
my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0]; |
||||||
|
my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0]; |
||||||
|
if (($running_directory =~ /^\./) && ($ENV{PWD})) |
||||||
|
{ |
||||||
|
$running_directory =~ s/^\./$ENV{PWD}/; |
||||||
|
} |
||||||
|
|
||||||
|
my $anvil = Anvil::Tools->new(); |
||||||
|
|
||||||
|
# Get a list of all interfaces with IP addresses. |
||||||
|
$anvil->Get->switches({debug => 3, list => []}); |
||||||
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}}); |
||||||
|
|
||||||
|
$anvil->Database->connect; |
||||||
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"}); |
||||||
|
if (not $anvil->data->{sys}{database}{connections}) |
||||||
|
{ |
||||||
|
# No databases, exit. |
||||||
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, secure => 0, key => "error_0003"}); |
||||||
|
$anvil->nice_exit({exit_code => 1}); |
||||||
|
} |
||||||
|
|
||||||
|
my $t = Term::Cap->Tgetent; |
||||||
|
|
||||||
|
print "Checking the state of all known machines. Please be patient.\n"; |
||||||
|
|
||||||
|
$anvil->Database->get_hosts(); |
||||||
|
foreach my $show_host_type ("striker", "node", "dr") |
||||||
|
{ |
||||||
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { show_host_type => $show_host_type }}); |
||||||
|
if ($show_host_type eq "striker") |
||||||
|
{ |
||||||
|
print "-=] Striker Dashboards;\n"; |
||||||
|
} |
||||||
|
elsif ($show_host_type eq "node") |
||||||
|
{ |
||||||
|
print "\n-=] Anvil! sub-nodes;\n"; |
||||||
|
} |
||||||
|
elsif ($show_host_type eq "dr") |
||||||
|
{ |
||||||
|
print "\n-=] DR Hosts\n"; |
||||||
|
} |
||||||
|
foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}}) |
||||||
|
{ |
||||||
|
my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name}; |
||||||
|
my $host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type}; |
||||||
|
my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; |
||||||
|
my $host_ipmi = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_ipmi}; |
||||||
|
my $access = ""; |
||||||
|
my $say_uptime = ""; |
||||||
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { |
||||||
|
's1:host_name' => $host_name, |
||||||
|
's2:short_host_name' => $short_host_name, |
||||||
|
's3:host_uuid' => $host_uuid, |
||||||
|
's4:host_type' => $host_type, |
||||||
|
's5:host_ipmi' => $anvil->Log->is_secure($host_ipmi), |
||||||
|
}}); |
||||||
|
next if $host_type ne $show_host_type; |
||||||
|
next if $host_uuid eq $anvil->Get->host_uuid; |
||||||
|
my $matches = $anvil->Network->find_access({ |
||||||
|
debug => 2, |
||||||
|
target => $host_name, |
||||||
|
}); |
||||||
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { matches => $matches }}); |
||||||
|
foreach my $network_name (sort {$a cmp $b} keys %{$anvil->data->{network_access}}) |
||||||
|
{ |
||||||
|
my $target_ip = $anvil->data->{network_access}{$network_name}{target_ip_address}; |
||||||
|
my $test_access = $anvil->Remote->test_access({target => $target_ip}); |
||||||
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { |
||||||
|
's1:network_name' => $network_name, |
||||||
|
's2:target_ip' => $target_ip, |
||||||
|
's3:test_access' => $test_access, |
||||||
|
}}); |
||||||
|
|
||||||
|
if ($test_access) |
||||||
|
{ |
||||||
|
# We're good. |
||||||
|
$access = 1; |
||||||
|
my $uptime = $anvil->Get->uptime({debug => 2, target => $target_ip}); |
||||||
|
$say_uptime = $anvil->Convert->time({ |
||||||
|
debug => 2, |
||||||
|
'time' => $uptime ? $uptime : 0, |
||||||
|
translate => 1, |
||||||
|
long => 1, |
||||||
|
}); |
||||||
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { |
||||||
|
's1:access' => $access, |
||||||
|
's2:uptime' => $uptime, |
||||||
|
's3:say_uptime' => $say_uptime, |
||||||
|
}}); |
||||||
|
last; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
if ($access) |
||||||
|
{ |
||||||
|
print $short_host_name." is up and has been running for: [".$say_uptime."]\n"; |
||||||
|
} |
||||||
|
else |
||||||
|
{ |
||||||
|
# Can we check the power using IPMI? |
||||||
|
if ($host_ipmi) |
||||||
|
{ |
||||||
|
my $shell_call = $host_ipmi." -o status"; |
||||||
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, secure => 1, list => { shell_call => $shell_call }}); |
||||||
|
|
||||||
|
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, secure => 1}); |
||||||
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { |
||||||
|
output => $output, |
||||||
|
return_code => $return_code, |
||||||
|
}}); |
||||||
|
|
||||||
|
if ($return_code eq "0") |
||||||
|
{ |
||||||
|
# The machine is already on |
||||||
|
print "The machine: [".$short_host_name."] appears to be powered ON, but we can't reach it. Is it booting?\n"; |
||||||
|
} |
||||||
|
elsif ($return_code eq "1") |
||||||
|
{ |
||||||
|
# Unable to connect to the fence device. |
||||||
|
print "The machine: [".$short_host_name."] can not be reached, and we can't check it's power status either. Has it been completely powered off?\n"; |
||||||
|
} |
||||||
|
elsif ($return_code eq "2") |
||||||
|
{ |
||||||
|
# The machine is off, try to start it. |
||||||
|
print "The machine: [".$short_host_name."] is confirmed to be powered OFF.\n"; |
||||||
|
} |
||||||
|
} |
||||||
|
else |
||||||
|
{ |
||||||
|
print "The machine: [".$short_host_name."] appears to be offline, and doesn't appear to have out-of-band management to check the power status.\n"; |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
print "\n Done!\n"; |
||||||
|
|
||||||
|
$anvil->nice_exit({exit_code => 0}); |
Loading…
Reference in new issue