* This adds the new tool 'striker-check-machines' which simply walks through all known physical machines and checks to see if they're accessible and powered on.

* Updated Get->uptime() to work on remote targets.

Signed-off-by: digimer <mkelly@alteeve.ca>
main
digimer 2 years ago
parent 4a439f23b6
commit 0aa72498db
  1. 75
      Anvil/Tools/Get.pm
  2. 1
      man/Makefile.am
  3. 29
      man/striker-check-machines.8
  4. 1
      tools/Makefile.am
  5. 150
      tools/striker-check-machines

@ -2607,10 +2607,25 @@ sub trusted_hosts
This returns, in seconds, how long the host has been up and running for. This returns, in seconds, how long the host has been up and running for.
This method takes no parameters. Parameters;
=head3 password (optional)
This is the password to use when connecting to a remote machine. If not set, but C<< target >> is, an attempt to connect without a password will be made.
=head3 port (optional)
This is the TCP port to use when connecting to a remote machine. If not set, but C<< target >> is, C<< 22 >> will be used.
=head3 remote_user (optional, default root)
If C<< target >> is set, this will be the user we connect to the remote machine as.
=head3 target (optional)
This is the IP or host name of the machine to read the version of. If this is not set, the local system's version is checked.
=cut =cut
### TODO: Make this work on remote hosts
sub uptime sub uptime
{ {
my $self = shift; my $self = shift;
@ -2619,13 +2634,59 @@ sub uptime
my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3; my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "Get->uptime()" }}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "Get->uptime()" }});
my $uptime = $anvil->Storage->read_file({ my $password = defined $parameter->{password} ? $parameter->{password} : "";
force_read => 1, my $port = defined $parameter->{port} ? $parameter->{port} : "";
cache => 0, my $remote_user = defined $parameter->{remote_user} ? $parameter->{remote_user} : "root";
file => $anvil->data->{path}{proc}{uptime}, my $target = defined $parameter->{target} ? $parameter->{target} : "";
}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
password => $anvil->Log->is_secure($password),
port => $port,
remote_user => $remote_user,
target => $target,
}});
# Read the file
my $uptime = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { uptime => $uptime }}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { uptime => $uptime }});
# Is this a local call or a remote call?
if ($anvil->Network->is_local({host => $target}))
{
# Local.
$uptime = $anvil->Storage->read_file({
debug => $debug,
force_read => 1,
cache => 0,
file => $anvil->data->{path}{proc}{uptime},
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { uptime => $uptime }});
}
else
{
# Remote, we have to cat the file.
my $shell_call = $anvil->data->{path}{exe}{cat}." ".$anvil->data->{path}{proc}{uptime};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { shell_call => $shell_call }});
my ($output, $error, $return_code) = $anvil->Remote->call({
debug => $debug,
shell_call => $shell_call,
target => $target,
port => $port,
password => $password,
remote_user => $remote_user,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
output => $output,
error => $error,
return_code => $return_code,
}});
if (not $return_code)
{
$uptime = $output;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { uptime => $uptime }});
}
}
# Clean it up. We'll have gotten two numbers, the uptime in seconds (to two decimal places) and the # Clean it up. We'll have gotten two numbers, the uptime in seconds (to two decimal places) and the
# total idle time. We only care about the int number. # total idle time. We only care about the int number.
$uptime =~ s/^(\d+)\..*$/$1/; $uptime =~ s/^(\d+)\..*$/$1/;

@ -22,4 +22,5 @@ dist_man8_MANS = \
anvil-manage-server-storage.8 \ anvil-manage-server-storage.8 \
anvil-manage-storage-groups.8 \ anvil-manage-storage-groups.8 \
scancore.8 \ scancore.8 \
striker-check-machines.8 \
striker-initialize-host.8 striker-initialize-host.8

@ -0,0 +1,29 @@
.\" Manpage for the Anvil! machine power and access reporting tool.
.\" Contact mkelly@alteeve.com to report issues, concerns or suggestions.
.TH striker-check-machine "8" "June 20 2023" "Anvil! Intelligent Availability™ Platform"
.SH NAME
striker-check-machine \- This program tests access and, when not reachable, checks the power state if out of band management is available.
.SH SYNOPSIS
.B striker-check-machine
\fI\,<command> \/\fR[\fI\,options\/\fR]
.SH DESCRIPTION
This loops through all know physical machines, Striker dashboards, Anvil! sub-nodes and DR hosts, and tries to connect to them. If they can't be reached over ssh, and if the machine has recorded IPMI information, an attempt to check if the machine is powered on or off is made.
.TP
.SH OPTIONS
.TP
\-?, \-h, \fB\-\-help\fR
Show this man page.
.TP
\fB\-\-log-secure\fR
When logging, record sensitive data, like passwords.
.TP
\-v, \-vv, \-vvv
Set the log level to 1, 2 or 3 respectively. Be aware that level 3 generates a significant amount of log data.
.SS "Commands:"
.TP
This program takes no commands.
.TP
.SH AUTHOR
Written by Madison Kelly, Alteeve staff and the Anvil! project contributors.
.SH "REPORTING BUGS"
Report bugs to users@clusterlabs.org

@ -51,6 +51,7 @@ dist_sbin_SCRIPTS = \
scancore \ scancore \
striker-auto-initialize-all \ striker-auto-initialize-all \
striker-boot-machine \ striker-boot-machine \
striker-check-machines \
striker-db-report \ striker-db-report \
striker-db-status \ striker-db-status \
striker-file-manager \ striker-file-manager \

@ -0,0 +1,150 @@
#!/usr/bin/perl
use strict;
use warnings;
use Anvil::Tools;
use Data::Dumper;
use Text::Diff;
use Term::Cap;
use Time::Local;
$| = 1;
my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0];
my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0];
if (($running_directory =~ /^\./) && ($ENV{PWD}))
{
$running_directory =~ s/^\./$ENV{PWD}/;
}
my $anvil = Anvil::Tools->new();
# Get a list of all interfaces with IP addresses.
$anvil->Get->switches({debug => 3, list => []});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}});
$anvil->Database->connect;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"});
if (not $anvil->data->{sys}{database}{connections})
{
# No databases, exit.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, secure => 0, key => "error_0003"});
$anvil->nice_exit({exit_code => 1});
}
my $t = Term::Cap->Tgetent;
print "Checking the state of all known machines. Please be patient.\n";
$anvil->Database->get_hosts();
foreach my $show_host_type ("striker", "node", "dr")
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { show_host_type => $show_host_type }});
if ($show_host_type eq "striker")
{
print "-=] Striker Dashboards;\n";
}
elsif ($show_host_type eq "node")
{
print "\n-=] Anvil! sub-nodes;\n";
}
elsif ($show_host_type eq "dr")
{
print "\n-=] DR Hosts\n";
}
foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}})
{
my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name};
my $host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type};
my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name};
my $host_ipmi = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_ipmi};
my $access = "";
my $say_uptime = "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:host_name' => $host_name,
's2:short_host_name' => $short_host_name,
's3:host_uuid' => $host_uuid,
's4:host_type' => $host_type,
's5:host_ipmi' => $anvil->Log->is_secure($host_ipmi),
}});
next if $host_type ne $show_host_type;
next if $host_uuid eq $anvil->Get->host_uuid;
my $matches = $anvil->Network->find_access({
debug => 2,
target => $host_name,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { matches => $matches }});
foreach my $network_name (sort {$a cmp $b} keys %{$anvil->data->{network_access}})
{
my $target_ip = $anvil->data->{network_access}{$network_name}{target_ip_address};
my $test_access = $anvil->Remote->test_access({target => $target_ip});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:network_name' => $network_name,
's2:target_ip' => $target_ip,
's3:test_access' => $test_access,
}});
if ($test_access)
{
# We're good.
$access = 1;
my $uptime = $anvil->Get->uptime({debug => 2, target => $target_ip});
$say_uptime = $anvil->Convert->time({
debug => 2,
'time' => $uptime ? $uptime : 0,
translate => 1,
long => 1,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:access' => $access,
's2:uptime' => $uptime,
's3:say_uptime' => $say_uptime,
}});
last;
}
}
if ($access)
{
print $short_host_name." is up and has been running for: [".$say_uptime."]\n";
}
else
{
# Can we check the power using IPMI?
if ($host_ipmi)
{
my $shell_call = $host_ipmi." -o status";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, secure => 1, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, secure => 1});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
if ($return_code eq "0")
{
# The machine is already on
print "The machine: [".$short_host_name."] appears to be powered ON, but we can't reach it. Is it booting?\n";
}
elsif ($return_code eq "1")
{
# Unable to connect to the fence device.
print "The machine: [".$short_host_name."] can not be reached, and we can't check it's power status either. Has it been completely powered off?\n";
}
elsif ($return_code eq "2")
{
# The machine is off, try to start it.
print "The machine: [".$short_host_name."] is confirmed to be powered OFF.\n";
}
}
else
{
print "The machine: [".$short_host_name."] appears to be offline, and doesn't appear to have out-of-band management to check the power status.\n";
}
}
}
}
print "\n Done!\n";
$anvil->nice_exit({exit_code => 0});
Loading…
Cancel
Save