From 9eec6c49779db0df9a28a75e1e3ced3e8b9aa53c Mon Sep 17 00:00:00 2001 From: Digimer Date: Mon, 29 Nov 2021 22:43:23 -0500 Subject: [PATCH] * Created ScanCore->check_temperature_direct() based around that start logic from ScanCore->post_scan_analysis_striker() temperature check, and updated the later to use the former. * Updated the logic of when to boot a node or DR host that was found to be off for unknown reasons to require both poewr and temperature to be OK, and checks against the new 'feature::scancore::disable::boot-unknown-stop' config variable. Signed-off-by: Digimer --- Anvil/Tools/Database.pm | 4 +- Anvil/Tools/ScanCore.pm | 242 +++++++++++++++++++++++++++++----------- anvil.conf | 5 + share/words.xml | 7 +- 4 files changed, 190 insertions(+), 68 deletions(-) diff --git a/Anvil/Tools/Database.pm b/Anvil/Tools/Database.pm index 0d274766..3ee83f85 100644 --- a/Anvil/Tools/Database.pm +++ b/Anvil/Tools/Database.pm @@ -1363,7 +1363,7 @@ sub connect foreach my $uuid (sort {$a cmp $b} keys %{$anvil->data->{database}}) { # Periodically, autovivication causes and empty key to appear. - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { uuid => $uuid }}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { uuid => $uuid }}); next if ((not $uuid) or (not $anvil->Validate->uuid({uuid => $uuid}))); if (($db_uuid) && ($db_uuid ne $uuid)) @@ -1387,7 +1387,7 @@ sub connect my $name = $anvil->data->{database}{$uuid}{name}; my $user = $anvil->data->{database}{$uuid}{user}; my $password = $anvil->data->{database}{$uuid}{password}; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { host => $host, port => $port, name => $name, diff --git a/Anvil/Tools/ScanCore.pm b/Anvil/Tools/ScanCore.pm index 9b50f8c0..7e2d6600 100644 --- a/Anvil/Tools/ScanCore.pm +++ b/Anvil/Tools/ScanCore.pm @@ -21,6 +21,7 @@ my $THIS_FILE = "ScanCore.pm"; # check_health # check_power # check_temperature +# check_temperature_direct # count_servers # post_scan_analysis # post_scan_analysis_dr @@ -1160,6 +1161,109 @@ ORDER BY } +=head2 check_temperature_direct + +This calls a target's IPMI interface to check the temperature sensors that are available. The status is returns as; + + 0 = Failed to read temperature sensors / IPMI unavailable + 1 = All available temperatures are nominal. + 2 = One of more sensors are in warning or critical. + +Parameters; + +=head3 host_uuid (Optional, default Get->host_uuid() ) + +This is the host's UUID to look at. + +=cut +sub check_temperature_direct +{ + my $self = shift; + my $parameter = shift; + my $anvil = $self->parent; + my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "ScanCore->check_temperature_direct()" }}); + + my $host_uuid = defined $parameter->{host_uuid} ? $parameter->{host_uuid} : $anvil->Get->host_uuid; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + host_uuid => $host_uuid, + }}); + + # * 0 - Failed to read temperature sensors / IPMI unavailable + # * 1 - All available temperatures are nominal + # * 2 - One of more sensors are in warning or critical. + my $status = 0; + if ((not defined $anvil->data->{machine}{host_uuid}{$host_uuid}{hosts}{host_ipmi}) or (not $anvil->data->{machine}{host_uuid}{$host_uuid}{hosts}{host_ipmi})) + { + $anvil->Database->get_hosts_info({debug => $debug}); + } + my $host_ipmi = $anvil->data->{machine}{host_uuid}{$host_uuid}{hosts}{host_ipmi}; + my $host_name = $anvil->data->{machine}{host_uuid}{$host_uuid}{hosts}{host_name}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + host_ipmi => $host_ipmi, + host_name => $host_name, + }}); + + my ($ipmitool_command, $ipmi_password) = $anvil->Convert->fence_ipmilan_to_ipmitool({ + debug => 2, + fence_ipmilan_command => $host_ipmi, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + ipmitool_command => $ipmitool_command, + ipmi_password => $anvil->Log->is_secure($ipmi_password), + }}); + + if ((not $ipmitool_command) or ($ipmitool_command eq "!!error!!")) + { + # No IPMI tool to call. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0573", variables => { host_name => $host_name }}); + return($status); + } + + $anvil->System->collect_ipmi_data({ + debug => $debug, + host_name => $host_name, + ipmitool_command => $ipmitool_command, + ipmi_password => $ipmi_password, + }); + + # Now look for thermal values. + foreach my $sensor_name (sort {$a cmp $b} keys %{$anvil->data->{ipmi}{$host_name}{scan_ipmitool_sensor_name}}) + { + my $current_value = $anvil->data->{ipmi}{$host_name}{scan_ipmitool_sensor_name}{$sensor_name}{scan_ipmitool_value_sensor_value}; + my $units = $anvil->data->{ipmi}{$host_name}{scan_ipmitool_sensor_name}{$sensor_name}{scan_ipmitool_sensor_units}; + my $sensor_status = $anvil->data->{ipmi}{$host_name}{scan_ipmitool_sensor_name}{$sensor_name}{scan_ipmitool_sensor_status}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + current_value => $current_value, + sensor_name => $sensor_name, + units => $units, + sensor_status => $sensor_status, + }}); + + # If this is a temperature, check to see if it is outside its nominal range and, if + # so, record it into a hash for loading into ScanCore's 'temperature' table. + if ($units eq "C") + { + if ($sensor_status eq "ok") + { + # We've found at least one temperature sensor. Set status to '1' if not previously set + $status = 1 if not $status; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { status => $status }}); + } + else + { + # Sensor isn't OK yet. + $status = 2 if not $status; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { status => $status }}); + } + } + } + + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { status => $status }}); + return($status); +} + + =head2 count_servers This returns the number of servers running on a given host, as reported by ScanCore (specifically, by counting the number of servers running on the host from the C<< servers >> table). It also counts the total amount of RAM in use by hosted servers. @@ -2527,9 +2631,7 @@ LIMIT 1;"; if (not $stop_reason) { $stop_reason = "unknown"; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { stop_reason => $stop_reason }}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0565", variables => { host_name => $host_name }}); - } if ($stop_reason eq "user") @@ -2538,7 +2640,71 @@ LIMIT 1;"; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0566", variables => { host_name => $host_name }}); next; } - elsif (($stop_reason eq "power") or ($stop_reason eq "unknown")) + elsif ($stop_reason eq "unknown") + { + # Check both power and temp. + if ((not defined $anvil->data->{feature}{scancore}{disable}{'boot-unknown-stop'}) or (not exists $anvil->data->{feature}{scancore}{disable}{'boot-unknown-stop'}) or ($anvil->data->{feature}{scancore}{disable}{'boot-unknown-stop'} eq "")) + { + $anvil->data->{feature}{scancore}{disable}{'boot-unknown-stop'} = 1; + } + if (not $anvil->data->{feature}{scancore}{disable}{'boot-unknown-stop'}) + { + # Ignore. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0671", variables => { host_name => $host_name }}); + } + else + { + # Evaluate for boot. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0672", variables => { host_name => $host_name }}); + + # Check power + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0567", variables => { host_name => $host_name }}); + my ($power_health, $shortest_time_on_batteries, $highest_charge_percentage, $estimated_hold_up_time) = $anvil->ScanCore->check_power({ + debug => $debug, + anvil_uuid => $anvil_uuid, + anvil_name => $anvil_name, + host_uuid => $host_uuid, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + power_health => $power_health, + shortest_time_on_batteries => $shortest_time_on_batteries, + highest_charge_percentage => $highest_charge_percentage, + estimated_hold_up_time => $estimated_hold_up_time, + }}); + + # Check temp. + my ($temp_health) = $anvil->ScanCore->check_temperature_direct({ + debug => $debug, + host_uuid => $host_uuid, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { temp_health => $temp_health }}); + + ### Temp + # * 0 = Failed to read temperature sensors / IPMI unavailable + # * 1 = All available temperatures are nominal. + # * 2 = One of more sensors are in warning or critical. + ### Power + # * 0 = No UPSes found for the host + # * 1 = One or more UPSes found and at least one has input power from mains. + # * 2 = One or more UPSes found, all are running on battery. + if (($temp_health eq "1") && ($power_health eq "1")) + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0673", variables => { host_name => $host_name }}); + + $shell_call =~ s/--action status/ --action on/; + my ($output, $return_code) = $anvil->System->call({debug => $debug, timeout => 30, shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { shell_call => $shell_call }}); + + # Mark it as booting. + $anvil->Database->update_host_status({ + debug => $debug, + host_uuid => $host_uuid, + host_status => "booting", + }); + } + } + } + elsif ($stop_reason eq "power") { # Check now if the power is OK $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0567", variables => { host_name => $host_name }}); @@ -2590,72 +2756,20 @@ LIMIT 1;"; } elsif ($stop_reason eq "thermal") { - ### TODO: Switch to ->check_temperature() # Check now if the temperature is OK. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0568", variables => { host_name => $host_name }}); - - my ($ipmitool_command, $ipmi_password) = $anvil->Convert->fence_ipmilan_to_ipmitool({ - debug => 2, - fence_ipmilan_command => $host_ipmi, - }); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { - ipmitool_command => $ipmitool_command, - ipmi_password => $anvil->Log->is_secure($ipmi_password), - }}); - - if ((not $ipmitool_command) or ($ipmitool_command eq "!!error!!")) - { - # No IPMI tool to call. - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0573", variables => { host_name => $host_name }}); - next; - } - - $anvil->System->collect_ipmi_data({ - host_name => $host_name, - ipmitool_command => $ipmitool_command, - ipmi_password => $ipmi_password, + my ($temp_health) = $anvil->ScanCore->check_temperature_direct({ + debug => $debug, + host_uuid => $host_uuid, }); - # Now look for thermal values. - my $sensor_found = 0; - my $temperatures_ok = 1; - foreach my $sensor_name (sort {$a cmp $b} keys %{$anvil->data->{ipmi}{$host_name}{scan_ipmitool_sensor_name}}) - { - my $current_value = $anvil->data->{ipmi}{$host_name}{scan_ipmitool_sensor_name}{$sensor_name}{scan_ipmitool_value_sensor_value}; - my $units = $anvil->data->{ipmi}{$host_name}{scan_ipmitool_sensor_name}{$sensor_name}{scan_ipmitool_sensor_units}; - my $status = $anvil->data->{ipmi}{$host_name}{scan_ipmitool_sensor_name}{$sensor_name}{scan_ipmitool_sensor_status}; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { - current_value => $current_value, - sensor_name => $sensor_name, - units => $units, - status => $status, - }}); - - # If this is a temperature, check to see if it is outside its nominal range and, if - # so, record it into a hash for loading into ScanCore's 'temperature' table. - if ($units eq "C") - { - if (not $sensor_found) - { - # We've found at least one temperature sensor. - $sensor_found = 1; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { sensor_found => $sensor_found }}); - } - - if ($status ne "ok") - { - # Sensor isn't OK yet. - $temperatures_ok = 0; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { temperatures_ok => $temperatures_ok }}); - } - } - } + ### Temp + # * 0 = Failed to read temperature sensors / IPMI unavailable + # * 1 = All available temperatures are nominal. + # * 2 = One of more sensors are in warning or critical. + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { temp_health => $temp_health }}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { - sensor_found => $sensor_found, - temperatures_ok => $temperatures_ok, - }}); - if (($sensor_found) && ($temperatures_ok)) + if ($temp_health eq "1") { ### TODO: We'll want to revisit M2's restart cooldown logic. It never ### actually proved useful in M2, but it doesn't mean it wouldn't help diff --git a/anvil.conf b/anvil.conf index a292cebe..15eb02e8 100644 --- a/anvil.conf +++ b/anvil.conf @@ -11,6 +11,11 @@ sys::privacy::strong = 0 # feature, set this to '1'. feature::scancore::disable::preventative-live-migration = 0 +# If a node is found to be powered off, and there is no reason recorded in the database, it will be booted. +# The assumption is that an accidental power off occurred. If you would like to have nodes that power off +# stay off until manually started, set this to '0' +#feature::scancore::disable::boot-unknown-stop = 1 + ### Database # Database connections; # diff --git a/share/words.xml b/share/words.xml index 91ff0d07..1c75d45f 100644 --- a/share/words.xml +++ b/share/words.xml @@ -1947,7 +1947,7 @@ The file: [#!variable!file!#] needs to be updated. The difference is: The host: [#!variable!host_name!#] is up, no need to check if it needs booting. The host: [#!variable!host_name!#] couldn't be reached directly, but IPMI reports that it is up. Could the IPMI BMC be hung or unplugged? The host: [#!variable!host_name!#] is off. Will check now if it should be booted. - The host: [#!variable!host_name!#] has no stop reason, so we'll boot it up in case it lost power without warning. + The host: [#!variable!host_name!#] has no stop reason, so we'll check to see if we should power it on, in case it lost power or overheated without warning. The host: [#!variable!host_name!#] was stopped by the user, so we'll leave it off. The host: [#!variable!host_name!#] was powered off because of power loss. Checking to see if it is now safe to restart it. The host: [#!variable!host_name!#] was powered off because of thermal issues. Checking to see if it is now safe to restart it. @@ -1955,7 +1955,7 @@ The file: [#!variable!file!#] needs to be updated. The difference is: Unable to parse the install manifest uuid: [#!variable!manifest_uuid!#] for the Anvil! [#!variable!anvil_name!#]. As such, unable to determine what UPSes power the machine: [#!variable!host_name!#]. Unable to determine if the power feeding this node is OK or not. The UPS referenced by the 'power_uuid': [#!variable!power_uuid!#] under the host: [#!variable!host_name!#] has no record of being on mains power, so we can't determine how long it's been on batteries. Setting the "shortest time on batteries" to zero seconds. Marking the host as 'online' and clearing the host's stop reason. - The host: [#!variable!host_name!#] is off, but there appears to be a problem translating the 'fence_ipmilan' into a workable 'ipmitool' command. Unable to check the thermal data of the host, and so, unable to determine if it's safe to boot the node. + There appears to be a problem translating the 'fence_ipmilan' into a workable 'ipmitool' command for the host: [#!variable!host_name!#]. Unable to check the thermal data of the host. The host: [#!variable!host_name!#] was powered off because of power loss. Power is back and the UPSes are sufficiently charged. Booting it back up now. The host: [#!variable!host_name!#] was powered off for thermal reasons. All available thermal sensors read as OK now. Booting it back up now. The file: [#!variable!file_path!#] isn't on (or isn't the right size on) Striker: [#!variable!host_name!#]. Not using it to pull from. @@ -2061,6 +2061,9 @@ The file: [#!variable!file!#] needs to be updated. The difference is: No password for the database on the host with UUID: [#!variable!uuid!#], skipping it. The firewalld daemon isn't running, skipping firewall setup. The postgresql server is installed. + The host: [#!variable!host_name!#] was powered off for an unknown reason, and 'feature::scancore::disable::boot-unknown-stop' is set to: [#!data!feature::scancore::disable::boot-unknown-stop!#]. Will not boot this host. + The host: [#!variable!host_name!#] was powered off for an unknown reason, and 'feature::scancore::disable::boot-unknown-stop' is set to: [#!data!feature::scancore::disable::boot-unknown-stop!#]. If power and temperature looks good, we'll boot it. + The host: [#!variable!host_name!#] has good power and temperature readings. Booting it back up now. The host name: [#!variable!target!#] does not resolve to an IP address.