* Created ScanCore->check_temperature_direct() based around that start logic from ScanCore->post_scan_analysis_striker() temperature check, and updated the later to use the former.

* Updated the logic of when to boot a node or DR host that was found to be off for unknown reasons to require both poewr and temperature to be OK, and checks against the new 'feature::scancore::disable::boot-unknown-stop' config variable.

Signed-off-by: Digimer <digimer@alteeve.ca>
main
Digimer 3 years ago
parent 2188c3491e
commit 9eec6c4977
  1. 4
      Anvil/Tools/Database.pm
  2. 242
      Anvil/Tools/ScanCore.pm
  3. 5
      anvil.conf
  4. 7
      share/words.xml

@ -1363,7 +1363,7 @@ sub connect
foreach my $uuid (sort {$a cmp $b} keys %{$anvil->data->{database}})
{
# Periodically, autovivication causes and empty key to appear.
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { uuid => $uuid }});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { uuid => $uuid }});
next if ((not $uuid) or (not $anvil->Validate->uuid({uuid => $uuid})));
if (($db_uuid) && ($db_uuid ne $uuid))
@ -1387,7 +1387,7 @@ sub connect
my $name = $anvil->data->{database}{$uuid}{name};
my $user = $anvil->data->{database}{$uuid}{user};
my $password = $anvil->data->{database}{$uuid}{password};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
host => $host,
port => $port,
name => $name,

@ -21,6 +21,7 @@ my $THIS_FILE = "ScanCore.pm";
# check_health
# check_power
# check_temperature
# check_temperature_direct
# count_servers
# post_scan_analysis
# post_scan_analysis_dr
@ -1160,6 +1161,109 @@ ORDER BY
}
=head2 check_temperature_direct
This calls a target's IPMI interface to check the temperature sensors that are available. The status is returns as;
0 = Failed to read temperature sensors / IPMI unavailable
1 = All available temperatures are nominal.
2 = One of more sensors are in warning or critical.
Parameters;
=head3 host_uuid (Optional, default Get->host_uuid() )
This is the host's UUID to look at.
=cut
sub check_temperature_direct
{
my $self = shift;
my $parameter = shift;
my $anvil = $self->parent;
my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "ScanCore->check_temperature_direct()" }});
my $host_uuid = defined $parameter->{host_uuid} ? $parameter->{host_uuid} : $anvil->Get->host_uuid;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
host_uuid => $host_uuid,
}});
# * 0 - Failed to read temperature sensors / IPMI unavailable
# * 1 - All available temperatures are nominal
# * 2 - One of more sensors are in warning or critical.
my $status = 0;
if ((not defined $anvil->data->{machine}{host_uuid}{$host_uuid}{hosts}{host_ipmi}) or (not $anvil->data->{machine}{host_uuid}{$host_uuid}{hosts}{host_ipmi}))
{
$anvil->Database->get_hosts_info({debug => $debug});
}
my $host_ipmi = $anvil->data->{machine}{host_uuid}{$host_uuid}{hosts}{host_ipmi};
my $host_name = $anvil->data->{machine}{host_uuid}{$host_uuid}{hosts}{host_name};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
host_ipmi => $host_ipmi,
host_name => $host_name,
}});
my ($ipmitool_command, $ipmi_password) = $anvil->Convert->fence_ipmilan_to_ipmitool({
debug => 2,
fence_ipmilan_command => $host_ipmi,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
ipmitool_command => $ipmitool_command,
ipmi_password => $anvil->Log->is_secure($ipmi_password),
}});
if ((not $ipmitool_command) or ($ipmitool_command eq "!!error!!"))
{
# No IPMI tool to call.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0573", variables => { host_name => $host_name }});
return($status);
}
$anvil->System->collect_ipmi_data({
debug => $debug,
host_name => $host_name,
ipmitool_command => $ipmitool_command,
ipmi_password => $ipmi_password,
});
# Now look for thermal values.
foreach my $sensor_name (sort {$a cmp $b} keys %{$anvil->data->{ipmi}{$host_name}{scan_ipmitool_sensor_name}})
{
my $current_value = $anvil->data->{ipmi}{$host_name}{scan_ipmitool_sensor_name}{$sensor_name}{scan_ipmitool_value_sensor_value};
my $units = $anvil->data->{ipmi}{$host_name}{scan_ipmitool_sensor_name}{$sensor_name}{scan_ipmitool_sensor_units};
my $sensor_status = $anvil->data->{ipmi}{$host_name}{scan_ipmitool_sensor_name}{$sensor_name}{scan_ipmitool_sensor_status};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
current_value => $current_value,
sensor_name => $sensor_name,
units => $units,
sensor_status => $sensor_status,
}});
# If this is a temperature, check to see if it is outside its nominal range and, if
# so, record it into a hash for loading into ScanCore's 'temperature' table.
if ($units eq "C")
{
if ($sensor_status eq "ok")
{
# We've found at least one temperature sensor. Set status to '1' if not previously set
$status = 1 if not $status;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { status => $status }});
}
else
{
# Sensor isn't OK yet.
$status = 2 if not $status;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { status => $status }});
}
}
}
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { status => $status }});
return($status);
}
=head2 count_servers
This returns the number of servers running on a given host, as reported by ScanCore (specifically, by counting the number of servers running on the host from the C<< servers >> table). It also counts the total amount of RAM in use by hosted servers.
@ -2527,9 +2631,7 @@ LIMIT 1;";
if (not $stop_reason)
{
$stop_reason = "unknown";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { stop_reason => $stop_reason }});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0565", variables => { host_name => $host_name }});
}
if ($stop_reason eq "user")
@ -2538,7 +2640,71 @@ LIMIT 1;";
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0566", variables => { host_name => $host_name }});
next;
}
elsif (($stop_reason eq "power") or ($stop_reason eq "unknown"))
elsif ($stop_reason eq "unknown")
{
# Check both power and temp.
if ((not defined $anvil->data->{feature}{scancore}{disable}{'boot-unknown-stop'}) or (not exists $anvil->data->{feature}{scancore}{disable}{'boot-unknown-stop'}) or ($anvil->data->{feature}{scancore}{disable}{'boot-unknown-stop'} eq ""))
{
$anvil->data->{feature}{scancore}{disable}{'boot-unknown-stop'} = 1;
}
if (not $anvil->data->{feature}{scancore}{disable}{'boot-unknown-stop'})
{
# Ignore.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0671", variables => { host_name => $host_name }});
}
else
{
# Evaluate for boot.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0672", variables => { host_name => $host_name }});
# Check power
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0567", variables => { host_name => $host_name }});
my ($power_health, $shortest_time_on_batteries, $highest_charge_percentage, $estimated_hold_up_time) = $anvil->ScanCore->check_power({
debug => $debug,
anvil_uuid => $anvil_uuid,
anvil_name => $anvil_name,
host_uuid => $host_uuid,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
power_health => $power_health,
shortest_time_on_batteries => $shortest_time_on_batteries,
highest_charge_percentage => $highest_charge_percentage,
estimated_hold_up_time => $estimated_hold_up_time,
}});
# Check temp.
my ($temp_health) = $anvil->ScanCore->check_temperature_direct({
debug => $debug,
host_uuid => $host_uuid,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { temp_health => $temp_health }});
### Temp
# * 0 = Failed to read temperature sensors / IPMI unavailable
# * 1 = All available temperatures are nominal.
# * 2 = One of more sensors are in warning or critical.
### Power
# * 0 = No UPSes found for the host
# * 1 = One or more UPSes found and at least one has input power from mains.
# * 2 = One or more UPSes found, all are running on battery.
if (($temp_health eq "1") && ($power_health eq "1"))
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0673", variables => { host_name => $host_name }});
$shell_call =~ s/--action status/ --action on/;
my ($output, $return_code) = $anvil->System->call({debug => $debug, timeout => 30, shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { shell_call => $shell_call }});
# Mark it as booting.
$anvil->Database->update_host_status({
debug => $debug,
host_uuid => $host_uuid,
host_status => "booting",
});
}
}
}
elsif ($stop_reason eq "power")
{
# Check now if the power is OK
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0567", variables => { host_name => $host_name }});
@ -2590,72 +2756,20 @@ LIMIT 1;";
}
elsif ($stop_reason eq "thermal")
{
### TODO: Switch to ->check_temperature()
# Check now if the temperature is OK.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0568", variables => { host_name => $host_name }});
my ($ipmitool_command, $ipmi_password) = $anvil->Convert->fence_ipmilan_to_ipmitool({
debug => 2,
fence_ipmilan_command => $host_ipmi,
my ($temp_health) = $anvil->ScanCore->check_temperature_direct({
debug => $debug,
host_uuid => $host_uuid,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
ipmitool_command => $ipmitool_command,
ipmi_password => $anvil->Log->is_secure($ipmi_password),
}});
if ((not $ipmitool_command) or ($ipmitool_command eq "!!error!!"))
{
# No IPMI tool to call.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0573", variables => { host_name => $host_name }});
next;
}
$anvil->System->collect_ipmi_data({
host_name => $host_name,
ipmitool_command => $ipmitool_command,
ipmi_password => $ipmi_password,
});
### Temp
# * 0 = Failed to read temperature sensors / IPMI unavailable
# * 1 = All available temperatures are nominal.
# * 2 = One of more sensors are in warning or critical.
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { temp_health => $temp_health }});
# Now look for thermal values.
my $sensor_found = 0;
my $temperatures_ok = 1;
foreach my $sensor_name (sort {$a cmp $b} keys %{$anvil->data->{ipmi}{$host_name}{scan_ipmitool_sensor_name}})
{
my $current_value = $anvil->data->{ipmi}{$host_name}{scan_ipmitool_sensor_name}{$sensor_name}{scan_ipmitool_value_sensor_value};
my $units = $anvil->data->{ipmi}{$host_name}{scan_ipmitool_sensor_name}{$sensor_name}{scan_ipmitool_sensor_units};
my $status = $anvil->data->{ipmi}{$host_name}{scan_ipmitool_sensor_name}{$sensor_name}{scan_ipmitool_sensor_status};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
current_value => $current_value,
sensor_name => $sensor_name,
units => $units,
status => $status,
}});
# If this is a temperature, check to see if it is outside its nominal range and, if
# so, record it into a hash for loading into ScanCore's 'temperature' table.
if ($units eq "C")
{
if (not $sensor_found)
{
# We've found at least one temperature sensor.
$sensor_found = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { sensor_found => $sensor_found }});
}
if ($status ne "ok")
{
# Sensor isn't OK yet.
$temperatures_ok = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { temperatures_ok => $temperatures_ok }});
}
}
}
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
sensor_found => $sensor_found,
temperatures_ok => $temperatures_ok,
}});
if (($sensor_found) && ($temperatures_ok))
if ($temp_health eq "1")
{
### TODO: We'll want to revisit M2's restart cooldown logic. It never
### actually proved useful in M2, but it doesn't mean it wouldn't help

@ -11,6 +11,11 @@ sys::privacy::strong = 0
# feature, set this to '1'.
feature::scancore::disable::preventative-live-migration = 0
# If a node is found to be powered off, and there is no reason recorded in the database, it will be booted.
# The assumption is that an accidental power off occurred. If you would like to have nodes that power off
# stay off until manually started, set this to '0'
#feature::scancore::disable::boot-unknown-stop = 1
### Database
# Database connections;
#

@ -1947,7 +1947,7 @@ The file: [#!variable!file!#] needs to be updated. The difference is:
<key name="log_0562">The host: [#!variable!host_name!#] is up, no need to check if it needs booting.</key>
<key name="log_0563">The host: [#!variable!host_name!#] couldn't be reached directly, but IPMI reports that it is up. Could the IPMI BMC be hung or unplugged?</key>
<key name="log_0564">The host: [#!variable!host_name!#] is off. Will check now if it should be booted.</key>
<key name="log_0565">The host: [#!variable!host_name!#] has no stop reason, so we'll boot it up in case it lost power without warning.</key>
<key name="log_0565">The host: [#!variable!host_name!#] has no stop reason, so we'll check to see if we should power it on, in case it lost power or overheated without warning.</key>
<key name="log_0566">The host: [#!variable!host_name!#] was stopped by the user, so we'll leave it off.</key>
<key name="log_0567">The host: [#!variable!host_name!#] was powered off because of power loss. Checking to see if it is now safe to restart it.</key>
<key name="log_0568">The host: [#!variable!host_name!#] was powered off because of thermal issues. Checking to see if it is now safe to restart it.</key>
@ -1955,7 +1955,7 @@ The file: [#!variable!file!#] needs to be updated. The difference is:
<key name="log_0570">Unable to parse the install manifest uuid: [#!variable!manifest_uuid!#] for the Anvil! [#!variable!anvil_name!#]. As such, unable to determine what UPSes power the machine: [#!variable!host_name!#]. Unable to determine if the power feeding this node is OK or not.</key>
<key name="log_0571">The UPS referenced by the 'power_uuid': [#!variable!power_uuid!#] under the host: [#!variable!host_name!#] has no record of being on mains power, so we can't determine how long it's been on batteries. Setting the "shortest time on batteries" to zero seconds.</key>
<key name="log_0572">Marking the host as 'online' and clearing the host's stop reason.</key>
<key name="log_0573">The host: [#!variable!host_name!#] is off, but there appears to be a problem translating the 'fence_ipmilan' into a workable 'ipmitool' command. Unable to check the thermal data of the host, and so, unable to determine if it's safe to boot the node.</key>
<key name="log_0573">There appears to be a problem translating the 'fence_ipmilan' into a workable 'ipmitool' command for the host: [#!variable!host_name!#]. Unable to check the thermal data of the host.</key>
<key name="log_0574">The host: [#!variable!host_name!#] was powered off because of power loss. Power is back and the UPSes are sufficiently charged. Booting it back up now.</key>
<key name="log_0575">The host: [#!variable!host_name!#] was powered off for thermal reasons. All available thermal sensors read as OK now. Booting it back up now.</key>
<key name="log_0576">The file: [#!variable!file_path!#] isn't on (or isn't the right size on) Striker: [#!variable!host_name!#]. Not using it to pull from.</key>
@ -2061,6 +2061,9 @@ The file: [#!variable!file!#] needs to be updated. The difference is:
<key name="log_0668">No password for the database on the host with UUID: [#!variable!uuid!#], skipping it.</key>
<key name="log_0669">The firewalld daemon isn't running, skipping firewall setup.</key>
<key name="log_0670">The postgresql server is installed.</key>
<key name="log_0671">The host: [#!variable!host_name!#] was powered off for an unknown reason, and 'feature::scancore::disable::boot-unknown-stop' is set to: [#!data!feature::scancore::disable::boot-unknown-stop!#]. Will not boot this host.</key>
<key name="log_0672">The host: [#!variable!host_name!#] was powered off for an unknown reason, and 'feature::scancore::disable::boot-unknown-stop' is set to: [#!data!feature::scancore::disable::boot-unknown-stop!#]. If power and temperature looks good, we'll boot it.</key>
<key name="log_0673">The host: [#!variable!host_name!#] has good power and temperature readings. Booting it back up now.</key>
<!-- Messages for users (less technical than log entries), though sometimes used for logs, too. -->
<key name="message_0001">The host name: [#!variable!target!#] does not resolve to an IP address.</key>

Loading…
Cancel
Save