Merge pull request #241 from ClusterLabs/anvil-tools-dev

* Changed the default trigger of live migrations to require a health …
main
digimer-bot 2 years ago committed by GitHub
commit 4a7f9f79df
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 10
      Anvil/Tools.pm
  2. 2
      Anvil/Tools/Alert.pm
  3. 4
      Anvil/Tools/Database.pm
  4. 14
      Anvil/Tools/ScanCore.pm
  5. 50
      scancore-agents/scan-apc-pdu/scan-apc-pdu
  6. 15
      share/words.xml
  7. 3
      tools/anvil-manage-files

@ -1009,6 +1009,16 @@ sub _set_defaults
html => "alteeve",
},
};
$anvil->data->{feature} = {
scancore => {
disable => {
'preventative-live-migration' => 0,
},
threshold => {
'preventative-live-migration' => 2,
},
},
};
return(0);
}

@ -276,7 +276,7 @@ sub check_condition_age
my $clear = defined $parameter->{clear} ? $parameter->{clear} : 0;
my $name = defined $parameter->{name} ? $parameter->{name} : "";
my $host_uuid = defined $parameter->{host_uuid} ? $parameter->{host_uuid} : "NULL";
my $host_uuid = defined $parameter->{host_uuid} ? $parameter->{host_uuid} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
clear => $clear,
name => $name,

@ -1625,7 +1625,7 @@ sub connect
# Read the DB identifier and then check that we've not already connected to this DB.
my $query = "SELECT system_identifier FROM pg_control_system();";
my $identifier = $anvil->Database->query({uuid => $uuid, query => $query, source => $THIS_FILE, line => __LINE__})->[0]->[0];
my $identifier = $anvil->Database->query({debug => $debug, uuid => $uuid, query => $query, source => $THIS_FILE, line => __LINE__})->[0]->[0];
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
query => $query,
identifier => $identifier,
@ -1691,7 +1691,7 @@ sub connect
variable_name => "database::".$uuid."::active",
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { active_value => $active_value }});
if (not $active_value)
if ($active_value eq "0")
{
# If we're "retry", we just started up.
if (($retry) && ($is_local))

@ -2022,13 +2022,25 @@ sub post_scan_analysis_node
# Last, evaluate health if we're otherwise OK
if ($peer_health > $local_health)
{
# The user may have set a migration threashold.
my $difference = $peer_health - $local_health;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { difference => $difference }});
if (not $anvil->data->{feature}{scancore}{threshold}{'preventative-live-migration'})
{
$anvil->data->{feature}{scancore}{threshold}{'preventative-live-migration'} = 2;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
'feature::scancore::threshold::preventative-live-migration' => $anvil->data->{feature}{scancore}{threshold}{'preventative-live-migration'},
}});
}
# A user may disable health-based preventative live migrations.
if ($anvil->data->{feature}{scancore}{disable}{'preventative-live-migration'})
{
# Do nothing.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, key => "message_0239"});
}
else
elsif ($difference >= $anvil->data->{feature}{scancore}{threshold}{'preventative-live-migration'})
{
# How long has this been the case?
my $age = $anvil->Alert->check_condition_age({

@ -260,7 +260,7 @@ FROM
my $scan_apc_pdu_link_speed = $row->[10];
my $scan_apc_pdu_phase_count = $row->[11];
my $scan_apc_pdu_outlet_count = $row->[12];
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
scan_apc_pdu_uuid => $scan_apc_pdu_uuid,
scan_apc_pdu_fence_uuid => $scan_apc_pdu_fence_uuid,
scan_apc_pdu_serial_number => $scan_apc_pdu_serial_number,
@ -1385,6 +1385,7 @@ WHERE
}
# Delete this from the SQL hash so we know it didn't vanish.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0726", variables => { hash_key => "sql::scan_apc_pdu_uuid::${scan_apc_pdu_uuid}" }});
delete $anvil->data->{sql}{scan_apc_pdu_uuid}{$scan_apc_pdu_uuid};
}
else
@ -1596,10 +1597,19 @@ INSERT INTO
scan_apc_pdu_ipv4_address => $scan_apc_pdu_ipv4_address,
}});
if ($scan_apc_pdu_model_number ne "DELETED")
{
# Yup! send an alert.
my $query = "
# The PDUs only allow one connection at a time, so if another scan agent is
# connected, we'll get this issue. As such, check how long it's been missing, and
# alert only if it's been missing for 10 minutes.
my $age = $anvil->Alert->check_condition_age({name => "scan_apc_pdu::lost_pdu::".$scan_apc_pdu_serial_number});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { age => $age }});
if ($age > 600)
{
# Yup! send an alert.
my $query = "
UPDATE
scan_apc_pdus
SET
@ -1608,22 +1618,23 @@ SET
WHERE
scan_apc_pdu_uuid = ".$anvil->Database->quote($scan_apc_pdu_uuid)."
;";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }});
push @{$anvil->data->{sys}{queries}}, $query;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }});
push @{$anvil->data->{sys}{queries}}, $query;
my $variables = {
model => $scan_apc_pdu_model_number,
serial_numer => $scan_apc_pdu_serial_number,
ip_address => $scan_apc_pdu_ipv4_address,
};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_apc_pdu_message_0040", variables => $variables});
$anvil->Alert->register({
alert_level => "warning",
message => "scan_apc_pdu_message_0040",
variables => $variables,
set_by => $THIS_FILE,
sort_position => $anvil->data->{'scan-apc-pdu'}{alert_sort}++,
});
my $variables = {
model => $scan_apc_pdu_model_number,
serial_numer => $scan_apc_pdu_serial_number,
ip_address => $scan_apc_pdu_ipv4_address,
};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_apc_pdu_message_0040", variables => $variables});
$anvil->Alert->register({
alert_level => "warning",
message => "scan_apc_pdu_message_0040",
variables => $variables,
set_by => $THIS_FILE,
sort_position => $anvil->data->{'scan-apc-pdu'}{alert_sort}++,
});
}
}
}
@ -2004,6 +2015,9 @@ sub gather_pdu_data
# If I got the serial number, I found the PDU.
next if not $scan_apc_pdu_serial_number;
# In case this PDU disappeared before, this will clear that condition.
$anvil->Alert->check_condition_age({clear => 1, name => "scan_apc_pdu::lost_pdu::".$scan_apc_pdu_serial_number});
#############################################################################################
# Base PDU info #
#############################################################################################

@ -466,7 +466,7 @@ Giving up.
<key name="error_0341">Failed to find the server: [#!variable!server!#] by name or UUID? Exiting.</key>
<key name="error_0342">The protocol: [#!variable!protocol!#] is invalid. Please use '--help' for more information.</key>
<key name="error_0343">The DR host: [#!variable!host_name!#] doesn't appear to be storage group: [#!variable!storage_group!#]. Unable to proceed.</key>
<key name="error_0344">We need: [#!variable!space_needed!# (#!variables!space_needed_bytes!# Bytes)] from the storage group: [#!variable!storage_group!#], but only: [#!variables!space_on_dr!# (#!variable!space_on_dr_bytes!# bytes)] is available on DR. Unable to proceed.</key>
<key name="error_0344">We need: [#!variable!space_needed!# (#!variable!space_needed_bytes!# Bytes)] from the storage group: [#!variable!storage_group!#], but only: [#!variable!space_on_dr!# (#!variable!space_on_dr_bytes!# bytes)] is available on DR. Unable to proceed.</key>
<key name="error_0345">[ Error ] - The check appears to have failed. Expected a return code of '0', but got: [#!variable!return_code!#]
The output, if any, was
====
@ -520,7 +520,7 @@ The definition data passed in was:
====
]]></key>
<key name="error_0368">[ Error ] - Failed to wipe and delete the logical volume: [#!variable!local_lv!#] that was volume number: [#!variable!volume!#] under the server: [#!variable!server!#].</key>
<key name="error_0369">There was a problem deleting: [#!variables!config_file!#]. The rest of the process completed successfully. Please manually remove this file if it still exists.</key>
<key name="error_0369">There was a problem deleting: [#!variable!config_file!#]. The rest of the process completed successfully. Please manually remove this file if it still exists.</key>
<!-- Files templates -->
<!-- NOTE: Translating these files requires an understanding of which lines are translatable -->
@ -576,7 +576,13 @@ sys::privacy::strong = #!data!sys::privacy::strong!#
# Normally, if one node in the Anvil! is healthier than the other, it will pull the servers from the peer
# on to it. This is a process called "preventative live migration". If you would like to disable this
# feature, set this to '1'.
feature::scancore::disable::preventative-live-migration = 0
#feature::scancore::disable::preventative-live-migration = 0
# If "preventative live migration" is enabled, this sets the threshold to trigger migration. The difference
# in health score has to be equal to or greater than the number below. The health scores are usually set to
# '1' per event, though scan agents are free to assign higher scores per event. The default threshold is
# '2'. To migrate on any health difference, set this to '1'. Use whole numbers only.
#feature::scancore::threshold::preventative-live-migration = 2
### Database
# Database connections;
@ -2191,8 +2197,9 @@ The file: [#!variable!file!#] needs to be updated. The difference is:
<key name="log_0721">The server: [#!variable!server!#] is ready to boot.</key>
<key name="log_0722">The server: [#!variable!server!#] was found to be running already, but it wasn't marked as booted. Marking it as if it just booted to handle any dependent servers.</key>
<key name="log_0723">The server: [#!variable!server!#] is configured to stay off, ignoring it.</key>
<key name="log_0724">The file: [#!variable!file!#] needs to be added to the database, but since the last scan it's size grew from: [#!variable!old_size_bytes!# (#!variables!old_size_hr!#)] to: [#!variable!new_size_bytes!# (#!variables!new_size_hr!#)]. A difference of: [#!variable!difference_bytes!# (#!variables!difference_hr!#)]. It might still be being uploaded, so we'll keep checking periodocally until the size stops changing.</key>
<key name="log_0724">The file: [#!variable!file!#] needs to be added to the database, but since the last scan it's size grew from: [#!variable!old_size_bytes!# (#!variable!old_size_hr!#)] to: [#!variable!new_size_bytes!# (#!variable!new_size_hr!#)]. A difference of: [#!variable!difference_bytes!# (#!variable!difference_hr!#)]. It might still be being uploaded, so we'll keep checking periodocally until the size stops changing.</key>
<key name="log_0725">Found the missing file: [#!variable!file!#] in the directory: [#!variable!directory!#]. Updating the database now.</key>
<key name="log_0726">Deleting the hash key: [#!variable!hash_key!#].</key>
<!-- Messages for users (less technical than log entries), though sometimes used for logs, too. -->
<key name="message_0001">The host name: [#!variable!target!#] does not resolve to an IP address.</key>

@ -28,7 +28,8 @@
# 6 = The file to delete is not under '/mnt/shared/'.
#
# TODO:
# -
# - If two Strikers have the same file name, but different sizes, we get into a yo-yo of updating the two
# sides. If this happens, we need to rsync the larger one over the smaller one.
#
# NOTE:
# - remove unsyncs, add syncs.

Loading…
Cancel
Save