From b8bb7cc4234d56cd25b17a84a5eecc245b0136a9 Mon Sep 17 00:00:00 2001 From: Digimer Date: Thu, 25 Aug 2022 12:43:51 -0400 Subject: [PATCH 1/3] * Changed the default trigger of live migrations to require a health score difference of 2 or higher. This can be user-adjusted using the new 'feature::scancore::threshold::preventative-live-migration' anvil.conf option. Signed-off-by: Digimer --- Anvil/Tools.pm | 10 ++++++++++ Anvil/Tools/ScanCore.pm | 14 +++++++++++++- share/words.xml | 8 +++++++- 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/Anvil/Tools.pm b/Anvil/Tools.pm index 2335d59b..c82b803f 100644 --- a/Anvil/Tools.pm +++ b/Anvil/Tools.pm @@ -1009,6 +1009,16 @@ sub _set_defaults html => "alteeve", }, }; + $anvil->data->{feature} = { + scancore => { + disable => { + 'preventative-live-migration' => 0, + }, + threshold => { + 'preventative-live-migration' => 2, + }, + }, + }; return(0); } diff --git a/Anvil/Tools/ScanCore.pm b/Anvil/Tools/ScanCore.pm index da503dd4..7e6cf3bf 100644 --- a/Anvil/Tools/ScanCore.pm +++ b/Anvil/Tools/ScanCore.pm @@ -2022,13 +2022,25 @@ sub post_scan_analysis_node # Last, evaluate health if we're otherwise OK if ($peer_health > $local_health) { + # The user may have set a migration threashold. + my $difference = $peer_health - $local_health; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { difference => $difference }}); + + if (not $anvil->data->{feature}{scancore}{threshold}{'preventative-live-migration'}) + { + $anvil->data->{feature}{scancore}{threshold}{'preventative-live-migration'} = 2; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + 'feature::scancore::threshold::preventative-live-migration' => $anvil->data->{feature}{scancore}{threshold}{'preventative-live-migration'}, + }}); + } + # A user may disable health-based preventative live migrations. if ($anvil->data->{feature}{scancore}{disable}{'preventative-live-migration'}) { # Do nothing. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, key => "message_0239"}); } - else + elsif ($difference >= $anvil->data->{feature}{scancore}{threshold}{'preventative-live-migration'}) { # How long has this been the case? my $age = $anvil->Alert->check_condition_age({ diff --git a/share/words.xml b/share/words.xml index e934f2e0..2584c794 100644 --- a/share/words.xml +++ b/share/words.xml @@ -576,7 +576,13 @@ sys::privacy::strong = #!data!sys::privacy::strong!# # Normally, if one node in the Anvil! is healthier than the other, it will pull the servers from the peer # on to it. This is a process called "preventative live migration". If you would like to disable this # feature, set this to '1'. -feature::scancore::disable::preventative-live-migration = 0 +#feature::scancore::disable::preventative-live-migration = 0 + +# If "preventative live migration" is enabled, this sets the threshold to trigger migration. The difference +# in health score has to be equal to or greater than the number below. The health scores are usually set to +# '1' per event, though scan agents are free to assign higher scores per event. The default threshold is +# '2'. To migrate on any health difference, set this to '1'. Use whole numbers only. +#feature::scancore::threshold::preventative-live-migration = 2 ### Database # Database connections; From 99a6593fe616784e4bbe9fed2fecdb9c5a75875f Mon Sep 17 00:00:00 2001 From: Digimer Date: Thu, 25 Aug 2022 21:43:21 -0400 Subject: [PATCH 2/3] * Fixed a bug when connecting to databases when one DB has no variable entries, making it seem like a DB was disabled. Signed-off-by: Digimer --- Anvil/Tools/Database.pm | 4 ++-- share/words.xml | 6 +++--- tools/anvil-manage-files | 3 ++- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/Anvil/Tools/Database.pm b/Anvil/Tools/Database.pm index ca529f15..2467c7b4 100644 --- a/Anvil/Tools/Database.pm +++ b/Anvil/Tools/Database.pm @@ -1625,7 +1625,7 @@ sub connect # Read the DB identifier and then check that we've not already connected to this DB. my $query = "SELECT system_identifier FROM pg_control_system();"; - my $identifier = $anvil->Database->query({uuid => $uuid, query => $query, source => $THIS_FILE, line => __LINE__})->[0]->[0]; + my $identifier = $anvil->Database->query({debug => $debug, uuid => $uuid, query => $query, source => $THIS_FILE, line => __LINE__})->[0]->[0]; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { query => $query, identifier => $identifier, @@ -1691,7 +1691,7 @@ sub connect variable_name => "database::".$uuid."::active", }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { active_value => $active_value }}); - if (not $active_value) + if ($active_value eq "0") { # If we're "retry", we just started up. if (($retry) && ($is_local)) diff --git a/share/words.xml b/share/words.xml index 2584c794..abef2821 100644 --- a/share/words.xml +++ b/share/words.xml @@ -466,7 +466,7 @@ Giving up. Failed to find the server: [#!variable!server!#] by name or UUID? Exiting. The protocol: [#!variable!protocol!#] is invalid. Please use '--help' for more information. The DR host: [#!variable!host_name!#] doesn't appear to be storage group: [#!variable!storage_group!#]. Unable to proceed. - We need: [#!variable!space_needed!# (#!variables!space_needed_bytes!# Bytes)] from the storage group: [#!variable!storage_group!#], but only: [#!variables!space_on_dr!# (#!variable!space_on_dr_bytes!# bytes)] is available on DR. Unable to proceed. + We need: [#!variable!space_needed!# (#!variable!space_needed_bytes!# Bytes)] from the storage group: [#!variable!storage_group!#], but only: [#!variable!space_on_dr!# (#!variable!space_on_dr_bytes!# bytes)] is available on DR. Unable to proceed. [ Error ] - The check appears to have failed. Expected a return code of '0', but got: [#!variable!return_code!#] The output, if any, was ==== @@ -520,7 +520,7 @@ The definition data passed in was: ==== ]]> [ Error ] - Failed to wipe and delete the logical volume: [#!variable!local_lv!#] that was volume number: [#!variable!volume!#] under the server: [#!variable!server!#]. - There was a problem deleting: [#!variables!config_file!#]. The rest of the process completed successfully. Please manually remove this file if it still exists. + There was a problem deleting: [#!variable!config_file!#]. The rest of the process completed successfully. Please manually remove this file if it still exists. @@ -2197,7 +2197,7 @@ The file: [#!variable!file!#] needs to be updated. The difference is: The server: [#!variable!server!#] is ready to boot. The server: [#!variable!server!#] was found to be running already, but it wasn't marked as booted. Marking it as if it just booted to handle any dependent servers. The server: [#!variable!server!#] is configured to stay off, ignoring it. - The file: [#!variable!file!#] needs to be added to the database, but since the last scan it's size grew from: [#!variable!old_size_bytes!# (#!variables!old_size_hr!#)] to: [#!variable!new_size_bytes!# (#!variables!new_size_hr!#)]. A difference of: [#!variable!difference_bytes!# (#!variables!difference_hr!#)]. It might still be being uploaded, so we'll keep checking periodocally until the size stops changing. + The file: [#!variable!file!#] needs to be added to the database, but since the last scan it's size grew from: [#!variable!old_size_bytes!# (#!variable!old_size_hr!#)] to: [#!variable!new_size_bytes!# (#!variable!new_size_hr!#)]. A difference of: [#!variable!difference_bytes!# (#!variable!difference_hr!#)]. It might still be being uploaded, so we'll keep checking periodocally until the size stops changing. Found the missing file: [#!variable!file!#] in the directory: [#!variable!directory!#]. Updating the database now. diff --git a/tools/anvil-manage-files b/tools/anvil-manage-files index d07cb7be..76f2b6ee 100755 --- a/tools/anvil-manage-files +++ b/tools/anvil-manage-files @@ -28,7 +28,8 @@ # 6 = The file to delete is not under '/mnt/shared/'. # # TODO: -# - +# - If two Strikers have the same file name, but different sizes, we get into a yo-yo of updating the two +# sides. If this happens, we need to rsync the larger one over the smaller one. # # NOTE: # - remove unsyncs, add syncs. From 89121a2b3bb5af6294b3f32734d70431377769f0 Mon Sep 17 00:00:00 2001 From: Digimer Date: Mon, 29 Aug 2022 17:30:52 -0400 Subject: [PATCH 3/3] * Fixed a bug in Alert->check_condition_age() where not setting a host_uuid caused the returned age to always be 0. * Updated scan_apc_pdu to not report a lost PDU unless it's been gone for ten minutes. Signed-off-by: Digimer --- Anvil/Tools/Alert.pm | 2 +- scancore-agents/scan-apc-pdu/scan-apc-pdu | 52 ++++++++++++++--------- share/words.xml | 1 + 3 files changed, 35 insertions(+), 20 deletions(-) diff --git a/Anvil/Tools/Alert.pm b/Anvil/Tools/Alert.pm index dc1d322f..bd27b126 100644 --- a/Anvil/Tools/Alert.pm +++ b/Anvil/Tools/Alert.pm @@ -276,7 +276,7 @@ sub check_condition_age my $clear = defined $parameter->{clear} ? $parameter->{clear} : 0; my $name = defined $parameter->{name} ? $parameter->{name} : ""; - my $host_uuid = defined $parameter->{host_uuid} ? $parameter->{host_uuid} : "NULL"; + my $host_uuid = defined $parameter->{host_uuid} ? $parameter->{host_uuid} : ""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { clear => $clear, name => $name, diff --git a/scancore-agents/scan-apc-pdu/scan-apc-pdu b/scancore-agents/scan-apc-pdu/scan-apc-pdu index 940f75be..0dcfb2d3 100755 --- a/scancore-agents/scan-apc-pdu/scan-apc-pdu +++ b/scancore-agents/scan-apc-pdu/scan-apc-pdu @@ -260,7 +260,7 @@ FROM my $scan_apc_pdu_link_speed = $row->[10]; my $scan_apc_pdu_phase_count = $row->[11]; my $scan_apc_pdu_outlet_count = $row->[12]; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { scan_apc_pdu_uuid => $scan_apc_pdu_uuid, scan_apc_pdu_fence_uuid => $scan_apc_pdu_fence_uuid, scan_apc_pdu_serial_number => $scan_apc_pdu_serial_number, @@ -1385,6 +1385,7 @@ WHERE } # Delete this from the SQL hash so we know it didn't vanish. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0726", variables => { hash_key => "sql::scan_apc_pdu_uuid::${scan_apc_pdu_uuid}" }}); delete $anvil->data->{sql}{scan_apc_pdu_uuid}{$scan_apc_pdu_uuid}; } else @@ -1596,10 +1597,19 @@ INSERT INTO scan_apc_pdu_ipv4_address => $scan_apc_pdu_ipv4_address, }}); + if ($scan_apc_pdu_model_number ne "DELETED") { - # Yup! send an alert. - my $query = " + # The PDUs only allow one connection at a time, so if another scan agent is + # connected, we'll get this issue. As such, check how long it's been missing, and + # alert only if it's been missing for 10 minutes. + my $age = $anvil->Alert->check_condition_age({name => "scan_apc_pdu::lost_pdu::".$scan_apc_pdu_serial_number}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { age => $age }}); + + if ($age > 600) + { + # Yup! send an alert. + my $query = " UPDATE scan_apc_pdus SET @@ -1608,22 +1618,23 @@ SET WHERE scan_apc_pdu_uuid = ".$anvil->Database->quote($scan_apc_pdu_uuid)." ;"; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }}); - push @{$anvil->data->{sys}{queries}}, $query; - - my $variables = { - model => $scan_apc_pdu_model_number, - serial_numer => $scan_apc_pdu_serial_number, - ip_address => $scan_apc_pdu_ipv4_address, - }; - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_apc_pdu_message_0040", variables => $variables}); - $anvil->Alert->register({ - alert_level => "warning", - message => "scan_apc_pdu_message_0040", - variables => $variables, - set_by => $THIS_FILE, - sort_position => $anvil->data->{'scan-apc-pdu'}{alert_sort}++, - }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }}); + push @{$anvil->data->{sys}{queries}}, $query; + + my $variables = { + model => $scan_apc_pdu_model_number, + serial_numer => $scan_apc_pdu_serial_number, + ip_address => $scan_apc_pdu_ipv4_address, + }; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_apc_pdu_message_0040", variables => $variables}); + $anvil->Alert->register({ + alert_level => "warning", + message => "scan_apc_pdu_message_0040", + variables => $variables, + set_by => $THIS_FILE, + sort_position => $anvil->data->{'scan-apc-pdu'}{alert_sort}++, + }); + } } } @@ -2003,6 +2014,9 @@ sub gather_pdu_data # If I got the serial number, I found the PDU. next if not $scan_apc_pdu_serial_number; + + # In case this PDU disappeared before, this will clear that condition. + $anvil->Alert->check_condition_age({clear => 1, name => "scan_apc_pdu::lost_pdu::".$scan_apc_pdu_serial_number}); ############################################################################################# # Base PDU info # diff --git a/share/words.xml b/share/words.xml index abef2821..e36b0d84 100644 --- a/share/words.xml +++ b/share/words.xml @@ -2199,6 +2199,7 @@ The file: [#!variable!file!#] needs to be updated. The difference is: The server: [#!variable!server!#] is configured to stay off, ignoring it. The file: [#!variable!file!#] needs to be added to the database, but since the last scan it's size grew from: [#!variable!old_size_bytes!# (#!variable!old_size_hr!#)] to: [#!variable!new_size_bytes!# (#!variable!new_size_hr!#)]. A difference of: [#!variable!difference_bytes!# (#!variable!difference_hr!#)]. It might still be being uploaded, so we'll keep checking periodocally until the size stops changing. Found the missing file: [#!variable!file!#] in the directory: [#!variable!directory!#]. Updating the database now. + Deleting the hash key: [#!variable!hash_key!#]. The host name: [#!variable!target!#] does not resolve to an IP address.