From 4dcd5057533c0399de4d0734cea6dd97016cd62c Mon Sep 17 00:00:00 2001 From: Digimer Date: Mon, 31 May 2021 13:34:49 -0400 Subject: [PATCH] * Biggest change in this commit; scan-apc-pdu and scan-apc-ups now only run on Striker dashboards! This was because we found that if two machines ran their agents at the same time, the reponce time from SNMP read requests grew a lot. This meant it was likely a third, fourth and so on machne would also then have their scan agent runs while the existing runs were still trying to process, causing the SNMP reads to get slower still until timeouts popped. * Bumped scancore's scan delay from 30 seconds to 60. * Shorted the age-out time to 24 hours and again boosted the archive thresholds. As we get a feel for the amount of data collected on multi-Anvil! systems over time, we may continue to tune this.l * Moved Database->archive_database() to be called daily by anvil-daemon, instead of during '->connect' calls. * Added locking to Database->_age_out_data to avoid resyncs mid-purge. Also moved the power, temperature and ip_address columns into the same 'to_clean' hash as it was duplicate logic. Signed-off-by: Digimer --- Anvil/Tools.pm | 2 +- Anvil/Tools/Database.pm | 108 +++++----------------- Anvil/Tools/ScanCore.pm | 6 +- anvil.conf | 6 +- scancore-agents/scan-apc-pdu/scan-apc-pdu | 8 ++ scancore-agents/scan-apc-ups/scan-apc-ups | 14 ++- scancore-agents/scan-storcli/scan-storcli | 2 +- share/words.xml | 2 +- tools/anvil-daemon | 7 +- tools/scancore | 2 +- 10 files changed, 61 insertions(+), 96 deletions(-) diff --git a/Anvil/Tools.pm b/Anvil/Tools.pm index d4fa4e96..abadd0f2 100644 --- a/Anvil/Tools.pm +++ b/Anvil/Tools.pm @@ -842,7 +842,7 @@ sub _set_defaults database => { # This is the number of hours, after which, transient data (like temperature and # power data) is considered "old" and gets deleted from the database. - age_out => 48, + age_out => 24, }, }; $anvil->data->{sys} = { diff --git a/Anvil/Tools/Database.pm b/Anvil/Tools/Database.pm index b43980d5..58a213b1 100644 --- a/Anvil/Tools/Database.pm +++ b/Anvil/Tools/Database.pm @@ -186,7 +186,7 @@ sub archive_database # If not given tables, use the system tables. if (not $tables) { - $tables = $anvil->data->{sys}{database}{check_tables}; + $tables = $anvil->Database->get_tables_from_schema({debug => $debug, schema_file => "all"}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { tables => $tables }}); } @@ -216,11 +216,13 @@ sub archive_database return(1); } + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0451"}); + # Make sure I have sane values. - $anvil->data->{sys}{database}{archive}{compress} = 1 if not defined $anvil->data->{sys}{database}{archive}{compress}; - $anvil->data->{sys}{database}{archive}{count} = 25000 if not defined $anvil->data->{sys}{database}{archive}{count}; - $anvil->data->{sys}{database}{archive}{division} = 30000 if not defined $anvil->data->{sys}{database}{archive}{division}; - $anvil->data->{sys}{database}{archive}{trigger} = 50000 if not defined $anvil->data->{sys}{database}{archive}{trigger}; + $anvil->data->{sys}{database}{archive}{compress} = 1 if not defined $anvil->data->{sys}{database}{archive}{compress}; + $anvil->data->{sys}{database}{archive}{count} = 100000 if not defined $anvil->data->{sys}{database}{archive}{count}; + $anvil->data->{sys}{database}{archive}{division} = 125000 if not defined $anvil->data->{sys}{database}{archive}{division}; + $anvil->data->{sys}{database}{archive}{trigger} = 500000 if not defined $anvil->data->{sys}{database}{archive}{trigger}; $anvil->data->{sys}{database}{archive}{save_to_disk} = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { "sys::database::archive::compress" => $anvil->data->{sys}{database}{archive}{compress}, @@ -14743,10 +14745,6 @@ sub resync_databases return(0); } - # Archive old data before resync'ing - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0451"}); - $anvil->Database->archive_database({debug => $debug}); - ### NOTE: Don't sort this array, we need to resync in the order that the user passed the tables to us ### to avoid trouble with primary/foreign keys. # We're going to use the array of tables assembles by _find_behind_databases() stored in @@ -15535,6 +15533,9 @@ sub _age_out_data my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "Database->_age_out_data()" }}); + # Get a lock. + $anvil->Database->locking({debug => $debug, request => 1}); + # Log our start, as this takes some time to run. my $start_time = time; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0623"}); @@ -15583,6 +15584,8 @@ sub _age_out_data # Commit the DELETEs. $anvil->Database->write({debug => $debug, uuid => $uuid, query => $queries, source => $THIS_FILE, line => __LINE__}); } + + $anvil->Database->locking({debug => $debug, renew => 1}); } # Remove old processed alerts. @@ -15622,6 +15625,7 @@ sub _age_out_data # Commit the DELETEs. $anvil->Database->write({debug => $debug, uuid => $uuid, query => $queries, source => $THIS_FILE, line => __LINE__}); } + $anvil->Database->locking({debug => $debug, renew => 1}); } # Now process power and tempoerature, if not disabled. @@ -15631,13 +15635,14 @@ sub _age_out_data if ($age =~ /\D/) { # Age is not valid, set it to defaults. - $age = 48; + $age = 24; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { age => $age }}); } if ($age == 0) { # Disabled, return. + $anvil->Database->locking({debug => $debug, release => 1}); return(0); } @@ -15649,83 +15654,15 @@ sub _age_out_data old_timestamp => $old_timestamp, }}); - # Purge temperature and power data. - my $tables = {}; - $tables->{temperature} = "temperature_uuid"; - $tables->{power} = "power_uuid"; - $tables->{ip_addresses} = "ip_address_uuid"; - foreach my $table (sort {$a cmp $b} keys %{$tables}) - { - my $uuid_column = $tables->{$table}; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { - table => $table, - uuid_column => $uuid_column, - }}); - foreach my $uuid (keys %{$anvil->data->{cache}{database_handle}}) - { - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { uuid => $uuid }}); - - my $queries = []; - my $query = "SELECT ".$uuid_column." FROM ".$table; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { query => $query }}); - - my $results = $anvil->Database->query({uuid => $uuid, query => $query, source => $THIS_FILE, line => __LINE__}); - my $count = @{$results}; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { - results => $results, - count => $count, - }}); - foreach my $row (@{$results}) - { - my $column_uuid = $row->[0]; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { column_uuid => $column_uuid }}); - - # Find how many records will be left. If it's 0, we'll use an OFFSET 1. - my $query = "SELECT history_id FROM history.".$table." WHERE ".$uuid_column." = ".$anvil->Database->quote($column_uuid)." AND modified_date > '".$old_timestamp."';"; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { query => $query }}); - - my $results = $anvil->Database->query({uuid => $uuid, query => $query, source => $THIS_FILE, line => __LINE__}); - my $count = @{$results}; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { - results => $results, - count => $count, - }}); - if ($count) - { - # At least one record will be left. - my $query = "DELETE FROM history.".$table." WHERE ".$uuid_column." = ".$anvil->Database->quote($column_uuid)." AND modified_date <= '".$old_timestamp."';"; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { query => $query }}); - push @{$queries}, $query; - } - else - { - # This would delete everything, reserve at least one record. - foreach my $row (@{$results}) - { - my $history_id = $row->[0]; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { history_id => $history_id }}); - - my $query = "DELETE FROM history.".$table." WHERE ".$uuid_column." = ".$anvil->Database->quote($column_uuid)." AND hostory_id = '".$history_id."';"; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { query => $query }}); - push @{$queries}, $query; - } - } - } - - my $commits = @{$queries}; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { commits => $commits }}); - if ($commits) - { - # Commit the DELETEs. - $anvil->Database->write({debug => $debug, uuid => $uuid, query => $queries, source => $THIS_FILE, line => __LINE__}); - } - } - } - ### Looks for scan agent data that grows quickly. # We don't use 'anvil->data' to prevent injecting SQL queries in anvil.conf my $to_clean = {}; + # Power, temperatures and ip addresses + $to_clean->{table}{temperature}{child_table}{temperature}{uuid_column} = "temperature_uuid"; + $to_clean->{table}{power}{child_table}{power}{uuid_column} = "power_uuid"; + $to_clean->{table}{ip_addresses}{child_table}{ip_addresses}{uuid_column} = "ip_address_uuid"; + # scan_apc_pdu $to_clean->{table}{scan_apc_pdus}{child_table}{scan_apc_pdu_phases}{uuid_column} = "scan_apc_pdu_phase_uuid"; $to_clean->{table}{scan_apc_pdus}{child_table}{scan_apc_pdu_variables}{uuid_column} = "scan_apc_pdu_variable_uuid"; @@ -15855,6 +15792,7 @@ sub _age_out_data $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { vacuum => $vacuum }}); undef $queries; } + $anvil->Database->locking({debug => $debug, renew => 1}); } } } @@ -15868,11 +15806,15 @@ sub _age_out_data my $query = "VACUUM FULL;"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { query => $query }}); $anvil->Database->write({debug => $debug, uuid => $uuid, query => $query, source => $THIS_FILE, line => __LINE__}); + + $anvil->Database->locking({debug => $debug, renew => 1}); } my $runtime = time - $start_time; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0624", variables => { runtime => $runtime }}); + $anvil->Database->locking({debug => $debug, release => 1}); + return(0); } diff --git a/Anvil/Tools/ScanCore.pm b/Anvil/Tools/ScanCore.pm index b92cf71c..b40e2f99 100644 --- a/Anvil/Tools/ScanCore.pm +++ b/Anvil/Tools/ScanCore.pm @@ -289,7 +289,7 @@ sub call_scan_agents my $runtime = (time - $start_time); my $log_level = $debug; my $string_key = "log_0557"; - if ($runtime > 10) + if ($runtime > 15) { $log_level = 1; $string_key = "log_0621"; @@ -2154,7 +2154,7 @@ LIMIT 1;"; } # Check this target's power state. - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0561", variables => { host_name => $host_name }}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0561", variables => { host_name => $host_name }}); # Do we share a network with this system? $anvil->Network->load_ips({ @@ -2206,7 +2206,7 @@ LIMIT 1;"; if ($access) { # It's up. - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0562", variables => { host_name => $host_name }}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0562", variables => { host_name => $host_name }}); $check_power = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { diff --git a/anvil.conf b/anvil.conf index 543cc918..8ed59c80 100644 --- a/anvil.conf +++ b/anvil.conf @@ -76,9 +76,9 @@ feature::scancore::disable::preventative-live-migration = 0 # NOTE: If the archive directory doesn't exist, Anvil! will create it # automatically the first time it is needed. sys::database::archive::compress = 1 -sys::database::archive::trigger = 100000 -sys::database::archive::count = 50000 -sys::database::archive::division = 75000 +sys::database::archive::trigger = 500000 +sys::database::archive::count = 100000 +sys::database::archive::division = 125000 sys::database::archive::directory = /usr/local/anvil/archives/ # This puts a limit on how many queries (writes, generally) to make in a single batch transaction. This is diff --git a/scancore-agents/scan-apc-pdu/scan-apc-pdu b/scancore-agents/scan-apc-pdu/scan-apc-pdu index a372147a..11bc26d8 100755 --- a/scancore-agents/scan-apc-pdu/scan-apc-pdu +++ b/scancore-agents/scan-apc-pdu/scan-apc-pdu @@ -162,6 +162,14 @@ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => " # Read switches $anvil->Get->switches; +# Too many connections cause the UPS to lag out, so we only run on Strikers. +my $host_type = $anvil->Get->host_type(); +$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_type => $host_type }}); +if (($host_type ne "striker") && (not $anvil->data->{switches}{force})) +{ + $anvil->nice_exit({exit_code => 1}); +} + # If we're disabled and '--force' wasn't used, exit. if (($anvil->data->{scancore}{'scan-apc-pdu'}{disable}) && (not $anvil->data->{switches}{force})) { diff --git a/scancore-agents/scan-apc-ups/scan-apc-ups b/scancore-agents/scan-apc-ups/scan-apc-ups index f92290c2..cd9e2236 100755 --- a/scancore-agents/scan-apc-ups/scan-apc-ups +++ b/scancore-agents/scan-apc-ups/scan-apc-ups @@ -189,6 +189,14 @@ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => " # Read switches $anvil->Get->switches; +# Too many connections cause the UPS to lag out, so we only run on Strikers. +my $host_type = $anvil->Get->host_type(); +$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_type => $host_type }}); +if (($host_type ne "striker") && (not $anvil->data->{switches}{force})) +{ + $anvil->nice_exit({exit_code => 1}); +} + # If we're disabled and '--force' wasn't used, exit. if (($anvil->data->{scancore}{'scan-apc-ups'}{disable}) && (not $anvil->data->{switches}{force})) { @@ -232,7 +240,11 @@ gather_ups_data($anvil); find_changes($anvil); # Update the database -$anvil->Database->insert_or_update_updated({updated_by => $THIS_FILE}); +my $updated_uuid = $anvil->Database->insert_or_update_updated({ + debug => 2, + updated_by => $THIS_FILE, +}); +$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { updated_uuid => $updated_uuid }}); # Clean up and go away. $anvil->nice_exit({exit_code => 0}); diff --git a/scancore-agents/scan-storcli/scan-storcli b/scancore-agents/scan-storcli/scan-storcli index bf7486e5..d2fb1b24 100755 --- a/scancore-agents/scan-storcli/scan-storcli +++ b/scancore-agents/scan-storcli/scan-storcli @@ -6093,7 +6093,7 @@ AND $message_key = "scan_storcli_warning_0006"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { message_key => $message_key }}); } - elsif ($old_variable_value > $new_variable_value) + elsif ($new_variable_value > $old_variable_value) { # Rising my $jumped = ($new_variable_value - $old_variable_value); diff --git a/share/words.xml b/share/words.xml index a25eae8b..ac6ab639 100644 --- a/share/words.xml +++ b/share/words.xml @@ -1596,7 +1596,7 @@ Failed to promote the DRBD resource: [#!variable!resource!#] primary. Expected a Ready to parse: [#!variable!file!#]. Parsed: [#!variable!records!#], adding/updating them to the database now. Skipping the network scan. The next scheduled scan will be done in: [#!variable!next_scan!#]. Override with '--force'. - Checking to see if any data needs to be archived before starting the resync. + Checking to see if any data needs to be archived. Skipping archiving, not a Striker dashboard. Archiving: [#!variable!records!#] over: [#!variable!loops!#] segments from the table: [#!variable!table!#] from the database on: [#!variable!host!#]. This might take a bit, please be patient. Writing: [#!variable!records!#] to the file: [#!variable!file!#]. diff --git a/tools/anvil-daemon b/tools/anvil-daemon index 13ec902d..ae51ade4 100755 --- a/tools/anvil-daemon +++ b/tools/anvil-daemon @@ -172,7 +172,7 @@ $anvil->data->{timing}{daily_checks} = 86400; $anvil->data->{timing}{repo_update_interval} = 86400; $anvil->data->{timing}{next_minute_check} = $now_time - 1; $anvil->data->{timing}{next_daily_check} = ($now_time + $delay) - 1; -$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { +$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "s1:timing::minute_checks" => $anvil->data->{timing}{minute_checks}, "s2:timing::daily_checks" => $anvil->data->{timing}{daily_checks}, "s3:timing::repo_update_interval" => $anvil->data->{timing}{repo_update_interval}, @@ -337,7 +337,7 @@ sub set_delay $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { type => $type }}); if ($type eq "striker") { - foreach my $uuid (sort {$a cmp $b} keys %{$anvil->data->{database}}) + foreach my $uuid (keys %{$anvil->data->{database}}) { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { "sys::host_uuid" => $anvil->data->{sys}{host_uuid}, @@ -465,6 +465,9 @@ sub handle_periodic_tasks # Age out old data. This takes up to a minute. $anvil->Database->_age_out_data(); + # Archive old data + $anvil->Database->archive_database(); + # Record a job, don't call it directly. It takes too long to run. my ($job_uuid) = $anvil->Database->insert_or_update_jobs({ file => $THIS_FILE, diff --git a/tools/scancore b/tools/scancore index b54f6e35..eaade962 100755 --- a/tools/scancore +++ b/tools/scancore @@ -106,7 +106,7 @@ while(1) prepare_for_run($anvil); # Set our sleep time - my $run_interval = 30; + my $run_interval = 60; if ((exists $anvil->data->{scancore}{timing}{run_interval}) && ($anvil->data->{scancore}{timing}{run_interval} =~ /^\d+$/)) { $run_interval = $anvil->data->{scancore}{timing}{run_interval};