* Biggest change in this commit; scan-apc-pdu and scan-apc-ups now only run on Striker dashboards! This was because we found that if two machines ran their agents at the same time, the reponce time from SNMP read requests grew a lot. This meant it was likely a third, fourth and so on machne would also then have their scan agent runs while the existing runs were still trying to process, causing the SNMP reads to get slower still until timeouts popped.

* Bumped scancore's scan delay from 30 seconds to 60.
* Shorted the age-out time to 24 hours and again boosted the archive thresholds. As we get a feel for the amount of data collected on multi-Anvil! systems over time, we may continue to tune this.l
* Moved Database->archive_database() to be called daily by anvil-daemon, instead of during '->connect' calls.
* Added locking to Database->_age_out_data to avoid resyncs mid-purge. Also moved the power, temperature and ip_address columns into the same 'to_clean' hash as it was duplicate logic.

Signed-off-by: Digimer <digimer@alteeve.ca>
main
Digimer 4 years ago
parent 8807915bb7
commit 4dcd505753
  1. 2
      Anvil/Tools.pm
  2. 106
      Anvil/Tools/Database.pm
  3. 6
      Anvil/Tools/ScanCore.pm
  4. 6
      anvil.conf
  5. 8
      scancore-agents/scan-apc-pdu/scan-apc-pdu
  6. 14
      scancore-agents/scan-apc-ups/scan-apc-ups
  7. 2
      scancore-agents/scan-storcli/scan-storcli
  8. 2
      share/words.xml
  9. 7
      tools/anvil-daemon
  10. 2
      tools/scancore

@ -842,7 +842,7 @@ sub _set_defaults
database => {
# This is the number of hours, after which, transient data (like temperature and
# power data) is considered "old" and gets deleted from the database.
age_out => 48,
age_out => 24,
},
};
$anvil->data->{sys} = {

@ -186,7 +186,7 @@ sub archive_database
# If not given tables, use the system tables.
if (not $tables)
{
$tables = $anvil->data->{sys}{database}{check_tables};
$tables = $anvil->Database->get_tables_from_schema({debug => $debug, schema_file => "all"});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { tables => $tables }});
}
@ -216,11 +216,13 @@ sub archive_database
return(1);
}
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0451"});
# Make sure I have sane values.
$anvil->data->{sys}{database}{archive}{compress} = 1 if not defined $anvil->data->{sys}{database}{archive}{compress};
$anvil->data->{sys}{database}{archive}{count} = 25000 if not defined $anvil->data->{sys}{database}{archive}{count};
$anvil->data->{sys}{database}{archive}{division} = 30000 if not defined $anvil->data->{sys}{database}{archive}{division};
$anvil->data->{sys}{database}{archive}{trigger} = 50000 if not defined $anvil->data->{sys}{database}{archive}{trigger};
$anvil->data->{sys}{database}{archive}{count} = 100000 if not defined $anvil->data->{sys}{database}{archive}{count};
$anvil->data->{sys}{database}{archive}{division} = 125000 if not defined $anvil->data->{sys}{database}{archive}{division};
$anvil->data->{sys}{database}{archive}{trigger} = 500000 if not defined $anvil->data->{sys}{database}{archive}{trigger};
$anvil->data->{sys}{database}{archive}{save_to_disk} = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
"sys::database::archive::compress" => $anvil->data->{sys}{database}{archive}{compress},
@ -14743,10 +14745,6 @@ sub resync_databases
return(0);
}
# Archive old data before resync'ing
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0451"});
$anvil->Database->archive_database({debug => $debug});
### NOTE: Don't sort this array, we need to resync in the order that the user passed the tables to us
### to avoid trouble with primary/foreign keys.
# We're going to use the array of tables assembles by _find_behind_databases() stored in
@ -15535,6 +15533,9 @@ sub _age_out_data
my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "Database->_age_out_data()" }});
# Get a lock.
$anvil->Database->locking({debug => $debug, request => 1});
# Log our start, as this takes some time to run.
my $start_time = time;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0623"});
@ -15583,6 +15584,8 @@ sub _age_out_data
# Commit the DELETEs.
$anvil->Database->write({debug => $debug, uuid => $uuid, query => $queries, source => $THIS_FILE, line => __LINE__});
}
$anvil->Database->locking({debug => $debug, renew => 1});
}
# Remove old processed alerts.
@ -15622,6 +15625,7 @@ sub _age_out_data
# Commit the DELETEs.
$anvil->Database->write({debug => $debug, uuid => $uuid, query => $queries, source => $THIS_FILE, line => __LINE__});
}
$anvil->Database->locking({debug => $debug, renew => 1});
}
# Now process power and tempoerature, if not disabled.
@ -15631,13 +15635,14 @@ sub _age_out_data
if ($age =~ /\D/)
{
# Age is not valid, set it to defaults.
$age = 48;
$age = 24;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { age => $age }});
}
if ($age == 0)
{
# Disabled, return.
$anvil->Database->locking({debug => $debug, release => 1});
return(0);
}
@ -15649,83 +15654,15 @@ sub _age_out_data
old_timestamp => $old_timestamp,
}});
# Purge temperature and power data.
my $tables = {};
$tables->{temperature} = "temperature_uuid";
$tables->{power} = "power_uuid";
$tables->{ip_addresses} = "ip_address_uuid";
foreach my $table (sort {$a cmp $b} keys %{$tables})
{
my $uuid_column = $tables->{$table};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
table => $table,
uuid_column => $uuid_column,
}});
foreach my $uuid (keys %{$anvil->data->{cache}{database_handle}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { uuid => $uuid }});
my $queries = [];
my $query = "SELECT ".$uuid_column." FROM ".$table;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { query => $query }});
my $results = $anvil->Database->query({uuid => $uuid, query => $query, source => $THIS_FILE, line => __LINE__});
my $count = @{$results};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
results => $results,
count => $count,
}});
foreach my $row (@{$results})
{
my $column_uuid = $row->[0];
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { column_uuid => $column_uuid }});
# Find how many records will be left. If it's 0, we'll use an OFFSET 1.
my $query = "SELECT history_id FROM history.".$table." WHERE ".$uuid_column." = ".$anvil->Database->quote($column_uuid)." AND modified_date > '".$old_timestamp."';";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { query => $query }});
my $results = $anvil->Database->query({uuid => $uuid, query => $query, source => $THIS_FILE, line => __LINE__});
my $count = @{$results};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
results => $results,
count => $count,
}});
if ($count)
{
# At least one record will be left.
my $query = "DELETE FROM history.".$table." WHERE ".$uuid_column." = ".$anvil->Database->quote($column_uuid)." AND modified_date <= '".$old_timestamp."';";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { query => $query }});
push @{$queries}, $query;
}
else
{
# This would delete everything, reserve at least one record.
foreach my $row (@{$results})
{
my $history_id = $row->[0];
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { history_id => $history_id }});
my $query = "DELETE FROM history.".$table." WHERE ".$uuid_column." = ".$anvil->Database->quote($column_uuid)." AND hostory_id = '".$history_id."';";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { query => $query }});
push @{$queries}, $query;
}
}
}
my $commits = @{$queries};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { commits => $commits }});
if ($commits)
{
# Commit the DELETEs.
$anvil->Database->write({debug => $debug, uuid => $uuid, query => $queries, source => $THIS_FILE, line => __LINE__});
}
}
}
### Looks for scan agent data that grows quickly.
# We don't use 'anvil->data' to prevent injecting SQL queries in anvil.conf
my $to_clean = {};
# Power, temperatures and ip addresses
$to_clean->{table}{temperature}{child_table}{temperature}{uuid_column} = "temperature_uuid";
$to_clean->{table}{power}{child_table}{power}{uuid_column} = "power_uuid";
$to_clean->{table}{ip_addresses}{child_table}{ip_addresses}{uuid_column} = "ip_address_uuid";
# scan_apc_pdu
$to_clean->{table}{scan_apc_pdus}{child_table}{scan_apc_pdu_phases}{uuid_column} = "scan_apc_pdu_phase_uuid";
$to_clean->{table}{scan_apc_pdus}{child_table}{scan_apc_pdu_variables}{uuid_column} = "scan_apc_pdu_variable_uuid";
@ -15855,6 +15792,7 @@ sub _age_out_data
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { vacuum => $vacuum }});
undef $queries;
}
$anvil->Database->locking({debug => $debug, renew => 1});
}
}
}
@ -15868,11 +15806,15 @@ sub _age_out_data
my $query = "VACUUM FULL;";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { query => $query }});
$anvil->Database->write({debug => $debug, uuid => $uuid, query => $query, source => $THIS_FILE, line => __LINE__});
$anvil->Database->locking({debug => $debug, renew => 1});
}
my $runtime = time - $start_time;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0624", variables => { runtime => $runtime }});
$anvil->Database->locking({debug => $debug, release => 1});
return(0);
}

@ -289,7 +289,7 @@ sub call_scan_agents
my $runtime = (time - $start_time);
my $log_level = $debug;
my $string_key = "log_0557";
if ($runtime > 10)
if ($runtime > 15)
{
$log_level = 1;
$string_key = "log_0621";
@ -2154,7 +2154,7 @@ LIMIT 1;";
}
# Check this target's power state.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0561", variables => { host_name => $host_name }});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0561", variables => { host_name => $host_name }});
# Do we share a network with this system?
$anvil->Network->load_ips({
@ -2206,7 +2206,7 @@ LIMIT 1;";
if ($access)
{
# It's up.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0562", variables => { host_name => $host_name }});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0562", variables => { host_name => $host_name }});
$check_power = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {

@ -76,9 +76,9 @@ feature::scancore::disable::preventative-live-migration = 0
# NOTE: If the archive directory doesn't exist, Anvil! will create it
# automatically the first time it is needed.
sys::database::archive::compress = 1
sys::database::archive::trigger = 100000
sys::database::archive::count = 50000
sys::database::archive::division = 75000
sys::database::archive::trigger = 500000
sys::database::archive::count = 100000
sys::database::archive::division = 125000
sys::database::archive::directory = /usr/local/anvil/archives/
# This puts a limit on how many queries (writes, generally) to make in a single batch transaction. This is

@ -162,6 +162,14 @@ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "
# Read switches
$anvil->Get->switches;
# Too many connections cause the UPS to lag out, so we only run on Strikers.
my $host_type = $anvil->Get->host_type();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_type => $host_type }});
if (($host_type ne "striker") && (not $anvil->data->{switches}{force}))
{
$anvil->nice_exit({exit_code => 1});
}
# If we're disabled and '--force' wasn't used, exit.
if (($anvil->data->{scancore}{'scan-apc-pdu'}{disable}) && (not $anvil->data->{switches}{force}))
{

@ -189,6 +189,14 @@ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "
# Read switches
$anvil->Get->switches;
# Too many connections cause the UPS to lag out, so we only run on Strikers.
my $host_type = $anvil->Get->host_type();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_type => $host_type }});
if (($host_type ne "striker") && (not $anvil->data->{switches}{force}))
{
$anvil->nice_exit({exit_code => 1});
}
# If we're disabled and '--force' wasn't used, exit.
if (($anvil->data->{scancore}{'scan-apc-ups'}{disable}) && (not $anvil->data->{switches}{force}))
{
@ -232,7 +240,11 @@ gather_ups_data($anvil);
find_changes($anvil);
# Update the database
$anvil->Database->insert_or_update_updated({updated_by => $THIS_FILE});
my $updated_uuid = $anvil->Database->insert_or_update_updated({
debug => 2,
updated_by => $THIS_FILE,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { updated_uuid => $updated_uuid }});
# Clean up and go away.
$anvil->nice_exit({exit_code => 0});

@ -6093,7 +6093,7 @@ AND
$message_key = "scan_storcli_warning_0006";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { message_key => $message_key }});
}
elsif ($old_variable_value > $new_variable_value)
elsif ($new_variable_value > $old_variable_value)
{
# Rising
my $jumped = ($new_variable_value - $old_variable_value);

@ -1596,7 +1596,7 @@ Failed to promote the DRBD resource: [#!variable!resource!#] primary. Expected a
<key name="log_0448">Ready to parse: [#!variable!file!#].</key>
<key name="log_0449">Parsed: [#!variable!records!#], adding/updating them to the database now.</key>
<key name="log_0450">Skipping the network scan. The next scheduled scan will be done in: [#!variable!next_scan!#]. Override with '--force'.</key>
<key name="log_0451">Checking to see if any data needs to be archived before starting the resync.</key>
<key name="log_0451">Checking to see if any data needs to be archived.</key>
<key name="log_0452">Skipping archiving, not a Striker dashboard.</key>
<key name="log_0453">Archiving: [#!variable!records!#] over: [#!variable!loops!#] segments from the table: [#!variable!table!#] from the database on: [#!variable!host!#]. This might take a bit, please be patient.</key>
<key name="log_0454">Writing: [#!variable!records!#] to the file: [#!variable!file!#].</key>

@ -172,7 +172,7 @@ $anvil->data->{timing}{daily_checks} = 86400;
$anvil->data->{timing}{repo_update_interval} = 86400;
$anvil->data->{timing}{next_minute_check} = $now_time - 1;
$anvil->data->{timing}{next_daily_check} = ($now_time + $delay) - 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"s1:timing::minute_checks" => $anvil->data->{timing}{minute_checks},
"s2:timing::daily_checks" => $anvil->data->{timing}{daily_checks},
"s3:timing::repo_update_interval" => $anvil->data->{timing}{repo_update_interval},
@ -337,7 +337,7 @@ sub set_delay
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { type => $type }});
if ($type eq "striker")
{
foreach my $uuid (sort {$a cmp $b} keys %{$anvil->data->{database}})
foreach my $uuid (keys %{$anvil->data->{database}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
"sys::host_uuid" => $anvil->data->{sys}{host_uuid},
@ -465,6 +465,9 @@ sub handle_periodic_tasks
# Age out old data. This takes up to a minute.
$anvil->Database->_age_out_data();
# Archive old data
$anvil->Database->archive_database();
# Record a job, don't call it directly. It takes too long to run.
my ($job_uuid) = $anvil->Database->insert_or_update_jobs({
file => $THIS_FILE,

@ -106,7 +106,7 @@ while(1)
prepare_for_run($anvil);
# Set our sleep time
my $run_interval = 30;
my $run_interval = 60;
if ((exists $anvil->data->{scancore}{timing}{run_interval}) && ($anvil->data->{scancore}{timing}{run_interval} =~ /^\d+$/))
{
$run_interval = $anvil->data->{scancore}{timing}{run_interval};

Loading…
Cancel
Save