Updated scancore and anvil-daemon to check their RAM use at the end of each loop and, if it's using more than 1 GiB of RAM, it sends an alert and exits.

* Updated Database->resync_databases() to never run on non-striker machines. On Strikers, before a resync, _age_out_data() is called to clear old data in long-off databases.
* Created System->check_memory() that is loosely based on anvil-check-memory, but checks to see if it's being controlled by a systemctl started daemon and, if so, reads the RAM in use from it's status output.

Signed-off-by: Digimer <digimer@alteeve.ca>
main
Digimer 3 years ago
parent 0789e885cf
commit d70b9a4956
  1. 13
      Anvil/Tools/Database.pm
  2. 2
      Anvil/Tools/ScanCore.pm
  3. 153
      Anvil/Tools/System.pm
  4. 3
      share/words.xml
  5. 40
      tools/anvil-daemon
  6. 38
      tools/scancore

@ -15691,6 +15691,16 @@ sub resync_databases
return(0); return(0);
} }
# If we're not a striker, don't resync ever.
my $host_type = $anvil->Get->host_type();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { host_type => $host_type }});
if ($host_type ne "striker")
{
# Not a dashboard, don't resync
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0686"});
return(1);
}
# If we're hosting servers, don't resync. Too high of a risk of oom-killer being triggered. # If we're hosting servers, don't resync. Too high of a risk of oom-killer being triggered.
my $server_count = $anvil->Server->count_servers({debug => $debug}); my $server_count = $anvil->Server->count_servers({debug => $debug});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { server_count => $server_count }}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { server_count => $server_count }});
@ -15700,6 +15710,9 @@ sub resync_databases
return(0); return(0);
} }
# Before resync, age out the data in each DB
$anvil->Database->_age_out_data({debug => $debug});
### NOTE: Don't sort this array, we need to resync in the order that the user passed the tables to us ### NOTE: Don't sort this array, we need to resync in the order that the user passed the tables to us
### to avoid trouble with primary/foreign keys. ### to avoid trouble with primary/foreign keys.
# We're going to use the array of tables assembles by _find_behind_databases() stored in # We're going to use the array of tables assembles by _find_behind_databases() stored in

@ -199,7 +199,7 @@ sub agent_startup
if (($anvil->data->{scancore}{$agent}{disable}) && (not $anvil->data->{switches}{force})) if (($anvil->data->{scancore}{$agent}{disable}) && (not $anvil->data->{switches}{force}))
{ {
# Exit. # Exit.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, 'print' => 1, key => "log_0646", variables => { program => $THIS_FILE }}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, 'print' => 1, key => "log_0646", variables => { program => $agent }});
$anvil->nice_exit({exit_code => 0}); $anvil->nice_exit({exit_code => 0});
} }

@ -654,6 +654,159 @@ sub check_memory
} }
=head2 check_ram_use
This is meant to be used by daemons to check how much RAM it is using. It returns an anonymous array with the first value being C<< 0 >> if the in-use RAM is below the maximum, and C<< 1 >> it the in-use RAM is too high. The second value is the amount of RAM in use, in bytes. If the program is not found to be running, C<< 2, 0 >> is returned.
my ($problem, $used_ram) = $anvil->System->check_ram_use({
program => $THIS_FILE,
max_ram => 1073741824,
});
Parameters;
=head3 program (required)
This is generally C<< $THIS_FILE >>. Though this could be used to check the RAM use of other programs.
=head3 max_ram (optional, default '1073741824' (1 GiB))
This is the limit allowed. If the in-use RAM is greater than this amount, an alert will be generated and sent.
=cut
sub check_ram_use
{
my $self = shift;
my $parameter = shift;
my $anvil = $self->parent;
my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "System->check_ram_use()" }});
my $program = defined $parameter->{program} ? $parameter->{program} : "";
my $max_ram = defined $parameter->{max_ram} ? $parameter->{max_ram} : 1073741824;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
program => $program,
max_ram => $max_ram,
}});
# Find the PID(s) of the program.
my $problem = 0;
my $ram_used = 0;
# See if we're a daemon running under systemctl. If so, the memory reported includes all spawned
# child programs, swap, etc. Much more thorough.
my $shell_call = $anvil->data->{path}{exe}{systemctl}." status ".$program." --lines=0";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({debug => $debug, shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
output => $output,
return_code => $return_code,
}});
foreach my $line (split/\n/, $output)
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { line => $line }});
if ($line =~ /Memory: (.*)?/)
{
my $memory = $1;
my $in_bytes = $anvil->Convert->human_readable_to_bytes({size => $memory, base2 => 1});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
memory => $memory,
in_bytes => $anvil->Convert->add_commas({number => $in_bytes})." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $in_bytes}).")",
}});
if ($in_bytes =~ /^\d+$/)
{
$ram_used = $in_bytes;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
ram_used => $anvil->Convert->add_commas({number => $ram_used})." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $ram_used}).")",
}});
}
last;
}
}
# If we didn't get the RAM from systemctl, read smaps
if (not $ram_used)
{
my $pids = $anvil->System->pids({debug => $debug, program_name => $program});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { pids => $pids }});
my $pids_found = @{$pids};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { pids_found => $pids_found }});
if (not $pids_found)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "warning_0135", variables => { program => $program }});
return(2, 0);
}
# Read in the smaps for each pid
foreach my $pid (sort {$a cmp $b} @{$pids})
{
my $smaps_path = "/proc/".$pid."/smaps";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { smaps_path => $smaps_path }});
# This will store the amount of RAM used by this specific PID.
$anvil->data->{memory}{pid}{$pid} = 0;
if (not -e $smaps_path)
{
# It is possible that the program just closed.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0433", variables => { pid => $pid }});
next;
}
# Read in the file.
my $body = $anvil->Storage->read_file({debug => $debug, file => $smaps_path});
foreach my $line (split/\n/, $body)
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { line => $line }});
if ($line =~ /^Private_Dirty:\s+(\d+) (.*B)$/)
{
my $size = $1;
my $type = $2;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
type => $type,
size => $size,
}});
next if not $size;
next if $size =~ /\D/;
# This uses 'kB' for 'KiB' >_>
$type = lc($type);
$type =~ s/b$/ib/ if $type !~ /ib$/;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { type => $type }});
my $size_in_bytes = $anvil->Convert->human_readable_to_bytes({size => $size, type => $type, base2 => 1});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
size_in_bytes => $anvil->Convert->add_commas({number => $size_in_bytes})." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $size_in_bytes}).")",
}});
$anvil->data->{memory}{pid}{$pid} += $size_in_bytes;
$ram_used += $size_in_bytes;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
"memory::pid::${pid}" => $anvil->Convert->add_commas({number => $anvil->data->{memory}{pid}{$pid}})." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $anvil->data->{memory}{pid}{$pid}}).")",
ram_used => $anvil->Convert->add_commas({number => $ram_used})." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $ram_used}).")",
}});
}
}
}
}
# Are we using too much RAM?
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
max_ram => $anvil->Convert->add_commas({number => $max_ram})." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $max_ram}).")",
ram_used => $anvil->Convert->add_commas({number => $ram_used})." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $ram_used}).")",
}});
if ($ram_used > $max_ram)
{
$problem = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { problem => $problem }});
}
return($problem, $ram_used);
}
=head2 check_ssh_keys =head2 check_ssh_keys
This method does several things; This method does several things;

@ -502,6 +502,7 @@ The output, if any, was;
</key> </key>
<key name="error_0355">Failed to load the database file: [#!variable!file!#]. Deleting it so it's not considered in the next load attempt.</key> <key name="error_0355">Failed to load the database file: [#!variable!file!#]. Deleting it so it's not considered in the next load attempt.</key>
<key name="error_0356">Failed to read the kernel release on the host: [#!variable!target!#]. The return code was: [#!variable!return_code!#] (expected '0') and the release output, if any, was: [#!variable!output!#].</key> <key name="error_0356">Failed to read the kernel release on the host: [#!variable!target!#]. The return code was: [#!variable!return_code!#] (expected '0') and the release output, if any, was: [#!variable!output!#].</key>
<key name="error_0357">The program: [#!variable!program!#] is using: [#!variable!ram_used!#] (#!variable!ram_used_bytes!# Bytes). This is probably caused by a memory leak, so we will now exit so that systemctl can restart us. If this is happening repeatedly, please contact support.</key>
<!-- Files templates --> <!-- Files templates -->
<!-- NOTE: Translating these files requires an understanding of which lines are translatable --> <!-- NOTE: Translating these files requires an understanding of which lines are translatable -->
@ -2077,6 +2078,7 @@ The file: [#!variable!file!#] needs to be updated. The difference is:
<key name="log_0683">Enabling 'ping' for all users.</key> <key name="log_0683">Enabling 'ping' for all users.</key>
<key name="log_0684">The network interface: [#!variable!nic!#] on the host: [#!variable!host!#] is recorded in the 'history.network_interfaces' table, but has not corresponding entry in the public table. Removing it.</key> <key name="log_0684">The network interface: [#!variable!nic!#] on the host: [#!variable!host!#] is recorded in the 'history.network_interfaces' table, but has not corresponding entry in the public table. Removing it.</key>
<key name="log_0685">[ Note ] - The network bridge: [#!variable!name!#] with 'bridge_uuid': [#!variable!uuid!#] is a duplicate, removing it from the database(s).</key> <key name="log_0685">[ Note ] - The network bridge: [#!variable!name!#] with 'bridge_uuid': [#!variable!uuid!#] is a duplicate, removing it from the database(s).</key>
<key name="log_0686">Skipping resync, not a Striker dashboard.</key>
<!-- Messages for users (less technical than log entries), though sometimes used for logs, too. --> <!-- Messages for users (less technical than log entries), though sometimes used for logs, too. -->
<key name="message_0001">The host name: [#!variable!target!#] does not resolve to an IP address.</key> <key name="message_0001">The host name: [#!variable!target!#] does not resolve to an IP address.</key>
@ -3107,6 +3109,7 @@ We will sleep a bit and try again.
<key name="warning_0132">[ Warning ] - Failed to build or install the DRBD kernel module! It is very unlikely that this machine will be able to run any servers until this is fixed.</key> <key name="warning_0132">[ Warning ] - Failed to build or install the DRBD kernel module! It is very unlikely that this machine will be able to run any servers until this is fixed.</key>
<key name="warning_0133">[ Warning ] - Table: [history.#!variable!table!#] not found.</key> <key name="warning_0133">[ Warning ] - Table: [history.#!variable!table!#] not found.</key>
<key name="warning_0134">[ Warning ] - Holding off starting the cluster. Tested access to ourself, and failed. Is '/etc/hosts' populated? Will try again in ten seconds.</key> <key name="warning_0134">[ Warning ] - Holding off starting the cluster. Tested access to ourself, and failed. Is '/etc/hosts' populated? Will try again in ten seconds.</key>
<key name="warning_0135">[ Warning ] - The program: [#!variable!program!#] was not found to be running.</key>
<!-- The entries below here are not sequential, but use a key to find the entry. --> <!-- The entries below here are not sequential, but use a key to find the entry. -->
<!-- Run 'striker-parse-os-list to find new entries. --> <!-- Run 'striker-parse-os-list to find new entries. -->

@ -246,6 +246,9 @@ while(1)
$anvil->nice_exit({exit_code => 0}); $anvil->nice_exit({exit_code => 0});
} }
# Check how much RAM we're using.
check_ram($anvil);
# Disconnect from the database(s) and sleep now. # Disconnect from the database(s) and sleep now.
$anvil->Database->disconnect(); $anvil->Database->disconnect();
sleep(2); sleep(2);
@ -258,6 +261,41 @@ $anvil->nice_exit({exit_code => 0});
# Functions # # Functions #
############################################################################################################# #############################################################################################################
# If we're using too much ram, send an alert and exit.
sub check_ram
{
my ($anvil) = @_;
# Problem 0 == ok, 1 == too much ram used, 2 == no pid found
my ($problem, $ram_used) = $anvil->System->check_ram_use({program => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
problem => $problem,
ram_used => $anvil->Convert->add_commas({number => $ram_used})." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $ram_used}).")",
}});
if ($problem)
{
# Send an alert and exit.
$anvil->Alert->register({alert_level => "notice", message => "error_0357", variables => {
program => $THIS_FILE,
ram_used => $anvil->Convert->bytes_to_human_readable({'bytes' => $ram_used}),
ram_used_bytes => $anvil->Convert->add_commas({number => $ram_used}),
}, set_by => $THIS_FILE, sort_position => 0});
$anvil->Email->send_alerts();
# Log the same
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0357", variables => {
program => $THIS_FILE,
ram_used => $anvil->Convert->bytes_to_human_readable({'bytes' => $ram_used}),
ram_used_bytes => $anvil->Convert->add_commas({number => $ram_used}),
}});
# Exit with RC0 so that systemctl restarts
$anvil->nice_exit({exit_code => 0});
}
return(0);
}
# Check to see if we're mapping the network on this host. # Check to see if we're mapping the network on this host.
sub check_if_mapping sub check_if_mapping
{ {
@ -1291,7 +1329,7 @@ sub prep_database
{ {
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
prep_database => $prep_database, prep_database => $prep_database,
"sys}{database}{connections" => $anvil->data->{sys}{database}{connections}, "sys::database::connections" => $anvil->data->{sys}{database}{connections},
}}); }});
if ($prep_database) if ($prep_database)
{ {

@ -163,6 +163,9 @@ while(1)
# Clean up # Clean up
cleanup_after_run($anvil); cleanup_after_run($anvil);
# Check how much RAM we're using.
check_ram($anvil);
# Sleep until it's time to run again. # Sleep until it's time to run again.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0249", variables => { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0249", variables => {
run_interval => $run_interval, run_interval => $run_interval,
@ -181,6 +184,41 @@ $anvil->nice_exit({exit_code => 0});
# Functions # # Functions #
############################################################################################################# #############################################################################################################
# If we're using too much ram, send an alert and exit.
sub check_ram
{
my ($anvil) = @_;
# Problem 0 == ok, 1 == too much ram used, 2 == no pid found
my ($problem, $ram_used) = $anvil->System->check_ram_use({program => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
problem => $problem,
ram_used => $anvil->Convert->add_commas({number => $ram_used})." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $ram_used}).")",
}});
if ($problem)
{
# Send an alert and exit.
$anvil->Alert->register({alert_level => "notice", message => "error_0357", variables => {
program => $THIS_FILE,
ram_used => $anvil->Convert->bytes_to_human_readable({'bytes' => $ram_used}),
ram_used_bytes => $anvil->Convert->add_commas({number => $ram_used}),
}, set_by => $THIS_FILE, sort_position => 0});
$anvil->Email->send_alerts();
# Log the same
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0357", variables => {
program => $THIS_FILE,
ram_used => $anvil->Convert->bytes_to_human_readable({'bytes' => $ram_used}),
ram_used_bytes => $anvil->Convert->add_commas({number => $ram_used}),
}});
# Exit with RC0 so that systemctl restarts
$anvil->nice_exit({exit_code => 0});
}
return(0);
}
# This cleans things up after a scan run has completed. # This cleans things up after a scan run has completed.
sub cleanup_after_run sub cleanup_after_run
{ {

Loading…
Cancel
Save