|
|
|
#!/usr/bin/perl
|
|
|
|
#
|
|
|
|
# This daemon monitors and logs preformance data. This is meant to help debug issues related to (potential)
|
|
|
|
# performance issues.
|
|
|
|
#
|
|
|
|
# NOTE: This is designed to be minimal overhead, so there is no attempt to connect to the database. As such,
|
|
|
|
# be mindful of what this daemon is used for.
|
|
|
|
#
|
|
|
|
|
|
|
|
use strict;
|
|
|
|
use warnings;
|
|
|
|
use Data::Dumper;
|
|
|
|
use Anvil::Tools;
|
|
|
|
|
|
|
|
my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0];
|
|
|
|
my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0];
|
|
|
|
if (($running_directory =~ /^\./) && ($ENV{PWD}))
|
|
|
|
{
|
|
|
|
$running_directory =~ s/^\./$ENV{PWD}/;
|
|
|
|
}
|
|
|
|
|
|
|
|
# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete.
|
|
|
|
$| = 1;
|
|
|
|
|
|
|
|
my $anvil = Anvil::Tools->new();
|
|
|
|
|
|
|
|
# Read switches
|
|
|
|
$anvil->Get->switches({list => [
|
|
|
|
"detailed",
|
|
|
|
"interval",
|
|
|
|
"print",
|
|
|
|
"run-once",
|
|
|
|
], man => $THIS_FILE});
|
|
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => $anvil->data->{switches}});
|
|
|
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0115", variables => { program => $THIS_FILE }});
|
|
|
|
|
|
|
|
# Calculate my sum so that we can exit if it changes later.
|
|
|
|
$anvil->Storage->record_md5sums;
|
|
|
|
my $next_md5sum_check = time + 30;
|
|
|
|
|
|
|
|
our $interval = $anvil->data->{switches}{interval} =~ /^\d+$/ ? $anvil->data->{switches}{interval} : 5;
|
|
|
|
our $print = $anvil->data->{switches}{'print'} ? 1 : 0;
|
|
|
|
our $detailed = $anvil->data->{switches}{detailed} ? 1 : 0;
|
|
|
|
|
|
|
|
|
|
|
|
# Now go into the main loop
|
|
|
|
while(1)
|
|
|
|
{
|
|
|
|
my $scan_time = time;
|
|
|
|
|
|
|
|
record_data($anvil);
|
|
|
|
|
|
|
|
if ($anvil->data->{switches}{'run-once'})
|
|
|
|
{
|
|
|
|
# We're done.
|
|
|
|
$anvil->nice_exit({exit_code => 0});
|
|
|
|
}
|
|
|
|
|
|
|
|
if (time > $next_md5sum_check)
|
|
|
|
{
|
|
|
|
$next_md5sum_check = time + 30;
|
|
|
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { next_md5sum_check => $next_md5sum_check }});
|
|
|
|
if ($anvil->Storage->check_md5sums)
|
|
|
|
{
|
|
|
|
# NOTE: We exit with '0' to prevent systemctl from showing a scary red message.
|
|
|
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "message_0014"});
|
|
|
|
$anvil->nice_exit({exit_code => 0});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
sleep $interval;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
sub record_data
|
|
|
|
{
|
|
|
|
my ($anvil) = @_;
|
|
|
|
|
|
|
|
$anvil->Get->load_average({debug => 3});
|
|
|
|
|
|
|
|
if ($detailed)
|
|
|
|
{
|
|
|
|
# Show the CPU load percents.
|
|
|
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => $print, level => 1, key => "log_0840", variables => {
|
|
|
|
iowait => $anvil->data->{loads}{load_percent}{iowait},
|
|
|
|
user => $anvil->data->{loads}{load_percent}{user},
|
|
|
|
steal => $anvil->data->{loads}{load_percent}{steal},
|
|
|
|
idle => $anvil->data->{loads}{load_percent}{idle},
|
|
|
|
nice => $anvil->data->{loads}{load_percent}{nice},
|
|
|
|
'system' => $anvil->data->{loads}{load_percent}{'system'},
|
|
|
|
}});
|
|
|
|
|
|
|
|
# Show the CPU load levels
|
|
|
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => $print, level => 1, key => "log_0835", variables => {
|
|
|
|
one_minute => $anvil->data->{loads}{load_average}{one_minute},
|
|
|
|
five_minutes => $anvil->data->{loads}{load_average}{five_minute},
|
|
|
|
ten_minutes => $anvil->data->{loads}{load_average}{ten_minute},
|
|
|
|
}});
|
|
|
|
|
|
|
|
# Show the processes
|
|
|
|
my $key = $detailed ? "log_0841" : "log_0836";
|
|
|
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => $print, level => 1, key => $key, variables => {
|
|
|
|
total => $anvil->Convert->add_commas({number => $anvil->data->{loads}{processes}{total}}),
|
|
|
|
running => $anvil->Convert->add_commas({number => $anvil->data->{loads}{processes}{running}}),
|
|
|
|
blocked => $anvil->Convert->add_commas({number => $anvil->data->{loads}{processes}{blocked}}),
|
|
|
|
interrupts => $anvil->Convert->add_commas({number => $anvil->data->{loads}{interrupts}{total}}),
|
|
|
|
}});
|
|
|
|
|
|
|
|
# CPU average load times
|
|
|
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => $print, level => 1, key => "log_0837", variables => {
|
|
|
|
io_wait => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{average}{io_wait} / 100)}),
|
|
|
|
user_mode => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{average}{user_mode} / 100)}),
|
|
|
|
user_mode_nice => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{average}{user_mode_nice} / 100)}),
|
|
|
|
system_mode => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{average}{system_mode} / 100)}),
|
|
|
|
idle_tasks => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{average}{idle_tasks} / 100)}),
|
|
|
|
hard_irq => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{average}{hard_irq} / 100)}),
|
|
|
|
soft_irq => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{average}{soft_irq} / 100)}),
|
|
|
|
}});
|
|
|
|
|
|
|
|
# Show per-cores
|
|
|
|
foreach my $core (sort {$a <=> $b} keys %{$anvil->data->{loads}{cpu}{core}})
|
|
|
|
{
|
|
|
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => $print, level => 1, key => "log_0838", variables => {
|
|
|
|
core => $core,
|
|
|
|
user_mode => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{core}{$core}{user_mode} / 100)}),
|
|
|
|
user_mode_nice => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{core}{$core}{user_mode_nice} / 100)}),
|
|
|
|
system_mode => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{core}{$core}{system_mode} / 100)}),
|
|
|
|
idle_tasks => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{core}{$core}{idle_tasks} / 100)}),
|
|
|
|
hard_irq => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{core}{$core}{hard_irq} / 100)}),
|
|
|
|
soft_irq => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{core}{$core}{soft_irq} / 100)}),
|
|
|
|
}});
|
|
|
|
}
|
|
|
|
|
|
|
|
# This is the number of IO operations in progress. When IOs in progress is non-zero, the weighted time (in 1/100ths of a second), doing those IOs.
|
|
|
|
foreach my $device_name (sort {$a cmp $b} keys %{$anvil->data->{loads}{storage}})
|
|
|
|
{
|
|
|
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => $print, level => 1, key => "log_0839", variables => {
|
|
|
|
device_name => $device_name,
|
|
|
|
in_progress => $anvil->data->{loads}{storage}{$device_name}{ios_currently_in_progress},
|
|
|
|
weighted_time_spent => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{storage}{$device_name}{weighted_time_spent_doing_ios} / 100)}),
|
|
|
|
}});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
# This is much more condensed.
|
|
|
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => $print, level => 1, key => "log_0846", variables => {
|
|
|
|
one_minute => $anvil->data->{loads}{load_average}{one_minute},
|
|
|
|
five_minutes => $anvil->data->{loads}{load_average}{five_minute},
|
|
|
|
ten_minutes => $anvil->data->{loads}{load_average}{ten_minute},
|
|
|
|
iowait => $anvil->data->{loads}{load_percent}{iowait},
|
|
|
|
running => $anvil->Convert->add_commas({number => $anvil->data->{loads}{processes}{running}}),
|
|
|
|
blocked => $anvil->Convert->add_commas({number => $anvil->data->{loads}{processes}{blocked}}),
|
|
|
|
}});
|
|
|
|
}
|
|
|
|
|
|
|
|
return(0);
|
|
|
|
}
|
|
|
|
|