parent
43f4201861
commit
4e367acd11
10 changed files with 210 additions and 4 deletions
@ -0,0 +1,41 @@ |
|||||||
|
.\" Manpage for the Anvil! server boot program |
||||||
|
.\" Contact mkelly@alteeve.com to report issues, concerns or suggestions. |
||||||
|
.TH anvil-monitor-performance "8" "February 12 2024" "Anvil! Intelligent Availability™ Platform" |
||||||
|
.SH NAME |
||||||
|
anvil-monitor-performance \- Tool used to log system performance to anvil.conf to assist with performance issue debugging. |
||||||
|
.SH SYNOPSIS |
||||||
|
.B anvil-monitor-performance |
||||||
|
\fI\,<command> \/\fR[\fI\,options\/\fR] |
||||||
|
.SH DESCRIPTION |
||||||
|
This tool uses the Get->load_average() method to collect performance data and then records the parsed data to the Anvil! logs. |
||||||
|
.TP |
||||||
|
This is meant to be a light-weight tool run by the same-named systemd daemon. It does not connect to the database. |
||||||
|
.TP |
||||||
|
.SH OPTIONS |
||||||
|
.TP |
||||||
|
\-?, \-h, \fB\-\-help\fR |
||||||
|
Show this man page. |
||||||
|
.TP |
||||||
|
\fB\-\-log\-secure\fR |
||||||
|
When logging, record sensitive data, like passwords. |
||||||
|
.TP |
||||||
|
\-v, \-vv, \-vvv |
||||||
|
Set the log level to 1, 2 or 3 respectively. Be aware that level 3 generates a significant amount of log data. |
||||||
|
.SS "Commands:" |
||||||
|
.TP |
||||||
|
\fB\-\-detailed\fR |
||||||
|
Log extended performance data. This shows extended and per-cpu-core and per block device metrics. |
||||||
|
.TP |
||||||
|
\fB\-\-interval\fR <seconds> |
||||||
|
By default, the performance data is collected and logged every five seconds. If you want to change this frequency, you can use this switch to set the interval seconds you wish to use. |
||||||
|
.TP |
||||||
|
\fB\-\-print\fR |
||||||
|
By default, this programs logs to /var/log/anvil.log. This switch also sends the logged data to STDOUT. |
||||||
|
.TP |
||||||
|
\fB\-\-run\-once\fR <uuid> |
||||||
|
This tells the program to collect and report the performance data once, and then to exit. |
||||||
|
.IP |
||||||
|
.SH AUTHOR |
||||||
|
Written by Madison Kelly, Alteeve staff and the Anvil! project contributors. |
||||||
|
.SH "REPORTING BUGS" |
||||||
|
Report bugs to users@clusterlabs.org |
@ -0,0 +1,147 @@ |
|||||||
|
#!/usr/bin/perl |
||||||
|
# |
||||||
|
# This daemon monitors and logs preformance data. This is meant to help debug issues related to (potential) |
||||||
|
# performance issues. |
||||||
|
# |
||||||
|
# NOTE: This is designed to be minimal overhead, so there is no attempt to connect to the database. As such, |
||||||
|
# be mindful of what this daemon is used for. |
||||||
|
# |
||||||
|
|
||||||
|
use strict; |
||||||
|
use warnings; |
||||||
|
use Data::Dumper; |
||||||
|
use Anvil::Tools; |
||||||
|
|
||||||
|
my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0]; |
||||||
|
my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0]; |
||||||
|
if (($running_directory =~ /^\./) && ($ENV{PWD})) |
||||||
|
{ |
||||||
|
$running_directory =~ s/^\./$ENV{PWD}/; |
||||||
|
} |
||||||
|
|
||||||
|
# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete. |
||||||
|
$| = 1; |
||||||
|
|
||||||
|
my $anvil = Anvil::Tools->new(); |
||||||
|
|
||||||
|
# Read switches |
||||||
|
$anvil->Get->switches({list => [ |
||||||
|
"detailed", |
||||||
|
"interval", |
||||||
|
"print", |
||||||
|
"run-once", |
||||||
|
], man => $THIS_FILE}); |
||||||
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}}); |
||||||
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => "log_0115", variables => { program => $THIS_FILE }}); |
||||||
|
|
||||||
|
# Calculate my sum so that we can exit if it changes later. |
||||||
|
$anvil->Storage->record_md5sums; |
||||||
|
my $next_md5sum_check = time + 30; |
||||||
|
|
||||||
|
our $interval = $anvil->data->{switches}{interval} =~ /^\d+$/ ? $anvil->data->{switches}{interval} : 5; |
||||||
|
our $print = $anvil->data->{switches}{'print'} ? 1 : 0; |
||||||
|
our $detailed = $anvil->data->{switches}{detailed} ? 1 : 0; |
||||||
|
|
||||||
|
|
||||||
|
# Now go into the main loop |
||||||
|
while(1) |
||||||
|
{ |
||||||
|
my $scan_time = time; |
||||||
|
|
||||||
|
record_data($anvil); |
||||||
|
|
||||||
|
if ($anvil->data->{switches}{'run-once'}) |
||||||
|
{ |
||||||
|
# We're done. |
||||||
|
$anvil->nice_exit({exit_code => 0}); |
||||||
|
} |
||||||
|
|
||||||
|
if (time > $next_md5sum_check) |
||||||
|
{ |
||||||
|
$next_md5sum_check = time + 30; |
||||||
|
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_md5sum_check => $next_md5sum_check }}); |
||||||
|
if ($anvil->Storage->check_md5sums) |
||||||
|
{ |
||||||
|
# NOTE: We exit with '0' to prevent systemctl from showing a scary red message. |
||||||
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "message_0014"}); |
||||||
|
$anvil->nice_exit({exit_code => 0}); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
sleep $interval; |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
sub record_data |
||||||
|
{ |
||||||
|
my ($anvil) = @_; |
||||||
|
|
||||||
|
$anvil->Get->load_average({debug => 2}); |
||||||
|
|
||||||
|
# Show the CPU load percents. |
||||||
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => $print, level => 1, key => "log_0840", variables => { |
||||||
|
iowait => $anvil->data->{loads}{load_percent}{iowait}, |
||||||
|
user => $anvil->data->{loads}{load_percent}{user}, |
||||||
|
steal => $anvil->data->{loads}{load_percent}{steal}, |
||||||
|
idle => $anvil->data->{loads}{load_percent}{idle}, |
||||||
|
nice => $anvil->data->{loads}{load_percent}{nice}, |
||||||
|
'system' => $anvil->data->{loads}{load_percent}{'system'}, |
||||||
|
}}); |
||||||
|
|
||||||
|
# Show the CPU load levels |
||||||
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => $print, level => 1, key => "log_0835", variables => { |
||||||
|
one_minute => $anvil->data->{loads}{load_average}{one_minute}, |
||||||
|
five_minutes => $anvil->data->{loads}{load_average}{five_minute}, |
||||||
|
ten_minutes => $anvil->data->{loads}{load_average}{ten_minute}, |
||||||
|
}}); |
||||||
|
|
||||||
|
# Show the processes |
||||||
|
my $key = $detailed ? "log_0841" : "log_0836"; |
||||||
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => $print, level => 1, key => $key, variables => { |
||||||
|
total => $anvil->Convert->add_commas({number => $anvil->data->{loads}{processes}{total}}), |
||||||
|
running => $anvil->Convert->add_commas({number => $anvil->data->{loads}{processes}{running}}), |
||||||
|
blocked => $anvil->Convert->add_commas({number => $anvil->data->{loads}{processes}{blocked}}), |
||||||
|
interrupts => $anvil->Convert->add_commas({number => $anvil->data->{loads}{interrupts}{total}}), |
||||||
|
}}); |
||||||
|
|
||||||
|
if ($detailed) |
||||||
|
{ |
||||||
|
# CPU average load times |
||||||
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => $print, level => 1, key => "log_0837", variables => { |
||||||
|
io_wait => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{average}{io_wait} / 100)}), |
||||||
|
user_mode => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{average}{user_mode} / 100)}), |
||||||
|
user_mode_nice => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{average}{user_mode_nice} / 100)}), |
||||||
|
system_mode => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{average}{system_mode} / 100)}), |
||||||
|
idle_tasks => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{average}{idle_tasks} / 100)}), |
||||||
|
hard_irq => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{average}{hard_irq} / 100)}), |
||||||
|
soft_irq => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{average}{soft_irq} / 100)}), |
||||||
|
}}); |
||||||
|
|
||||||
|
# Show per-cores |
||||||
|
foreach my $core (sort {$a <=> $b} keys %{$anvil->data->{loads}{cpu}{core}}) |
||||||
|
{ |
||||||
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => $print, level => 1, key => "log_0838", variables => { |
||||||
|
core => $core, |
||||||
|
user_mode => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{core}{$core}{user_mode} / 100)}), |
||||||
|
user_mode_nice => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{core}{$core}{user_mode_nice} / 100)}), |
||||||
|
system_mode => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{core}{$core}{system_mode} / 100)}), |
||||||
|
idle_tasks => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{core}{$core}{idle_tasks} / 100)}), |
||||||
|
hard_irq => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{core}{$core}{hard_irq} / 100)}), |
||||||
|
soft_irq => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{core}{$core}{soft_irq} / 100)}), |
||||||
|
}}); |
||||||
|
} |
||||||
|
|
||||||
|
# This is the number of IO operations in progress. When IOs in progress is non-zero, the weighted time (in 1/100ths of a second), doing those IOs. |
||||||
|
foreach my $device_name (sort {$a cmp $b} keys %{$anvil->data->{loads}{storage}}) |
||||||
|
{ |
||||||
|
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => $print, level => 1, key => "log_0839", variables => { |
||||||
|
device_name => $device_name, |
||||||
|
in_progress => $anvil->data->{loads}{storage}{$device_name}{ios_currently_in_progress}, |
||||||
|
weighted_time_spent => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{storage}{$device_name}{weighted_time_spent_doing_ios} / 100)}), |
||||||
|
}}); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
return(0); |
||||||
|
} |
||||||
|
|
@ -0,0 +1,12 @@ |
|||||||
|
[Unit] |
||||||
|
Description=Anvil! Intelligent Availability Platform - Performance Monitor Daemon |
||||||
|
Wants=network.target |
||||||
|
|
||||||
|
[Service] |
||||||
|
Type=simple |
||||||
|
ExecStart=/usr/sbin/anvil-monitor-performance |
||||||
|
ExecStop=/bin/kill -WINCH ${MAINPID} |
||||||
|
Restart=always |
||||||
|
|
||||||
|
[Install] |
||||||
|
WantedBy=multi-user.target |
Loading…
Reference in new issue