Created anvil-monitor-performance tool and daemon.

Signed-off-by: digimer <mkelly@alteeve.ca>
main
digimer 10 months ago
parent 43f4201861
commit 4e367acd11
  1. 1
      anvil.spec.in
  2. 2
      man/Makefile.am
  3. 2
      man/anvil-boot-server.8
  4. 0
      man/anvil-monitor-network.8
  5. 41
      man/anvil-monitor-performance.8
  6. 7
      share/words.xml
  7. 1
      tools/Makefile.am
  8. 147
      tools/anvil-monitor-performance
  9. 1
      units/Makefile.am
  10. 12
      units/anvil-monitor-performance.service

@ -250,6 +250,7 @@ setenforce 0
systemctl enable --now chronyd.service
systemctl enable --now anvil-daemon.service
systemctl enable --now anvil-monitor-network.service
systemctl enable --now anvil-monitor-performance.service
systemctl enable --now scancore.service
%pre striker

@ -28,6 +28,8 @@ dist_man8_MANS = \
anvil-manage-server.8 \
anvil-manage-server-storage.8 \
anvil-manage-storage-groups.8 \
anvil-monitor-network.8 \
anvil-monitor-performance.8 \
anvil-migrate-server.8 \
anvil-network-profiler.8 \
anvil-parse-fence-agents.8 \

@ -16,7 +16,7 @@ This method, when used with '\fB\-\-server\fR all', will honour server boot prio
\-?, \-h, \fB\-\-help\fR
Show this man page.
.TP
\fB\-\-log-secure\fR
\fB\-\-log\-secure\fR
When logging, record sensitive data, like passwords.
.TP
\-v, \-vv, \-vvv

@ -0,0 +1,41 @@
.\" Manpage for the Anvil! server boot program
.\" Contact mkelly@alteeve.com to report issues, concerns or suggestions.
.TH anvil-monitor-performance "8" "February 12 2024" "Anvil! Intelligent Availability™ Platform"
.SH NAME
anvil-monitor-performance \- Tool used to log system performance to anvil.conf to assist with performance issue debugging.
.SH SYNOPSIS
.B anvil-monitor-performance
\fI\,<command> \/\fR[\fI\,options\/\fR]
.SH DESCRIPTION
This tool uses the Get->load_average() method to collect performance data and then records the parsed data to the Anvil! logs.
.TP
This is meant to be a light-weight tool run by the same-named systemd daemon. It does not connect to the database.
.TP
.SH OPTIONS
.TP
\-?, \-h, \fB\-\-help\fR
Show this man page.
.TP
\fB\-\-log\-secure\fR
When logging, record sensitive data, like passwords.
.TP
\-v, \-vv, \-vvv
Set the log level to 1, 2 or 3 respectively. Be aware that level 3 generates a significant amount of log data.
.SS "Commands:"
.TP
\fB\-\-detailed\fR
Log extended performance data. This shows extended and per-cpu-core and per block device metrics.
.TP
\fB\-\-interval\fR <seconds>
By default, the performance data is collected and logged every five seconds. If you want to change this frequency, you can use this switch to set the interval seconds you wish to use.
.TP
\fB\-\-print\fR
By default, this programs logs to /var/log/anvil.log. This switch also sends the logged data to STDOUT.
.TP
\fB\-\-run\-once\fR <uuid>
This tells the program to collect and report the performance data once, and then to exit.
.IP
.SH AUTHOR
Written by Madison Kelly, Alteeve staff and the Anvil! project contributors.
.SH "REPORTING BUGS"
Report bugs to users@clusterlabs.org

@ -2714,12 +2714,13 @@ The file: [#!variable!file!#] needs to be updated. The difference is:
<key name="log_0832">The host: [#!variable!host_name!#] was found not found in the '/etc/hosts' file! We'll wait a few seconds and check again.</key>
<key name="log_0833">All host names were found in '/etc/hosts', ready to proceed!</key>
<key name="log_0834">One or more hosts are not yet in the '/etc/hosts' file with expected IPs. We'll wait a short bit and check again.</key>
<key name="log_0835">The CPU load average is; one minute: [#!variable!one_minute!#], five minutes: [#!variable!five_minutes!#], ten minutes: [#!variable!ten_minutes!#].</key>
<key name="log_0836">Processes; total: [#!variable!total!#], running: [#!variable!running!#], blocked: [#!variable!blocked!#], IRQ interrupts: [#!variable!interrupts!#].</key>
<key name="log_0835">The CPU load average is; (one / five / ten minutes): [#!variable!one_minute!# / #!variable!five_minutes!# / #!variable!ten_minutes!#].</key>
<key name="log_0836">Processes; (total, running, blocked): [#!variable!total!#, #!variable!running!#, #!variable!blocked!#]</key>
<key name="log_0837">Time spend (in secs) doing; IO wait: [#!variable!io_wait!#], user mode: [#!variable!user_mode!#], niced user mode: [#!variable!user_mode_nice!#], system mode: [#!variable!system_mode!#], idle tasks: [#!variable!idle_tasks!#], hard IRQ: [#!variable!hard_irq!#], soft IRQ: [#!variable!soft_irq!#].</key>
<key name="log_0838">CPU Core: [#!variable!core!#], time doind (seconds); user mode: [#!variable!user_mode!#], niced user mode: [#!variable!user_mode_nice!#], system mode: [#!variable!system_mode!#], idle tasks: [#!variable!idle_tasks!#], hard IRQ: [#!variable!hard_irq!#], soft IRQ: [#!variable!soft_irq!#].</key>
<key name="log_0839">Drive: [#!variable!device_name!#], IOs currently in progress: [#!variable!in_progress!#], weighted time spent: [#!variable!weighted_time_spent!# sec].</key>
<key name="log_0840">CPU percent time doing; IO wait: [#!variable!iowait!#], user tasks: [#!variable!user!#], system: [#!variable!system!#], nice'd tasks: [#!variable!nice!#], idle: [#!variable!idle!#], involuntary wait: [#!variable!steal!#].</key>
<key name="log_0840">CPU percent time doing; (IO wait / user / system / nice'd / idle / involuntary wait): [#!variable!iowait!# / #!variable!user!# / #!variable!system!# / #!variable!nice!# / #!variable!idle!#/ #!variable!steal!#].</key>
<key name="log_0841">Processes; (total / running / blocked / IRQ interrupts): [#!variable!total!# / #!variable!running!# / #!variable!blocked!# / #!variable!interrupts!#].</key>
<!-- Messages for users (less technical than log entries), though sometimes used for logs, too. -->
<key name="message_0001">The host name: [#!variable!target!#] does not resolve to an IP address.</key>

@ -27,6 +27,7 @@ dist_sbin_SCRIPTS = \
anvil-manage-vnc-pipe \
anvil-migrate-server \
anvil-monitor-network \
anvil-monitor-performance \
anvil-network-profiler \
anvil-parse-fence-agents \
anvil-pcs-wrapper \

@ -0,0 +1,147 @@
#!/usr/bin/perl
#
# This daemon monitors and logs preformance data. This is meant to help debug issues related to (potential)
# performance issues.
#
# NOTE: This is designed to be minimal overhead, so there is no attempt to connect to the database. As such,
# be mindful of what this daemon is used for.
#
use strict;
use warnings;
use Data::Dumper;
use Anvil::Tools;
my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0];
my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0];
if (($running_directory =~ /^\./) && ($ENV{PWD}))
{
$running_directory =~ s/^\./$ENV{PWD}/;
}
# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete.
$| = 1;
my $anvil = Anvil::Tools->new();
# Read switches
$anvil->Get->switches({list => [
"detailed",
"interval",
"print",
"run-once",
], man => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => "log_0115", variables => { program => $THIS_FILE }});
# Calculate my sum so that we can exit if it changes later.
$anvil->Storage->record_md5sums;
my $next_md5sum_check = time + 30;
our $interval = $anvil->data->{switches}{interval} =~ /^\d+$/ ? $anvil->data->{switches}{interval} : 5;
our $print = $anvil->data->{switches}{'print'} ? 1 : 0;
our $detailed = $anvil->data->{switches}{detailed} ? 1 : 0;
# Now go into the main loop
while(1)
{
my $scan_time = time;
record_data($anvil);
if ($anvil->data->{switches}{'run-once'})
{
# We're done.
$anvil->nice_exit({exit_code => 0});
}
if (time > $next_md5sum_check)
{
$next_md5sum_check = time + 30;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_md5sum_check => $next_md5sum_check }});
if ($anvil->Storage->check_md5sums)
{
# NOTE: We exit with '0' to prevent systemctl from showing a scary red message.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "message_0014"});
$anvil->nice_exit({exit_code => 0});
}
}
sleep $interval;
}
sub record_data
{
my ($anvil) = @_;
$anvil->Get->load_average({debug => 2});
# Show the CPU load percents.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => $print, level => 1, key => "log_0840", variables => {
iowait => $anvil->data->{loads}{load_percent}{iowait},
user => $anvil->data->{loads}{load_percent}{user},
steal => $anvil->data->{loads}{load_percent}{steal},
idle => $anvil->data->{loads}{load_percent}{idle},
nice => $anvil->data->{loads}{load_percent}{nice},
'system' => $anvil->data->{loads}{load_percent}{'system'},
}});
# Show the CPU load levels
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => $print, level => 1, key => "log_0835", variables => {
one_minute => $anvil->data->{loads}{load_average}{one_minute},
five_minutes => $anvil->data->{loads}{load_average}{five_minute},
ten_minutes => $anvil->data->{loads}{load_average}{ten_minute},
}});
# Show the processes
my $key = $detailed ? "log_0841" : "log_0836";
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => $print, level => 1, key => $key, variables => {
total => $anvil->Convert->add_commas({number => $anvil->data->{loads}{processes}{total}}),
running => $anvil->Convert->add_commas({number => $anvil->data->{loads}{processes}{running}}),
blocked => $anvil->Convert->add_commas({number => $anvil->data->{loads}{processes}{blocked}}),
interrupts => $anvil->Convert->add_commas({number => $anvil->data->{loads}{interrupts}{total}}),
}});
if ($detailed)
{
# CPU average load times
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => $print, level => 1, key => "log_0837", variables => {
io_wait => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{average}{io_wait} / 100)}),
user_mode => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{average}{user_mode} / 100)}),
user_mode_nice => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{average}{user_mode_nice} / 100)}),
system_mode => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{average}{system_mode} / 100)}),
idle_tasks => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{average}{idle_tasks} / 100)}),
hard_irq => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{average}{hard_irq} / 100)}),
soft_irq => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{average}{soft_irq} / 100)}),
}});
# Show per-cores
foreach my $core (sort {$a <=> $b} keys %{$anvil->data->{loads}{cpu}{core}})
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => $print, level => 1, key => "log_0838", variables => {
core => $core,
user_mode => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{core}{$core}{user_mode} / 100)}),
user_mode_nice => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{core}{$core}{user_mode_nice} / 100)}),
system_mode => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{core}{$core}{system_mode} / 100)}),
idle_tasks => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{core}{$core}{idle_tasks} / 100)}),
hard_irq => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{core}{$core}{hard_irq} / 100)}),
soft_irq => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{cpu}{core}{$core}{soft_irq} / 100)}),
}});
}
# This is the number of IO operations in progress. When IOs in progress is non-zero, the weighted time (in 1/100ths of a second), doing those IOs.
foreach my $device_name (sort {$a cmp $b} keys %{$anvil->data->{loads}{storage}})
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => $print, level => 1, key => "log_0839", variables => {
device_name => $device_name,
in_progress => $anvil->data->{loads}{storage}{$device_name}{ios_currently_in_progress},
weighted_time_spent => $anvil->Convert->add_commas({number => ($anvil->data->{loads}{storage}{$device_name}{weighted_time_spent_doing_ios} / 100)}),
}});
}
}
return(0);
}

@ -4,6 +4,7 @@ servicedir = $(SYSTEMD_UNIT_DIR)
dist_service_DATA = \
anvil-daemon.service \
anvil-monitor-network.service \
anvil-monitor-performance.service \
anvil-safe-start.service \
scancore.service \
striker-ui-api.service

@ -0,0 +1,12 @@
[Unit]
Description=Anvil! Intelligent Availability Platform - Performance Monitor Daemon
Wants=network.target
[Service]
Type=simple
ExecStart=/usr/sbin/anvil-monitor-performance
ExecStop=/bin/kill -WINCH ${MAINPID}
Restart=always
[Install]
WantedBy=multi-user.target
Loading…
Cancel
Save