From e052c75e2f582f23e02d59a927f1ca2d00d59d02 Mon Sep 17 00:00:00 2001 From: Digimer Date: Mon, 1 Feb 2021 12:56:46 -0500 Subject: [PATCH 1/9] * Added a check to anvil-delete-server to remove the XML definition file. * Added checks to anvil-provision-server to see if an existing server name is flagged as DELETED, instead of outright rejecting a given server name. Signed-off-by: Digimer --- share/words.xml | 4 ++ tools/anvil-delete-server | 14 +++++ tools/anvil-provision-server | 108 ++++++++++++++++++++++++++++------- 3 files changed, 105 insertions(+), 21 deletions(-) diff --git a/share/words.xml b/share/words.xml index 9c909832..d3674532 100644 --- a/share/words.xml +++ b/share/words.xml @@ -655,6 +655,8 @@ It should be provisioned in the next minute or two. The server delete is complete on this host! It looks like ScanCore has not yet run on one or both nodes in this Anvil! system. Missing resource data, so unable to proceed. Manually calling 'scan-drbd' to ensure that the new agent is recorded. + The server name: [#!variable!server_name!#] is already used by another server. + Deleting the server's definition file: [#!variable!file!#]... Starting: [#!variable!program!#]. @@ -1654,6 +1656,8 @@ Are you sure that you want to delete the server: [#!variable!server_name!#]? [Ty The server is running here, assigning the job to this host. Preparing to delete a server. Preparing to migrate a server (or all servers). + - #!variable!server_name!# (Current state: [#!variable!server_state!#]) + - * #!variable!server_name!# (Deleted, name can be reused) Saved the mail server information successfully! diff --git a/tools/anvil-delete-server b/tools/anvil-delete-server index c2e26831..cf716ef5 100755 --- a/tools/anvil-delete-server +++ b/tools/anvil-delete-server @@ -195,6 +195,20 @@ WHERE $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "job_0215"}); } + # Delete the XML definition file. + my $resource_file = $anvil->data->{path}{directories}{shared}{definitions}."/".$server_name.".xml"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { resource_file => $resource_file }}); + if (-f $resource_file) + { + # Remove it. + $anvil->Job->update_progress({ + progress => 80, + message => "job_0220,!!file!".$resource_file."!!", + }); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "job_0220", variables => { file => $resource_file }}); + unlink $resource_file; + } + $anvil->Job->update_progress({ progress => 100, message => "job_0216", diff --git a/tools/anvil-provision-server b/tools/anvil-provision-server index 813852d2..025907e5 100755 --- a/tools/anvil-provision-server +++ b/tools/anvil-provision-server @@ -423,8 +423,8 @@ sub provision_server my ($handle, $return_code) = $anvil->System->call({ background => 1, shell_call => $shell_call, - stdout_file => "/var/log/anvil_server_".$server.".stdout", - stderr_file => "/var/log/anvil_server_".$server.".stderr", + stdout_file => "/var/log/anvil-server_".$server.".stdout", + stderr_file => "/var/log/anvil-server_".$server.".stderr", }); my $pid = $handle->pid(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { @@ -1236,14 +1236,24 @@ sub parse_job_data }}); if (exists $anvil->data->{servers}{anvil_uuid}{$anvil_uuid}{server_name}{$server}) { - # Duplicate name - $anvil->Job->update_progress({ - progress => 100, - message => "error_0198,!!server_name!".$anvil->data->{job}{server_name}."!!", - job_status => "failed", - }); - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => 'err', key => "error_0198", variables => { server_name => $anvil->data->{job}{server_name} }}); - $anvil->nice_exit({exit_code => 1}); + # Is this name used by a server marked as DELETED? + my $server_uuid = $anvil->data->{servers}{anvil_uuid}{$anvil_uuid}{server_name}{$server}{server_uuid}; + my $server_state = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_state}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + server_uuid => $server_uuid, + server_state => $server_state, + }}); + if ($server_state ne "DELETED") + { + # Duplicate name + $anvil->Job->update_progress({ + progress => 100, + message => "error_0198,!!server_name!".$anvil->data->{job}{server_name}."!!", + job_status => "failed", + }); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => 'err', key => "error_0198", variables => { server_name => $anvil->data->{job}{server_name} }}); + $anvil->nice_exit({exit_code => 1}); + } } if (not $anvil->data->{job}{server_name}) @@ -1564,7 +1574,8 @@ sub interactive_ask_server_name $anvil->Database->get_servers({debug => 2}); ### TODO: Figure out how many rows we have and break the server list into columns if too long. - my $retry = 0; + my $retry = 0; + my $duplicate = ""; while(1) { my $default = ""; @@ -1580,13 +1591,41 @@ sub interactive_ask_server_name # Show all the current server names. if ($retry) { - print $anvil->Words->string({key => "job_0159"})."\n\n"; + if ($duplicate) + { + print $anvil->Words->string({key => "job_0219", variables => { server_name => $duplicate }})."\n\n"; + $duplicate = ""; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { duplicate => $duplicate }}); + } + else + { + print $anvil->Words->string({key => "job_0159"})."\n\n"; + } } my $anvil_uuid = $anvil->data->{new_server}{anvil_uuid}; print $anvil->Words->string({key => "job_0160", variables => { anvil_name => $anvil->data->{new_server}{anvil_name} }})."\n"; foreach my $server_name (sort {$a cmp $b} keys %{$anvil->data->{servers}{anvil_uuid}{$anvil_uuid}{server_name}}) { - print "- ".$server_name."\n"; + my $server_uuid = $anvil->data->{servers}{anvil_uuid}{$anvil_uuid}{server_name}{$server_name}{server_uuid}; + my $server_state = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_state}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + server_uuid => $server_uuid, + server_state => $server_state, + }}); + if ($server_state eq "DELETED") + { + print "- ".$server_name." ("..")\n"; + print $anvil->Words->string({key => "message_0220", variables => { + server_name => $server_name, + }})."\n"; + } + else + { + print $anvil->Words->string({key => "message_0219", variables => { + server_name => $server_name, + server_state => $server_state, + }})."\n"; + } } print $terminal->Tgoto('cm', 0, 3)."? "; @@ -1602,14 +1641,41 @@ sub interactive_ask_server_name # Reload in case a new anvil! was saved while we waited. $anvil->Database->get_servers(); - if (($answer) && (not exists $anvil->data->{servers}{anvil_uuid}{$anvil_uuid}{server_name}{$answer})) + if ($answer) { - # Valid. - $anvil->data->{new_server}{name} = $answer; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - "new_server::name" => $anvil->data->{new_server}{name}, - }}); - + # Duplicate? + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { answer => $answer }}); + if (exists $anvil->data->{servers}{anvil_uuid}{$anvil_uuid}{server_name}{$answer}) + { + my $server_uuid = $anvil->data->{servers}{anvil_uuid}{$anvil_uuid}{server_name}{$answer}{server_uuid}; + my $server_state = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_state}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + server_uuid => $server_uuid, + server_state => $server_state, + }}); + if ($server_state eq "DELETED") + { + # Valid, we can re-use deleted server names. + $anvil->data->{new_server}{name} = $answer; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "new_server::name" => $anvil->data->{new_server}{name}, + }}); + } + else + { + # Invalid, duplicate. + $duplicate = $answer; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { duplicate => $duplicate }}); + } + } + else + { + # Valid. + $anvil->data->{new_server}{name} = $answer; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "new_server::name" => $anvil->data->{new_server}{name}, + }}); + } last; } else @@ -2218,7 +2284,7 @@ sub interactive_ask_server_os }}); # Still here? - $os_list .= " - [".sprintf("%-10s", $os_code)."] - ".$os_name."\n"; + $os_list .= " - ".sprintf("%-10s", $os_code)." - ".$os_name."\n"; } my $retry = 0; From 50d529e07cedc73450572a31e0cc693d3ed32b50 Mon Sep 17 00:00:00 2001 From: Digimer Date: Wed, 3 Feb 2021 19:21:15 -0500 Subject: [PATCH 2/9] * Added a check to anvil-delete-server to remove the XML definition file. * Added checks to anvil-provision-server to see if an existing server name is flagged as DELETED, instead of outright rejecting a given server name. Signed-off-by: Digimer --- tools/test.pl | 66 +++++++++++++++++++++++---------------------------- 1 file changed, 30 insertions(+), 36 deletions(-) diff --git a/tools/test.pl b/tools/test.pl index 02512052..1a59d91d 100755 --- a/tools/test.pl +++ b/tools/test.pl @@ -26,44 +26,38 @@ $anvil->Get->switches; $anvil->Database->connect({debug => 3}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0132"}); -my $key_string = 'scan_drbd_message_0007,!!resource_name!srv00-sql1!!,!!resource_state!#!string!scan_drbd_unit_0004!#!!,!!resource_xml!] from key string: [scan_drbd_message_0007,!!resource_name!srv00-sql1!!,!!resource_state!#!string!scan_drbd_unit_0004!#!!,!!resource_xml! - - - /dev/drbd_srv00-sql1_0 - /dev/mk-a02n01_ssd0/srv00-sql1_0 - internal - -
(null)
-
- - - /dev/drbd_srv00-sql1_0 - /dev/mk-a02n02_ssd0/srv00-sql1_0 - internal - -
(null)
-
- -
10.101.12.1
-
10.101.12.2
-
-
-
-
-
-
!!'; -my $out_string = $anvil->Words->parse_banged_string({ +my $key_string = 'message_0190 +job_0185 +job_0186,!!minor!5!!,!!port!7803!! +job_0188,!!job_uuid!12eeded2-c5bb-4295-8c8e-665bd9c9b83a!!,!!peer_name!mk-a02n01.digimer.ca!! +job_0189,!!lv_path!/dev/mk-a02n02_ssd0/srv02-lab02_0!! +job_0218 +job_0190,!!resource!srv02-lab02!! +job_0191,!!resource!srv02-lab02!! +job_0192 +job_0195 +job_0203,!!resource!srv02-lab02!! +job_0199,!!shell_call!/usr/bin/virt-install --connect qemu:///system \ +--name srv02-lab02 \ + --os-variant win2k19 \ + --memory 8192 \ + --events on_poweroff=destroy,on_reboot=restart \ + --vcpus 6,sockets=1,cores=6 \ + --cpu host \ + --network bridge=ifn1_bridge1,model=virtio \ + --graphics spice \ + --sound ich9 \ + --clock offset=localtime \ + --boot menu=on \ + --disk path=/dev/drbd/by-res/srv02-lab02/0,target.bus=virtio,driver.io=threads,cache=writeback,driver.discard=unmap,boot.order=1 \ + --disk path=/mnt/shared/files/Windows_Server_2019_eval.iso,device=cdrom,shareable=on,boot.order=2 \ + --disk path=/mnt/shared/files/virtio-win-0.1.185.iso,device=cdrom,shareable=on,boot.order=3 --force \ + --noautoconsole --wait -1 > /var/log/anvil-server_srv02-lab02.log +!! +job_0200'; +my ($free_minor, $free_port) = $anvil->Words->parse_banged_string({ debug => 2, key_string => $key_string, }); -print "Got: -==== -".$out_string." -==== -"; - $anvil->nice_exit({exit_code => 0}); From 8d0f87391225c0dd55b4a6ac20779033b68ef2ee Mon Sep 17 00:00:00 2001 From: Digimer Date: Fri, 5 Feb 2021 23:34:51 -0500 Subject: [PATCH 3/9] * Updated scan-storcli to check if a MegaRAID controlled exists and neither storcli64 or perccli64 exist. If a controller is found but no RPM is installed, it checks to see if the host is Dell and then decides to try and install perccli or storcli. * Reworked scan-ipimitool so that on nodes and dr hosts, it only scans itself. On strikers, it scans all hosts found in active Anvil! systems with a host_ipmi entry. ` * For all agents, reduced log verbosity to not push too much noise into anvil.log while scancore is running in the background. Signed-off-by: Digimer --- Anvil/Tools.pm | 1 + Anvil/Tools/Database.pm | 2 +- Anvil/Tools/Get.pm | 6 +- scancore-agents/scan-apc-pdu/scan-apc-pdu | 14 +- scancore-agents/scan-apc-ups/scan-apc-ups | 8 +- scancore-agents/scan-cluster/scan-cluster | 8 +- scancore-agents/scan-drbd/scan-drbd | 6 +- scancore-agents/scan-hardware/scan-hardware | 12 +- scancore-agents/scan-ipmitool/scan-ipmitool | 145 ++++++++++++------ scancore-agents/scan-lvm/scan-lvm | 6 +- scancore-agents/scan-server/scan-server | 8 +- scancore-agents/scan-storcli/scan-storcli | 117 ++++++++++++-- scancore-agents/scan-storcli/scan-storcli.xml | 3 + share/anvil.sql | 2 +- tools/scancore | 4 +- 15 files changed, 242 insertions(+), 100 deletions(-) diff --git a/Anvil/Tools.pm b/Anvil/Tools.pm index e64783f9..c5e4f487 100644 --- a/Anvil/Tools.pm +++ b/Anvil/Tools.pm @@ -1145,6 +1145,7 @@ sub _set_paths journalctl => "/usr/bin/journalctl", logger => "/usr/bin/logger", ls => "/usr/bin/ls", + lspci => "/usr/sbin/lspci", lsblk => "/usr/bin/lsblk", lvchange => "/usr/sbin/lvchange", lvcreate => "/usr/sbin/lvcreate", diff --git a/Anvil/Tools/Database.pm b/Anvil/Tools/Database.pm index b7f4e940..90ab2f83 100644 --- a/Anvil/Tools/Database.pm +++ b/Anvil/Tools/Database.pm @@ -1252,7 +1252,7 @@ sub connect average_time => $average_time, }}); - my $ping_time = tv_interval ($start_time, [gettimeofday]); + #my $ping_time = tv_interval ($start_time, [gettimeofday]); #print "[".$ping_time."] - Pinged: [$host:$port:$name:$user]\n"; if (not $pinged) diff --git a/Anvil/Tools/Get.pm b/Anvil/Tools/Get.pm index f29f2819..79e251e5 100644 --- a/Anvil/Tools/Get.pm +++ b/Anvil/Tools/Get.pm @@ -1479,7 +1479,11 @@ sub free_memory =head2 host_type -This method tries to determine the host type and returns a value suitable for use is the C<< hosts >> table. +This method tries to determine the host type and returns a value suitable for use is the C<< hosts >> table. Returned values are; + + striker - Striker dashboards + node - Anvil! nodes (active protection of VMs) + dr - DR Hosts (passive DR host targets) my $type = $anvil->Get->host_type(); diff --git a/scancore-agents/scan-apc-pdu/scan-apc-pdu b/scancore-agents/scan-apc-pdu/scan-apc-pdu index ae065612..dde2b528 100755 --- a/scancore-agents/scan-apc-pdu/scan-apc-pdu +++ b/scancore-agents/scan-apc-pdu/scan-apc-pdu @@ -75,9 +75,7 @@ if (($running_directory =~ /^\./) && ($ENV{PWD})) $running_directory =~ s/^\./$ENV{PWD}/; } -my $anvil = Anvil::Tools->new({log_level => 2, log_secure => 1}); -$anvil->Log->level({set => 2}); -$anvil->Log->secure({set => 1}); +my $anvil = Anvil::Tools->new(); # Make sure we're running as 'root' # $< == real UID, $> == effective UID @@ -159,7 +157,7 @@ $anvil->data->{snmp} = { }; $anvil->Storage->read_config(); -$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0115", variables => { program => $THIS_FILE }}); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }}); # Read switches $anvil->Get->switches; @@ -883,7 +881,7 @@ WHERE if ($new_uptime > $old_uptime) { # Normal, info-level - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_apc_pdu_message_0018", variables => $variables}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "scan_apc_pdu_message_0018", variables => $variables}); $anvil->Alert->register({ alert_level => "info", message => "scan_apc_pdu_message_0018", @@ -933,9 +931,9 @@ WHERE old_total_wattage_draw => $old_total_wattage_draw, new_total_wattage_draw => $new_total_wattage_draw, }; - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_apc_pdu_message_0020", variables => $variables}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "scan_apc_pdu_message_0020", variables => $variables}); $anvil->Alert->register({ - alert_level => "notice", + alert_level => "info", message => "scan_apc_pdu_message_0020", variables => $variables, set_by => $THIS_FILE, @@ -1035,7 +1033,7 @@ WHERE old_phase_current_amperage => $old_scan_apc_pdu_phase_current_amperage, new_phase_current_amperage => $new_scan_apc_pdu_phase_current_amperage, }; - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_apc_pdu_message_0023", variables => $variables}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "scan_apc_pdu_message_0023", variables => $variables}); $anvil->Alert->register({ alert_level => "info", message => "scan_apc_pdu_message_0023", diff --git a/scancore-agents/scan-apc-ups/scan-apc-ups b/scancore-agents/scan-apc-ups/scan-apc-ups index d3256764..f15648ac 100755 --- a/scancore-agents/scan-apc-ups/scan-apc-ups +++ b/scancore-agents/scan-apc-ups/scan-apc-ups @@ -40,9 +40,7 @@ if (($running_directory =~ /^\./) && ($ENV{PWD})) $running_directory =~ s/^\./$ENV{PWD}/; } -my $anvil = Anvil::Tools->new({log_level => 2, log_secure => 1}); -$anvil->Log->level({set => 2}); -$anvil->Log->secure({set => 1}); +my $anvil = Anvil::Tools->new(); # Make sure we're running as 'root' # $< == real UID, $> == effective UID @@ -185,7 +183,7 @@ $anvil->data->{snmp} = { }; $anvil->Storage->read_config(); -$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0115", variables => { program => $THIS_FILE }}); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }}); # Read switches $anvil->Get->switches; @@ -736,7 +734,7 @@ INSERT INTO new_value => "#!string!scan_apc_ups_last_transfer_".sprintf("%04d", $say_scan_apc_ups_last_transfer_reason)."!#", old_value => "#!string!scan_apc_ups_last_transfer_".sprintf("%04d", $say_old_scan_apc_ups_last_transfer_reason)."!#", }; - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_apc_ups_warning_0015", variables => $variables}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "scan_apc_ups_warning_0015", variables => $variables}); $anvil->Alert->register({alert_level => "notice", message => "scan_apc_ups_warning_0015", variables => $variables, set_by => $THIS_FILE, sort_position => $anvil->data->{'scan-apc-pdu'}{alert_sort}++}); } if ($scan_apc_ups_manufactured_date ne $old_scan_apc_ups_manufactured_date) diff --git a/scancore-agents/scan-cluster/scan-cluster b/scancore-agents/scan-cluster/scan-cluster index ad8cd0e9..336641c6 100755 --- a/scancore-agents/scan-cluster/scan-cluster +++ b/scancore-agents/scan-cluster/scan-cluster @@ -34,9 +34,7 @@ if (($running_directory =~ /^\./) && ($ENV{PWD})) $running_directory =~ s/^\./$ENV{PWD}/; } -my $anvil = Anvil::Tools->new({log_level => 2, log_secure => 1}); -$anvil->Log->level({set => 2}); -$anvil->Log->secure({set => 1}); +my $anvil = Anvil::Tools->new(); # Make sure we're running as 'root' # $< == real UID, $> == effective UID @@ -69,7 +67,7 @@ if ($problem) $anvil->nice_exit({exit_code => 1}); } -$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_cluster_log_0001", variables => { program => $THIS_FILE }}); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }}); if ($anvil->data->{switches}{purge}) { # This can be called when doing bulk-database purges. @@ -85,7 +83,7 @@ my $host_type = $anvil->Get->host_type; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { host_type => $host_type }}); if ($host_type ne "node") { - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_cluster_log_0002", variables => { host_type => $host_type }}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "scan_cluster_log_0002", variables => { host_type => $host_type }}); $anvil->nice_exit({exit_code => 0}); } diff --git a/scancore-agents/scan-drbd/scan-drbd b/scancore-agents/scan-drbd/scan-drbd index cc4cca92..2c27c6b2 100755 --- a/scancore-agents/scan-drbd/scan-drbd +++ b/scancore-agents/scan-drbd/scan-drbd @@ -36,9 +36,7 @@ if (($running_directory =~ /^\./) && ($ENV{PWD})) $running_directory =~ s/^\./$ENV{PWD}/; } -my $anvil = Anvil::Tools->new({log_level => 2, log_secure => 1}); -$anvil->Log->level({set => 2}); -$anvil->Log->secure({set => 1}); +my $anvil = Anvil::Tools->new(); $anvil->data->{'scan-drbd'}{alert_sort} = 2; $anvil->data->{'scan-drbd'}{queries} = []; @@ -75,7 +73,7 @@ if ($problem) { $anvil->nice_exit({exit_code => 1}); } -$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_drbd_log_0001", variables => { program => $THIS_FILE }}); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }}); if ($anvil->data->{switches}{purge}) { diff --git a/scancore-agents/scan-hardware/scan-hardware b/scancore-agents/scan-hardware/scan-hardware index ffa4062b..da1e88f0 100755 --- a/scancore-agents/scan-hardware/scan-hardware +++ b/scancore-agents/scan-hardware/scan-hardware @@ -31,9 +31,7 @@ if (($running_directory =~ /^\./) && ($ENV{PWD})) $running_directory =~ s/^\./$ENV{PWD}/; } -my $anvil = Anvil::Tools->new({log_level => 2, log_secure => 1}); -$anvil->Log->level({set => 2}); -$anvil->Log->secure({set => 1}); +my $anvil = Anvil::Tools->new(); # Make sure we're running as 'root' # $< == real UID, $> == effective UID @@ -54,7 +52,7 @@ $anvil->data->{scancore}{'scan-hardware'}{swap}{high_threshold} = 75; $anvil->data->{switches}{force} = 0; $anvil->Storage->read_config(); -$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, 'print' => 1, key => "log_0115", variables => { program => $THIS_FILE }}); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, 'print' => 1, key => "log_0115", variables => { program => $THIS_FILE }}); # Read switches $anvil->Get->switches; @@ -913,15 +911,15 @@ sub find_changes my $say_new_scan_hardware_memory_free = $anvil->Convert->bytes_to_human_readable({'bytes' => $new_scan_hardware_memory_free})." (".$anvil->Convert->add_commas({number => $new_scan_hardware_memory_free})." #!string!scan_hardware_unit_0001!#)"; my $say_old_scan_hardware_memory_free = $anvil->Convert->bytes_to_human_readable({'bytes' => $old_scan_hardware_memory_free})." (".$anvil->Convert->add_commas({number => $old_scan_hardware_memory_free})." #!string!scan_hardware_unit_0001!#)"; $anvil->Alert->register({set_by => $THIS_FILE, alert_level => "info", message => "scan_hardware_alert_0018,!!new!".$say_new_scan_hardware_memory_free."!!,!!old!".$say_old_scan_hardware_memory_free."!!"}); - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_hardware_alert_0018", variables => { new => $say_new_scan_hardware_memory_free, old => $say_old_scan_hardware_memory_free}}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "scan_hardware_alert_0018", variables => { new => $say_new_scan_hardware_memory_free, old => $say_old_scan_hardware_memory_free}}); } if ($new_scan_hardware_swap_free ne $old_scan_hardware_swap_free) { - $update = 1; + $update = 1; my $say_new_scan_hardware_swap_free = $anvil->Convert->bytes_to_human_readable({'bytes' => $new_scan_hardware_swap_free})." (".$anvil->Convert->add_commas({number => $new_scan_hardware_swap_free})." #!string!scan_hardware_unit_0001!#)"; my $say_old_scan_hardware_swap_free = $anvil->Convert->bytes_to_human_readable({'bytes' => $old_scan_hardware_swap_free})." (".$anvil->Convert->add_commas({number => $old_scan_hardware_swap_free})." #!string!scan_hardware_unit_0001!#)"; $anvil->Alert->register({set_by => $THIS_FILE, alert_level => "info", message => "scan_hardware_alert_0019,!!new!".$say_new_scan_hardware_swap_free."!!,!!old!".$say_old_scan_hardware_swap_free."!!"}); - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_hardware_alert_0019", variables => { new => $say_new_scan_hardware_swap_free, old => $say_old_scan_hardware_swap_free}}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "scan_hardware_alert_0019", variables => { new => $say_new_scan_hardware_swap_free, old => $say_old_scan_hardware_swap_free}}); my $new_swap_bytes_used = $new_scan_hardware_swap_total - $new_scan_hardware_swap_free; my $old_swap_bytes_used = $old_scan_hardware_swap_total - $old_scan_hardware_swap_free; diff --git a/scancore-agents/scan-ipmitool/scan-ipmitool b/scancore-agents/scan-ipmitool/scan-ipmitool index 64dd2fdb..438803e1 100755 --- a/scancore-agents/scan-ipmitool/scan-ipmitool +++ b/scancore-agents/scan-ipmitool/scan-ipmitool @@ -192,7 +192,7 @@ $anvil->data->{'scan-ipmitool'} = { }; $anvil->Storage->read_config(); -$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, 'print' => 1 , key => "log_0115", variables => { program => $THIS_FILE }}); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, 'print' => 1 , key => "log_0115", variables => { program => $THIS_FILE }}); # Read switches $anvil->data->{switches}{force} = 0; @@ -2014,10 +2014,10 @@ sub find_ipmi_targets # If I am a node, I will only scan myself. my $host_type = $anvil->Get->host_type(); - my $hostname = $anvil->Get->host_name(); + my $host_name = $anvil->Get->host_name(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_type => $host_type, - hostname => $hostname, + host_name => $host_name, }}); # Do I have local IPMI access? @@ -2044,71 +2044,124 @@ sub find_ipmi_targets } } - # Find all known hosts (except ourself) with a host_ipmi value set. + # Which hosts we scan depends on if we're a Striker dashboard or not. If we are, we'll try to scan + # all machines in all Anvil! systems. Otherwise, we only scan ourselves. + if ($host_type ne "striker") + { + # We're not a dashboard, so we don't scan others. + return($ipmi_targets); + } + + # Loop through Anvil! systems. my $query = " SELECT - host_name, - host_uuid, - host_ipmi + anvil_name, + anvil_node1_host_uuid, + anvil_node2_host_uuid, + anvil_dr1_host_uuid FROM - hosts + anvils WHERE - host_ipmi != '' -AND - host_uuid != ".$anvil->Database->quote($anvil->Get->host_uuid)." + anvil_description != 'DELETED' ;"; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { query => $query }}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }}); my $results = $anvil->Database->query({query => $query, source => $THIS_FILE, line => __LINE__}); my $count = @{$results}; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { results => $results, count => $count, }}); foreach my $row (@{$results}) { - # We've got an entry in the 'scan_hardware' table, so now we'll look for data in the node and - # services tables. - my $host_name = $row->[0]; - my $host_uuid = $row->[1]; - my $host_ipmi = $row->[2]; + # For each host_uuid, get the IPMI info. + my $anvil_name = $row->[0]; + my $anvil_node1_host_uuid = $row->[1]; + my $anvil_node2_host_uuid = $row->[2]; + my $anvil_dr1_host_uuid = defined $row->[3] ? $row->[3] : ""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - host_name => $host_name, - host_uuid => $host_uuid, - host_ipmi => $anvil->Log->is_secure($host_ipmi), + 's1:anvil_name' => $anvil_name, + 's2:anvil_node1_host_uuid' => $anvil_node1_host_uuid, + 's3:anvil_node2_host_uuid' => $anvil_node2_host_uuid, + 's4:anvil_dr1_host_uuid' => $anvil_dr1_host_uuid, }}); - # Get the ipaddress and see if I can ping the target. If I can't, there is no sense in - # recording this entry. - my $access = 0; - my $target = ""; - if (($host_ipmi =~ /-a (.*?) /) or ($host_ipmi =~ /-ip (.*?) /)) + my $query = " +SELECT + host_name, + host_uuid, + host_ipmi +FROM + hosts +WHERE + host_ipmi != '' +AND + ( + host_uuid = ".$anvil->Database->quote($anvil_node1_host_uuid)." + OR + host_uuid = ".$anvil->Database->quote($anvil_node2_host_uuid); + if ($anvil_dr1_host_uuid) { - $target = $1; + $query .= " + OR + host_uuid = ".$anvil->Database->quote($anvil_dr1_host_uuid); } - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { target => $target }}); - if ($target) + $query .= " + ) +;"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }}); + + my $results = $anvil->Database->query({query => $query, source => $THIS_FILE, line => __LINE__}); + my $count = @{$results}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + results => $results, + count => $count, + }}); + foreach my $row (@{$results}) { - ($access, my $average_time) = $anvil->Network->ping({ping => $target}); + # We've got an entry in the 'scan_hardware' table, so now we'll look for data in the node and + # services tables. + my $host_name = $row->[0]; + my $host_uuid = $row->[1]; + my $host_ipmi = $row->[2]; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - access => $access, - average_time => $average_time, + host_name => $host_name, + host_uuid => $host_uuid, + host_ipmi => $anvil->Log->is_secure($host_ipmi), + }}); + + # Get the ipaddress and see if I can ping the target. If I can't, there is no sense in + # recording this entry. + my $access = 0; + my $target = ""; + if (($host_ipmi =~ /-a (.*?) /) or ($host_ipmi =~ /-ip (.*?) /)) + { + $target = $1; + } + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { target => $target }}); + if ($target) + { + ($access, my $average_time) = $anvil->Network->ping({ping => $target}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + access => $access, + average_time => $average_time, + }}); + } + next if not $access; + $ipmi_targets++; + + # Convert to an 'ipmitool' call. + my ($ipmitool_command, $ipmi_password) = $anvil->Convert->fence_ipmilan_to_ipmitool({fence_ipmilan_command => $host_ipmi}); + + $anvil->data->{'scan-ipmitool'}{host_name}{$host_name}{host_ipmi} = $host_ipmi; + $anvil->data->{'scan-ipmitool'}{host_name}{$host_name}{ipmitool_command} = $ipmitool_command; + $anvil->data->{'scan-ipmitool'}{host_name}{$host_name}{ipmi_password} = $ipmi_password; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "scan-ipmitool::host_name::${host_name}::host_ipmi" => $anvil->data->{'scan-ipmitool'}{host_name}{$host_name}{host_ipmi}, + "scan-ipmitool::host_name::${host_name}::ipmitool_command" => $anvil->data->{'scan-ipmitool'}{host_name}{$host_name}{ipmitool_command}, + "scan-ipmitool::host_name::${host_name}::ipmi_password" => $anvil->Log->is_secure($anvil->data->{'scan-ipmitool'}{host_name}{$host_name}{ipmi_password}), }}); } - next if not $access; - $ipmi_targets++; - - # Convert to an 'ipmitool' call. - my ($ipmitool_command, $ipmi_password) = $anvil->Convert->fence_ipmilan_to_ipmitool({fence_ipmilan_command => $host_ipmi}); - - $anvil->data->{'scan-ipmitool'}{host_name}{$host_name}{host_ipmi} = $host_ipmi; - $anvil->data->{'scan-ipmitool'}{host_name}{$host_name}{ipmitool_command} = $ipmitool_command; - $anvil->data->{'scan-ipmitool'}{host_name}{$host_name}{ipmi_password} = $ipmi_password; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - "scan-ipmitool::host_name::${host_name}::host_ipmi" => $anvil->data->{'scan-ipmitool'}{host_name}{$host_name}{host_ipmi}, - "scan-ipmitool::host_name::${host_name}::ipmitool_command" => $anvil->data->{'scan-ipmitool'}{host_name}{$host_name}{ipmitool_command}, - "scan-ipmitool::host_name::${host_name}::ipmi_password" => $anvil->Log->is_secure($anvil->data->{'scan-ipmitool'}{host_name}{$host_name}{ipmi_password}), - }}); } $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { ipmi_targets => $ipmi_targets }}); diff --git a/scancore-agents/scan-lvm/scan-lvm b/scancore-agents/scan-lvm/scan-lvm index 2167314a..87d243fd 100755 --- a/scancore-agents/scan-lvm/scan-lvm +++ b/scancore-agents/scan-lvm/scan-lvm @@ -36,9 +36,7 @@ if (($running_directory =~ /^\./) && ($ENV{PWD})) $running_directory =~ s/^\./$ENV{PWD}/; } -my $anvil = Anvil::Tools->new({log_level => 2, log_secure => 1}); -$anvil->Log->level({set => 2}); -$anvil->Log->secure({set => 1}); +my $anvil = Anvil::Tools->new(); # Make sure we're running as 'root' # $< == real UID, $> == effective UID @@ -54,7 +52,7 @@ $anvil->data->{scancore}{'scan-lvm'}{disable} = 0; $anvil->data->{switches}{force} = 0; $anvil->Storage->read_config(); -$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0115", variables => { program => $THIS_FILE }}); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }}); # Read switches $anvil->Get->switches; diff --git a/scancore-agents/scan-server/scan-server b/scancore-agents/scan-server/scan-server index 656e2ea6..aa2ac12f 100755 --- a/scancore-agents/scan-server/scan-server +++ b/scancore-agents/scan-server/scan-server @@ -35,9 +35,7 @@ if (($running_directory =~ /^\./) && ($ENV{PWD})) $running_directory =~ s/^\./$ENV{PWD}/; } -my $anvil = Anvil::Tools->new({log_level => 2, log_secure => 1}); -$anvil->Log->level({set => 2}); -$anvil->Log->secure({set => 1}); +my $anvil = Anvil::Tools->new(); # Make sure we're running as 'root' # $< == real UID, $> == effective UID @@ -71,7 +69,7 @@ if ($problem) $anvil->nice_exit({exit_code => 1}); } -$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_server_log_0001", variables => { program => $THIS_FILE }}); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "scan_server_log_0001", variables => { program => $THIS_FILE }}); # There are no tables for this agent, so '--purge' is useless here. @@ -80,7 +78,7 @@ my $host_type = $anvil->Get->host_type; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { host_type => $host_type }}); if ($host_type eq "striker") { - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_server_log_0002", variables => { host_type => $host_type }}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "scan_server_log_0002", variables => { host_type => $host_type }}); $anvil->nice_exit({exit_code => 0}); } diff --git a/scancore-agents/scan-storcli/scan-storcli b/scancore-agents/scan-storcli/scan-storcli index 536d070d..bf7486e5 100755 --- a/scancore-agents/scan-storcli/scan-storcli +++ b/scancore-agents/scan-storcli/scan-storcli @@ -75,9 +75,7 @@ if (($running_directory =~ /^\./) && ($ENV{PWD})) $running_directory =~ s/^\./$ENV{PWD}/; } -my $anvil = Anvil::Tools->new({log_level => 2, log_secure => 1}); -$anvil->Log->level({set => 2}); -$anvil->Log->secure({set => 1}); +my $anvil = Anvil::Tools->new(); # Make sure we're running as 'root' # $< == real UID, $> == effective UID @@ -207,7 +205,7 @@ $anvil->data->{'scan-storcli'} = { }; $anvil->Storage->read_config(); -$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, 'print' => 1 , key => "log_0115", variables => { program => $THIS_FILE }}); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, 'print' => 1, key => "log_0115", variables => { program => $THIS_FILE }}); # Read switches $anvil->data->{switches}{force} = 0; @@ -240,7 +238,7 @@ if ($anvil->data->{switches}{purge}) } -$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "scan_storcli_message_0001"}); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "scan_storcli_message_0001"}); # This does two things; It checks to see if storcli64 is installed (exits '1' if not, exits '2' if not # executable) and then checks to see if any controllers are found in the system (exits '3' if not). @@ -2724,7 +2722,7 @@ INSERT INTO low_critical_temperature => $low_critical, low_warning_temperature => $low_warning, }; - my $log_level = $alert_level eq "notice" ? 2 : 1; + my $log_level = $alert_level eq "notice" ? 3 : 2; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $log_level, key => $message_key, variables => $variables}); $anvil->Alert->register({ alert_level => $alert_level, @@ -3284,7 +3282,7 @@ WHERE low_warning_temperature => $low_warning, jump => $jump, }; - my $log_level = $alert_level eq "notice" ? 2 : 1; + my $log_level = $alert_level eq "notice" ? 3 : 2; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $log_level, key => $message_key, variables => $variables}); $anvil->Alert->register({ alert_level => $alert_level, @@ -10040,12 +10038,37 @@ sub find_lsi_controllers $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "path::exe::storcli64" => $anvil->data->{path}{exe}{storcli64} }}); } - # First, do we have storcli64 installed? + # Do we have storcli64 installed? if (not -e $anvil->data->{path}{exe}{storcli64}) { - # Nope, exit. - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, key => "scan_storcli_error_0001", variables => { path => $anvil->data->{path}{exe}{storcli64} }}); - $anvil->nice_exit({exit_code => 1}); + # Nope, Call lspci to see if there's a MegaRAID controller. If there is, the user may need to + # install the RPM. + my ($output, $return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{lspci}}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + + my $megaraid_installed = 0; + foreach my $line (split/\n/, $output) + { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { line => $line }}); + + if ($line =~ /MegaRAID/i) + { + # This host appears to have a RAID card, but it's not installed. Lets try to + # install it for them. + $megaraid_installed = install_storcli($anvil); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { megaraid_installed => $megaraid_installed }}); + } + } + + # exit. + if (not $megaraid_installed) + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "scan_storcli_error_0001", variables => { path => $anvil->data->{path}{exe}{storcli64} }}); + $anvil->nice_exit({exit_code => 1}); + } } # Make sure it is executable @@ -10083,7 +10106,7 @@ sub find_lsi_controllers } else { - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, key => "scan_storcli_error_0003", variables => { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 2, level => 0, key => "scan_storcli_error_0003", variables => { path => $anvil->data->{path}{exe}{storcli64}, }}); $anvil->nice_exit({exit_code => 3}); @@ -10091,3 +10114,73 @@ sub find_lsi_controllers return(0); } + +sub install_storcli +{ + my ($anvil) = @_; + + # Tell the user what we're doing. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, key => "scan_storcli_note_0071"}); + + # Is this a Dell? + my $is_dell = 0; + my ($output, $return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{dmidecode}." --string system-manufacturer"}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + + foreach my $line (split/\n/, $output) + { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { line => $line }}); + if ($line =~ /Dell/i) + { + $is_dell = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { is_dell => $is_dell }}); + } + } + + my $rpm_name = $is_dell ? "perccli" : "storcli"; + my $shell_call = $anvil->data->{path}{exe}{dnf}." -y install ".$rpm_name; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + + # Check now to see if the program is installed. If it is, register an alert announcing we installed it. + my $program = $is_dell ? $anvil->data->{path}{exe}{perccli64} : $anvil->data->{path}{exe}{storcli64}; + if (-e $program) + { + # Installed successfully! + my $variables = { + rpm => $rpm_name, + }; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_storcli_note_0072", variables => $variables}); + $anvil->Alert->register({ + alert_level => "notice", + message => "scan_storcli_note_0072", + show_header => 1, + variables => $variables, + sort_position => 0, + set_by => $THIS_FILE, + }); + + # Before we return, if we installed for Dell, switch out the 'storcli' program path. + if ($is_dell) + { + $anvil->data->{path}{exe}{storcli64} = $anvil->data->{path}{exe}{perccli64}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "path::exe::storcli64" => $anvil->data->{path}{exe}{storcli64} }}); + } + + return(1); + } + else + { + # Didn't work. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, key => "scan_storcli_note_0073"}); + } + + return(0); +} diff --git a/scancore-agents/scan-storcli/scan-storcli.xml b/scancore-agents/scan-storcli/scan-storcli.xml index 6117c89b..2ed90fe4 100644 --- a/scancore-agents/scan-storcli/scan-storcli.xml +++ b/scancore-agents/scan-storcli/scan-storcli.xml @@ -334,6 +334,9 @@ The temperature of the Battery Backup Unit (BBU): [#!variable!serial_number!#] h - Controller: [#!variable!serial_number!#]: '#!variable!name!#' has changed: [#!variable!old_value!#] -> [#!variable!new_value!#] NOTE: This is expected and is no reason for concern. + The host appears to have a MegaRAID controller, but the tool to communicate with it is not installed. We'll try to install it now. + An LSI-based (MegaRAID) controller was found, but the management tool was not installed. The RPM: [#!variable!rpm!#] has now been installed to allow monitoring storage. + The install didn't work, we'll try again in the next scan. diff --git a/share/anvil.sql b/share/anvil.sql index 92963c1d..76b26650 100644 --- a/share/anvil.sql +++ b/share/anvil.sql @@ -337,7 +337,7 @@ CREATE TRIGGER trigger_sessions CREATE TABLE anvils ( anvil_uuid uuid not null primary key, anvil_name text not null, - anvil_description text not null, -- This is a short, one-line (usually) description of this particular Anvil!. It is displayed in the Anvil! selection list. + anvil_description text not null, -- This is a short, one-line (usually) description of this particular Anvil!. It is displayed in the Anvil! selection list. This is set to 'DELETED' when an Anvil! is removed. anvil_password text not null, -- This is the 'hacluster' user password. It is also used to access nodes that don't have a specific password set. anvil_node1_host_uuid uuid, -- This is the host_uuid of the machine that is used as node 1. anvil_node2_host_uuid uuid, -- This is the host_uuid of the machine that is used as node 2. diff --git a/tools/scancore b/tools/scancore index d61bf03c..5b8307a2 100755 --- a/tools/scancore +++ b/tools/scancore @@ -16,7 +16,9 @@ # - Record how long a server's migration took in the past, and use that to determine which node to evacuate # during load shed. Also, track how long it takes for servers to stop to determine when to initiate a total # shutdown. -# - +# - Add a '--silence-alerts --anvil ' and '--restore-alerts --anvil ' to temporarily +# disable/re-enable alerts. This is to allow for quiet maintenance without stopping scancore itself. +# use strict; use warnings; From 0ec1bf6b6aa27667ff2fbde31c11a0d78c4b8352 Mon Sep 17 00:00:00 2001 From: Digimer Date: Fri, 5 Feb 2021 23:41:48 -0500 Subject: [PATCH 4/9] * Updated DRBD->delete_resource() to return a success if asked to delete a non-existent resource (as can happen when partial anvil-delete-server runs are re-run). * Reworked DRBD->get_next_resource() to pull from the database, and to no longer do that increments-of-three nonsense. Avoidable complexity. Also added a call to Cluster->get_anvil_uuid() if the 'anvil_uuid' parameter wasn't passed. * Updated Database->get_host_from_uuid() and ->get_hosts() to now take 'include_deleted' parameter and default to not returning deleted hosts. This fixed issues where anvil-{delete,provision}-server calls could assign jobs to now-deleted hosts with reused host names. * Updated anvil-delete-server to print log entries to STDOUT. Also updated it to not wait of shutdown of a server in pacemaker to complete, and instead to destroy it after calling pacemaker's resource stop. Updated to also check to see if the server being deleted is already out of pacemaker and, if so, skip that step and directly try to destroy the server, if it's running. * Updated anvil-provision-server to force 'peer_mode' runs to pull their TCP Port and DRBD minor numbers from the job. This fixes a bug where the same resource on two machines could use different TCP ports. Signed-off-by: Digimer --- Anvil/Tools/Cluster.pm | 8 +- Anvil/Tools/DRBD.pm | 208 ++++++++++++++++------------------- Anvil/Tools/Database.pm | 54 +++++++-- Anvil/Tools/Job.pm | 3 +- share/words.xml | 8 +- tools/anvil-delete-server | 186 ++++++++++++++++++++++--------- tools/anvil-provision-server | 30 ++--- tools/test.pl | 35 +----- 8 files changed, 301 insertions(+), 231 deletions(-) diff --git a/Anvil/Tools/Cluster.pm b/Anvil/Tools/Cluster.pm index dc97b877..5e3a5477 100644 --- a/Anvil/Tools/Cluster.pm +++ b/Anvil/Tools/Cluster.pm @@ -1694,7 +1694,7 @@ sub parse_cib if ($anvil->Network->is_local({host => $target})) { # Local call - ($cib_data, $return_code) = $anvil->System->call({debug => $debug, shell_call => $shell_call}); + ($cib_data, $return_code) = $anvil->System->call({debug => ($debug + 1), shell_call => $shell_call}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { cib_data => $cib_data, return_code => $return_code, @@ -1704,7 +1704,7 @@ sub parse_cib { # Remote call. ($cib_data, my $error, $return_code) = $anvil->Remote->call({ - debug => $debug, + debug => ($debug + 1), shell_call => $shell_call, target => $target, port => $port, @@ -2415,7 +2415,7 @@ sub parse_crm_mon if ($anvil->Network->is_local({host => $target})) { # Local call - ($crm_mon_data, $return_code) = $anvil->System->call({debug => $debug, shell_call => $shell_call}); + ($crm_mon_data, $return_code) = $anvil->System->call({debug => ($debug + 1), shell_call => $shell_call}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { crm_mon_data => $crm_mon_data, return_code => $return_code, @@ -2463,7 +2463,7 @@ sub parse_crm_mon foreach my $resource ($dom->findnodes('/pacemaker-result/resources/resource')) { next if $resource->{resource_agent} ne "ocf::alteeve:server"; - my $id = $resource->{id}; + my $id = $resource->{id}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { id => $id }}); foreach my $variable (sort {$a cmp $b} keys %{$resource}) { diff --git a/Anvil/Tools/DRBD.pm b/Anvil/Tools/DRBD.pm index 097631b4..e9aa076d 100644 --- a/Anvil/Tools/DRBD.pm +++ b/Anvil/Tools/DRBD.pm @@ -278,9 +278,9 @@ sub delete_resource $anvil->DRBD->gather_data({debug => $debug}); if (not exists $anvil->data->{new}{resource}{$resource}) { - # Resource not found. + # Resource not found, so it appears to already be gone. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0228", variables => { resource => $resource }}); - return('!!error!!'); + return(0); } my $waiting = 1; @@ -1100,23 +1100,25 @@ sub get_devices =head2 get_next_resource -This returns the next free DRBD minor number and the next free TCP port. The minor number is the first one found to be free. The TCP port is allocated in steps of three. That is to say, if the last used TCP port is '7790', then '7793' is considered the next free port. This is to ensure that if a DR host is added or used, the three adjacent ports are available for use in one resource configuration. +This returns the next free DRBD minor number and the next free TCP port. The minor number and TCP port returned are ones found to be free on both/all machines in Anvil! system. As such, the returned values may skip values free on any given system. -Minor numbers are not grouped as resources and volumes can be referenced by name, so the DRBD minor number is less important for human users. +If a resource name is given, then the caller can either return an error if the name matches (useful for name conflict checks) or return the first (lowest) minor number and TCP used by the resource. my ($free_minor, $free_port) = $anvil->DRBD->get_next_resource({anvil_uuid => "a5ae5242-e9d3-46c9-9ce8-306855aa56db"}) If there is a problem, two empty strings will be returned. +B<< Note >>: Deleted resources, volumes and peers are ignored! As such, a minor or TCP port that used to be used by deleted resource can be returned. + Parameters; -=head3 anvil_uuid (required) +=head3 anvil_uuid (optional, default 'Cluster->get_anvil_uuid') -This is the Anvil! in which we're looking for the next free resources. +This is the Anvil! in which we're looking for the next free resources. It's required, but generally it doesn't need to be specified as we can find it via C<< Cluster->get_anvil_uuid() >>. =head3 resource_name (optional) -If this is set, and the resource is found to already exist, the first DRBD minor number and first used TCP port are returned. Alternatively, if C<< force_unique >> is set to C<< 1 >>, and the resource is found to exist, C<< !!error!! >> is returned. +If this is set, and the resource is found to already exist, the first DRBD minor number and first used TCP port are returned. Alternatively, if C<< force_unique >> is set to C<< 1 >>, and the resource is found to exist, empty strings are returned. =head3 force_unique (optional, default '0') @@ -1131,8 +1133,6 @@ sub get_next_resource my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "DRBD->get_next_resource()" }}); - my $free_minor = ""; - my $free_port = ""; my $anvil_uuid = defined $parameter->{anvil_uuid} ? $parameter->{anvil_uuid} : ""; my $resource_name = defined $parameter->{resource_name} ? $parameter->{resource_name} : ""; my $force_unique = defined $parameter->{force_unique} ? $parameter->{force_unique} : 0; @@ -1142,21 +1142,30 @@ sub get_next_resource force_unique => $force_unique, }}); + # If we weren't passed an anvil_uuid, see if we can find one locally + if (not $anvil_uuid) + { + $anvil_uuid = $anvil->Cluster->get_anvil_uuid({debug => $debug}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { anvil_uuid => $anvil_uuid }}); + } + if (not $anvil_uuid) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0020", variables => { method => "DRBD->get_next_resource()", parameter => "anvil_uuid" }}); - return($free_minor, $free_port); + return("", ""); } $anvil->Database->get_anvils({debug => $debug}); if (not exists $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0162", variables => { anvil_uuid => $anvil_uuid }}); - return($free_minor, $free_port); + return("", ""); } # Read in the resource information from both nodes. They _should_ be identical, but that's not 100% # certain. + my $free_minor = ""; + my $free_port = ""; my $node1_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node1_host_uuid}; my $node2_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node2_host_uuid}; my $dr1_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_dr1_host_uuid}; @@ -1166,26 +1175,53 @@ sub get_next_resource dr1_host_uuid => $dr1_host_uuid, }}); - my $query = " +my $query = " SELECT - scan_drbd_resource_host_uuid, - scan_drbd_resource_name, - scan_drbd_resource_xml + a.host_uuid, + a.host_name, + b.scan_drbd_resource_name, + c.scan_drbd_volume_number, + c.scan_drbd_volume_device_path, + c.scan_drbd_volume_device_minor, + d.scan_drbd_peer_host_name, + d.scan_drbd_peer_ip_address, + d.scan_drbd_peer_protocol, + d.scan_drbd_peer_fencing, + d.scan_drbd_peer_tcp_port FROM - scan_drbd_resources + hosts a, + scan_drbd_resources b, + scan_drbd_volumes c, + scan_drbd_peers d WHERE - scan_drbd_resource_host_uuid = ".$anvil->Database->quote($node1_host_uuid)." -OR - scan_drbd_resource_host_uuid = ".$anvil->Database->quote($node2_host_uuid)." "; + a.host_uuid = b.scan_drbd_resource_host_uuid +AND + b.scan_drbd_resource_uuid = c.scan_drbd_volume_scan_drbd_resource_uuid +AND + c.scan_drbd_volume_uuid = d.scan_drbd_peer_scan_drbd_volume_uuid +AND + b.scan_drbd_resource_xml != 'DELETED' +AND + c.scan_drbd_volume_device_path != 'DELETED' +AND + d.scan_drbd_peer_connection_state != 'DELETED' +AND + ( + scan_drbd_resource_host_uuid = ".$anvil->Database->quote($node1_host_uuid)." + OR + scan_drbd_resource_host_uuid = ".$anvil->Database->quote($node2_host_uuid)." "; if ($dr1_host_uuid) { $query .= " -OR - scan_drbd_resource_host_uuid = ".$anvil->Database->quote($dr1_host_uuid)." "; + OR + scan_drbd_resource_host_uuid = ".$anvil->Database->quote($dr1_host_uuid)." "; } $query .= " + ) ORDER BY - scan_drbd_resource_name ASC + b.scan_drbd_resource_name ASC, + c.scan_drbd_volume_device_minor ASC, + d.scan_drbd_peer_tcp_port ASC ;"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { query => $query }}); @@ -1197,110 +1233,58 @@ ORDER BY }}); foreach my $row (@{$results}) { - my $scan_drbd_resource_host_uuid = $row->[0]; - my $scan_drbd_resource_name = $row->[1]; - my $scan_drbd_resource_xml = $row->[2]; + # I don't really need most of this, but it helps with debugging + my $host_uuid = $row->[0]; + my $host_name = $row->[1]; + my $scan_drbd_resource_name = $row->[2]; + my $scan_drbd_volume_number = $row->[3]; + my $scan_drbd_volume_device_path = $row->[4]; + my $scan_drbd_volume_device_minor = $row->[5]; + my $scan_drbd_peer_host_name = $row->[6]; + my $scan_drbd_peer_ip_address = $row->[7]; + my $scan_drbd_peer_protocol = $row->[8]; + my $scan_drbd_peer_fencing = $row->[9]; + my $scan_drbd_peer_tcp_port = $row->[10]; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { - scan_drbd_resource_host_uuid => $scan_drbd_resource_host_uuid, - scan_drbd_resource_name => $scan_drbd_resource_name, - scan_drbd_resource_xml => $scan_drbd_resource_xml, + 's1:host_uuid' => $host_uuid, + 's2:host_name' => $host_name, + 's3:scan_drbd_resource_name' => $scan_drbd_resource_name, + 's4:scan_drbd_volume_number' => $scan_drbd_volume_number, + 's5:scan_drbd_volume_device_path' => $scan_drbd_volume_device_path, + 's6:scan_drbd_volume_device_minor' => $scan_drbd_volume_device_minor, + 's7:scan_drbd_peer_host_name' => $scan_drbd_peer_host_name, + 's8:scan_drbd_peer_ip_address' => $scan_drbd_peer_ip_address, + 's9:scan_drbd_peer_protocol' => $scan_drbd_peer_protocol, + 's10:scan_drbd_peer_fencing' => $scan_drbd_peer_fencing, + 's11:scan_drbd_peer_tcp_port' => $scan_drbd_peer_tcp_port, }}); - next if $scan_drbd_resource_xml eq "DELETED"; - - local $@; - my $dom = eval { XML::LibXML->load_xml(string => $scan_drbd_resource_xml); }; - if ($@) - { - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "error_0111", variables => { - xml_body => $scan_drbd_resource_xml, - eval_error => $@, - }}); - next; - } - - # Successful parse! - my $local_minor = ""; - my $local_port = ""; - foreach my $name ($dom->findnodes('/resource')) - { - my $resource = $name->{name}; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { resource => $resource }}); - - foreach my $host ($name->findnodes('./host')) - { - my $host_name = $host->{name}; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { host_name => $host_name }}); - - foreach my $volume_vnr ($host->findnodes('./volume')) - { - my $volume = $volume_vnr->{vnr}; - my $minor = $volume_vnr->findvalue('./device/@minor'); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { - 's1:host_name' => $host_name." \@ ".$resource."/".$volume, - 's2:minor' => $minor, - }}); - - $anvil->data->{drbd}{used_resources}{minor}{$minor}{used} = 1; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { - "drbd::used_resources::minor::${minor}::used" => $anvil->data->{drbd}{used_resources}{minor}{$minor}{used}, - }}); - - if (not $local_minor) - { - $local_minor = $minor; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { local_minor => $local_minor }}); - } - } - } - - foreach my $connection ($name->findnodes('./connection')) - { - foreach my $host ($connection->findnodes('./host')) - { - my $host_name = $host->{name}; - my $tcp_port = $host->findvalue('./address/@port'); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { - host_name => $host_name, - tcp_port => $tcp_port, - }}); - - $anvil->data->{drbd}{used_resources}{tcp_port}{$tcp_port}{used} = 1; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { - "drbd::used_resources::tcp_port::${tcp_port}::used" => $anvil->data->{drbd}{used_resources}{tcp_port}{$tcp_port}{used}, - }}); - - if (not $local_port) - { - $local_port = $tcp_port; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { local_port => $local_port }}); - } - } - } - } + $anvil->data->{drbd}{used_resources}{minor}{$scan_drbd_volume_device_minor}{used} = 1; + $anvil->data->{drbd}{used_resources}{tcp_port}{$scan_drbd_peer_tcp_port}{used} = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + "drbd::used_resources::minor::${scan_drbd_volume_device_minor}::used" => $anvil->data->{drbd}{used_resources}{minor}{$scan_drbd_volume_device_minor}{used}, + "drbd::used_resources::tcp_port::${scan_drbd_peer_tcp_port}::used" => $anvil->data->{drbd}{used_resources}{tcp_port}{$scan_drbd_peer_tcp_port}{used}, + }}); - # Is the user looking for this resource? - if (($resource_name) && ($resource_name eq $scan_drbd_resource_name)) + if (($resource_name) && ($scan_drbd_resource_name eq $resource_name)) { - # If we're force_unique, error. + # Found the resource the user was asking for. if ($force_unique) { # Error out. - return('!!error!!'); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => 'err', key => "error_0237", variables => { resource_name => $resource_name }}); + return("", ""); } else { - $free_minor = $local_minor; - $free_port = $local_port; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { - free_minor => $free_minor, - free_port => $free_port, - }}); - return($free_minor, $free_port); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0592", variables => { resource_name => $resource_name }}); + return($scan_drbd_volume_device_minor, $scan_drbd_peer_tcp_port); } } } + # If I'm here, I need to find the next free TCP port. We'll look for the next minor number for this + # host. my $looking = 1; $free_minor = 0; while($looking) @@ -1323,7 +1307,7 @@ ORDER BY { if (exists $anvil->data->{drbd}{used_resources}{tcp_port}{$free_port}) { - $free_port += 3; + $free_port++; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { free_port => $free_port }}); } else diff --git a/Anvil/Tools/Database.pm b/Anvil/Tools/Database.pm index 90ab2f83..ee3c5670 100644 --- a/Anvil/Tools/Database.pm +++ b/Anvil/Tools/Database.pm @@ -2510,6 +2510,10 @@ Parameters; This is the host UUID we're querying the name of. +=head3 include_deleted (optional, default '0') + +If set to C<< 1 >>, hosts that are deleted are included. If you use this, and a machine was replaced, then watch for multiple host UUIDs. + =head3 short (optional, default '0') If set to C<< 1 >>, the short host name is returned. When set to C<< 0 >>, the full host name is returned. @@ -2523,12 +2527,14 @@ sub get_host_from_uuid my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "Database->get_host_from_uuid()" }}); - my $host_name = ""; - my $host_uuid = defined $parameter->{host_uuid} ? $parameter->{host_uuid} : ""; - my $short = defined $parameter->{short} ? $parameter->{short} : 0; + my $host_name = ""; + my $host_uuid = defined $parameter->{host_uuid} ? $parameter->{host_uuid} : ""; + my $include_deleted = defined $parameter->{include_deleted} ? $parameter->{include_deleted} : 0; + my $short = defined $parameter->{short} ? $parameter->{short} : 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { - host_uuid => $host_uuid, - short => $short, + host_uuid => $host_uuid, + include_deleted => $include_deleted, + short => $short, }}); if (not $host_uuid) @@ -2538,7 +2544,21 @@ sub get_host_from_uuid return($host_name); } - my $query = "SELECT host_name FROM hosts WHERE host_uuid = ".$anvil->Database->quote($host_uuid).";"; + my $query = " +SELECT + host_name +FROM + hosts +WHERE + host_uuid = ".$anvil->Database->quote($host_uuid); + if (not $include_deleted) + { + $query .= " +AND + host_key != 'DELETED'"; + } + $query .= " +;"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { query => $query }}); my $results = $anvil->Database->query({query => $query, source => $THIS_FILE, line => __LINE__}); @@ -2592,7 +2612,13 @@ And to simplify look-ups by UUID or name; To prevent some cases of recursion, C<< hosts::loaded >> is set on successful load, and if this is set, this method immediately returns with C<< 0 >>. -This method takes no parameters. +Parameters; + +=head3 include_deleted (optional, default '0') + +By default, hosts that have been deleted (C<< host_key >> set to C<< DELETED >>) are not returned. If this is set to C<< 1 >>, those deleted hosts are included. + +B<< Note >>: Be careful when using this. If a machine was replaced, then there could be two (or more) host UUIDs for a given host name. =cut sub get_hosts @@ -2603,6 +2629,11 @@ sub get_hosts my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "Database->get_hosts()" }}); + my $include_deleted = defined $parameter->{include_deleted} ? $parameter->{include_deleted} : 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + include_deleted => $include_deleted, + }}); + # Delete any data from past scans. delete $anvil->data->{hosts}{host_uuid}; delete $anvil->data->{sys}{hosts}{by_uuid}; @@ -2620,7 +2651,14 @@ SELECT host_ipmi, modified_date FROM - hosts + hosts "; + if (not $include_deleted) + { + $query .= " +WHERE + host_key != 'DELETED'"; + } + $query .= " ;"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { query => $query }}); diff --git a/Anvil/Tools/Job.pm b/Anvil/Tools/Job.pm index 56dcfc00..3c1bfa62 100644 --- a/Anvil/Tools/Job.pm +++ b/Anvil/Tools/Job.pm @@ -356,9 +356,10 @@ FROM WHERE job_command LIKE ".$anvil->Database->quote("%".$program."%")." AND - job_progress != '100' + job_progress = 0 AND job_host_uuid = ".$anvil->Database->quote($host_uuid)." +LIMIT 1 ;"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }}); my $results = $anvil->Database->query({query => $query, source => $THIS_FILE, line => __LINE__}); diff --git a/share/words.xml b/share/words.xml index d3674532..0835efa6 100644 --- a/share/words.xml +++ b/share/words.xml @@ -310,7 +310,7 @@ Output (if any): Unable to delete the server resource: [#!variable!server_name!#] as this node is not (yet) a full member of the cluster. It looks like to removal of the server resource: [#!variable!server_name!#] failed. The return code should have been '0', but: [#!variable!return_code!#] was returned. The 'pcs' command output, if any, was: [#!variable!output!#]. It looks like to removal of the server resource: [#!variable!server_name!#] failed. Unsafe to proceed with the removal of the server. Please check the logs for more information. - Unable to delete the resource: [#!variable!resource!#] because it wasn't found in DRBD's config. + Unable to delete the resource: [#!variable!resource!#] because it wasn't found in DRBD's config. This can happen is a previous delete partially completed, in which case this is not a problem. One or more peers need us, and we're not allowed to wait. Deletion aborted. The shell call: [#!variable!shell_call!#] was expected to return '0', but instead the return code: [#!variable!return_code!#] was received. The output, if any, was: [#!variable!output!#]. This host is not an Anvil! node or DR host, unable to migrate servers. @@ -319,6 +319,8 @@ Output (if any): Unable to find the target host to migrate to the job UUID: [#!variable!job_uuid!#]. The migration target host: [#!variable!target_host_uuid!#] is either invalid, or doesn't match one of the nodes in this Anvil! system. There appears to be no resource data in the database for the host: [#!variable!host_name!#]. Has ScanCore run and, specifically, has 'scan-hardware' run yet? Unable to provide available resources for this Anvil! system. + The resource name: [#!variable!resource_name!#] already exists, and 'force_unique' is set. This is likely a name conflict, returning '!!error!!'. + This node is not yet fully in the cluster. Sleeping for a bit, then we'll exit. The job will try again shortly after. @@ -657,6 +659,9 @@ It should be provisioned in the next minute or two. Manually calling 'scan-drbd' to ensure that the new agent is recorded. The server name: [#!variable!server_name!#] is already used by another server. Deleting the server's definition file: [#!variable!file!#]... + The server: [#!variable!server_name!#] was not found in the cluster configuration. This can happen if a server was partially deleted and we're trying again. + Preparing to delete the server: [#!variable!server_name!#]. + Using virsh to destroy (force off) the server: [#!variable!server_name!#], if it is still running. Starting: [#!variable!program!#]. @@ -1342,6 +1347,7 @@ The file: [#!variable!file!#] needs to be updated. The difference is: Deleting the file: [#!variable!file!#]. Wiping the metadata from the DRBD resource: [#!variable!resource!#]. Wiping any file system signatures and then deleting the logical volume: [#!variable!device_path!#]. + The resource name: [#!variable!resource_name!#] was found, returning the first TCP port and minor number. The host name: [#!variable!target!#] does not resolve to an IP address. diff --git a/tools/anvil-delete-server b/tools/anvil-delete-server index cf716ef5..0fee1e94 100755 --- a/tools/anvil-delete-server +++ b/tools/anvil-delete-server @@ -26,23 +26,21 @@ if (($running_directory =~ /^\./) && ($ENV{PWD})) $| = 1; my $anvil = Anvil::Tools->new(); -$anvil->Log->level({set => 2}); -$anvil->Log->secure({set => 1}); -$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => "log_0115", variables => { program => $THIS_FILE }}); # Read switches (target ([user@]host[:port]) and the file with the target's password. If the password is # passed directly, it will be used. Otherwise, the password will be read from the database. $anvil->data->{switches}{'job-uuid'} = ""; $anvil->Get->switches; +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, secure => 0, key => "log_0115", variables => { program => $THIS_FILE }}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'switches::job-uuid' => $anvil->data->{switches}{'job-uuid'} }}); $anvil->Database->connect(); -$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"}); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 3, secure => 0, key => "log_0132"}); if (not $anvil->data->{sys}{database}{connections}) { # No databases, update the job, sleep for a bit and then exit. The daemon will pick it up and try # again after we exit. - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0218"}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, 'print' => 1, level => 0, priority => "err", key => "error_0218"}); sleep 10; $anvil->nice_exit({exit_code => 1}); } @@ -54,14 +52,14 @@ if (not $anvil->data->{sys}{database}{connections}) # and, if available, the DR host. At this point, the job acts the same regardless of the host. The DRBD # resource will stopped and then have it's metadata wiped, The LV backing the device will be deleted next. -$anvil->data->{sys}{anvil_uuid} = $anvil->Cluster->get_anvil_uuid({debug => 2}); +$anvil->data->{sys}{anvil_uuid} = $anvil->Cluster->get_anvil_uuid(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'sys::anvil_uuid' => $anvil->data->{sys}{anvil_uuid} }}); # If we don't have a job UUID, try to find one. if (not $anvil->data->{switches}{'job-uuid'}) { # Load the job data. - $anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({program => $THIS_FILE}); + $anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({debug => 2, program => $THIS_FILE}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }}); } @@ -93,7 +91,7 @@ if ($anvil->data->{switches}{'job-uuid'}) } # Log an exit. - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => 'err', key => "error_0217"}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, 'print' => 1, level => 0, priority => 'err', key => "error_0217"}); $anvil->nice_exit({exit_code => 1}); } @@ -105,7 +103,7 @@ else if (not $anvil->data->{sys}{anvil_uuid}) { # We can't do anything, exit. - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => 'err', key => "error_0217"}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, 'print' => 1, level => 0, priority => 'err', key => "error_0217"}); $anvil->nice_exit({exit_code => 1}); } @@ -144,11 +142,17 @@ sub run_jobs remove_from_pacemaker($anvil); } + $anvil->Job->update_progress({ + progress => 25, + message => "job_0222,!!server_name!".$server_name."!!", + }); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "job_0222", variables => { server_name => $server_name }}); + $anvil->Job->update_progress({ progress => 50, message => "job_0213", }); - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "job_0213"}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "job_0213"}); ### NOTE: If we're a DR host, and the server wasn't used here, this is expected to fail # Delete the DRBD resource and backing storage @@ -162,7 +166,7 @@ sub run_jobs message => "error_0228,!!resource!".$server_name."!!", job_status => "failed", }); - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => 'err', key => "error_0228", variables => { resource => $server_name }}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => 'err', key => "error_0228", variables => { resource => $server_name }}); $anvil->nice_exit({exit_code => 1}); } @@ -170,7 +174,7 @@ sub run_jobs progress => 60, message => "job_0214", }); - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "job_0214"}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "job_0214"}); # Make sure the server is flagged as DELETEd. $anvil->Database->get_servers(); @@ -184,7 +188,8 @@ SET server_state = 'DELETED', modified_date = ".$anvil->Database->quote($anvil->data->{sys}{database}{timestamp})." WHERE - server_uuid = ".$anvil->Database->quote($server_uuid).";"; + server_uuid = ".$anvil->Database->quote($server_uuid)." +;"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => { query => $query }}); $anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__}); @@ -192,7 +197,7 @@ WHERE progress => 70, message => "job_0215", }); - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "job_0215"}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "job_0215"}); } # Delete the XML definition file. @@ -205,7 +210,7 @@ WHERE progress => 80, message => "job_0220,!!file!".$resource_file."!!", }); - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "job_0220", variables => { file => $resource_file }}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "job_0220", variables => { file => $resource_file }}); unlink $resource_file; } @@ -213,7 +218,7 @@ WHERE progress => 100, message => "job_0216", }); - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "job_0216"}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "job_0216"}); return(0); } @@ -235,46 +240,110 @@ sub remove_from_pacemaker progress => 10, message => "job_0210,!!server_name!".$server_name."!!", }); - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "job_0210", variables => { server_name => $server_name }}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "job_0210", variables => { server_name => $server_name }}); - my $problem = $anvil->Cluster->shutdown_server({ - debug => 2, - server => $server_name, - 'wait' => 1, - }); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); - if ($problem) + if (not $anvil->data->{cib}{parsed}{data}{server}{$server_name}) { - # Failed to stop. + # Server is already out of the cluster. $anvil->Job->update_progress({ - progress => 100, - message => "error_0223,!!server_name!".$server_name."!!", - job_status => "failed", + progress => 20, + message => "job_0221,!!server_name!".$server_name."!!", }); - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => 'err', key => "error_0223", variables => { server_name => $server_name }}); - $anvil->nice_exit({exit_code => 1}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "job_0211", variables => { server_name => $server_name }}); + + # Force the server off now, just in case it's running outside the cluster + $anvil->Job->update_progress({ + progress => 25, + message => "job_0223,!!server_name!".$server_name."!!", + }); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "job_0223", variables => { server_name => $server_name }}); + my $success = $anvil->Server->shutdown_virsh({ + debug => 2, + force => 1, + 'wait' => 1, + server => $server_name, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { success => $success }}); + if (not $success) + { + # Failed to stop + $anvil->Job->update_progress({ + progress => 100, + message => "error_0223,!!server_name!".$server_name."!!", + job_status => "failed", + }); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => 'err', key => "error_0223", variables => { server_name => $server_name }}); + $anvil->nice_exit({exit_code => 1}); + } } - - # Server is off now. - $anvil->Job->update_progress({ - progress => 20, - message => "job_0211,!!server_name!".$server_name."!!", - }); - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "job_0211", variables => { server_name => $server_name }}); - - # Delete the resource. - $problem = $anvil->Cluster->delete_server({debug => 2, server_name => $server_name}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); - if ($problem) + elsif ($anvil->data->{cib}{parsed}{data}{server}{$server_name}{status} ne "off") { - # Something went wrong + # As we're going to delete the server, we won't wait. We'll come back here and destroy the + # server if it's still running. + my $problem = $anvil->Cluster->shutdown_server({ + debug => 2, + server => $server_name, + 'wait' => 0, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); + if ($problem) + { + # Failed to stop. + $anvil->Job->update_progress({ + progress => 100, + message => "error_0223,!!server_name!".$server_name."!!", + job_status => "failed", + }); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => 'err', key => "error_0223", variables => { server_name => $server_name }}); + $anvil->nice_exit({exit_code => 1}); + } + + # Force the server off now. $anvil->Job->update_progress({ - progress => 100, - message => "error_0227,!!server_name!".$server_name."!!", - job_status => "failed", + progress => 20, + message => "job_0223,!!server_name!".$server_name."!!", }); - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => 'err', key => "error_0227", variables => { server_name => $server_name }}); - $anvil->nice_exit({exit_code => 1}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "job_0223", variables => { server_name => $server_name }}); + my $success = $anvil->Server->shutdown_virsh({ + debug => 2, + force => 1, + 'wait' => 1, + server => $server_name, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { success => $success }}); + if (not $success) + { + # Failed to stop + $anvil->Job->update_progress({ + progress => 100, + message => "error_0223,!!server_name!".$server_name."!!", + job_status => "failed", + }); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => 'err', key => "error_0223", variables => { server_name => $server_name }}); + $anvil->nice_exit({exit_code => 1}); + } + + # Server is off now. + $anvil->Job->update_progress({ + progress => 25, + message => "job_0211,!!server_name!".$server_name."!!", + }); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "job_0211", variables => { server_name => $server_name }}); + + # Delete the resource. + $problem = $anvil->Cluster->delete_server({debug => 2, server_name => $server_name}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); + if ($problem) + { + # Something went wrong + $anvil->Job->update_progress({ + progress => 100, + message => "error_0227,!!server_name!".$server_name."!!", + job_status => "failed", + }); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => 'err', key => "error_0227", variables => { server_name => $server_name }}); + $anvil->nice_exit({exit_code => 1}); + } } # Register the job with the peers. @@ -315,7 +384,7 @@ sub remove_from_pacemaker progress => $progress, message => "job_0212,!!host_name!".$host_name."!!", }); - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "job_0212", variables => { host_name => $host_name }}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "job_0212", variables => { host_name => $host_name }}); $progress += 10; } @@ -352,7 +421,7 @@ sub parse_job_data message => "error_0219,!!job_uuid!".$anvil->data->{switches}{'job-uuid'}."!!", job_status => "failed", }); - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => 'err', key => "error_0219", variables => { job_uuid => $anvil->data->{switches}{'job-uuid'} }}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => 'err', key => "error_0219", variables => { job_uuid => $anvil->data->{switches}{'job-uuid'} }}); $anvil->nice_exit({exit_code => 1}); } @@ -367,7 +436,7 @@ sub parse_job_data message => "error_0220,!!server_uuid!".$server_uuid."!!", job_status => "failed", }); - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => 'err', key => "error_0220", variables => { server_uuid => $server_uuid }}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => 'err', key => "error_0220", variables => { server_uuid => $server_uuid }}); $anvil->nice_exit({exit_code => 1}); } @@ -380,10 +449,21 @@ sub parse_job_data { # The cluster isn't running, sleep and exit. $anvil->Job->update_progress({ - progress => 0, + progress => 10, message => "error_0222", }); - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => 'err', key => "error_0222"}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => 'err', key => "error_0222"}); + sleep 10; + $anvil->nice_exit({exit_code => 1}); + } + elsif (not $anvil->data->{cib}{parsed}{'local'}{ready}) + { + # We're not a full member (yet) + $anvil->Job->update_progress({ + progress => 10, + message => "error_0238", + }); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => 'err', key => "error_0238"}); sleep 10; $anvil->nice_exit({exit_code => 1}); } diff --git a/tools/anvil-provision-server b/tools/anvil-provision-server index 025907e5..1005bcf2 100755 --- a/tools/anvil-provision-server +++ b/tools/anvil-provision-server @@ -8,6 +8,9 @@ # 0 = Normal exit. # 1 = Any problem that causes an early exit. # +# TODO: Support cloning; Example +# - virt-clone --original-xml /mnt/shared/definitions/.xml --name --file --check path_exists=off +# use strict; use warnings; @@ -1067,28 +1070,19 @@ sub check_drbd_minor_and_port $anvil->nice_exit({exit_code => 1}); } - if ((($anvil->data->{job}{drbd_minor} eq "") or ($anvil->data->{job}{drbd_tcp_port} eq "")) && (not $anvil->data->{job}{peer_mode})) + if (not $anvil->data->{job}{peer_mode}) { - my ($free_minor, $free_port) = $anvil->DRBD->get_next_resource({ + # We're primary, so query the minor number and TCP port + # The peer must use the TCP and minor as set in the job + ($anvil->data->{job}{drbd_minor}, $anvil->data->{job}{drbd_tcp_port}) = $anvil->DRBD->get_next_resource({ debug => 2, anvil_uuid => $anvil->data->{job}{anvil_uuid}, resource_name => $anvil->data->{job}{server_name}, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - free_minor => $free_minor, - free_port => $free_port, + 'job::drbd_minor' => $anvil->data->{job}{drbd_minor}, + 'job::drbd_tcp_port' => $anvil->data->{job}{drbd_tcp_port}, }}); - - if ($anvil->data->{job}{drbd_minor} eq "") - { - $anvil->data->{job}{drbd_minor} = $free_minor; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'job::drbd_minor' => $anvil->data->{job}{drbd_minor} }}); - } - if ($anvil->data->{job}{drbd_tcp_port} eq "") - { - $anvil->data->{job}{drbd_tcp_port} = $free_port; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'job::drbd_tcp_port' => $anvil->data->{job}{drbd_tcp_port} }}); - } } # If we don't have a DRBD minor or TCP port, we're stuck. @@ -1614,10 +1608,8 @@ sub interactive_ask_server_name }}); if ($server_state eq "DELETED") { - print "- ".$server_name." ("..")\n"; - print $anvil->Words->string({key => "message_0220", variables => { - server_name => $server_name, - }})."\n"; + ### NOTE: This could get cluttered, so for now we'll not show them. + #print $anvil->Words->string({key => "message_0220", variables => { server_name => $server_name }})."\n"; } else { diff --git a/tools/test.pl b/tools/test.pl index 1a59d91d..98a94eb8 100755 --- a/tools/test.pl +++ b/tools/test.pl @@ -26,38 +26,7 @@ $anvil->Get->switches; $anvil->Database->connect({debug => 3}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0132"}); -my $key_string = 'message_0190 -job_0185 -job_0186,!!minor!5!!,!!port!7803!! -job_0188,!!job_uuid!12eeded2-c5bb-4295-8c8e-665bd9c9b83a!!,!!peer_name!mk-a02n01.digimer.ca!! -job_0189,!!lv_path!/dev/mk-a02n02_ssd0/srv02-lab02_0!! -job_0218 -job_0190,!!resource!srv02-lab02!! -job_0191,!!resource!srv02-lab02!! -job_0192 -job_0195 -job_0203,!!resource!srv02-lab02!! -job_0199,!!shell_call!/usr/bin/virt-install --connect qemu:///system \ ---name srv02-lab02 \ - --os-variant win2k19 \ - --memory 8192 \ - --events on_poweroff=destroy,on_reboot=restart \ - --vcpus 6,sockets=1,cores=6 \ - --cpu host \ - --network bridge=ifn1_bridge1,model=virtio \ - --graphics spice \ - --sound ich9 \ - --clock offset=localtime \ - --boot menu=on \ - --disk path=/dev/drbd/by-res/srv02-lab02/0,target.bus=virtio,driver.io=threads,cache=writeback,driver.discard=unmap,boot.order=1 \ - --disk path=/mnt/shared/files/Windows_Server_2019_eval.iso,device=cdrom,shareable=on,boot.order=2 \ - --disk path=/mnt/shared/files/virtio-win-0.1.185.iso,device=cdrom,shareable=on,boot.order=3 --force \ - --noautoconsole --wait -1 > /var/log/anvil-server_srv02-lab02.log -!! -job_0200'; -my ($free_minor, $free_port) = $anvil->Words->parse_banged_string({ - debug => 2, - key_string => $key_string, -}); +my ($minor, $tcp_port) = $anvil->DRBD->get_next_resource({debug => 2}); +print "Next free minor: [".$minor."], tcp port: [".$tcp_port."]\n"; $anvil->nice_exit({exit_code => 0}); From 9dbb39da5b3ca16e5e50d92cf37a637932e3a5eb Mon Sep 17 00:00:00 2001 From: Digimer Date: Wed, 3 Feb 2021 12:58:18 -0500 Subject: [PATCH 5/9] * Added support for manually setting the server's UUID in anvil-provision-server. Also, if a server name existed before but was deleted, the old UUID is re-used to provide better continuity. The user can override this behaviour with the new --uuid switch. Signed-off-by: Digimer --- tools/anvil-provision-server | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/tools/anvil-provision-server b/tools/anvil-provision-server index 1005bcf2..716ff031 100755 --- a/tools/anvil-provision-server +++ b/tools/anvil-provision-server @@ -40,6 +40,7 @@ $anvil->data->{switches}{os} = ""; $anvil->data->{switches}{cpu} = ""; $anvil->data->{switches}{'job-uuid'} = ""; $anvil->data->{switches}{name} = ""; +$anvil->data->{switches}{uuid} = ""; $anvil->data->{switches}{ram} = ""; $anvil->data->{switches}{'storage-group'} = ""; $anvil->data->{switches}{'storage-size'} = ""; @@ -50,6 +51,7 @@ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list 'switches::cpu' => $anvil->data->{switches}{cpu}, 'switches::job-uuid' => $anvil->data->{switches}{'job-uuid'}, 'switches::name' => $anvil->data->{switches}{name}, + 'switches::uuid' => $anvil->data->{switches}{uuid}, 'switches::ram' => $anvil->data->{switches}{ram}, 'switches::storage-group' => $anvil->data->{switches}{'storage-group'}, 'switches::storage-size' => $anvil->data->{switches}{'storage-size'}, @@ -409,6 +411,10 @@ sub provision_server $shell_call .= " --boot menu=on \\\n"; $shell_call .= " --disk path=/dev/drbd/by-res/".$server."/0,target.bus=virtio,driver.io=threads,cache=writeback,driver.discard=unmap,boot.order=1 \\\n"; $shell_call .= " --disk path=".$anvil->data->{job}{install_iso_path}.",device=cdrom,shareable=on,boot.order=2 \\\n"; + if ($anvil->data->{job}{server_uuid}) + { + $shell_call .= " --uuid=".$anvil->data->{job}{server_uuid}." \\\n"; + } if ($anvil->data->{job}{driver_iso_path}) { $shell_call .= " --disk path=".$anvil->data->{job}{driver_iso_path}.",device=cdrom,shareable=on,boot.order=3 --force \\\n"; @@ -1123,6 +1129,11 @@ sub parse_job_data $anvil->data->{job}{server_name} = $1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'job::server_name' => $anvil->data->{job}{server_name} }}); } + if ($line =~ /server_uuid=(.*)$/) + { + $anvil->data->{job}{server_uuid} = $1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'job::server_uuid' => $anvil->data->{job}{server_uuid} }}); + } if ($line =~ /cpu_cores=(.*)$/) { $anvil->data->{job}{cpu_cores} = $1; @@ -1433,6 +1444,7 @@ sub interactive_question } $anvil->data->{new_server}{name} = $anvil->data->{switches}{name} ? $anvil->data->{switches}{name} : ""; + $anvil->data->{new_server}{uuid} = $anvil->data->{switches}{uuid} ? $anvil->data->{switches}{uuid} : ""; # If this is a node, load the anvil_uuid automatically. @@ -1647,10 +1659,13 @@ sub interactive_ask_server_name }}); if ($server_state eq "DELETED") { - # Valid, we can re-use deleted server names. + # Valid, we can re-use deleted server names. We'll also re-use the + # UUID, if the user didn't specifically specify a UUID. $anvil->data->{new_server}{name} = $answer; + $anvil->data->{new_server}{uuid} = $server_uuid if not $anvil->data->{new_server}{uuid}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "new_server::name" => $anvil->data->{new_server}{name}, + "new_server::uuid" => $anvil->data->{new_server}{uuid}, }}); } else @@ -2396,6 +2411,11 @@ storage_group_uuid=".$anvil->data->{new_server}{storage_group}." storage_size=".$anvil->data->{new_server}{storage_size}." install_iso=".$anvil->data->{new_server}{install_media}." driver_iso=".$anvil->data->{new_server}{driver_disc}; + if ($anvil->data->{new_server}{uuid}) + { + $job_data .= " +server_uuid=".$anvil->data->{new_server}{name}; + } print "\n".$anvil->Words->string({key => "job_0183", variables => { job_data => $job_data }})."\n"; # Register the job with the primary node on the Anvil! (or node 1, if neither node is primary). From 569270541e9bbe3ff23583582c4e63e3cb69fe51 Mon Sep 17 00:00:00 2001 From: Digimer Date: Wed, 3 Feb 2021 14:34:08 -0500 Subject: [PATCH 6/9] * Added 'tar' as a dependency because somehow I went three years without this... Signed-off-by: Digimer --- anvil.spec.in | 1 + 1 file changed, 1 insertion(+) diff --git a/anvil.spec.in b/anvil.spec.in index de53d3cb..5fe40062 100644 --- a/anvil.spec.in +++ b/anvil.spec.in @@ -98,6 +98,7 @@ Requires: rsync Requires: screen Requires: smartmontools Requires: syslinux +Requires: tar Requires: tmux Requires: unzip Requires: usbutils From 2be14d93a6df3fc57f551931695fec2b7f5f1e7b Mon Sep 17 00:00:00 2001 From: Digimer Date: Sun, 7 Feb 2021 17:03:05 -0500 Subject: [PATCH 7/9] * Added a check to anvil-delete-server to remove the XML definition file. * Added checks to anvil-provision-server to see if an existing server name is flagged as DELETED, instead of outright rejecting a given server name. Signed-off-by: Digimer --- tools/test.pl | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/tools/test.pl b/tools/test.pl index 98a94eb8..1a59d91d 100755 --- a/tools/test.pl +++ b/tools/test.pl @@ -26,7 +26,38 @@ $anvil->Get->switches; $anvil->Database->connect({debug => 3}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0132"}); -my ($minor, $tcp_port) = $anvil->DRBD->get_next_resource({debug => 2}); -print "Next free minor: [".$minor."], tcp port: [".$tcp_port."]\n"; +my $key_string = 'message_0190 +job_0185 +job_0186,!!minor!5!!,!!port!7803!! +job_0188,!!job_uuid!12eeded2-c5bb-4295-8c8e-665bd9c9b83a!!,!!peer_name!mk-a02n01.digimer.ca!! +job_0189,!!lv_path!/dev/mk-a02n02_ssd0/srv02-lab02_0!! +job_0218 +job_0190,!!resource!srv02-lab02!! +job_0191,!!resource!srv02-lab02!! +job_0192 +job_0195 +job_0203,!!resource!srv02-lab02!! +job_0199,!!shell_call!/usr/bin/virt-install --connect qemu:///system \ +--name srv02-lab02 \ + --os-variant win2k19 \ + --memory 8192 \ + --events on_poweroff=destroy,on_reboot=restart \ + --vcpus 6,sockets=1,cores=6 \ + --cpu host \ + --network bridge=ifn1_bridge1,model=virtio \ + --graphics spice \ + --sound ich9 \ + --clock offset=localtime \ + --boot menu=on \ + --disk path=/dev/drbd/by-res/srv02-lab02/0,target.bus=virtio,driver.io=threads,cache=writeback,driver.discard=unmap,boot.order=1 \ + --disk path=/mnt/shared/files/Windows_Server_2019_eval.iso,device=cdrom,shareable=on,boot.order=2 \ + --disk path=/mnt/shared/files/virtio-win-0.1.185.iso,device=cdrom,shareable=on,boot.order=3 --force \ + --noautoconsole --wait -1 > /var/log/anvil-server_srv02-lab02.log +!! +job_0200'; +my ($free_minor, $free_port) = $anvil->Words->parse_banged_string({ + debug => 2, + key_string => $key_string, +}); $anvil->nice_exit({exit_code => 0}); From b2dab95459db3249dd4f7e976c3782b7f16d5669 Mon Sep 17 00:00:00 2001 From: Digimer Date: Fri, 5 Feb 2021 23:41:48 -0500 Subject: [PATCH 8/9] * Updated DRBD->delete_resource() to return a success if asked to delete a non-existent resource (as can happen when partial anvil-delete-server runs are re-run). * Reworked DRBD->get_next_resource() to pull from the database, and to no longer do that increments-of-three nonsense. Avoidable complexity. Also added a call to Cluster->get_anvil_uuid() if the 'anvil_uuid' parameter wasn't passed. * Updated Database->get_host_from_uuid() and ->get_hosts() to now take 'include_deleted' parameter and default to not returning deleted hosts. This fixed issues where anvil-{delete,provision}-server calls could assign jobs to now-deleted hosts with reused host names. * Updated anvil-delete-server to print log entries to STDOUT. Also updated it to not wait of shutdown of a server in pacemaker to complete, and instead to destroy it after calling pacemaker's resource stop. Updated to also check to see if the server being deleted is already out of pacemaker and, if so, skip that step and directly try to destroy the server, if it's running. * Updated anvil-provision-server to force 'peer_mode' runs to pull their TCP Port and DRBD minor numbers from the job. This fixes a bug where the same resource on two machines could use different TCP ports. Signed-off-by: Digimer --- tools/test.pl | 35 ++--------------------------------- 1 file changed, 2 insertions(+), 33 deletions(-) diff --git a/tools/test.pl b/tools/test.pl index 1a59d91d..98a94eb8 100755 --- a/tools/test.pl +++ b/tools/test.pl @@ -26,38 +26,7 @@ $anvil->Get->switches; $anvil->Database->connect({debug => 3}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0132"}); -my $key_string = 'message_0190 -job_0185 -job_0186,!!minor!5!!,!!port!7803!! -job_0188,!!job_uuid!12eeded2-c5bb-4295-8c8e-665bd9c9b83a!!,!!peer_name!mk-a02n01.digimer.ca!! -job_0189,!!lv_path!/dev/mk-a02n02_ssd0/srv02-lab02_0!! -job_0218 -job_0190,!!resource!srv02-lab02!! -job_0191,!!resource!srv02-lab02!! -job_0192 -job_0195 -job_0203,!!resource!srv02-lab02!! -job_0199,!!shell_call!/usr/bin/virt-install --connect qemu:///system \ ---name srv02-lab02 \ - --os-variant win2k19 \ - --memory 8192 \ - --events on_poweroff=destroy,on_reboot=restart \ - --vcpus 6,sockets=1,cores=6 \ - --cpu host \ - --network bridge=ifn1_bridge1,model=virtio \ - --graphics spice \ - --sound ich9 \ - --clock offset=localtime \ - --boot menu=on \ - --disk path=/dev/drbd/by-res/srv02-lab02/0,target.bus=virtio,driver.io=threads,cache=writeback,driver.discard=unmap,boot.order=1 \ - --disk path=/mnt/shared/files/Windows_Server_2019_eval.iso,device=cdrom,shareable=on,boot.order=2 \ - --disk path=/mnt/shared/files/virtio-win-0.1.185.iso,device=cdrom,shareable=on,boot.order=3 --force \ - --noautoconsole --wait -1 > /var/log/anvil-server_srv02-lab02.log -!! -job_0200'; -my ($free_minor, $free_port) = $anvil->Words->parse_banged_string({ - debug => 2, - key_string => $key_string, -}); +my ($minor, $tcp_port) = $anvil->DRBD->get_next_resource({debug => 2}); +print "Next free minor: [".$minor."], tcp port: [".$tcp_port."]\n"; $anvil->nice_exit({exit_code => 0}); From 6009590352acd44c2bbcd5f5a45016af54595c5e Mon Sep 17 00:00:00 2001 From: Digimer Date: Sun, 7 Feb 2021 16:55:43 -0500 Subject: [PATCH 9/9] * Fixed a bug in scan-apc-ups where changes in the transfer reason were not being recorded. * Cleaned up a log of logging to reduce the amount of log entries when running at log level 1. * Bumped the scan-ipmitool default 'jump' range to 10c. Signed-off-by: Digimer --- Anvil/Tools/Email.pm | 5 ++--- Anvil/Tools/ScanCore.pm | 11 ++++++++--- scancore-agents/scan-apc-ups/scan-apc-ups | 20 ++++++++++++++++++-- scancore-agents/scan-ipmitool/scan-ipmitool | 2 +- share/words.xml | 2 +- tools/scancore | 13 ++++++++----- 6 files changed, 38 insertions(+), 15 deletions(-) diff --git a/Anvil/Tools/Email.pm b/Anvil/Tools/Email.pm index 035e3942..ec9a7157 100644 --- a/Anvil/Tools/Email.pm +++ b/Anvil/Tools/Email.pm @@ -623,10 +623,9 @@ Reply-To: ".$reply_to." my $file_time = $anvil->Get->date_and_time({file_name => 1}); my $short_uuid = $anvil->Get->uuid({short => 1}); my $file_name = $anvil->data->{path}{directories}{alert_emails}."/alert_email.".$file_time.".".$short_uuid; - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0541", variables => { file => $file_name }}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0541", variables => { file => $file_name }}); my $problem = $anvil->Storage->write_file({ - debug => 3, file => $file_name, body => $email_body, }); @@ -640,7 +639,7 @@ Reply-To: ".$reply_to." } else { - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0542", variables => { to => $to }}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0542", variables => { to => $to }}); my $shell_call = $anvil->data->{path}{exe}{mailx}." -t < ".$file_name; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { shell_call => $shell_call }}); diff --git a/Anvil/Tools/ScanCore.pm b/Anvil/Tools/ScanCore.pm index 663b48f2..07a24272 100644 --- a/Anvil/Tools/ScanCore.pm +++ b/Anvil/Tools/ScanCore.pm @@ -259,7 +259,7 @@ sub call_scan_agents $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { shell_call => $shell_call }}); # Tell the user this agent is about to run... - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0252", variables => { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0252", variables => { agent_name => $agent_name, timeout => $timeout, }}); @@ -269,9 +269,14 @@ sub call_scan_agents { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { line => $line }}); } - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0557", variables => { + + # If an agent takes a while to run, log it with higher verbosity + my $runtime = (time - $start_time); + my $log_level = $runtime > 10 ? 1 : $debug; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { output => $output, runtime => $runtime }}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => $log_level, key => "log_0557", variables => { agent_name => $agent_name, - runtime => (time - $start_time), + runtime => $runtime, return_code => $return_code, }}); diff --git a/scancore-agents/scan-apc-ups/scan-apc-ups b/scancore-agents/scan-apc-ups/scan-apc-ups index f15648ac..7f17abfa 100755 --- a/scancore-agents/scan-apc-ups/scan-apc-ups +++ b/scancore-agents/scan-apc-ups/scan-apc-ups @@ -698,6 +698,8 @@ INSERT INTO # some being more critical than others. if ($scan_apc_ups_last_transfer_reason ne $old_scan_apc_ups_last_transfer_reason) { + $ups_changed = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { ups_changed => $ups_changed }}); ### NOTE: This used to be 'warning' level, but it caused way too many ### false alarms. If a serious issue arises, the input voltage @@ -729,13 +731,27 @@ INSERT INTO $say_old_scan_apc_ups_last_transfer_reason = 99; } + # The level of the alert will depends on the now/old state + my $log_level = 3; + my $alert_level = "info"; + if (($scan_apc_ups_last_transfer_reason eq "2") or ($old_scan_apc_ups_last_transfer_reason eq "2") or + ($scan_apc_ups_last_transfer_reason eq "3") or ($old_scan_apc_ups_last_transfer_reason eq "3") or + ($scan_apc_ups_last_transfer_reason eq "4") or ($old_scan_apc_ups_last_transfer_reason eq "3") or + ($scan_apc_ups_last_transfer_reason eq "6") or ($old_scan_apc_ups_last_transfer_reason eq "6") or + ($scan_apc_ups_last_transfer_reason eq "8") or ($old_scan_apc_ups_last_transfer_reason eq "8") or + ($scan_apc_ups_last_transfer_reason eq "10") or ($old_scan_apc_ups_last_transfer_reason eq "10")) + { + $log_level = 2; + $alert_level = "notice"; + } + my $variables = { ups_name => $scan_apc_ups_name, new_value => "#!string!scan_apc_ups_last_transfer_".sprintf("%04d", $say_scan_apc_ups_last_transfer_reason)."!#", old_value => "#!string!scan_apc_ups_last_transfer_".sprintf("%04d", $say_old_scan_apc_ups_last_transfer_reason)."!#", }; - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "scan_apc_ups_warning_0015", variables => $variables}); - $anvil->Alert->register({alert_level => "notice", message => "scan_apc_ups_warning_0015", variables => $variables, set_by => $THIS_FILE, sort_position => $anvil->data->{'scan-apc-pdu'}{alert_sort}++}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $log_level, key => "scan_apc_ups_warning_0015", variables => $variables}); + $anvil->Alert->register({alert_level => $alert_level, message => "scan_apc_ups_warning_0015", variables => $variables, set_by => $THIS_FILE, sort_position => $anvil->data->{'scan-apc-pdu'}{alert_sort}++}); } if ($scan_apc_ups_manufactured_date ne $old_scan_apc_ups_manufactured_date) { diff --git a/scancore-agents/scan-ipmitool/scan-ipmitool b/scancore-agents/scan-ipmitool/scan-ipmitool index 438803e1..adf1ac37 100755 --- a/scancore-agents/scan-ipmitool/scan-ipmitool +++ b/scancore-agents/scan-ipmitool/scan-ipmitool @@ -124,7 +124,7 @@ $anvil->data->{'scan-ipmitool'} = { high_critical => 55, low_warning => 5, low_critical => 0, - jump => 5, + jump => 10, ### TODO: Some sensors define their hysteresis which we can read using: ### ipmitool ... sensor get "Ambient" buffer => 2, diff --git a/share/words.xml b/share/words.xml index 0835efa6..9d05966c 100644 --- a/share/words.xml +++ b/share/words.xml @@ -949,7 +949,7 @@ The body of the file: [#!variable!file!#] does not match the new body. The file [ Warning ] - The local system is not yet configured. Scancore will check once a minute and start running once configured. [ Cleared ] - The local system is now configured, proceeding. ScanCore is entering the main loop now. - ----=] ScanCore loop finished. Sleeping for: [#!variable!run_interval!#] seconds. ]=-------------------------------------- + ----=] ScanCore loop finished after: [#!variable!runtime!#]. Sleeping for: [#!variable!run_interval!#] seconds. ]=-------------------------------------- The md5sum of: [#!variable!file!#] has changed since the daemon started. * [#!variable!old_sum!#] -> [#!variable!new_sum!#] diff --git a/tools/scancore b/tools/scancore index 5b8307a2..e5b92515 100755 --- a/tools/scancore +++ b/tools/scancore @@ -97,10 +97,11 @@ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level while(1) { # Do the various pre-run tasks. - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => { 'sys::log::level' => $anvil->data->{sys}{'log'}{level} }}); + my $start_time = time; prepare_for_run($anvil); # Do we have at least one database? + my $agent_runtime = 0; if ($anvil->data->{sys}{database}{connections}) { # Run the normal tasks @@ -147,7 +148,10 @@ while(1) { $run_interval = $anvil->data->{scancore}{timing}{run_interval}; } - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0249", variables => { run_interval => $run_interval }}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0249", variables => { + run_interval => $run_interval, + runtime => (time - $start_time), + }}); sleep($run_interval); # In case something has changed, exit. @@ -211,7 +215,7 @@ sub prepare_for_run $anvil->Get->switches(); $anvil->Words->read(); $anvil->Database->connect(); - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0132"}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0132"}); # See if the mail server needs to be updated. $anvil->Email->check_config; @@ -303,9 +307,8 @@ sub startup_tasks my ($anvil) = @_; # Make sure our stop reason is cleared. - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0572"}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0572"}); my $variable_uuid = $anvil->Database->insert_or_update_variables({ - debug => 2, variable_name => 'system::stop_reason', variable_value => '', variable_default => '',