From e7537b0ca3ea399281db438a4408616a9c76c0a6 Mon Sep 17 00:00:00 2001 From: digimer Date: Tue, 25 Apr 2023 13:12:13 -0400 Subject: [PATCH 1/8] * Fixed a bug where, when DRBD->gather_data() calls 'drbdadm dump-xml' and the output includes usage data, it breaks XML parsing. * Fixed a bug in Get->available_resources() where DELETED servers were being counted in the used resources math. Signed-off-by: digimer --- Anvil/Tools/DRBD.pm | 18 ++++++++++++++++++ Anvil/Tools/Get.pm | 12 +++++++++--- notes | 1 + 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/Anvil/Tools/DRBD.pm b/Anvil/Tools/DRBD.pm index 93cfc116..fab080de 100644 --- a/Anvil/Tools/DRBD.pm +++ b/Anvil/Tools/DRBD.pm @@ -738,6 +738,24 @@ sub gather_data local_short_host_name => $local_short_host_name, }}); + # Often, annoyingly, DRBD reports a message about usage before showing the XML. We need to detect and + # strip that off. + my $new_xml = ""; + my $in_xml = 0; + foreach my $line (split/\n/, $xml) + { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { line => $line }}); + if ($line =~ /Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { in_xml => $in_xml }}); + } + next if not $in_xml; + $new_xml .= $line."\n"; + } + $xml = $new_xml; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { xml => $xml }}); + local $@; my $dom = eval { XML::LibXML->load_xml(string => $xml); }; if ($@) diff --git a/Anvil/Tools/Get.pm b/Anvil/Tools/Get.pm index 2b80a9a4..fe0b5f0d 100644 --- a/Anvil/Tools/Get.pm +++ b/Anvil/Tools/Get.pm @@ -601,7 +601,7 @@ WHERE scan_hardware_cpu_cores => $scan_hardware_cpu_cores, scan_hardware_cpu_threads => $scan_hardware_cpu_threads, scan_hardware_cpu_model => $scan_hardware_cpu_model, - scan_hardware_ram_total => $scan_hardware_ram_total, + scan_hardware_ram_total => $scan_hardware_ram_total." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $scan_hardware_ram_total}).")", }}); $anvil->data->{anvil_resources}{$anvil_uuid}{host_uuid}{$host_uuid}{cpu}{cores} = $scan_hardware_cpu_cores; @@ -663,7 +663,9 @@ SELECT FROM servers WHERE - server_anvil_uuid = ".$anvil->Database->quote($anvil_uuid)." + server_anvil_uuid = ".$anvil->Database->quote($anvil_uuid)." +AND + server_state != 'DELETED' ORDER BY server_name ASC;"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { query => $query }}); @@ -693,9 +695,13 @@ ORDER BY if (not exists $anvil->data->{anvil_resources}{ram}{reserved}) { $anvil->data->{anvil_resources}{ram}{reserved} = $default_reserved; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + default_reserved => $default_reserved, + "anvil_resources::ram::reserved" => $anvil->data->{anvil_resources}{ram}{reserved}, + }}); } $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { - "anvil_resources::ram::reserved" => $anvil->data->{anvil_resources}{ram}{reserved}." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $anvil->data->{anvil_resources}{ram}{reserved}}).")", + "anvil_resources::ram::reserved" => $anvil->data->{anvil_resources}{ram}{reserved}, }}); $anvil->data->{anvil_resources}{ram}{reserved} =~ s/,//g; diff --git a/notes b/notes index 628d8f1b..3c0ce0dd 100644 --- a/notes +++ b/notes @@ -17,6 +17,7 @@ Common queries; * SELECT a.host_name, b.file_name, c.file_location_active FROM hosts a, files b, file_locations c WHERE a.host_uuid = c.file_location_host_uuid AND b.file_uuid = c.file_location_file_uuid ORDER BY b.file_name ASC, a.host_name ASC; * SELECT a.dr_link_uuid, b.host_name, c.anvil_name, a.dr_link_note FROM dr_links a, hosts b, anvils c WHERE a.dr_link_host_uuid = b.host_uuid AND a.dr_link_anvil_uuid = c.anvil_uuid ORDER BY c.anvil_name ASC, b.host_name ASC; * SELECT a.storage_group_uuid, d.storage_group_member_uuid, b.anvil_name, a.storage_group_name, c.host_name, d.storage_group_member_vg_uuid, d.storage_group_member_note FROM storage_groups a, anvils b, hosts c, storage_group_members d WHERE a.storage_group_uuid = d.storage_group_member_storage_group_uuid AND a.storage_group_anvil_uuid = b.anvil_uuid AND c.host_uuid = d.storage_group_member_host_uuid ORDER BY a.storage_group_name ASC, c.host_name ASC; +* SELECT a.scan_hardware_uuid, b.host_name, a.scan_hardware_cpu_cores AS cores, a.scan_hardware_cpu_threads AS threads, pg_size_pretty(a.scan_hardware_ram_total) AS ram_total, pg_size_pretty(a.scan_hardware_memory_total) AS memory_total, pg_size_pretty(a.scan_hardware_memory_free) AS memory_free FROM scan_hardware a, hosts b WHERE a.scan_hardware_host_uuid = b.host_uuid ORDER BY b.host_name ASC; # Fail a resource for testing purposes. From 895f1ec2627c3d0031c97401b960d06b4a6dd008 Mon Sep 17 00:00:00 2001 From: digimer Date: Fri, 28 Apr 2023 00:19:53 -0400 Subject: [PATCH 2/8] This fixes a race condition when multiple servers are provisioned at (nearly) the same time. * In DRBD->get_next_resource(), implemented a "hold" system where the DRBD minor and TCP port(s) returned are marked as being held for one minute. So subsequent calls won't use the same numbers. * In anvil-daemon, added a check in run_jobs() where only one instance of a given job command will be started per 2-second loop. This should help reduce the chance of simultaneous race confitions in general. * Removed from anvil-provision-server and most other tools the call to Job->get_job_uuid(). If the program is called without the job_uuid, don't try to find it. This allows a human (or script) to make repeated calls to a program without one of those calls running a pending job instead. Signed-off-by: digimer --- Anvil/Tools/Cluster.pm | 3 + Anvil/Tools/DRBD.pm | 138 ++++++++++++++++++++++++++++++ notes | 3 + share/words.xml | 2 + tools/anvil-boot-server | 8 -- tools/anvil-daemon | 22 +++++ tools/anvil-delete-server | 8 -- tools/anvil-download-file | 13 --- tools/anvil-manage-power | 5 -- tools/anvil-manage-server | 8 -- tools/anvil-manage-server-storage | 8 -- tools/anvil-migrate-server | 8 -- tools/anvil-provision-server | 13 +-- tools/anvil-rename-server | 8 -- tools/anvil-safe-stop | 8 -- tools/anvil-shutdown-server | 8 -- tools/anvil-sync-shared | 8 -- tools/anvil-update-system | 8 -- tools/striker-boot-machine | 8 -- 19 files changed, 172 insertions(+), 115 deletions(-) diff --git a/Anvil/Tools/Cluster.pm b/Anvil/Tools/Cluster.pm index 76c2cf03..3f6c846a 100644 --- a/Anvil/Tools/Cluster.pm +++ b/Anvil/Tools/Cluster.pm @@ -143,6 +143,9 @@ sub add_server if (exists $anvil->data->{cib}{parsed}{cib}{resources}{primitive}{$server_name}{type}) { # The server already exists + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + "cib::parsed::cib::resources::primitive::${server_name}::type" => $anvil->data->{cib}{parsed}{cib}{resources}{primitive}{$server_name}{type}, + }}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0213", variables => { server_name => $server_name }}); return("!!error!!"); } diff --git a/Anvil/Tools/DRBD.pm b/Anvil/Tools/DRBD.pm index fab080de..26a2d135 100644 --- a/Anvil/Tools/DRBD.pm +++ b/Anvil/Tools/DRBD.pm @@ -1921,6 +1921,75 @@ ORDER BY } else { + # See if this minor is held by someone. + my $variable_name = "drbd::hold::minor::".$free_minor."::until"; + my ($variable_value, $variable_uuid, undef) = $anvil->Database->read_variable({ + debug => $debug, + variable_name => $variable_name, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + 's1:variable_name' => $variable_name, + 's2:variable_value' => $variable_value, + 's3:variable_uuid' => $variable_uuid, + }}); + + if (($variable_value) && ($variable_value !~ /^\d+$/)) + { + # Bad value, clear it. + $variable_uuid = $anvil->Database->insert_or_update_variables({ + debug => $debug, + variable_uuid => $variable_uuid, + variable_value => "0", + update_value_only => "", + }); + $variable_value = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + variable_uuid => $variable_uuid, + variable_value => $variable_value + }}); + } + + if ($variable_uuid) + { + my $now_time = time; + my $age = $now_time - $variable_value; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + age => $age, + now_time => $now_time, + }}); + if (($variable_value) && ($now_time > $variable_value)) + { + # This is being held, move on. + $free_minor++; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { free_minor => $free_minor }}); + next; + } + else + { + # Either the hold is stale or invalid, delete it. + $variable_uuid = $anvil->Database->insert_or_update_variables({ + debug => $debug, + variable_uuid => $variable_uuid, + variable_value => "0", + update_value_only => "", + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { variable_uuid => $variable_uuid }}); + } + } + + # To prevent race conditions, put a one minute hold on the minor number. + $variable_uuid = $anvil->Database->insert_or_update_variables({ + debug => $debug, + variable_name => $variable_name, + variable_value => time+60, + variable_default => "0", + variable_description => "striker_0301", + variable_section => "hold", + variable_source_uuid => "NULL", + variable_source_table => "", + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { variable_uuid => $variable_uuid }}); + $looking = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { looking => $looking }}); } @@ -1956,6 +2025,74 @@ ORDER BY } else { + # See if this minor is held by someone. + my $variable_name = "drbd::hold::tcp_port::".$check_port."::until"; + my ($variable_value, $variable_uuid, undef) = $anvil->Database->read_variable({ + debug => $debug, + variable_name => $variable_name, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + 's1:variable_name' => $variable_name, + 's2:variable_value' => $variable_value, + 's3:variable_uuid' => $variable_uuid, + }}); + + if (($variable_value) && ($variable_value !~ /^\d+$/)) + { + # Bad value, clear it. + $variable_uuid = $anvil->Database->insert_or_update_variables({ + debug => $debug, + variable_uuid => $variable_uuid, + variable_value => "0", + update_value_only => "", + }); + $variable_value = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + variable_uuid => $variable_uuid, + variable_value => $variable_value + }}); + } + + if ($variable_uuid) + { + my $now_time = time; + my $age = $now_time - $variable_value; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + age => $age, + now_time => $now_time }}); + if (($variable_value) && ($now_time > $variable_value)) + { + # This is being held, move on. + $check_port++; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { check_port => $check_port }}); + next; + } + else + { + # Either the hold is stale or invalid, delete it. + $variable_uuid = $anvil->Database->insert_or_update_variables({ + debug => $debug, + variable_uuid => $variable_uuid, + variable_value => "0", + update_value_only => "", + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { variable_uuid => $variable_uuid }}); + } + } + + # To prevent a race condition, put a one minute hold on this port number. + $variable_uuid = $anvil->Database->insert_or_update_variables({ + debug => $debug, + variable_name => $variable_name, + variable_value => time+60, + variable_default => "0", + variable_description => "striker_0301", + variable_section => "hold", + variable_source_uuid => "NULL", + variable_source_table => "", + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { variable_uuid => $variable_uuid }}); + # This is a free port. $free_ports .= $check_port.","; $port_count++; @@ -1977,6 +2114,7 @@ ORDER BY } } + # Mark these ports as assigned. $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { free_minor => $free_minor, free_ports => $free_ports, diff --git a/notes b/notes index 3c0ce0dd..0ba53630 100644 --- a/notes +++ b/notes @@ -20,6 +20,9 @@ Common queries; * SELECT a.scan_hardware_uuid, b.host_name, a.scan_hardware_cpu_cores AS cores, a.scan_hardware_cpu_threads AS threads, pg_size_pretty(a.scan_hardware_ram_total) AS ram_total, pg_size_pretty(a.scan_hardware_memory_total) AS memory_total, pg_size_pretty(a.scan_hardware_memory_free) AS memory_free FROM scan_hardware a, hosts b WHERE a.scan_hardware_host_uuid = b.host_uuid ORDER BY b.host_name ASC; +for lv in $(lvscan | grep deploy| awk '{print $2}' | sed s/\'//g); do lvremove -y $lv; done; rm -f /etc/drbd.d/an-test-deploy*; lvscan; ls -lah /etc/drbd.d/ + + # Fail a resource for testing purposes. crm_resource --fail --resource srv02-b -N vm-a01n01 diff --git a/share/words.xml b/share/words.xml index c93548d5..68080f25 100644 --- a/share/words.xml +++ b/share/words.xml @@ -2371,6 +2371,7 @@ The file: [#!variable!file!#] needs to be updated. The difference is: The DR host: [#!variable!host!#] as been linked to the Anvil! node: [#!variable!anvil!#]. The DR host: [#!variable!host!#] as been _unlinked_ to the Anvil! node: [#!variable!anvil!#]. The DR host: [#!variable!host!#] was not linked to the Anvil! node: [#!variable!anvil!#], nothing to do. + The job: [#!variable!command!#] (with job UUID: [#!variable!job_uuid!#]) is being skipped for now, already started a job (started job_uuid: [#!variable!started_job!#]) with this command on this loop. The host name: [#!variable!target!#] does not resolve to an IP address. @@ -3213,6 +3214,7 @@ If you are comfortable that the target has changed for a known reason, you can s TCP Port Migration Network link #!variable!number!# This is where you configure the optional network dedicated to RAM-copy during live migrations. + This puts a temporary hold on a DRBD minor number or TCP port so that it isn't used again in the time between when it was queried as the next free number, and before it can be used. #!variable!number!#/sec diff --git a/tools/anvil-boot-server b/tools/anvil-boot-server index a64e5ce7..3a40042f 100755 --- a/tools/anvil-boot-server +++ b/tools/anvil-boot-server @@ -43,14 +43,6 @@ if (not $anvil->data->{sys}{database}{connections}) $anvil->nice_exit({exit_code => 1}); } -# If we don't have a job UUID, try to find one. -if (not $anvil->data->{switches}{'job-uuid'}) -{ - # Load the job data. - $anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({program => $THIS_FILE}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }}); -} - if ($anvil->data->{switches}{'job-uuid'}) { # Load the job data. diff --git a/tools/anvil-daemon b/tools/anvil-daemon index bcd15b75..c0116114 100755 --- a/tools/anvil-daemon +++ b/tools/anvil-daemon @@ -1481,6 +1481,28 @@ sub run_jobs updated_seconds_ago => $updated_seconds_ago, }}); + # To minimize the chance of race conditions, any given command will be called only once at a + # time. If two jobs of the same command exist, only one will be called. + if ($job_progress != 100) + { + my $short_command = $job_command; + $short_command =~ s/\s.*$//; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { short_command => $short_command }}); + if (exists $anvil->data->{sys}{started}{$short_command}) + { + # Skip it. + my $started_job = $anvil->data->{sys}{started}{$short_command}; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0737", variables => { + started_job => $started_job, + job_uuid => $job_uuid, + command => $short_command, + }}); + next; + } + $anvil->data->{sys}{started}{$short_command} = $job_uuid; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "sys::started::${short_command}" => $anvil->data->{sys}{started}{$short_command} }}); + } + # If this is a start-up call, only start jobs whose status is 'anvil_startup'. if (($startup) && ($job_status ne "anvil_startup")) { diff --git a/tools/anvil-delete-server b/tools/anvil-delete-server index 3dac3791..4f756eb3 100755 --- a/tools/anvil-delete-server +++ b/tools/anvil-delete-server @@ -54,14 +54,6 @@ if (not $anvil->data->{sys}{database}{connections}) $anvil->data->{sys}{anvil_uuid} = $anvil->Cluster->get_anvil_uuid(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'sys::anvil_uuid' => $anvil->data->{sys}{anvil_uuid} }}); -# If we don't have a job UUID, try to find one. -if (not $anvil->data->{switches}{'job-uuid'}) -{ - # Load the job data. - $anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({debug => 2, program => $THIS_FILE}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }}); -} - # If we still don't have a job-uuit, go into interactive mode. if ($anvil->data->{switches}{'job-uuid'}) { diff --git a/tools/anvil-download-file b/tools/anvil-download-file index e56dd664..4e516a8c 100755 --- a/tools/anvil-download-file +++ b/tools/anvil-download-file @@ -105,19 +105,6 @@ sub get_job_details { my ($anvil) = @_; - # If I don't have a job-uuid, see if any jobs are pending - if (not $anvil->data->{switches}{'job-uuid'}) - { - my $job_uuid = $anvil->Job->get_job_uuid({debug => 2, program => $THIS_FILE}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }}); - if ($anvil->Validate->uuid({uuid => $job_uuid})) - { - # Got one! - $anvil->data->{switches}{'job-uuid'} = $job_uuid; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'switches::job-uuid' => $anvil->data->{switches}{'job-uuid'} }}); - } - } - # If we've got a job-uuid, load the details. $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'switches::job-uuid' => $anvil->data->{switches}{'job-uuid'} }}); if ($anvil->data->{switches}{'job-uuid'}) diff --git a/tools/anvil-manage-power b/tools/anvil-manage-power index 115075fb..1bf093e1 100755 --- a/tools/anvil-manage-power +++ b/tools/anvil-manage-power @@ -228,11 +228,6 @@ sub do_poweroff $job_uuid = $anvil->data->{switches}{'job-uuid'}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }}); } - else - { - $job_uuid = $anvil->Job->get_job_uuid({debug => 2, program => $THIS_FILE}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }}); - } # Make sure the 'reboot needed' flag is set. When 'anvil-daemon' starts, it will use this to confirm # that it is starting post-reboot and clear it. diff --git a/tools/anvil-manage-server b/tools/anvil-manage-server index 7dfe2c02..42aec54b 100755 --- a/tools/anvil-manage-server +++ b/tools/anvil-manage-server @@ -52,14 +52,6 @@ if (not $anvil->data->{sys}{database}{connections}) $anvil->nice_exit({exit_code => 1}); } -# If we don't have a job UUID, try to find one. -if (not $anvil->data->{switches}{'job-uuid'}) -{ - # Load the job data. - $anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({program => $THIS_FILE}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }}); -} - # If we still don't have a job-uuit, go into interactive mode. if ($anvil->data->{switches}{'job-uuid'}) { diff --git a/tools/anvil-manage-server-storage b/tools/anvil-manage-server-storage index 3f19ac74..2f956b28 100755 --- a/tools/anvil-manage-server-storage +++ b/tools/anvil-manage-server-storage @@ -70,14 +70,6 @@ if (not $anvil->data->{sys}{database}{connections}) $anvil->nice_exit({exit_code => 1}); } -# If we don't have a job UUID, try to find one. -if (not $anvil->data->{switches}{'job-uuid'}) -{ - # Load the job data. - $anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({program => $THIS_FILE}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }}); -} - $anvil->Database->get_hosts(); $anvil->Database->get_anvils(); $anvil->Database->get_servers(); diff --git a/tools/anvil-migrate-server b/tools/anvil-migrate-server index 52348b40..fd6b75a3 100755 --- a/tools/anvil-migrate-server +++ b/tools/anvil-migrate-server @@ -59,14 +59,6 @@ if (not $anvil->data->{sys}{database}{connections}) $anvil->nice_exit({exit_code => 1}); } -# If we don't have a job UUID, try to find one. -if (not $anvil->data->{switches}{'job-uuid'}) -{ - # Load the job data. - $anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({program => $THIS_FILE}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }}); -} - if ($anvil->data->{switches}{'job-uuid'}) { # Load the job data. diff --git a/tools/anvil-provision-server b/tools/anvil-provision-server index 3ccee7dc..ad24b2f4 100755 --- a/tools/anvil-provision-server +++ b/tools/anvil-provision-server @@ -66,14 +66,6 @@ if (not $anvil->data->{sys}{database}{connections}) $anvil->nice_exit({exit_code => 1}); } -# If we don't have a job UUID, try to find one. -if (not $anvil->data->{switches}{'job-uuid'}) -{ - # Load the job data. - $anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({program => $THIS_FILE}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }}); -} - # If we still don't have a job-uuit, go into interactive mode. if ($anvil->data->{switches}{'job-uuid'}) { @@ -354,7 +346,10 @@ sub add_server_to_cluster $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "job_0207"}); # Is our peer in the cluster? For that matter, are we? - my $problem = $anvil->Cluster->add_server({server_name => $anvil->data->{job}{server_name}}); + my $problem = $anvil->Cluster->add_server({ + debug => 2, + server_name => $anvil->data->{job}{server_name}, + }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); if ($problem) { diff --git a/tools/anvil-rename-server b/tools/anvil-rename-server index c010bf69..bef64cd4 100755 --- a/tools/anvil-rename-server +++ b/tools/anvil-rename-server @@ -54,14 +54,6 @@ if (not $anvil->data->{sys}{database}{connections}) $anvil->nice_exit({exit_code => 1}); } -# If we don't have a job UUID, try to find one. -if (not $anvil->data->{switches}{'job-uuid'}) -{ - # Load the job data. - $anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({program => $THIS_FILE}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }}); -} - # If we still don't have a job-uuit, go into interactive mode. if ($anvil->data->{switches}{'job-uuid'}) { diff --git a/tools/anvil-safe-stop b/tools/anvil-safe-stop index 7b217eed..6c6b99cf 100755 --- a/tools/anvil-safe-stop +++ b/tools/anvil-safe-stop @@ -74,14 +74,6 @@ if (not $anvil->data->{sys}{database}{connections}) $anvil->nice_exit({exit_code => 1}); } -# If we don't have a job UUID, try to find one. -if (not $anvil->data->{switches}{'job-uuid'}) -{ - # Load the job data. - $anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({program => $THIS_FILE}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }}); -} - # If we still don't have a job-uuit, go into interactive mode. if ($anvil->data->{switches}{'job-uuid'}) { diff --git a/tools/anvil-shutdown-server b/tools/anvil-shutdown-server index a1cdbf7c..992c0b01 100755 --- a/tools/anvil-shutdown-server +++ b/tools/anvil-shutdown-server @@ -53,14 +53,6 @@ if (not $anvil->data->{sys}{database}{connections}) $anvil->nice_exit({exit_code => 1}); } -# If we don't have a job UUID, try to find one. -if (not $anvil->data->{switches}{'job-uuid'}) -{ - # Load the job data. - $anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({program => $THIS_FILE}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }}); -} - if ($anvil->data->{switches}{'job-uuid'}) { # Load the job data. diff --git a/tools/anvil-sync-shared b/tools/anvil-sync-shared index be8d839c..db6cffa3 100755 --- a/tools/anvil-sync-shared +++ b/tools/anvil-sync-shared @@ -40,14 +40,6 @@ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => " $anvil->Database->connect; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0132"}); -# If we don't have a job-uuid, look for one. -if (not $anvil->data->{switches}{'job-uuid'}) -{ - # Load the job data. - $anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({debug => 2, program => $THIS_FILE}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }}); -} - # If we have a job-uuid, process it. if ($anvil->data->{switches}{'job-uuid'}) { diff --git a/tools/anvil-update-system b/tools/anvil-update-system index 06600601..78874e2f 100755 --- a/tools/anvil-update-system +++ b/tools/anvil-update-system @@ -54,14 +54,6 @@ if (not $anvil->data->{sys}{database}{connections}) $anvil->nice_exit({exit_code => 1}); } -# Did we get called with a job UUID? If not, try to find a pending job and take it. -$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }}); -if (not $anvil->data->{switches}{'job-uuid'}) -{ - # See if a job is waiting to run. - $anvil->data->{switches}{job_uuid} = $anvil->Job->get_job_uuid({program => $THIS_FILE}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }}); -} if ($anvil->data->{switches}{'job-uuid'}) { # Load the job details. If anything is returned, there was a problem. diff --git a/tools/striker-boot-machine b/tools/striker-boot-machine index 2ffdc1dd..d9ef51b6 100755 --- a/tools/striker-boot-machine +++ b/tools/striker-boot-machine @@ -48,14 +48,6 @@ if (not $anvil->data->{sys}{database}{connections}) $anvil->nice_exit({exit_code => 1}); } -# If we don't have a job UUID, try to find one. -if (not $anvil->data->{switches}{'job-uuid'}) -{ - # Load the job data. - $anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({program => $THIS_FILE}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }}); -} - if ($anvil->data->{switches}{'job-uuid'}) { # Load the job data. From 9a58f4d1ffc323607a2739c2df2161705aa8f7d3 Mon Sep 17 00:00:00 2001 From: digimer Date: Sun, 30 Apr 2023 19:47:58 -0400 Subject: [PATCH 3/8] * This is a small commit to increase logging while chasing down a race condition issue with assembling storage groups. Signed-off-by: digimer --- Anvil/Tools/Get.pm | 2 +- tools/anvil-manage-storage-groups | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Anvil/Tools/Get.pm b/Anvil/Tools/Get.pm index fe0b5f0d..0ec80857 100644 --- a/Anvil/Tools/Get.pm +++ b/Anvil/Tools/Get.pm @@ -527,7 +527,7 @@ sub available_resources # This both loads storage group data and assembles ungrouped VGs into storage groups, when possible. $anvil->Cluster->assemble_storage_groups({ - debug => $debug, + debug => 2, anvil_uuid => $anvil_uuid, }); diff --git a/tools/anvil-manage-storage-groups b/tools/anvil-manage-storage-groups index 62500ca2..ee8ca50a 100755 --- a/tools/anvil-manage-storage-groups +++ b/tools/anvil-manage-storage-groups @@ -32,6 +32,8 @@ if (($running_directory =~ /^\./) && ($ENV{PWD})) $| = 1; my $anvil = Anvil::Tools->new(); +$anvil->Log->level({set => 2}); +$anvil->Log->secure({set => 1}); $anvil->Get->switches({list => [ "add", From 1bba56a5b1596b95f27c17a28abe1592a287b34f Mon Sep 17 00:00:00 2001 From: digimer Date: Mon, 1 May 2023 10:54:51 -0400 Subject: [PATCH 4/8] Hard coded anvil-provision-server to log level 2 while chasing a race condition is storage groups. Signed-off-by: digimer --- tools/anvil-provision-server | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/anvil-provision-server b/tools/anvil-provision-server index ad24b2f4..6c42f8e6 100755 --- a/tools/anvil-provision-server +++ b/tools/anvil-provision-server @@ -31,6 +31,8 @@ if (($running_directory =~ /^\./) && ($ENV{PWD})) $| = 1; my $anvil = Anvil::Tools->new(); +$anvil->Log->level({set => 2}); +$anvil->Log->secure({set => 1}); # Read switches $anvil->Get->switches({list => [ From d64044c7d187607e06f19bf37192ce3de872dd8b Mon Sep 17 00:00:00 2001 From: digimer Date: Mon, 1 May 2023 13:48:27 -0400 Subject: [PATCH 5/8] Test fix for storage group race condition. Signed-off-by: digimer --- Anvil/Tools/Cluster.pm | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Anvil/Tools/Cluster.pm b/Anvil/Tools/Cluster.pm index 3f6c846a..34ac84b7 100644 --- a/Anvil/Tools/Cluster.pm +++ b/Anvil/Tools/Cluster.pm @@ -378,7 +378,10 @@ ORDER BY scan_lvm_vg_size => $scan_lvm_vg_size." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $scan_lvm_vg_size}).")", scan_lvm_vg_internal_uuid => $scan_lvm_vg_internal_uuid, }}); - + + # Using previously loaded storage group data can be racy, so query group membership directly. + $anvil->Database->get_storage_group_data({debug => $debug}); + # Skip VGs that are in a group already. if ((exists $anvil->data->{storage_groups}{vg_uuid}{$scan_lvm_vg_internal_uuid}) && ($anvil->data->{storage_groups}{vg_uuid}{$scan_lvm_vg_internal_uuid}{storage_group_uuid})) From e483840ceb3234a4e32e469d43fbd7561060bfaf Mon Sep 17 00:00:00 2001 From: digimer Date: Mon, 1 May 2023 20:29:20 -0400 Subject: [PATCH 6/8] Second attempt to fix the storage group race condition. This time, we only let node 1 assemble storage groups. Signed-off-by: digimer --- Anvil/Tools/Cluster.pm | 3 --- Anvil/Tools/Get.pm | 23 +++++++++++++++-------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/Anvil/Tools/Cluster.pm b/Anvil/Tools/Cluster.pm index 34ac84b7..5fac82fe 100644 --- a/Anvil/Tools/Cluster.pm +++ b/Anvil/Tools/Cluster.pm @@ -379,9 +379,6 @@ ORDER BY scan_lvm_vg_internal_uuid => $scan_lvm_vg_internal_uuid, }}); - # Using previously loaded storage group data can be racy, so query group membership directly. - $anvil->Database->get_storage_group_data({debug => $debug}); - # Skip VGs that are in a group already. if ((exists $anvil->data->{storage_groups}{vg_uuid}{$scan_lvm_vg_internal_uuid}) && ($anvil->data->{storage_groups}{vg_uuid}{$scan_lvm_vg_internal_uuid}{storage_group_uuid})) diff --git a/Anvil/Tools/Get.pm b/Anvil/Tools/Get.pm index 0ec80857..7a566660 100644 --- a/Anvil/Tools/Get.pm +++ b/Anvil/Tools/Get.pm @@ -516,20 +516,27 @@ sub available_resources $anvil->Database->get_bridges({debug => $debug}); # Get the details. + my $host_uuid = $anvil->Get->host_uuid; my $anvil_name = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_name}; my $node1_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node1_host_uuid}; my $node2_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node2_host_uuid}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { - anvil_name => $anvil_name, - node1_host_uuid => $node1_host_uuid, - node2_host_uuid => $node2_host_uuid, + 's1:anvil_name' => $anvil_name, + 's2:node1_host_uuid' => $node1_host_uuid, + 's3:node2_host_uuid' => $node2_host_uuid, + 's4:host_uuid' => $host_uuid, }}); - # This both loads storage group data and assembles ungrouped VGs into storage groups, when possible. - $anvil->Cluster->assemble_storage_groups({ - debug => 2, - anvil_uuid => $anvil_uuid, - }); + # If we're node 1, we'll try to assemble the storage group. Onle node 1 does this to help avoid race + # conditions. This both loads storage group data and assembles ungrouped VGs into storage groups, + # when possible. + if ($host_uuid eq $node1_host_uuid) + { + $anvil->Cluster->assemble_storage_groups({ + debug => 2, + anvil_uuid => $anvil_uuid, + }); + } # This will store the available resources based on the least of the nodes. $anvil->data->{anvil_resources}{$anvil_uuid}{cpu}{cores} = 0; From 510db7025333ee8867ffc276ec4738f8ad894b8f Mon Sep 17 00:00:00 2001 From: digimer Date: Tue, 2 May 2023 00:07:40 -0400 Subject: [PATCH 7/8] Another attempt to resolve the stoage group race condition. This moves the check for auto-assembly to scan-lvm. It only works for the first assemble, after that the user can/should use anvil-manage-storage-groups. Signed-off-by: digimer --- Anvil/Tools/Database.pm | 9 +++++- Anvil/Tools/Get.pm | 13 --------- scancore-agents/scan-lvm/scan-lvm | 48 +++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 14 deletions(-) diff --git a/Anvil/Tools/Database.pm b/Anvil/Tools/Database.pm index 1e842e1d..8d0dba12 100644 --- a/Anvil/Tools/Database.pm +++ b/Anvil/Tools/Database.pm @@ -5900,7 +5900,6 @@ ORDER BY my $storage_group_member_host_uuid = $row->[4]; my $storage_group_member_vg_uuid = $row->[5]; # This is the VG's internal UUID my $storage_group_member_note = $row->[6]; # If this is 'DELETED', the link isn't used anymore - my $storage_group_member_host_name = $anvil->data->{hosts}{host_uuid}{$storage_group_member_host_uuid}{short_host_name}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { storage_group_uuid => $storage_group_uuid, storage_group_anvil_uuid => $storage_group_anvil_uuid, @@ -5909,6 +5908,14 @@ ORDER BY storage_group_member_host_uuid => $storage_group_member_host_uuid, storage_group_member_vg_uuid => $storage_group_member_vg_uuid, storage_group_member_note => $storage_group_member_note, + }}); + + if (not exists $anvil->data->{hosts}{host_uuid}{$storage_group_member_host_uuid}) + { + $anvil->Database->get_hosts({debug => $debug}); + } + my $storage_group_member_host_name = $anvil->data->{hosts}{host_uuid}{$storage_group_member_host_uuid}{short_host_name}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { storage_group_member_host_name => $storage_group_member_host_name, }}); diff --git a/Anvil/Tools/Get.pm b/Anvil/Tools/Get.pm index 7a566660..3147b0d0 100644 --- a/Anvil/Tools/Get.pm +++ b/Anvil/Tools/Get.pm @@ -516,7 +516,6 @@ sub available_resources $anvil->Database->get_bridges({debug => $debug}); # Get the details. - my $host_uuid = $anvil->Get->host_uuid; my $anvil_name = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_name}; my $node1_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node1_host_uuid}; my $node2_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node2_host_uuid}; @@ -524,20 +523,8 @@ sub available_resources 's1:anvil_name' => $anvil_name, 's2:node1_host_uuid' => $node1_host_uuid, 's3:node2_host_uuid' => $node2_host_uuid, - 's4:host_uuid' => $host_uuid, }}); - # If we're node 1, we'll try to assemble the storage group. Onle node 1 does this to help avoid race - # conditions. This both loads storage group data and assembles ungrouped VGs into storage groups, - # when possible. - if ($host_uuid eq $node1_host_uuid) - { - $anvil->Cluster->assemble_storage_groups({ - debug => 2, - anvil_uuid => $anvil_uuid, - }); - } - # This will store the available resources based on the least of the nodes. $anvil->data->{anvil_resources}{$anvil_uuid}{cpu}{cores} = 0; $anvil->data->{anvil_resources}{$anvil_uuid}{cpu}{threads} = 0; diff --git a/scancore-agents/scan-lvm/scan-lvm b/scancore-agents/scan-lvm/scan-lvm index 8e17eb86..7f59de3e 100755 --- a/scancore-agents/scan-lvm/scan-lvm +++ b/scancore-agents/scan-lvm/scan-lvm @@ -84,6 +84,9 @@ read_last_scan($anvil); # Loog for changes find_changes($anvil); +# Check storage for storage groups. +check_storage_groups($anvil); + # Shut down. $anvil->ScanCore->agent_shutdown({agent => $THIS_FILE}); $anvil->nice_exit({exit_code => 0}); @@ -93,6 +96,51 @@ $anvil->nice_exit({exit_code => 0}); # Functions # ############################################################################################################# +sub check_storage_groups +{ + my ($anvil) = @_; + + # Are we in an Anvil!? If so, are there any storage groups yet? + my $anvil_uuid = $anvil->Cluster->get_anvil_uuid(); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { anvil_uuid => $anvil_uuid }}); + + if ($anvil_uuid) + { + # If we're node 1, we'll try to assemble the storage group. Onle node 1 does this to help avoid race + # conditions. This both loads storage group data and assembles ungrouped VGs into storage groups, + # when possible. + $anvil->Database->get_anvils(); + my $host_uuid = $anvil->Get->host_uuid; + my $anvil_name = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_name}; + my $node1_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node1_host_uuid}; + my $node2_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node2_host_uuid}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:anvil_name' => $anvil_name, + 's2:node1_host_uuid' => $node1_host_uuid, + 's3:node2_host_uuid' => $node2_host_uuid, + 's4:host_uuid' => $host_uuid, + }}); + if ($host_uuid eq $node1_host_uuid) + { + # Are there any storage groups yet? If not, try to assemble one. + my $query = "SELECT COUNT(*) FROM storage_groups WHERE storage_group_anvil_uuid = ".$anvil->Database->quote($anvil_uuid).";"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }}); + + my $count = $anvil->Database->query({query => $query, source => $THIS_FILE, line => __LINE__})->[0]->[0]; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { count => $count }}); + if (not $count) + { + $anvil->Cluster->assemble_storage_groups({ + debug => 2, + anvil_uuid => $anvil_uuid, + }); + } + } + } + + return(0); +} + sub find_changes { my ($anvil) = @_; From 26fa3c7e32985d53c723b0e1e3c130c44e55b9a1 Mon Sep 17 00:00:00 2001 From: digimer Date: Tue, 2 May 2023 16:28:05 -0400 Subject: [PATCH 8/8] Fixed a bug where Get->available_resources() was missing LVM/storage group data in some cases. Signed-off-by: digimer --- Anvil/Tools/Get.pm | 3 +- notes | 120 +++++++++++++++++++++------------------------ 2 files changed, 57 insertions(+), 66 deletions(-) diff --git a/Anvil/Tools/Get.pm b/Anvil/Tools/Get.pm index 3147b0d0..2d38328b 100644 --- a/Anvil/Tools/Get.pm +++ b/Anvil/Tools/Get.pm @@ -514,7 +514,8 @@ sub available_resources # Load hosts and network bridges. This loads Anvil! data as well $anvil->Database->get_hosts({debug => $debug}); $anvil->Database->get_bridges({debug => $debug}); - + $anvil->Database->get_lvm_data({debug => $debug}); + # Get the details. my $anvil_name = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_name}; my $node1_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node1_host_uuid}; diff --git a/notes b/notes index 0ba53630..dc0a3bf4 100644 --- a/notes +++ b/notes @@ -14,10 +14,16 @@ Create "Node status" which returns "degraded" if the peer is gone Common queries; * SELECT a.job_uuid, b.host_name, a.job_command, a.job_data, a.job_progress, a.job_status FROM jobs a, hosts b WHERE a.job_host_uuid = b.host_uuid AND a.job_progress != 100; -* SELECT a.host_name, b.file_name, c.file_location_active FROM hosts a, files b, file_locations c WHERE a.host_uuid = c.file_location_host_uuid AND b.file_uuid = c.file_location_file_uuid ORDER BY b.file_name ASC, a.host_name ASC; * SELECT a.dr_link_uuid, b.host_name, c.anvil_name, a.dr_link_note FROM dr_links a, hosts b, anvils c WHERE a.dr_link_host_uuid = b.host_uuid AND a.dr_link_anvil_uuid = c.anvil_uuid ORDER BY c.anvil_name ASC, b.host_name ASC; * SELECT a.storage_group_uuid, d.storage_group_member_uuid, b.anvil_name, a.storage_group_name, c.host_name, d.storage_group_member_vg_uuid, d.storage_group_member_note FROM storage_groups a, anvils b, hosts c, storage_group_members d WHERE a.storage_group_uuid = d.storage_group_member_storage_group_uuid AND a.storage_group_anvil_uuid = b.anvil_uuid AND c.host_uuid = d.storage_group_member_host_uuid ORDER BY a.storage_group_name ASC, c.host_name ASC; * SELECT a.scan_hardware_uuid, b.host_name, a.scan_hardware_cpu_cores AS cores, a.scan_hardware_cpu_threads AS threads, pg_size_pretty(a.scan_hardware_ram_total) AS ram_total, pg_size_pretty(a.scan_hardware_memory_total) AS memory_total, pg_size_pretty(a.scan_hardware_memory_free) AS memory_free FROM scan_hardware a, hosts b WHERE a.scan_hardware_host_uuid = b.host_uuid ORDER BY b.host_name ASC; +* SELECT a.scan_apc_ups_name AS name, a.scan_apc_ups_serial_number AS sn, a.scan_apc_ups_health AS health, a.scan_apc_ups_nmc_serial_number AS nmc_sn, a.scan_apc_ups_nmc_mac_address AS mac, a.scan_apc_ups_ip AS ip, b._percentage_charge AS charge, d.scan_apc_ups_battery_temperature AS btemp FROM scan_apc_upses a, scan_apc_ups_input b, scan_apc_ups_output c, scan_apc_ups_batteries d WHERE a.scan_apc_ups_uuid = b.scan_apc_ups_input_scan_apc_ups_uuid AND a.scan_apc_ups_uuid = c.scan_apc_ups_output_scan_apc_ups_uuid AND a.scan_apc_ups_uuid = d.scan_apc_ups_battery_scan_apc_ups_uuid ORDER BY name ASC; + +* SELECT b.host_name, a.network_interface_uuid, a.network_interface_mac_address AS mac, a.network_interface_name AS name, a.network_interface_speed AS speed, a.network_interface_link_state AS link, a.network_interface_operational AS op, a.network_interface_duplex AS duplex, a.network_interface_medium AS medium, a.network_interface_bond_uuid AS bond_uuid, a.network_interface_bridge_uuid AS bridge_uuid FROM network_interfaces a, hosts b WHERE a.network_interface_host_uuid = b.host_uuid AND b.host_name LIKE 'an-a02%' AND a.network_interface_operational != 'DELETED' ORDER BY b.host_name ASC, a.network_interface_name ASC; +* SELECT b.host_name, a.bond_uuid, a.bond_name, a.bond_mode, a.bond_mtu AS mtu, a.bond_primary_interface AS primary, a.bond_active_interface AS active, a.bond_mac_address AS mac, a.bond_operational AS op, c.bridge_name, a.modified_date FROM bonds a, hosts b, bridges c WHERE a.bond_host_uuid = b.host_uuid AND a.bond_bridge_uuid = c.bridge_uuid AND (b.host_uuid = 'b4e46faf-0ebe-e211-a0d6-00262d0ca874' OR b.host_uuid = '4ba42b4e-9bf7-e311-a889-899427029de4') ORDER BY b.host_name ASC, a.bond_name ASC; +* SELECT b.host_name, a.bridge_uuid, a.bridge_name, a.bridge_id, a.bridge_mtu FROM bridges a, hosts b WHERE a.bridge_host_uuid = b.host_uuid AND b.host_name LIKE 'an-a02%' ORDER BY b.host_name ASC, a.bridge_name ASC; +* SELECT a.host_name, b.file_name, c.file_location_active FROM hosts a, files b, file_locations c WHERE a.host_uuid = c.file_location_host_uuid AND b.file_uuid = c.file_location_file_uuid ORDER BY b.file_name ASC, a.host_name ASC; +* SELECT b.host_name, a.health_agent_name, a.health_source_name, a.health_source_weight FROM health a, hosts b WHERE b.host_uuid = a.health_host_uuid AND b.host_name LIKE 'an-a02%' order by b.host_name ASC, a.health_agent_name ASC, a.health_source_weight ASC; for lv in $(lvscan | grep deploy| awk '{print $2}' | sed s/\'//g); do lvremove -y $lv; done; rm -f /etc/drbd.d/an-test-deploy*; lvscan; ls -lah /etc/drbd.d/ @@ -29,14 +35,11 @@ crm_resource --fail --resource srv02-b -N vm-a01n01 # Recover without reboot crm_resource --resource srv01-a --refresh - uname -r; grubby --default-kernel; lsinitrd -m /boot/initramfs-4.18.0-448.el8.x86_64.img | grep lvm; systemctl is-enabled scancore.service; dnf -y update; systemctl disable --now anvil-daemon; systemctl disable --now scancore - When pairing Striker, make sure new config goes to all known nodes! - dnf -y update && dnf -y install https://www.alteeve.com/an-repo/m3/anvil-release-latest.noarch.rpm && alteeve-repo-setup -y && dnf -y install anvil-striker --allowerasing dnf -y update && dnf -y install https://www.alteeve.com/an-repo/m3/anvil-release-latest.noarch.rpm && alteeve-repo-setup -y && dnf -y install anvil-node --allowerasing dnf -y update && dnf -y install https://www.alteeve.com/an-repo/m3/anvil-release-latest.noarch.rpm && alteeve-repo-setup -y && dnf -y install anvil-dr --allowerasing @@ -60,7 +63,6 @@ firewall-cmd --permanent --zone=IFN1 --add-port=22869/tcp firewall-cmd --reload - # Configure APC PDUs and UPSes tcpip -i 10.201.2.3 -s 255.255.0.0 -g 10.201.255.254 web -h enable @@ -75,7 +77,6 @@ snmp -S enable -c2 public -a2 writeplus watch 'echo "striker 1"; ssh root@an-striker01 "grep ^database /etc/anvil/anvil.conf | grep host"; echo "striker 2"; ssh root@an-striker02 "grep ^database /etc/anvil/anvil.conf | grep host"; echo "node 1"; ssh root@an-a01n01 "grep ^database /etc/anvil/anvil.conf | grep host"; echo "node 2"; ssh root@an-a01n02 "grep ^database /etc/anvil/anvil.conf | grep host"; echo "dr 1"; ssh root@an-a01dr01 "grep ^database /etc/anvil/anvil.conf | grep host";' - Anvil! to Anvil! live migration; 1. Create LVs 2. Make sure /etc/hosts is populated @@ -95,13 +96,6 @@ Deleting Resource - srv01-cs8 10. -TODO: -- Remove this; (step 2) "This is the user name that you will log into Striker as and the name of the user that owns the database" -- Being set to the gateway, not the default DNS - "This is the domain name server(s) to use when resolving domain names. You can specify 2 or more, separated by commas." -- The web UI password isn't being set properly during Striker stage-2 setup -- Changing the password doesn't log out active webui sessions. -- host_health is a duplicate of 'health' - ============ # Dump @@ -111,13 +105,9 @@ su - postgres -c "pg_dump --schema-only anvil > /var/lib/pgsql/anvil_schema.out" su - postgres -c "dropdb anvil" && su - postgres -c "createdb --owner admin anvil" && su - postgres -c "psql anvil < /var/lib/pgsql/anvil.out" su postgres -c "psql anvil" -SELECT a.scan_apc_ups_name AS name, a.scan_apc_ups_serial_number AS sn, a.scan_apc_ups_health AS health, a.scan_apc_ups_nmc_serial_number AS nmc_sn, a.scan_apc_ups_nmc_mac_address AS mac, a.scan_apc_ups_ip AS ip, b._percentage_charge AS charge, d.scan_apc_ups_battery_temperature AS btemp FROM scan_apc_upses a, scan_apc_ups_input b, scan_apc_ups_output c, scan_apc_ups_batteries d WHERE a.scan_apc_ups_uuid = b.scan_apc_ups_input_scan_apc_ups_uuid AND a.scan_apc_ups_uuid = c.scan_apc_ups_output_scan_apc_ups_uuid AND a.scan_apc_ups_uuid = d.scan_apc_ups_battery_scan_apc_ups_uuid ORDER BY name ASC; ============ -dnf -y install augeas - - Jenkins; Initial setup: @@ -1076,8 +1066,8 @@ OS10# write memory ### Set hostname: OS10# configure terminal -OS10(config)# hostname zo-switch02 -zo-switch02(config)# +OS10(config)# hostname an-switch02 +an-switch02(config)# ======] VLT Config [======= @@ -1111,7 +1101,7 @@ OS10# show vlt 1 mismatch (If no issues, VLT is OK) # See how I am and my role (* == switch you're on) -zo-switch02(config)# show vlt 1 role +an-switch02(config)# show vlt 1 role VLT Unit ID Role ------------------------ * 1 secondary @@ -1126,31 +1116,31 @@ OS10(conf-if-ma-1/1/1)# ip address 10.201.1.2/16 OS10(conf-if-ma-1/1/1)# no shutdown OS10(conf-if-ma-1/1/1)# exit OS10(config)# write memory -OS10(config)# hostname zo-switch01 -zo-switch01(config)# interface vlan 100 -zo-switch01(conf-if-vl-100)# description BCN1 -zo-switch01(config)# exit -zo-switch01(conf-if-vl-100)# interface range ethernet 1/1/1-1/1/14 -zo-switch01(conf-range-eth1/1/1-1/1/10)# switchport access vlan 100 -zo-switch01(conf-range-eth1/1/1-1/1/10)# no shutdown -zo-switch01(conf-range-eth1/1/1-1/1/10)# exit +OS10(config)# hostname an-switch01 +an-switch01(config)# interface vlan 100 +an-switch01(conf-if-vl-100)# description BCN1 +an-switch01(config)# exit +an-switch01(conf-if-vl-100)# interface range ethernet 1/1/1-1/1/14 +an-switch01(conf-range-eth1/1/1-1/1/10)# switchport access vlan 100 +an-switch01(conf-range-eth1/1/1-1/1/10)# no shutdown +an-switch01(conf-range-eth1/1/1-1/1/10)# exit -zo-switch01(config)# interface vlan 200 -zo-switch01(conf-if-vl-200)# description SN1 -zo-switch01(conf-if-vl-200)# exit -zo-switch01(config)# interface range ethernet 1/1/11-1/1/14 -zo-switch01(conf-range-eth1/1/11-1/1/14)# switchport access vlan 200 -zo-switch01(conf-range-eth1/1/11-1/1/14)# no shutdown -zo-switch01(conf-range-eth1/1/11-1/1/14)# exit +an-switch01(config)# interface vlan 200 +an-switch01(conf-if-vl-200)# description SN1 +an-switch01(conf-if-vl-200)# exit +an-switch01(config)# interface range ethernet 1/1/11-1/1/14 +an-switch01(conf-range-eth1/1/11-1/1/14)# switchport access vlan 200 +an-switch01(conf-range-eth1/1/11-1/1/14)# no shutdown +an-switch01(conf-range-eth1/1/11-1/1/14)# exit -zo-switch01(config)# interface vlan 300 -zo-switch01(conf-if-vl-300)# description IFN1 -zo-switch01(conf-if-vl-300)# exit -zo-switch01(config)# interface range ethernet 1/1/15-1/1/24 -zo-switch01(conf-range-eth1/1/15-1/1/24)# switchport access vlan 300 -zo-switch01(conf-range-eth1/1/15-1/1/24)# no shutdown -zo-switch01(conf-range-eth1/1/15-1/1/24)# exit -zo-switch01(config)# show vlan +an-switch01(config)# interface vlan 300 +an-switch01(conf-if-vl-300)# description IFN1 +an-switch01(conf-if-vl-300)# exit +an-switch01(config)# interface range ethernet 1/1/15-1/1/24 +an-switch01(conf-range-eth1/1/15-1/1/24)# switchport access vlan 300 +an-switch01(conf-range-eth1/1/15-1/1/24)# no shutdown +an-switch01(conf-range-eth1/1/15-1/1/24)# exit +an-switch01(config)# show vlan Codes: * - Default VLAN, M - Management VLAN, R - Remote Port Mirroring VLANs, @ - Attached to Virtual Network, P - Primary, C - Community, I - Isolated Q: A - Access (Untagged), T - Tagged @@ -1164,12 +1154,12 @@ Q: A - Access (Untagged), T - Tagged 300 Active IFN1 T Po1000 A Eth1/1/15-1/1/24 4094 Active T Po1000 -zo-switch01(config)# write memory +an-switch01(config)# write memory ### Delete a VLAN: -zo-switch02(config)# no interface vlan 3400 -zo-switch02(config)# show vlan +an-switch02(config)# no interface vlan 3400 +an-switch02(config)# show vlan === Firmware Update === @@ -1251,7 +1241,7 @@ reboot: machine restart ### NOTE: The login prompt will appear before the system is ready to log in. The default username and password revert to 'admin' / 'admin', but this won't work for the first couple of minutes. ## OLD -zo-switch02# show version +an-switch02# show version Dell EMC Networking OS10 Enterprise Copyright (c) 1999-2020 by Dell Inc. All Rights Reserved. OS Version: 10.5.0.4 @@ -1260,10 +1250,10 @@ Build Time: 2020-01-30T21:08:56+0000 System Type: S4128T-ON Architecture: x86_64 Up Time: 22:49:57 -zo-switch02# +an-switch02# ## New -zo-a01n01# show version +an-a01n01# show version Dell EMC Networking OS10 Enterprise Copyright (c) 1999-2021 by Dell Inc. All Rights Reserved. OS Version: 10.5.2.3 @@ -1539,13 +1529,13 @@ $body = $cgi->param('POSTDATA') # gives you the body of the request as a string, ausearch -c 'drbdsetup' --raw | audit2allow -M my-drbdsetup && semodule -X 300 -i my-drbdsetup.pp -May 02 13:35:21 zo-a01n02.zennioptical.com setroubleshoot[5333]: SELinux is preventing /usr/sbin/drbdsetup from create access on the netlink_generic_socket labeled drbd_t. For complete SELinux messages run: sealert -l 4079c288-db4a-4f44-a588-94f1dbfff269 -May 02 13:35:21 zo-a01n02.zennioptical.com setroubleshoot[5333]: SELinux is preventing /usr/sbin/drbdsetup from create access on the netlink_generic_socket labeled drbd_t. +May 02 13:35:21 an-a01n02.zennioptical.com setroubleshoot[5333]: SELinux is preventing /usr/sbin/drbdsetup from create access on the netlink_generic_socket labeled drbd_t. For complete SELinux messages run: sealert -l 4079c288-db4a-4f44-a588-94f1dbfff269 +May 02 13:35:21 an-a01n02.zennioptical.com setroubleshoot[5333]: SELinux is preventing /usr/sbin/drbdsetup from create access on the netlink_generic_socket labeled drbd_t. If you believe that drbdsetup should be allowed create access on netlink_generic_socket labeled drbd_t by default. # ausearch -c 'drbdsetup' --raw | audit2allow -M my-drbdsetup # semodule -X 300 -i my-drbdsetup.pp - If you believe that virsh should be allowed read access on the srv16-zo-psql-qa.xml file by default. + If you believe that virsh should be allowed read access on the srv16-an-psql-qa.xml file by default. # ausearch -c 'virsh' --raw | audit2allow -M my-virsh # semodule -X 300 -i my-virsh.pp @@ -1609,17 +1599,17 @@ Gi1/0/24 + Gi2/0/24 Dell LACP Config (OS10 - https://www.dell.com/support/kbdoc/en-us/000102901/dell-emc-networking-os10-how-to-set-up-virtual-link-trunking-vlt) * On both switches; -zo-switch02# configure terminal +an-switch02# configure terminal * IFN Port channel is 3 -zo-switch02(config)# interface port-channel 3 -zo-switch02(conf-if-po-3)# <165>1 2021-10-19T04:58:56.022086+00:00 zo-switch02 dn_alm 920 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %IFM_ASTATE_UP: Interface admin state up :port-channel3 -<165>1 2021-10-19T04:58:56.022722+00:00 zo-switch02 dn_alm 920 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %IFM_OSTATE_DN: Interface operational state is down :port-channel3 -zo-switch02(conf-if-po-3)# lacp fallback enable -zo-switch02(conf-if-po-3)# description IFN1 -zo-switch02(conf-if-po-3)# exit -zo-switch02(config)# exit -zo-switch02# show port-channel summary +an-switch02(config)# interface port-channel 3 +an-switch02(conf-if-po-3)# <165>1 2021-10-19T04:58:56.022086+00:00 an-switch02 dn_alm 920 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %IFM_ASTATE_UP: Interface admin state up :port-channel3 +<165>1 2021-10-19T04:58:56.022722+00:00 an-switch02 dn_alm 920 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %IFM_OSTATE_DN: Interface operational state is down :port-channel3 +an-switch02(conf-if-po-3)# lacp fallback enable +an-switch02(conf-if-po-3)# description IFN1 +an-switch02(conf-if-po-3)# exit +an-switch02(config)# exit +an-switch02# show port-channel summary Flags: D - Down I - member up but inactive P - member up and active U - Up (port-channel) F - Fallback Activated @@ -1629,10 +1619,10 @@ Group Port-Channel Type Protocol Member Ports 3 port-channel3 (D) Eth STATIC 1000 port-channel1000 (U) Eth STATIC 1/1/25(P) 1/1/26(P) -zo-switch02# configure terminal -zo-switch02(config)# interface ethernet 1/1/24 -zo-switch02(conf-if-eth1/1/24)# channel-group 3 -zo-switch02(conf-if-eth1/1/24)# <165>1 2021-10-19T05:09:41.237808+00:00 zo-switch02 dn_alm 920 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %IFM_OSTATE_UP: Interface operational state is up :port-channel3 +an-switch02# configure terminal +an-switch02(config)# interface ethernet 1/1/24 +an-switch02(conf-if-eth1/1/24)# channel-group 3 +an-switch02(conf-if-eth1/1/24)# <165>1 2021-10-19T05:09:41.237808+00:00 an-switch02 dn_alm 920 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %IFM_OSTATE_UP: Interface operational state is up :port-channel3 exit exit