From 171ea7400043f6fda1f78cb97353e16ca1d7e576 Mon Sep 17 00:00:00 2001 From: Digimer Date: Wed, 6 Jul 2022 19:18:35 -0400 Subject: [PATCH] * There is a fix in this commit to resolve a race condition where, when reconfiguring the network, the request to set a job to reboot would fail because the connections to all Strikers could be lost, causing Database->_test_access() would error out, blocking the reboot. When restarted, the network would not be changed, so no reboot would be requested, leaving the machine in an innaccesible state. * Updated anvil-boot-server when called with '--all' to honour boot ordering, delays and condtions. * Updated Database->get_servers() to collect the server's XML as well as data from the 'servers' table. * Updated anvil-provision-server to make a new DRBD resource 'secondary' after forcing it to primary to begin the initial sync. Signed-off-by: Digimer --- Anvil/Tools/Database.pm | 113 +++++++++++++--------- notes | 1 + share/words.xml | 10 +- tools/anvil-boot-server | 178 +++++++++++++++++++++++++++++++---- tools/anvil-configure-host | 1 + tools/anvil-provision-server | 11 +++ 6 files changed, 252 insertions(+), 62 deletions(-) diff --git a/Anvil/Tools/Database.pm b/Anvil/Tools/Database.pm index 6b5e2404..f4d80304 100644 --- a/Anvil/Tools/Database.pm +++ b/Anvil/Tools/Database.pm @@ -4662,7 +4662,7 @@ WHERE =head2 get_servers -This loads all known servers from the database. +This loads all known servers from the database, including the corresponding C<< server_definition_xml >> from the C<< server_definitions >> table. servers::server_uuid::::server_name servers::server_uuid::::server_anvil_uuid @@ -4680,6 +4680,8 @@ This loads all known servers from the database. servers::server_uuid::::server_configured_ram servers::server_uuid::::server_updated_by_user servers::server_uuid::::server_boot_time + servers::server_uuid::::server_definition_uuid + servers::server_uuid::::server_definition_xml To simplify lookup of server UUIDs by server names, this hash is also set; @@ -4703,25 +4705,30 @@ sub get_servers my $query = " SELECT - server_uuid, - server_name, - server_anvil_uuid, - server_user_stop, - server_start_after_server_uuid, - server_start_delay, - server_host_uuid, - server_state, - server_live_migration, - server_pre_migration_file_uuid, - server_pre_migration_arguments, - server_post_migration_file_uuid, - server_post_migration_arguments, - server_ram_in_use, - server_configured_ram, - server_updated_by_user, - server_boot_time + a.server_uuid, + a.server_name, + a.server_anvil_uuid, + a.server_user_stop, + a.server_start_after_server_uuid, + a.server_start_delay, + a.server_host_uuid, + a.server_state, + a.server_live_migration, + a.server_pre_migration_file_uuid, + a.server_pre_migration_arguments, + a.server_post_migration_file_uuid, + a.server_post_migration_arguments, + a.server_ram_in_use, + a.server_configured_ram, + a.server_updated_by_user, + a.server_boot_time, + b.server_definition_uuid, + b.server_definition_xml FROM - servers + servers a, + server_definitions b +WHERE + a.server_uuid = b.server_definition_server_uuid ;"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { query => $query }}); @@ -4750,24 +4757,28 @@ FROM my $server_configured_ram = $row->[14]; my $server_updated_by_user = $row->[15]; my $server_boot_time = $row->[16]; + my $server_definition_uuid = $row->[17]; + my $server_definition_xml = $row->[18]; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { - server_uuid => $server_uuid, - server_name => $server_name, - server_anvil_uuid => $server_anvil_uuid, - server_user_stop => $server_user_stop, - server_start_after_server_uuid => $server_start_after_server_uuid, - server_start_delay => $server_start_delay, - server_host_uuid => $server_host_uuid, - server_state => $server_state, - server_live_migration => $server_live_migration, - server_pre_migration_file_uuid => $server_pre_migration_file_uuid, - server_pre_migration_arguments => $server_pre_migration_arguments, - server_post_migration_file_uuid => $server_post_migration_file_uuid, - server_post_migration_arguments => $server_post_migration_arguments, - server_ram_in_use => $server_ram_in_use, - server_configured_ram => $server_configured_ram, - server_updated_by_user => $server_updated_by_user, - server_boot_time => $server_boot_time, + 's01:server_uuid' => $server_uuid, + 's02:server_name' => $server_name, + 's03:server_anvil_uuid' => $server_anvil_uuid, + 's04:server_user_stop' => $server_user_stop, + 's05:server_start_after_server_uuid' => $server_start_after_server_uuid, + 's06:server_start_delay' => $server_start_delay, + 's07:server_host_uuid' => $server_host_uuid, + 's08:server_state' => $server_state, + 's09:server_live_migration' => $server_live_migration, + 's10:server_pre_migration_file_uuid' => $server_pre_migration_file_uuid, + 's11:server_pre_migration_arguments' => $server_pre_migration_arguments, + 's12:server_post_migration_file_uuid' => $server_post_migration_file_uuid, + 's13:server_post_migration_arguments' => $server_post_migration_arguments, + 's14:server_ram_in_use' => $server_ram_in_use, + 's15:server_configured_ram' => $server_configured_ram, + 's16:server_updated_by_user' => $server_updated_by_user, + 's17:server_boot_time' => $server_boot_time, + 's18:server_definition_uuid' => $server_definition_uuid, + 's19:server_definition_xml' => $server_definition_xml, }}); # Record the data in the hash, too. @@ -4787,6 +4798,8 @@ FROM $anvil->data->{servers}{server_uuid}{$server_uuid}{server_configured_ram} = $server_configured_ram; $anvil->data->{servers}{server_uuid}{$server_uuid}{server_updated_by_user} = $server_updated_by_user; $anvil->data->{servers}{server_uuid}{$server_uuid}{server_boot_time} = $server_boot_time; + $anvil->data->{servers}{server_uuid}{$server_uuid}{server_definition_uuid} = $server_definition_uuid; + $anvil->data->{servers}{server_uuid}{$server_uuid}{server_definition_xml} = $server_definition_xml; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { "servers::server_uuid::${server_uuid}::server_anvil_uuid" => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_anvil_uuid}, "servers::server_uuid::${server_uuid}::server_user_stop" => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_user_stop}, @@ -4803,6 +4816,8 @@ FROM "servers::server_uuid::${server_uuid}::server_configured_ram" => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_configured_ram}, "servers::server_uuid::${server_uuid}::server_updated_by_user" => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_updated_by_user}, "servers::server_uuid::${server_uuid}::server_boot_time" => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_boot_time}, + "servers::server_uuid::${server_uuid}::server_definition_uuid" => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_definition_uuid}, + "servers::server_uuid::${server_uuid}::server_definition_xml" => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_definition_xml}, }}); # Store the servers in a hash under each Anvil!, sortable. @@ -11209,9 +11224,11 @@ This indicates when a server was stopped by a user. If this is set to C<< 1 >>, If the user wants to boot this server after another server, this can be set to C<< servers >> -> C<< server_uuid >>. When set, the server referenced will be booted (at least) C<< server_start_delay >> seconds before this server is booted. -=head3 server_start_delay (optional, default '30') +B<< Note >>: If this is set to C<< 00000000-0000-0000-0000-000000000000 >>, the server will be left off. + +=head3 server_start_delay (optional, default '0') -If C<< server_start_after_server_uuid >> is set, then this value controls the delay between when the referenced server boots and when this server boots. +If C<< server_start_after_server_uuid >> is set, then this value controls the delay between when the referenced server boots and when this server boots. This value is ignored if the server is not set to boot after another server. B<< Note >>: This is the B<< minimum >> delay! It's possible that the actual delay could be a bit more than this value. @@ -18416,12 +18433,22 @@ sub _test_access $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { "sys::database::connections" => $anvil->data->{sys}{database}{connections} }}); if (not $anvil->data->{sys}{database}{connections}) { - # No connections are left, die. - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0196"}); - $anvil->nice_exit({exit_code => 1}); + # No connections are left. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0366"}); - # In case we're still alive, die. - die $THIS_FILE." ".__LINE__."; exiting on DB connection error.\n"; + # It's possible the network was just reconfigured, and they were trying to updated a + # job in the database. If so, this failure can be hit. To handle this, we'll check + # if 'sys::reboot' is set. If so, we'll reboot now. + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { "sys::reboot" => $anvil->data->{sys}{reboot} }}); + if ($anvil->data->{sys}{reboot}) + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0196"}); + my $shell_call = $anvil->data->{path}{exe}{systemctl}." reboot"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code }}); + } + return(1); } } diff --git a/notes b/notes index 6d5aab04..30729be8 100644 --- a/notes +++ b/notes @@ -25,6 +25,7 @@ firewall-cmd --permanent --zone=IFN1 --add-port=22869/tcp firewall-cmd --reload + # Configure APC PDUs and UPSes tcpip -i 10.201.2.3 -s 255.255.0.0 -g 10.201.255.254 web -h enable diff --git a/share/words.xml b/share/words.xml index e2584362..5e43b966 100644 --- a/share/words.xml +++ b/share/words.xml @@ -511,6 +511,7 @@ The output, if any, was; There are two or more entries on the host: [#!variable!host!#] in the history table: [#!variable!table!#]! The duplicate modidied_date and column UUID are: [#!variable!key!#] (time is UTC), and the query that exposed the dupplicate was: [#!variable!query!#]. This is likely caused by two database writes where the 'modified_date' wasn't updated between writes. [ Error ] - There was a problem purging records. The details of the problem should be in the logs. The table: [#!variable!table!#] has an entry in the history schema that doesn't have a corresponding record in the public schema. This is likely a resync artifact of a deleted record. Purging the record: [#!variable!uuid_column!#:#!variable!column_uuid!#] from all databases. + [ Error ] - Failed to reconnect to the database, and now no connections remain. @@ -1554,7 +1555,7 @@ The database connection error was: Switching the default database handle to use the database: [#!variable!server!#] prior to reconnect attempt. Switching the default database to read from to the database: [#!variable!server!#] prior to reconnect attempt. Ready to try to reconnect to: [#!variable!server!#], but delaying for: [#!variable!delay!#] seconds to give the database a chance to come back online in case this is a transient issue. - Failed to reconnect to the database, and now no connections remail. Exiting. + The reboot flag was set. Rebooting NOW! maintenance_mode() was passed an invalid 'set' value: [#!variable!set!#]. No action taken.]]> The user: [#!variable!user!#] logged out successfully. A system reboot has been requested via the Striker UI. @@ -2143,6 +2144,13 @@ The file: [#!variable!file!#] needs to be updated. The difference is: Closing the firewall port: [#!variable!port!#/#!variable!protocol!#] for the zone: [#!variable!zone!#]! Closing the firewall port range: [#!variable!port!#/#!variable!protocol!#] for the zone: [#!variable!zone!#]! Changes were made to the firewall, reloading now. + This server will boot: [#!variable!delay!#] after the server: [#!variable!server!#]. Checking if it's time to boot it or not. + The server: [#!variable!boot_after_server!#] hasn't booted yet, holding off booting: [#!variable!this_server!#]. + Evaluating the booting of the server: [#!variable!server!#]. + The server: [#!variable!boot_after_server!#] has booted, but we need to wait: [#!variable!time_to_wait!#] seconds before we can start this server: [#!variable!this_server!#]. + The server: [#!variable!server!#] is ready to boot. + The server: [#!variable!server!#] was found to be running already, but it wasn't marked as booted. Marking it as if it just booted to handle any dependent servers. + The server: [#!variable!server!#] is configured to stay off, ignoring it. The host name: [#!variable!target!#] does not resolve to an IP address. diff --git a/tools/anvil-boot-server b/tools/anvil-boot-server index 90ee9b19..11945879 100755 --- a/tools/anvil-boot-server +++ b/tools/anvil-boot-server @@ -189,7 +189,7 @@ sub wait_for_pacemaker my $waiting = 1; while($waiting) { - my $problem = $anvil->Cluster->parse_cib({debug => 2}); + my $problem = $anvil->Cluster->parse_cib({debug => 3}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); if (not $problem) { @@ -328,29 +328,171 @@ sub boot_all_servers return(0); } + # Load information about the servers on this Anvil!. + my $anvil_uuid = $anvil->data->{sys}{anvil_uuid}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { anvil_uuid => $anvil_uuid }}); + my $increment = int(70 / $server_count); my $percent = 15; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { increment => $increment }}); - foreach my $server (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{data}{server}}) + + # Loop until all are processed. + my $waiting = 1; + my $start_time = time; + while($waiting) { - my $status = $anvil->data->{cib}{parsed}{data}{server}{$server}{status}; - my $host_name = $anvil->data->{cib}{parsed}{data}{server}{$server}{host_name}; - my $role = $anvil->data->{cib}{parsed}{data}{server}{$server}{role}; - my $active = $anvil->data->{cib}{parsed}{data}{server}{$server}{active}; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - 's1:server' => $server, - 's2:status' => $status, - 's2:host_name' => $host_name, - 's4:role' => $role, - 's5:active' => $active, - }}); + # Get a list of servers now. + $anvil->Database->get_servers({debug => 3}); - if ($status eq "off") + # This will get set to 0 if any servers are waiting to boot. + my $all_processed = 1; + foreach my $server_name (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{data}{server}}) + { + my $status = $anvil->data->{cib}{parsed}{data}{server}{$server_name}{status}; + my $host_name = $anvil->data->{cib}{parsed}{data}{server}{$server_name}{host_name}; + my $role = $anvil->data->{cib}{parsed}{data}{server}{$server_name}{role}; + my $active = $anvil->data->{cib}{parsed}{data}{server}{$server_name}{active}; + my $server_uuid = $anvil->data->{servers}{anvil_uuid}{$anvil_uuid}{server_name}{$server_name}{server_uuid}; + my $boot_delay = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_start_delay}; + $boot_delay = 0 if not $boot_delay; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:server_name' => $server_name, + 's2:status' => $status, + 's2:host_name' => $host_name, + 's4:role' => $role, + 's5:active' => $active, + 's6:server_uuid' => $server_uuid, + 's7:boot_delay' => $boot_delay, + }}); + + if (not exists $anvil->data->{boot_server}{$server_name}{processed}) + { + # This will get set to the boot time once we actually start it. This will let + # us time when servers that boot after this server can boot. + $anvil->data->{boot_server}{$server_name}{processed} = 0; + } + elsif ($anvil->data->{boot_server}{$server_name}{processed}) + { + # Already processed. + next; + } + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, secure => 0, key => "log_0719", variables => { server => $server_name }}); + + my $boot_after_server_uuid = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_start_after_server_uuid}; + $boot_after_server_uuid = "" if not defined $boot_after_server_uuid; + $boot_after_server_uuid = "" if $boot_after_server_uuid eq "NULL"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { boot_after_server_uuid => $boot_after_server_uuid }}); + if ($boot_after_server_uuid) + { + if ($boot_after_server_uuid eq "00000000-0000-0000-0000-000000000000") + { + # This server is configured to stay off. + $anvil->data->{boot_server}{$server_name}{processed} = time; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "boot_server::${server_name}::processed" => $anvil->data->{boot_server}{$server_name}{processed}, + }}); + + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, secure => 0, key => "log_0723", variables => { server => $server_name }}); + next; + } + + # What's the server's name. + my $boot_after_server_name = $anvil->data->{servers}{server_uuid}{$boot_after_server_uuid}{server_name}; + $boot_after_server_name = "" if not defined $boot_after_server_name; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { boot_after_server_name => $boot_after_server_name }}); + + # Has this server processed? + if ($boot_after_server_name) + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, secure => 0, key => "log_0717", variables => { + delay => $boot_delay, + server => $boot_after_server_name, + }}); + if (not exists $anvil->data->{boot_server}{$boot_after_server_name}) + { + $anvil->data->{boot_server}{$boot_after_server_name}{processed} = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "boot_server::${boot_after_server_name}::processed" => $anvil->data->{boot_server}{$boot_after_server_name}{processed}, + }}); + } + + if ($anvil->data->{boot_server}{$boot_after_server_name}{processed}) + { + my $processed_seconds_ago = time - $anvil->data->{boot_server}{$boot_after_server_name}{processed}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { processed_seconds_ago => $processed_seconds_ago }}); + if ($processed_seconds_ago > $boot_delay) + { + # Ready to boot. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, secure => 0, key => "log_0721", variables => { server => $server_name }}); + } + else + { + # Not ready yet. + $all_processed = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_processed => $all_processed }}); + + my $time_to_wait = $boot_delay - $processed_seconds_ago; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, secure => 0, key => "log_0720", variables => { + boot_after_server => $boot_after_server_name, + this_server => $server_name, + time_to_wait => $time_to_wait, + }}); + next; + } + } + else + { + # The other server hasn't processed yet. + $all_processed = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_processed => $all_processed }}); + + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, secure => 0, key => "log_0718", variables => { + boot_after_server => $boot_after_server_name, + this_server => $server_name, + }}); + next; + } + } + } + + if ($status eq "off") + { + # Boot it. + my $wait = $anvil->data->{switches}{'wait'} ? 1 : 0; + $percent += $increment; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 'wait' => $wait, + percent => $percent, + }}); + boot_server($anvil, $server_name, $wait, $percent); + + # If we're here, the server processed. + $anvil->data->{boot_server}{$server_name}{processed} = time; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "boot_server::${server_name}::processed" => $anvil->data->{boot_server}{$server_name}{processed}, + }}); + } + elsif (not $anvil->data->{boot_server}{$server_name}{processed}) + { + # It may have booted before we ran. + $anvil->data->{boot_server}{$server_name}{processed} = time; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, secure => 0, key => "log_0722", variables => { server => $server_name }}); + } + } + + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_processed => $all_processed }}); + if ($all_processed) + { + # We're done! + $waiting = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); + } + else { - # Boot it. - my $wait = $anvil->data->{switches}{'wait'} ? 1 : 0; - $percent += $increment; - boot_server($anvil, $server, $wait, $percent); + # Wait a bit. + sleep 2; + my $problem = $anvil->Cluster->parse_cib({debug => 3}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); } } diff --git a/tools/anvil-configure-host b/tools/anvil-configure-host index 00ea8be5..899c1762 100755 --- a/tools/anvil-configure-host +++ b/tools/anvil-configure-host @@ -1242,6 +1242,7 @@ sub reconfigure_network { # In an attempt to make network changes more reliable, we'll just reboot. This shouldn't # actually be hit anymore as any change should have triggered the reboot above. + $anvil->data->{sys}{reboot} = 1; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, 'print' => 1, key => "log_0687", variables => { reason => "#!string!log_0631!#" }}); do_reboot($anvil); diff --git a/tools/anvil-provision-server b/tools/anvil-provision-server index 72b38694..6375cf9d 100755 --- a/tools/anvil-provision-server +++ b/tools/anvil-provision-server @@ -857,6 +857,17 @@ sub startup_resource } $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0580", variables => { resource => $anvil->data->{job}{server_name} }}); + # Make it secondary again. This is needed as it won't auto-demote + # otherwise. + $shell_call = $anvil->data->{path}{exe}{drbdadm}." secondary ".$anvil->data->{job}{server_name}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + # set the fencing back $shell_call = $anvil->data->{path}{exe}{drbdadm}." adjust ".$anvil->data->{job}{server_name}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});