From 0874ad571a294d1c300858153a4d0a513f46e0f9 Mon Sep 17 00:00:00 2001 From: digimer Date: Tue, 18 Apr 2023 14:33:58 -0400 Subject: [PATCH] Updated anvil-safe-start to not give up on starting corosync/pacemaker if it fails on the first try. Signed-off-by: digimer --- share/words.xml | 9 +++++++ tools/anvil-safe-start | 55 +++++++++++++++++++++++++++++++++--------- 2 files changed, 53 insertions(+), 11 deletions(-) diff --git a/share/words.xml b/share/words.xml index d44b7617..c93548d5 100644 --- a/share/words.xml +++ b/share/words.xml @@ -386,6 +386,7 @@ The attempt to start the servers appears to have failed. The return code '0' was ==== #!variable!output!# ==== +We're done waiting, exiting out. ' or '--server-uuid .]]> Could not find the server: [#!variable!server!#] on this Anvil! in the database. @@ -3587,6 +3588,14 @@ The error was: [ Warning ] - The test "fail file": [#!variable!fail_file!#] was found. So long as this file exists, the ocf:alteeve:server RA will return 'OCF_ERR_GENERIC' (exit code 1). Delete the file to resume normal operation. [ Warning ] - The configured reserved RAM was set to: [#!variable!was!#], which appears invalid. It must be an integer value representing the amount of RAM to reserve, in MiB. The reserved RAM is being set to: [#!variable!was!#]. [ Warning ] - The configured reserved RAM was set to: [#!variable!was!#], which appears invalid. It must be an integer value representing the amount of RAM to reserve, in MiB. The reserved RAM is being set to: [#!variable!was!#]. + +The attempt to start the servers appears to have failed. The return code '0' was expected, but: [#!variable!return_code!#] was received. The output was: +==== +#!variable!output!# +==== +We will wait: [#!variable!waiting!#] seconds and then try again. We'll give up if it keeps failing after: [#!variable!time_left!#] seconds. + + diff --git a/tools/anvil-safe-start b/tools/anvil-safe-start index cb4f50d2..d04e7ffe 100755 --- a/tools/anvil-safe-start +++ b/tools/anvil-safe-start @@ -284,22 +284,55 @@ sub start_pacemaker ### TODO: A lot more testing is needed for degraded single-node start later. ### Should we use --all, or wait for our peer? For now, we wait. - #my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster start --all"; - my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster start"; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); - my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + ### NOTE: This can be racy during initial setup, calling the start before /etc/hosts is + ### populated. So this watches for that corner case. + my $wait_until = time + 120; + my $waiting = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - output => $output, - return_code => $return_code, + wait_until => $wait_until, + waiting => $waiting, }}); - if ($return_code) + while($waiting) { - # What?! Fail out, we're done. - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0256", variables => { + #my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster start --all"; + my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster start"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code, }}); - $anvil->nice_exit({exit_code => 1}); + if ($return_code) + { + # Are we done waiting? + if (time > $wait_until) + { + # We're done. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0256", variables => { + output => $output, + return_code => $return_code, + }}); + $anvil->nice_exit({exit_code => 1}); + } + else + { + # Report the error and sleep + my $time_left = $wait_until - time; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "alert", key => "warning_0153", variables => { + output => $output, + return_code => $return_code, + time_left => $time_left, + waiting => 10, + }}); + sleep 10; + } + } + else + { + # Success! + $waiting = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); + } } ### TODO: We may implement the logic to fence our peer (similar to cman's post_join_delay' @@ -309,7 +342,7 @@ sub start_pacemaker # the peer and, if the fence succeeds, unblock quorum. my $start_time = time; my $wait_for_peer = $start_time + 120; - my $waiting = 1; + $waiting = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { start_time => $start_time, wait_for_peer => $wait_for_peer,