diff --git a/share/words.xml b/share/words.xml
index d44b7617..c93548d5 100644
--- a/share/words.xml
+++ b/share/words.xml
@@ -386,6 +386,7 @@ The attempt to start the servers appears to have failed. The return code '0' was
====
#!variable!output!#
====
+We're done waiting, exiting out.
' or '--server-uuid .]]>
Could not find the server: [#!variable!server!#] on this Anvil! in the database.
@@ -3587,6 +3588,14 @@ The error was:
[ Warning ] - The test "fail file": [#!variable!fail_file!#] was found. So long as this file exists, the ocf:alteeve:server RA will return 'OCF_ERR_GENERIC' (exit code 1). Delete the file to resume normal operation.
[ Warning ] - The configured reserved RAM was set to: [#!variable!was!#], which appears invalid. It must be an integer value representing the amount of RAM to reserve, in MiB. The reserved RAM is being set to: [#!variable!was!#].
[ Warning ] - The configured reserved RAM was set to: [#!variable!was!#], which appears invalid. It must be an integer value representing the amount of RAM to reserve, in MiB. The reserved RAM is being set to: [#!variable!was!#].
+
+The attempt to start the servers appears to have failed. The return code '0' was expected, but: [#!variable!return_code!#] was received. The output was:
+====
+#!variable!output!#
+====
+We will wait: [#!variable!waiting!#] seconds and then try again. We'll give up if it keeps failing after: [#!variable!time_left!#] seconds.
+
+
diff --git a/tools/anvil-safe-start b/tools/anvil-safe-start
index cb4f50d2..d04e7ffe 100755
--- a/tools/anvil-safe-start
+++ b/tools/anvil-safe-start
@@ -284,22 +284,55 @@ sub start_pacemaker
### TODO: A lot more testing is needed for degraded single-node start later.
### Should we use --all, or wait for our peer? For now, we wait.
- #my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster start --all";
- my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster start";
- $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
- my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
+ ### NOTE: This can be racy during initial setup, calling the start before /etc/hosts is
+ ### populated. So this watches for that corner case.
+ my $wait_until = time + 120;
+ my $waiting = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
- output => $output,
- return_code => $return_code,
+ wait_until => $wait_until,
+ waiting => $waiting,
}});
- if ($return_code)
+ while($waiting)
{
- # What?! Fail out, we're done.
- $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0256", variables => {
+ #my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster start --all";
+ my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster start";
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
+ my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
- $anvil->nice_exit({exit_code => 1});
+ if ($return_code)
+ {
+ # Are we done waiting?
+ if (time > $wait_until)
+ {
+ # We're done.
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0256", variables => {
+ output => $output,
+ return_code => $return_code,
+ }});
+ $anvil->nice_exit({exit_code => 1});
+ }
+ else
+ {
+ # Report the error and sleep
+ my $time_left = $wait_until - time;
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "alert", key => "warning_0153", variables => {
+ output => $output,
+ return_code => $return_code,
+ time_left => $time_left,
+ waiting => 10,
+ }});
+ sleep 10;
+ }
+ }
+ else
+ {
+ # Success!
+ $waiting = 0;
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
+ }
}
### TODO: We may implement the logic to fence our peer (similar to cman's post_join_delay'
@@ -309,7 +342,7 @@ sub start_pacemaker
# the peer and, if the fence succeeds, unblock quorum.
my $start_time = time;
my $wait_for_peer = $start_time + 120;
- my $waiting = 1;
+ $waiting = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
start_time => $start_time,
wait_for_peer => $wait_for_peer,