Updated anvil-safe-start to not give up on starting corosync/pacemaker if it fails on the first try.

Signed-off-by: digimer <mkelly@alteeve.ca>
main
digimer 2 years ago
parent 8ba613952c
commit 0874ad571a
  1. 9
      share/words.xml
  2. 55
      tools/anvil-safe-start

@ -386,6 +386,7 @@ The attempt to start the servers appears to have failed. The return code '0' was
====
#!variable!output!#
====
We're done waiting, exiting out.
</key>
<key name="error_0276"><![CDATA[No server specified to rename. Please use '--server <name>' or '--server-uuid <UUID>.]]></key>
<key name="error_0277">Could not find the server: [#!variable!server!#] on this Anvil! in the database.</key>
@ -3587,6 +3588,14 @@ The error was:
<key name="warning_0150">[ Warning ] - The test "fail file": [#!variable!fail_file!#] was found. So long as this file exists, the ocf:alteeve:server RA will return 'OCF_ERR_GENERIC' (exit code 1). Delete the file to resume normal operation.</key>
<key name="warning_0151">[ Warning ] - The configured reserved RAM was set to: [#!variable!was!#], which appears invalid. It must be an integer value representing the amount of RAM to reserve, in MiB. The reserved RAM is being set to: [#!variable!was!#].</key>
<key name="warning_0152">[ Warning ] - The configured reserved RAM was set to: [#!variable!was!#], which appears invalid. It must be an integer value representing the amount of RAM to reserve, in MiB. The reserved RAM is being set to: [#!variable!was!#].</key>
<key name="warning_0153">
The attempt to start the servers appears to have failed. The return code '0' was expected, but: [#!variable!return_code!#] was received. The output was:
====
#!variable!output!#
====
We will wait: [#!variable!waiting!#] seconds and then try again. We'll give up if it keeps failing after: [#!variable!time_left!#] seconds.
</key>
</language>
<!-- 日本語 -->
<language name="jp" long_name="日本語" description="Anvil! language file.">

@ -284,22 +284,55 @@ sub start_pacemaker
### TODO: A lot more testing is needed for degraded single-node start later.
### Should we use --all, or wait for our peer? For now, we wait.
#my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster start --all";
my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster start";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
### NOTE: This can be racy during initial setup, calling the start before /etc/hosts is
### populated. So this watches for that corner case.
my $wait_until = time + 120;
my $waiting = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
wait_until => $wait_until,
waiting => $waiting,
}});
if ($return_code)
while($waiting)
{
# What?! Fail out, we're done.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0256", variables => {
#my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster start --all";
my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster start";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
$anvil->nice_exit({exit_code => 1});
if ($return_code)
{
# Are we done waiting?
if (time > $wait_until)
{
# We're done.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0256", variables => {
output => $output,
return_code => $return_code,
}});
$anvil->nice_exit({exit_code => 1});
}
else
{
# Report the error and sleep
my $time_left = $wait_until - time;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "alert", key => "warning_0153", variables => {
output => $output,
return_code => $return_code,
time_left => $time_left,
waiting => 10,
}});
sleep 10;
}
}
else
{
# Success!
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
}
}
### TODO: We may implement the logic to fence our peer (similar to cman's post_join_delay'
@ -309,7 +342,7 @@ sub start_pacemaker
# the peer and, if the fence succeeds, unblock quorum.
my $start_time = time;
my $wait_for_peer = $start_time + 120;
my $waiting = 1;
$waiting = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
start_time => $start_time,
wait_for_peer => $wait_for_peer,

Loading…
Cancel
Save