* Updated anvil-provision-server to handle startup when the peer doesn't create/connect it's DRBD resource (ie: node is offline).

Signed-off-by: digimer <digimer@gravitar.alteeve.com>
main
digimer 2 years ago
parent a5cee52153
commit b666caec64
  1. 3
      notes
  2. 9
      share/words.xml
  3. 101
      tools/anvil-provision-server

@ -1,4 +1,7 @@
Add 'lsof' to Required
When pairing Striker, make sure new config goes to all known nodes!
Immediately set drbdadm to 'secondary' after 'primary --force'

@ -571,6 +571,7 @@ The definition data passed in was:
<key name="error_0396">[ Error ] - The Anvil! UUID: [#!variable!uuid!#] was not found.</key>
<key name="error_0397">[ Error ] - The DR link UUID: [#!variable!uuid!#] was not found.</key>
<key name="error_0398">[ Error ] - There was a problem processing the requested network: [#!variable!network!#]. Details should be logged.</key>
<key name="error_0399">[ Error ] - It looks like the new device: [#!variable!resource!#] failed to appear. Unable to proceed.</key>
<!-- Files templates -->
<!-- NOTE: Translating these files requires an understanding of which lines are translatable -->
@ -1227,7 +1228,7 @@ Use the 'Short ID' that best matches your OS.</key>
<key name="job_0215">The server has been flagged as deleted now.</key>
<key name="job_0216">The server delete is complete on this host!</key>
<key name="job_0217">It looks like ScanCore has not yet run on one or both nodes in this Anvil! system. Missing resource data, so unable to proceed.</key>
<key name="job_0218">Manually calling 'scan-drbd' to ensure that the new agent is recorded.</key>
<key name="job_0218">Manually calling 'scan-drbd' to ensure that the new resource is recorded.</key>
<key name="job_0219">The server name: [#!variable!server_name!#] is already used by another server.</key>
<key name="job_0220">Deleting the server's definition file: [#!variable!file!#]...</key>
<key name="job_0221">The server: [#!variable!server_name!#] was not found in the cluster configuration. This can happen if a server was partially deleted and we're trying again.</key>
@ -1476,6 +1477,12 @@ Note: This is a permanent action! If you protect this server again later, a full
<key name="job_0428">The server: [#!variable!server!#] is still running two minutes after asking it to stop. It might have woken up on the first press and ignored the shutdown request (Hi Windows). Pressing the poewr button again.</key>
<key name="job_0429">Copying the Long-throw (drbd proxy) license file: [#!variable!file!#] into place.</key>
<key name="job_0430">The fence device: [#!variable!device!#] no longer has a port associated with it, will remove it.</key>
<key name="job_0431">Calling drbdadm adjust to load the new resource, then waiting for the DRBD device to appear.</key>
<key name="job_0432">The new DRBD resource: [#!variable!resource!#] now exists!</key>
<key name="job_0433">Still waiting for the new DRBD resource: [#!variable!resource!#] to appear...</key>
<key name="job_0434">Waiting for up to a minute to see if the peer connects before provisioning the server.</key>
<key name="job_0435">One or more peer disk state or roles are 'unknown', waiting: [#!variable!waiting!#] seconds longer.</key>
<key name="job_0436">Peer disk state or role still unknown after one minute, proceeding without it.</key>
<!-- Log entries -->
<key name="log_0001">Starting: [#!variable!program!#].</key>

@ -706,6 +706,16 @@ sub provision_server
return_code => $return_code,
}});
# Call an adjust in case we forced the resource to disable fencing.
$shell_call = $anvil->data->{path}{exe}{drbdadm}." adjust ".$anvil->data->{job}{server_name};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
return(0);
}
@ -758,8 +768,8 @@ sub startup_resource
task => "up",
});
# If both sides are Inconsistent, for node 1 to primary
# If both sides are Inconsistent, for node 1 to primary. If we're primary and the peer is
# Unknown, force to primary.
my $waiting = 1;
while($waiting)
{
@ -924,8 +934,93 @@ sub startup_resource
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_ready => $all_ready }});
if ($all_ready)
{
# If we're here, we're mostly ready, however we're in a bit of a spot if our
# peer hasn't connected. We're going to wait up to one minute for the peer to
# connect. If it doesn't, we're going to force back to primary.
$anvil->Job->update_progress({
progress => 59,
message => "job_0434",
});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0434"});
my $wait_until = time + 60;
my $resource = $anvil->data->{job}{server_name};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:time' => time,
's2:wait_until' => $wait_until,
's3:resource' => $resource,
}});
while ($waiting)
{
$anvil->DRBD->gather_data({debug => 3});
my $any_unknown = 0;
foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{new}{resource}{$resource}{volume}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { volume => $volume }});
foreach my $peer (sort {$a cmp $b} keys %{$anvil->data->{new}{resource}{$resource}{volume}{$volume}{peer}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peer => $peer }});
if (not defined $anvil->data->{new}{resource}{$resource}{volume}{$volume}{peer}{$peer}{local_disk_state})
{
$any_unknown = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { volume => $volume }});
next;
}
my $local_disk_state = $anvil->data->{new}{resource}{$resource}{volume}{$volume}{peer}{$peer}{local_disk_state};
my $peer_disk_state = $anvil->data->{new}{resource}{$resource}{volume}{$volume}{peer}{$peer}{peer_disk_state};
my $local_role = $anvil->data->{new}{resource}{$resource}{volume}{$volume}{peer}{$peer}{local_role};
my $peer_role = $anvil->data->{new}{resource}{$resource}{volume}{$volume}{peer}{$peer}{peer_role};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:local_disk_state' => $local_disk_state,
's2:peer_disk_state' => $peer_disk_state,
's3:local_role' => $local_role,
's4:peer_role' => $peer_role,
}});
if (($peer_disk_state =~ /unknown/i) or ($peer_role =~ /unknown/i))
{
$any_unknown = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { volume => $volume }});
}
last if $any_unknown;
}
last if $any_unknown;
}
if ($any_unknown)
{
if (time > $wait_until)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0436"});
my $shell_call = $anvil->data->{path}{exe}{drbdsetup}." net-options ".$anvil->data->{job}{server_name}." ".$anvil->data->{job}{drbd_peer_node_id}." --set-defaults --_name=".$anvil->data->{job}{peer_short_name}." --protocol=C --fencing=dont-care";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
}
else
{
my $waiting = $wait_until - time;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0435", variables => { waiting => $waiting }});
sleep 5;
}
}
else
{
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
}
}
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
}
if ($waiting)

Loading…
Cancel
Save