* Updated ocf:alteeve:server to better handle starting up DRBD resources before trying to boot a VM.

Signed-off-by: Digimer <digimer@alteeve.ca>
main
Digimer 3 years ago
parent 920d38d250
commit 3c0435a455
  1. 6
      notes
  2. 284
      ocf/alteeve/server
  3. 4
      share/words.xml

@ -312,6 +312,12 @@ pcs constraint location srv01-test prefers el8-a01n01=200 el8-a01n02=100
stonith-max-attempts=INFINITY stonith-max-attempts=INFINITY
cluster-recheck-interval puts an upper bound on the "i give up" time cluster-recheck-interval puts an upper bound on the "i give up" time
====
pcs resource create srv01-cs8 ocf:alteeve:server name="srv01-cs8" meta allow-migrate="true" target-role="stopped" op monitor interval="60" start timeout="INFINITY" on-fail="block" stop timeout="INFINITY" on-fail="block" migrate_to timeout="INFINITY"
pcs constraint location srv01-cs8 prefers mk-a02n01=200 mk-a02n02=100
==== DRBD notes ==== DRBD notes
* resources can contain an US-ASCII character, except for spaces * resources can contain an US-ASCII character, except for spaces

@ -799,22 +799,23 @@ sub start_drbd_resource
peer => $peer, peer => $peer,
}}); }});
### TODO: Local start up below, move the peer check to have local startup is handled
# Do we need startup? # Do we need startup?
my $startup_needed = 0; my $local_startup_needed = 0;
$anvil->DRBD->get_status({debug => 3}); $anvil->DRBD->get_status({debug => 3});
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}}) foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
{ {
# Is the current resource up locally already? If it is, we're done. # Is the current resource up locally already?
my $role = defined $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role} ? $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role} : ""; my $role = defined $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role} ? $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
resource => $resource, 's1:resource' => $resource,
role => $role, 's2:role' => $role,
}}); }});
if ((lc($role) ne "secondary") && (lc($role) ne "primary")) if ((lc($role) ne "secondary") && (lc($role) ne "primary"))
{ {
$startup_needed = 1; $local_startup_needed = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { startup_needed => $startup_needed }}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_startup_needed => $local_startup_needed }});
last; last;
} }
else else
@ -826,95 +827,234 @@ sub start_drbd_resource
} }
} }
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { startup_needed => $startup_needed }}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_startup_needed => $local_startup_needed }});
if (not $startup_needed) if ($local_startup_needed)
{ {
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0431"}); foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
return(0); {
my $peer_ip = $anvil->data->{drbd}{config}{$host}{resource}{$resource}{connection}{$peer}{ip_address};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0419", variables => {
server => $server,
peer => $peer,
peer_ip => $peer_ip,
resource => $resource,
}});
# Bring the local resource up
$anvil->DRBD->manage_resource({
resource => $resource,
task => "up",
});
# Bring the peer's resource up.
$anvil->DRBD->manage_resource({
resource => $resource,
task => "up",
target => $peer_ip,
});
# Now wait for it to be connected or UpToDate...
my $waiting = 1;
while($waiting)
{
$anvil->DRBD->get_status({debug => 3});
print "==] ".$local_host." [==] ".$resource." [==] ".$peer." [==\n";
print Dumper $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}{$peer};
print "=========================================================\n";
my $connection_state = $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}{$peer}{'connection-state'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { connection_state => $connection_state }});
my $all_ready = 1;
foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}})
{
my $disk_state = $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}{$volume}{'disk-state'};
my $replication_state = $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}{$peer}{volume}{$volume}{'replication-state'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
disk_state => $disk_state,
replication_state => $replication_state,
}});
# Is the peer isn't connected (directly or by being in Sync), or this volume
# isn't UpToDate, we need to keep waiting.
if ((lc($disk_state) ne "uptodate") && ($replication_state !~ /^Sync/i) && (lc($connection_state) ne "connected"))
{
$all_ready = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_ready => $all_ready }});
}
}
die;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_ready => $all_ready }});
if ($all_ready)
{
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
}
if ($waiting)
{
sleep 1;
}
}
}
# If auto-promote isn't set, promote the resource.
if (not $anvil->data->{drbd}{config}{$local_host}{'auto-promote'})
{
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0420", variables => {
server => $server,
resource => $resource,
}});
# Make the local resource primary.
$anvil->DRBD->manage_resource({
resource => $resource,
task => "primary",
});
}
}
} }
# Start DRBD locally. # See if we're inconsistent and, if so, if we can connect our peers.
sleep 5;
$anvil->DRBD->get_status({debug => 3});
my $peer_startup_needed = 0;
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}}) foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
{ {
my $peer_ip = $anvil->data->{drbd}{config}{$host}{resource}{$resource}{connection}{$peer}{ip_address}; # Is the current resource up locally already?
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0419", variables => { my $role = defined $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role} ? $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role} : "";
server => $server, $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
peer => $peer, 's1:resource' => $resource,
peer_ip => $peer_ip, 's2:role' => $role,
resource => $resource,
}}); }});
# Bring the local resource up # Check all volumes.
$anvil->DRBD->manage_resource({ foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}})
resource => $resource, {
task => "up", my $disk_state = defined $anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}{$volume}{'disk-state'} ? $anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}{$volume}{'disk-state'} : "";
}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { disk_state => $disk_state }});
# Bring the peer's resource up.
$anvil->DRBD->manage_resource({
resource => $resource,
task => "up",
target => $peer_ip,
});
# Now wait for it to be connected or UpToDate... if ((lc($disk_state) eq "consistent") or
my $waiting = 1; (lc($disk_state) eq "outdated") or
while($waiting) (lc($disk_state) eq "failed") or
(not $disk_state))
{
# This will trigger trying to ssh into peer(s) and up'ing their resource.
$peer_startup_needed = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peer_startup_needed => $peer_startup_needed }});
last;
}
}
}
# Do we need to start the resource on our peers?
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peer_startup_needed => $peer_startup_needed }});
if (not $peer_startup_needed)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0431"});
return(0);
}
# Start DRBD on the peer(s).
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { resource => $resource }});
foreach my $host (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}})
{ {
$anvil->DRBD->get_status({debug => 3}); my $is_local = $anvil->Network->is_local({host => $host});
my $connection_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer}{'connection-state'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
connection_state => $connection_state, 's1:host' => $host,
's2:is_local' => $is_local,
}}); }});
my $all_ready = 1; my $connection_state = $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}{$host}{'connection-state'};
foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}}) $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { connection_state => $connection_state }});
if (lc($connection_state) ne "connected")
{ {
my $disk_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}{$volume}{'disk-state'}; # Try to connect to the peer and up this reasource.
my $replication_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer}{volume}{$volume}{'replication-state'}; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0694", variables => {
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host => $host,
disk_state => $disk_state, resource => $resource,
replication_state => $replication_state, connection_state => $connection_state,
}}); }});
my ($access) = $anvil->Remote->test_access({target => $host});
# Is the peer isn't connected (directly or by being in Sync), or this volume $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { access => $access }});
# isn't UpToDate, we need to keep waiting. if ($access)
if ((lc($disk_state) ne "uptodate") && ($replication_state !~ /^Sync/i) && (lc($connection_state) ne "connected"))
{ {
$all_ready = 0; my ($output, $error, $return_code) = $anvil->Remote->call({
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_ready => $all_ready }}); target => $host,
shell_call => $anvil->data->{path}{exe}{drbdadm}." up ".$resource,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
error => $error,
return_code => $return_code,
}});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0695", variables => {
return_code => $return_code,
error => $error,
output => $output,
}});
}
else
{
# No access
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "warning_0136", variables => { host => $host }});
} }
}
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_ready => $all_ready }});
if ($all_ready)
{
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
}
if ($waiting)
{
sleep 1;
} }
} }
} }
# If auto-promote isn't set, promote the resource. # Loop until all our resources are Connected or UpToDate
if (not $anvil->data->{drbd}{config}{$host}{'auto-promote'}) my $waiting = 1;
my $wait_until = time + 30;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:time' => time,
's2:wait_until' => $wait_until,
}});
while($waiting)
{ {
sleep 5;
my $all_connected = 1;
$anvil->DRBD->get_status({debug => 3});
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}}) foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
{ {
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0420", variables => { foreach my $host (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}})
server => $server, {
resource => $resource, $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host => $host }});
}}); next if $anvil->Network->is_local({host => $host});
# Make the local resource primary. foreach my $connection (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}})
$anvil->DRBD->manage_resource({ {
resource => $resource, my $connection_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{'connection-state'};
task => "primary", $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
}); connection => $connection,
connection_state => $connection_state,
}});
if (lc($connection_state) ne "connected")
{
$all_connected = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_connected => $all_connected }});
}
}
}
}
if ($all_connected)
{
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
}
elsif (time > $wait_until)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "warning_0137"});
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
} }
} }

@ -2086,6 +2086,8 @@ The file: [#!variable!file!#] needs to be updated. The difference is:
<key name="log_0691">Requested to power-off as part of the anvil-safe-stop job.</key> <key name="log_0691">Requested to power-off as part of the anvil-safe-stop job.</key>
<key name="log_0692">The anvil-safe-stop job has completed and will now power off.</key> <key name="log_0692">The anvil-safe-stop job has completed and will now power off.</key>
<key name="log_0693">The anvil-configure-host tool is requesting a reboot.</key> <key name="log_0693">The anvil-configure-host tool is requesting a reboot.</key>
<key name="log_0694">The connection to: [#!variable!host!#] for the resource: [#!variable!resource!#] is in the connection state: [#!variable!connection_state!#]. Will try to connect to the peer and up the resource now.</key>
<key name="log_0695">The request to start the resource had the return code: [#!variable!return_code!#]. Call output, if any, was: [#!variable!output!#]. Errors, if any, were: [#!variable!error!#].</key>
<!-- Messages for users (less technical than log entries), though sometimes used for logs, too. --> <!-- Messages for users (less technical than log entries), though sometimes used for logs, too. -->
<key name="message_0001">The host name: [#!variable!target!#] does not resolve to an IP address.</key> <key name="message_0001">The host name: [#!variable!target!#] does not resolve to an IP address.</key>
@ -3117,6 +3119,8 @@ We will sleep a bit and try again.
<key name="warning_0133">[ Warning ] - Table: [history.#!variable!table!#] not found.</key> <key name="warning_0133">[ Warning ] - Table: [history.#!variable!table!#] not found.</key>
<key name="warning_0134">[ Warning ] - Holding off starting the cluster. Tested access to ourself, and failed. Is '/etc/hosts' populated? Will try again in ten seconds.</key> <key name="warning_0134">[ Warning ] - Holding off starting the cluster. Tested access to ourself, and failed. Is '/etc/hosts' populated? Will try again in ten seconds.</key>
<key name="warning_0135">[ Warning ] - The program: [#!variable!program!#] was not found to be running.</key> <key name="warning_0135">[ Warning ] - The program: [#!variable!program!#] was not found to be running.</key>
<key name="warning_0136">[ Warning ] - Failed to connect to the host: [#!variable!host!#]! Unable to up the resource, so the server may not start. If the peer can't be recovered, manually forcing the local resource(s) to UpToDate may be required.</key>
<key name="warning_0137">[ Warning ] - Timed out waiting for the connections to the peers, and the local resource(s) is not in 'UpToDate' state. Booting the server will likely fail.</key>
<!-- The entries below here are not sequential, but use a key to find the entry. --> <!-- The entries below here are not sequential, but use a key to find the entry. -->
<!-- Run 'striker-parse-os-list to find new entries. --> <!-- Run 'striker-parse-os-list to find new entries. -->

Loading…
Cancel
Save