* Updated ocf:alteeve:server to better handle starting up DRBD resources before trying to boot a VM.

Signed-off-by: Digimer <digimer@alteeve.ca>
main
Digimer 3 years ago
parent 87a2454a09
commit e62e5d7b0c
  1. 6
      notes
  2. 280
      ocf/alteeve/server
  3. 4
      share/words.xml

@ -312,6 +312,12 @@ pcs constraint location srv01-test prefers el8-a01n01=200 el8-a01n02=100
stonith-max-attempts=INFINITY
cluster-recheck-interval puts an upper bound on the "i give up" time
====
pcs resource create srv01-cs8 ocf:alteeve:server name="srv01-cs8" meta allow-migrate="true" target-role="stopped" op monitor interval="60" start timeout="INFINITY" on-fail="block" stop timeout="INFINITY" on-fail="block" migrate_to timeout="INFINITY"
pcs constraint location srv01-cs8 prefers mk-a02n01=200 mk-a02n02=100
==== DRBD notes
* resources can contain an US-ASCII character, except for spaces

@ -799,22 +799,23 @@ sub start_drbd_resource
peer => $peer,
}});
### TODO: Local start up below, move the peer check to have local startup is handled
# Do we need startup?
my $startup_needed = 0;
my $local_startup_needed = 0;
$anvil->DRBD->get_status({debug => 3});
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
{
# Is the current resource up locally already? If it is, we're done.
# Is the current resource up locally already?
my $role = defined $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role} ? $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
resource => $resource,
role => $role,
's1:resource' => $resource,
's2:role' => $role,
}});
if ((lc($role) ne "secondary") && (lc($role) ne "primary"))
{
$startup_needed = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { startup_needed => $startup_needed }});
$local_startup_needed = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_startup_needed => $local_startup_needed }});
last;
}
else
@ -826,95 +827,234 @@ sub start_drbd_resource
}
}
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { startup_needed => $startup_needed }});
if (not $startup_needed)
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_startup_needed => $local_startup_needed }});
if ($local_startup_needed)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0431"});
return(0);
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
{
my $peer_ip = $anvil->data->{drbd}{config}{$host}{resource}{$resource}{connection}{$peer}{ip_address};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0419", variables => {
server => $server,
peer => $peer,
peer_ip => $peer_ip,
resource => $resource,
}});
# Bring the local resource up
$anvil->DRBD->manage_resource({
resource => $resource,
task => "up",
});
# Bring the peer's resource up.
$anvil->DRBD->manage_resource({
resource => $resource,
task => "up",
target => $peer_ip,
});
# Now wait for it to be connected or UpToDate...
my $waiting = 1;
while($waiting)
{
$anvil->DRBD->get_status({debug => 3});
print "==] ".$local_host." [==] ".$resource." [==] ".$peer." [==\n";
print Dumper $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}{$peer};
print "=========================================================\n";
my $connection_state = $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}{$peer}{'connection-state'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { connection_state => $connection_state }});
my $all_ready = 1;
foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}})
{
my $disk_state = $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}{$volume}{'disk-state'};
my $replication_state = $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}{$peer}{volume}{$volume}{'replication-state'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
disk_state => $disk_state,
replication_state => $replication_state,
}});
# Is the peer isn't connected (directly or by being in Sync), or this volume
# isn't UpToDate, we need to keep waiting.
if ((lc($disk_state) ne "uptodate") && ($replication_state !~ /^Sync/i) && (lc($connection_state) ne "connected"))
{
$all_ready = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_ready => $all_ready }});
}
}
die;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_ready => $all_ready }});
if ($all_ready)
{
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
}
if ($waiting)
{
sleep 1;
}
}
}
# If auto-promote isn't set, promote the resource.
if (not $anvil->data->{drbd}{config}{$local_host}{'auto-promote'})
{
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0420", variables => {
server => $server,
resource => $resource,
}});
# Make the local resource primary.
$anvil->DRBD->manage_resource({
resource => $resource,
task => "primary",
});
}
}
}
# Start DRBD locally.
# See if we're inconsistent and, if so, if we can connect our peers.
sleep 5;
$anvil->DRBD->get_status({debug => 3});
my $peer_startup_needed = 0;
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
{
my $peer_ip = $anvil->data->{drbd}{config}{$host}{resource}{$resource}{connection}{$peer}{ip_address};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0419", variables => {
server => $server,
peer => $peer,
peer_ip => $peer_ip,
resource => $resource,
# Is the current resource up locally already?
my $role = defined $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role} ? $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:resource' => $resource,
's2:role' => $role,
}});
# Bring the local resource up
$anvil->DRBD->manage_resource({
resource => $resource,
task => "up",
});
# Check all volumes.
foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}})
{
my $disk_state = defined $anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}{$volume}{'disk-state'} ? $anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}{$volume}{'disk-state'} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { disk_state => $disk_state }});
# Bring the peer's resource up.
$anvil->DRBD->manage_resource({
resource => $resource,
task => "up",
target => $peer_ip,
});
if ((lc($disk_state) eq "consistent") or
(lc($disk_state) eq "outdated") or
(lc($disk_state) eq "failed") or
(not $disk_state))
{
# This will trigger trying to ssh into peer(s) and up'ing their resource.
$peer_startup_needed = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peer_startup_needed => $peer_startup_needed }});
last;
}
}
}
# Now wait for it to be connected or UpToDate...
my $waiting = 1;
while($waiting)
{
$anvil->DRBD->get_status({debug => 3});
# Do we need to start the resource on our peers?
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peer_startup_needed => $peer_startup_needed }});
if (not $peer_startup_needed)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0431"});
return(0);
}
my $connection_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer}{'connection-state'};
# Start DRBD on the peer(s).
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { resource => $resource }});
foreach my $host (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}})
{
my $is_local = $anvil->Network->is_local({host => $host});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
connection_state => $connection_state,
's1:host' => $host,
's2:is_local' => $is_local,
}});
my $all_ready = 1;
foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}})
my $connection_state = $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}{$host}{'connection-state'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { connection_state => $connection_state }});
if (lc($connection_state) ne "connected")
{
my $disk_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}{$volume}{'disk-state'};
my $replication_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer}{volume}{$volume}{'replication-state'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
disk_state => $disk_state,
replication_state => $replication_state,
# Try to connect to the peer and up this reasource.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0694", variables => {
host => $host,
resource => $resource,
connection_state => $connection_state,
}});
# Is the peer isn't connected (directly or by being in Sync), or this volume
# isn't UpToDate, we need to keep waiting.
if ((lc($disk_state) ne "uptodate") && ($replication_state !~ /^Sync/i) && (lc($connection_state) ne "connected"))
my ($access) = $anvil->Remote->test_access({target => $host});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { access => $access }});
if ($access)
{
my ($output, $error, $return_code) = $anvil->Remote->call({
target => $host,
shell_call => $anvil->data->{path}{exe}{drbdadm}." up ".$resource,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
error => $error,
return_code => $return_code,
}});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0695", variables => {
return_code => $return_code,
error => $error,
output => $output,
}});
}
else
{
$all_ready = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_ready => $all_ready }});
# No access
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "warning_0136", variables => { host => $host }});
}
}
}
}
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_ready => $all_ready }});
if ($all_ready)
# Loop until all our resources are Connected or UpToDate
my $waiting = 1;
my $wait_until = time + 30;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:time' => time,
's2:wait_until' => $wait_until,
}});
while($waiting)
{
sleep 5;
my $all_connected = 1;
$anvil->DRBD->get_status({debug => 3});
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
{
foreach my $host (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}})
{
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
}
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host => $host }});
next if $anvil->Network->is_local({host => $host});
foreach my $connection (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}})
{
my $connection_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{'connection-state'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
connection => $connection,
connection_state => $connection_state,
}});
if ($waiting)
{
sleep 1;
if (lc($connection_state) ne "connected")
{
$all_connected = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_connected => $all_connected }});
}
}
}
}
}
# If auto-promote isn't set, promote the resource.
if (not $anvil->data->{drbd}{config}{$host}{'auto-promote'})
{
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
if ($all_connected)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0420", variables => {
server => $server,
resource => $resource,
}});
# Make the local resource primary.
$anvil->DRBD->manage_resource({
resource => $resource,
task => "primary",
});
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
}
elsif (time > $wait_until)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "warning_0137"});
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
}
}

@ -2086,6 +2086,8 @@ The file: [#!variable!file!#] needs to be updated. The difference is:
<key name="log_0691">Requested to power-off as part of the anvil-safe-stop job.</key>
<key name="log_0692">The anvil-safe-stop job has completed and will now power off.</key>
<key name="log_0693">The anvil-configure-host tool is requesting a reboot.</key>
<key name="log_0694">The connection to: [#!variable!host!#] for the resource: [#!variable!resource!#] is in the connection state: [#!variable!connection_state!#]. Will try to connect to the peer and up the resource now.</key>
<key name="log_0695">The request to start the resource had the return code: [#!variable!return_code!#]. Call output, if any, was: [#!variable!output!#]. Errors, if any, were: [#!variable!error!#].</key>
<!-- Messages for users (less technical than log entries), though sometimes used for logs, too. -->
<key name="message_0001">The host name: [#!variable!target!#] does not resolve to an IP address.</key>
@ -3117,6 +3119,8 @@ We will sleep a bit and try again.
<key name="warning_0133">[ Warning ] - Table: [history.#!variable!table!#] not found.</key>
<key name="warning_0134">[ Warning ] - Holding off starting the cluster. Tested access to ourself, and failed. Is '/etc/hosts' populated? Will try again in ten seconds.</key>
<key name="warning_0135">[ Warning ] - The program: [#!variable!program!#] was not found to be running.</key>
<key name="warning_0136">[ Warning ] - Failed to connect to the host: [#!variable!host!#]! Unable to up the resource, so the server may not start. If the peer can't be recovered, manually forcing the local resource(s) to UpToDate may be required.</key>
<key name="warning_0137">[ Warning ] - Timed out waiting for the connections to the peers, and the local resource(s) is not in 'UpToDate' state. Booting the server will likely fail.</key>
<!-- The entries below here are not sequential, but use a key to find the entry. -->
<!-- Run 'striker-parse-os-list to find new entries. -->

Loading…
Cancel
Save