diff --git a/Anvil/Tools/DRBD.pm b/Anvil/Tools/DRBD.pm
index 247be8e4..3fd91272 100644
--- a/Anvil/Tools/DRBD.pm
+++ b/Anvil/Tools/DRBD.pm
@@ -1648,7 +1648,7 @@ sub get_status
($output, $anvil->data->{drbd}{status}{$host}{return_code}) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
output => $output,
- "drbd::status::${host}::return_code" => $anvil->data->{drbd}{status}{return_code},
+ "drbd::status::${host}::return_code" => $anvil->data->{drbd}{status}{$host}{return_code},
}});
}
else
@@ -1666,7 +1666,7 @@ sub get_status
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
error => $error,
output => $output,
- "drbd::status::${host}::return_code" => $anvil->data->{drbd}{status}{return_code},
+ "drbd::status::${host}::return_code" => $anvil->data->{drbd}{status}{$host}{return_code},
}});
}
@@ -1913,12 +1913,14 @@ sub manage_resource
### can block startup, so to be safe, during start, we'll call adjust
if ($task eq "up")
{
+ # This generally brings up the resource
my $shell_call = $anvil->data->{path}{exe}{drbdadm}." adjust ".$resource;
my $output = "";
my $return_code = 255;
if ($anvil->Network->is_local({host => $target}))
{
# Local.
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { shell_call => $shell_call }});
($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
output => $output,
@@ -1928,6 +1930,7 @@ sub manage_resource
else
{
# Remote call.
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { shell_call => $shell_call }});
($output, my $error, $return_code) = $anvil->Remote->call({
debug => $debug,
shell_call => $shell_call,
@@ -1944,12 +1947,15 @@ sub manage_resource
}
}
+ # If we 'adjust'ed abovem this will likely complain that the backing disk already exists, and that's
+ # fine.
my $shell_call = $anvil->data->{path}{exe}{drbdadm}." ".$task." ".$resource;
my $output = "";
my $return_code = 255;
if ($anvil->Network->is_local({host => $target}))
{
# Local.
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { shell_call => $shell_call }});
($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
output => $output,
@@ -1959,6 +1965,7 @@ sub manage_resource
else
{
# Remote call.
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { shell_call => $shell_call }});
($output, my $error, $return_code) = $anvil->Remote->call({
debug => $debug,
shell_call => $shell_call,
diff --git a/notes b/notes
index 85a9cc5a..d29efd84 100644
--- a/notes
+++ b/notes
@@ -312,6 +312,12 @@ pcs constraint location srv01-test prefers el8-a01n01=200 el8-a01n02=100
stonith-max-attempts=INFINITY
cluster-recheck-interval puts an upper bound on the "i give up" time
+====
+
+pcs resource create srv01-cs8 ocf:alteeve:server name="srv01-cs8" meta allow-migrate="true" target-role="stopped" op monitor interval="60" start timeout="INFINITY" on-fail="block" stop timeout="INFINITY" on-fail="block" migrate_to timeout="INFINITY"
+pcs constraint location srv01-cs8 prefers mk-a02n01=200 mk-a02n02=100
+
+
==== DRBD notes
* resources can contain an US-ASCII character, except for spaces
diff --git a/ocf/alteeve/server b/ocf/alteeve/server
index 3205f661..b8c8a129 100755
--- a/ocf/alteeve/server
+++ b/ocf/alteeve/server
@@ -792,29 +792,27 @@ sub start_drbd_resource
my $local_host = $anvil->Get->short_host_name();
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
my $host = $anvil->Get->short_host_name;
- my $peer = $anvil->data->{drbd}{config}{$host}{peer};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
- server => $server,
- host => $host,
- peer => $peer,
+ server => $server,
+ host => $host,
}});
# Do we need startup?
- my $startup_needed = 0;
+ my $local_startup_needed = 0;
$anvil->DRBD->get_status({debug => 3});
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
{
- # Is the current resource up locally already? If it is, we're done.
+ # Is the current resource up locally already?
my $role = defined $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role} ? $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
- resource => $resource,
- role => $role,
+ 's1:resource' => $resource,
+ 's2:role' => $role,
}});
if ((lc($role) ne "secondary") && (lc($role) ne "primary"))
{
- $startup_needed = 1;
- $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { startup_needed => $startup_needed }});
+ $local_startup_needed = 1;
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_startup_needed => $local_startup_needed }});
last;
}
else
@@ -826,95 +824,219 @@ sub start_drbd_resource
}
}
- $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { startup_needed => $startup_needed }});
- if (not $startup_needed)
+ # Do I need to start the DRBD resource locally? If so, do so.
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_startup_needed => $local_startup_needed }});
+ if ($local_startup_needed)
{
- $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0431"});
- return(0);
+ foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
+ {
+ # Bring the local resource up
+ $anvil->DRBD->manage_resource({
+ debug => 2,
+ resource => $resource,
+ task => "up",
+ });
+
+ # Now wait for it to come up.
+ my $waiting = 1;
+ my $wait_until = time + 10;
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
+ 's1:time' => time,
+ 's2:wait_until' => $wait_until,
+ }});
+ while($waiting)
+ {
+ $anvil->DRBD->get_status({debug => 3});
+
+ my $all_up = 1;
+ foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}})
+ {
+ my $disk_state = lc($anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}{$volume}{'disk-state'});
+ $disk_state = "" if not defined $disk_state;
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
+ 's1:resource' => $resource,
+ 's2:volume' => $volume,
+ 's3:disk_state' => $disk_state,
+ }});
+
+ if (($disk_state ne "inconsistent") &&
+ ($disk_state ne "outdated") &&
+ ($disk_state ne "consistent") &&
+ ($disk_state ne "uptodate"))
+ {
+ $all_up = 0;
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_up => $all_up }});
+ }
+ }
+
+ if ($all_up)
+ {
+ $waiting = 0;
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
+ }
+ if ($waiting)
+ {
+ sleep 2;
+ }
+ elsif (time > $wait_until)
+ {
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "warning_0138"});
+ $waiting = 0;
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
+ }
+ }
+ }
+
+ # If auto-promote isn't set, promote the resource.
+ if (not $anvil->data->{drbd}{config}{$local_host}{'auto-promote'})
+ {
+ foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
+ {
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0420", variables => {
+ server => $server,
+ resource => $resource,
+ }});
+ # Make the local resource primary.
+ $anvil->DRBD->manage_resource({
+ resource => $resource,
+ task => "primary",
+ });
+ }
+ }
}
- # Start DRBD locally.
+ # See if we're inconsistent and, if so, if we can connect our peers.
+ sleep 2;
+ $anvil->DRBD->get_status({debug => 3});
+ my $peer_startup_needed = 0;
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
{
- my $peer_ip = $anvil->data->{drbd}{config}{$host}{resource}{$resource}{connection}{$peer}{ip_address};
- $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0419", variables => {
- server => $server,
- peer => $peer,
- peer_ip => $peer_ip,
- resource => $resource,
+ # Is the current resource up locally already?
+ my $role = defined $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role} ? $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role} : "";
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
+ 's1:resource' => $resource,
+ 's2:role' => $role,
}});
- # Bring the local resource up
- $anvil->DRBD->manage_resource({
- resource => $resource,
- task => "up",
- });
-
- # Bring the peer's resource up.
- $anvil->DRBD->manage_resource({
- resource => $resource,
- task => "up",
- target => $peer_ip,
- });
+ # Check all volumes.
+ foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}})
+ {
+ my $disk_state = defined $anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}{$volume}{'disk-state'} ? $anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}{$volume}{'disk-state'} : "";
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { disk_state => $disk_state }});
- # Now wait for it to be connected or UpToDate...
- my $waiting = 1;
- while($waiting)
+ if ((lc($disk_state) eq "consistent") or
+ (lc($disk_state) eq "outdated") or
+ (lc($disk_state) eq "failed") or
+ (not $disk_state))
+ {
+ # This will trigger trying to ssh into peer(s) and up'ing their resource.
+ $peer_startup_needed = 1;
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peer_startup_needed => $peer_startup_needed }});
+ last;
+ }
+ }
+ }
+
+ # Do we need to start the resource on our peers?
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peer_startup_needed => $peer_startup_needed }});
+ if (not $peer_startup_needed)
+ {
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0431"});
+ return(0);
+ }
+
+ # Start DRBD on the peer(s).
+ foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
+ {
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { resource => $resource }});
+ foreach my $host (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}})
{
- $anvil->DRBD->get_status({debug => 3});
-
- my $connection_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer}{'connection-state'};
+ my $is_local = $anvil->Network->is_local({host => $host});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
- connection_state => $connection_state,
+ 's1:host' => $host,
+ 's2:is_local' => $is_local,
}});
- my $all_ready = 1;
- foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}})
+ my $connection_state = $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}{$host}{'connection-state'};
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { connection_state => $connection_state }});
+
+ if (lc($connection_state) ne "connected")
{
- my $disk_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}{$volume}{'disk-state'};
- my $replication_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer}{volume}{$volume}{'replication-state'};
- $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
- disk_state => $disk_state,
- replication_state => $replication_state,
+ # Try to connect to the peer and up this reasource.
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0694", variables => {
+ host => $host,
+ resource => $resource,
+ connection_state => $connection_state,
}});
-
- # Is the peer isn't connected (directly or by being in Sync), or this volume
- # isn't UpToDate, we need to keep waiting.
- if ((lc($disk_state) ne "uptodate") && ($replication_state !~ /^Sync/i) && (lc($connection_state) ne "connected"))
+ my ($access) = $anvil->Remote->test_access({target => $host});
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { access => $access }});
+ if ($access)
{
- $all_ready = 0;
- $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_ready => $all_ready }});
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0695", variables => {
+ host => $host,
+ resource => $resource,
+ }});
+ $anvil->DRBD->manage_resource({
+ debug => 2,
+ resource => $resource,
+ task => "up",
+ target => $host,
+ });
+ }
+ else
+ {
+ # No access
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "warning_0136", variables => { host => $host }});
}
- }
-
- $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_ready => $all_ready }});
- if ($all_ready)
- {
- $waiting = 0;
- $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
- }
-
- if ($waiting)
- {
- sleep 1;
}
}
}
- # If auto-promote isn't set, promote the resource.
- if (not $anvil->data->{drbd}{config}{$host}{'auto-promote'})
+ # Loop until all our resources are Connected or UpToDate
+ my $waiting = 1;
+ my $wait_until = time + 30;
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
+ 's1:time' => time,
+ 's2:wait_until' => $wait_until,
+ }});
+ while($waiting)
{
+ sleep 5;
+ my $all_connected = 1;
+ $anvil->DRBD->get_status({debug => 3});
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
{
- $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0420", variables => {
- server => $server,
- resource => $resource,
- }});
- # Make the local resource primary.
- $anvil->DRBD->manage_resource({
- resource => $resource,
- task => "primary",
- });
+ foreach my $host (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}})
+ {
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host => $host }});
+ next if $anvil->Network->is_local({host => $host});
+ foreach my $connection (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}})
+ {
+ my $connection_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{'connection-state'};
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
+ connection => $connection,
+ connection_state => $connection_state,
+ }});
+
+ if (lc($connection_state) ne "connected")
+ {
+ $all_connected = 0;
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_connected => $all_connected }});
+ }
+ }
+ }
+ }
+
+ if ($all_connected)
+ {
+ $waiting = 0;
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
+ }
+ elsif (time > $wait_until)
+ {
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "warning_0137"});
+ $waiting = 0;
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
}
}
diff --git a/share/words.xml b/share/words.xml
index dcf027fa..e77a2309 100644
--- a/share/words.xml
+++ b/share/words.xml
@@ -2085,6 +2085,10 @@ The file: [#!variable!file!#] needs to be updated. The difference is:
Kernel updated, reboot queued.
Requested to power-off as part of the anvil-safe-stop job.
The anvil-safe-stop job has completed and will now power off.
+ The anvil-configure-host tool is requesting a reboot.
+ The connection to: [#!variable!host!#] for the resource: [#!variable!resource!#] is in the connection state: [#!variable!connection_state!#]. Will try to connect to the peer and up the resource now.
+ About to request the start of the resource: [#variable!resource!#] on: [#!variable!host!#].
+ The peer: [#!variable!peer!#] is defined in the resource: [#!variable!resource!#] but we don't connect to it, ignoring it.
The host name: [#!variable!target!#] does not resolve to an IP address.
@@ -3116,6 +3120,9 @@ We will sleep a bit and try again.
[ Warning ] - Table: [history.#!variable!table!#] not found.
[ Warning ] - Holding off starting the cluster. Tested access to ourself, and failed. Is '/etc/hosts' populated? Will try again in ten seconds.
[ Warning ] - The program: [#!variable!program!#] was not found to be running.
+ [ Warning ] - Failed to connect to the host: [#!variable!host!#]! Unable to up the resource, so the server may not start. If the peer can't be recovered, manually forcing the local resource(s) to UpToDate may be required.
+ [ Warning ] - Timed out waiting for the connections to the peers, and the local resource(s) is not in 'UpToDate' state. Booting the server will likely fail.
+ [ Warning ] - Timed out waiting for the connections to the peers.
diff --git a/tools/anvil-configure-host b/tools/anvil-configure-host
index 18b25b76..a9c8fbb0 100755
--- a/tools/anvil-configure-host
+++ b/tools/anvil-configure-host
@@ -113,6 +113,7 @@ sub do_reboot
my ($anvil) = @_;
# Mark that a reboot is needed, in case something kills us before we actually reboot.
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0687", variables => { reason => "log_0693" }});
my ($job_uuid) = $anvil->Database->insert_or_update_jobs({
file => $THIS_FILE,
line => __LINE__,
@@ -1233,7 +1234,7 @@ sub reconfigure_network
# If we should reset, do so now.
if ($anvil->data->{sys}{reboot})
{
- $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, 'print' => 1, key => "log_0631"});
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, 'print' => 1, key => "log_0687", variables => { reason => "log_0631" }});
do_reboot($anvil);
}
@@ -1241,7 +1242,7 @@ sub reconfigure_network
{
# In an attempt to make network changes more reliable, we'll just reboot. This shouldn't
# actually be hit anymore as any change should have triggered the reboot above.
- $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, 'print' => 1, key => "log_0631"});
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, 'print' => 1, key => "log_0687", variables => { reason => "log_0631" }});
do_reboot($anvil);
# # Re-read the config