From 920d38d250318a47aca4574f4292e151412cc051 Mon Sep 17 00:00:00 2001 From: Digimer Date: Thu, 17 Feb 2022 13:56:29 -0500 Subject: [PATCH 1/2] Moved anvil-configure-host reboot logging to use log_0687 to help grep for reboot causes. Signed-off-by: Digimer --- share/words.xml | 1 + tools/anvil-configure-host | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/share/words.xml b/share/words.xml index dcf027fa..092fcb75 100644 --- a/share/words.xml +++ b/share/words.xml @@ -2085,6 +2085,7 @@ The file: [#!variable!file!#] needs to be updated. The difference is: Kernel updated, reboot queued. Requested to power-off as part of the anvil-safe-stop job. The anvil-safe-stop job has completed and will now power off. + The anvil-configure-host tool is requesting a reboot. The host name: [#!variable!target!#] does not resolve to an IP address. diff --git a/tools/anvil-configure-host b/tools/anvil-configure-host index 18b25b76..a9c8fbb0 100755 --- a/tools/anvil-configure-host +++ b/tools/anvil-configure-host @@ -113,6 +113,7 @@ sub do_reboot my ($anvil) = @_; # Mark that a reboot is needed, in case something kills us before we actually reboot. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0687", variables => { reason => "log_0693" }}); my ($job_uuid) = $anvil->Database->insert_or_update_jobs({ file => $THIS_FILE, line => __LINE__, @@ -1233,7 +1234,7 @@ sub reconfigure_network # If we should reset, do so now. if ($anvil->data->{sys}{reboot}) { - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, 'print' => 1, key => "log_0631"}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, 'print' => 1, key => "log_0687", variables => { reason => "log_0631" }}); do_reboot($anvil); } @@ -1241,7 +1242,7 @@ sub reconfigure_network { # In an attempt to make network changes more reliable, we'll just reboot. This shouldn't # actually be hit anymore as any change should have triggered the reboot above. - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, 'print' => 1, key => "log_0631"}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, 'print' => 1, key => "log_0687", variables => { reason => "log_0631" }}); do_reboot($anvil); # # Re-read the config From 3c0435a45512186c410c897e9aa7b8380c613d4f Mon Sep 17 00:00:00 2001 From: Digimer Date: Fri, 18 Feb 2022 22:37:43 -0500 Subject: [PATCH 2/2] * Updated ocf:alteeve:server to better handle starting up DRBD resources before trying to boot a VM. Signed-off-by: Digimer --- notes | 6 + ocf/alteeve/server | 284 +++++++++++++++++++++++++++++++++------------ share/words.xml | 4 + 3 files changed, 222 insertions(+), 72 deletions(-) diff --git a/notes b/notes index 85a9cc5a..d29efd84 100644 --- a/notes +++ b/notes @@ -312,6 +312,12 @@ pcs constraint location srv01-test prefers el8-a01n01=200 el8-a01n02=100 stonith-max-attempts=INFINITY cluster-recheck-interval puts an upper bound on the "i give up" time +==== + +pcs resource create srv01-cs8 ocf:alteeve:server name="srv01-cs8" meta allow-migrate="true" target-role="stopped" op monitor interval="60" start timeout="INFINITY" on-fail="block" stop timeout="INFINITY" on-fail="block" migrate_to timeout="INFINITY" +pcs constraint location srv01-cs8 prefers mk-a02n01=200 mk-a02n02=100 + + ==== DRBD notes * resources can contain an US-ASCII character, except for spaces diff --git a/ocf/alteeve/server b/ocf/alteeve/server index 3205f661..83606215 100755 --- a/ocf/alteeve/server +++ b/ocf/alteeve/server @@ -799,22 +799,23 @@ sub start_drbd_resource peer => $peer, }}); + ### TODO: Local start up below, move the peer check to have local startup is handled # Do we need startup? - my $startup_needed = 0; + my $local_startup_needed = 0; $anvil->DRBD->get_status({debug => 3}); foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}}) { - # Is the current resource up locally already? If it is, we're done. + # Is the current resource up locally already? my $role = defined $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role} ? $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role} : ""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - resource => $resource, - role => $role, + 's1:resource' => $resource, + 's2:role' => $role, }}); if ((lc($role) ne "secondary") && (lc($role) ne "primary")) { - $startup_needed = 1; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { startup_needed => $startup_needed }}); + $local_startup_needed = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_startup_needed => $local_startup_needed }}); last; } else @@ -826,95 +827,234 @@ sub start_drbd_resource } } - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { startup_needed => $startup_needed }}); - if (not $startup_needed) + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_startup_needed => $local_startup_needed }}); + if ($local_startup_needed) { - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0431"}); - return(0); + foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}}) + { + my $peer_ip = $anvil->data->{drbd}{config}{$host}{resource}{$resource}{connection}{$peer}{ip_address}; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0419", variables => { + server => $server, + peer => $peer, + peer_ip => $peer_ip, + resource => $resource, + }}); + + # Bring the local resource up + $anvil->DRBD->manage_resource({ + resource => $resource, + task => "up", + }); + + # Bring the peer's resource up. + $anvil->DRBD->manage_resource({ + resource => $resource, + task => "up", + target => $peer_ip, + }); + + # Now wait for it to be connected or UpToDate... + my $waiting = 1; + while($waiting) + { + $anvil->DRBD->get_status({debug => 3}); + + print "==] ".$local_host." [==] ".$resource." [==] ".$peer." [==\n"; + print Dumper $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}{$peer}; + print "=========================================================\n"; + + my $connection_state = $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}{$peer}{'connection-state'}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { connection_state => $connection_state }}); + + my $all_ready = 1; + foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}}) + { + my $disk_state = $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}{$volume}{'disk-state'}; + my $replication_state = $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}{$peer}{volume}{$volume}{'replication-state'}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + disk_state => $disk_state, + replication_state => $replication_state, + }}); + + # Is the peer isn't connected (directly or by being in Sync), or this volume + # isn't UpToDate, we need to keep waiting. + if ((lc($disk_state) ne "uptodate") && ($replication_state !~ /^Sync/i) && (lc($connection_state) ne "connected")) + { + $all_ready = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_ready => $all_ready }}); + } + } + + die; + + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_ready => $all_ready }}); + if ($all_ready) + { + $waiting = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); + } + + if ($waiting) + { + sleep 1; + } + } + } + + # If auto-promote isn't set, promote the resource. + if (not $anvil->data->{drbd}{config}{$local_host}{'auto-promote'}) + { + foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}}) + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0420", variables => { + server => $server, + resource => $resource, + }}); + # Make the local resource primary. + $anvil->DRBD->manage_resource({ + resource => $resource, + task => "primary", + }); + } + } } - # Start DRBD locally. + # See if we're inconsistent and, if so, if we can connect our peers. + sleep 5; + $anvil->DRBD->get_status({debug => 3}); + my $peer_startup_needed = 0; foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}}) { - my $peer_ip = $anvil->data->{drbd}{config}{$host}{resource}{$resource}{connection}{$peer}{ip_address}; - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0419", variables => { - server => $server, - peer => $peer, - peer_ip => $peer_ip, - resource => $resource, + # Is the current resource up locally already? + my $role = defined $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role} ? $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role} : ""; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:resource' => $resource, + 's2:role' => $role, }}); - # Bring the local resource up - $anvil->DRBD->manage_resource({ - resource => $resource, - task => "up", - }); - - # Bring the peer's resource up. - $anvil->DRBD->manage_resource({ - resource => $resource, - task => "up", - target => $peer_ip, - }); + # Check all volumes. + foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}}) + { + my $disk_state = defined $anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}{$volume}{'disk-state'} ? $anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}{$volume}{'disk-state'} : ""; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { disk_state => $disk_state }}); - # Now wait for it to be connected or UpToDate... - my $waiting = 1; - while($waiting) + if ((lc($disk_state) eq "consistent") or + (lc($disk_state) eq "outdated") or + (lc($disk_state) eq "failed") or + (not $disk_state)) + { + # This will trigger trying to ssh into peer(s) and up'ing their resource. + $peer_startup_needed = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peer_startup_needed => $peer_startup_needed }}); + last; + } + } + } + + # Do we need to start the resource on our peers? + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peer_startup_needed => $peer_startup_needed }}); + if (not $peer_startup_needed) + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0431"}); + return(0); + } + + # Start DRBD on the peer(s). + foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}}) + { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { resource => $resource }}); + foreach my $host (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}}) { - $anvil->DRBD->get_status({debug => 3}); - - my $connection_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer}{'connection-state'}; + my $is_local = $anvil->Network->is_local({host => $host}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - connection_state => $connection_state, + 's1:host' => $host, + 's2:is_local' => $is_local, }}); - my $all_ready = 1; - foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}}) + my $connection_state = $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}{$host}{'connection-state'}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { connection_state => $connection_state }}); + + if (lc($connection_state) ne "connected") { - my $disk_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}{$volume}{'disk-state'}; - my $replication_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer}{volume}{$volume}{'replication-state'}; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - disk_state => $disk_state, - replication_state => $replication_state, + # Try to connect to the peer and up this reasource. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0694", variables => { + host => $host, + resource => $resource, + connection_state => $connection_state, }}); - - # Is the peer isn't connected (directly or by being in Sync), or this volume - # isn't UpToDate, we need to keep waiting. - if ((lc($disk_state) ne "uptodate") && ($replication_state !~ /^Sync/i) && (lc($connection_state) ne "connected")) + my ($access) = $anvil->Remote->test_access({target => $host}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { access => $access }}); + if ($access) { - $all_ready = 0; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_ready => $all_ready }}); + my ($output, $error, $return_code) = $anvil->Remote->call({ + target => $host, + shell_call => $anvil->data->{path}{exe}{drbdadm}." up ".$resource, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0695", variables => { + return_code => $return_code, + error => $error, + output => $output, + }}); + } + else + { + # No access + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "warning_0136", variables => { host => $host }}); } - } - - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_ready => $all_ready }}); - if ($all_ready) - { - $waiting = 0; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); - } - - if ($waiting) - { - sleep 1; } } } - # If auto-promote isn't set, promote the resource. - if (not $anvil->data->{drbd}{config}{$host}{'auto-promote'}) + # Loop until all our resources are Connected or UpToDate + my $waiting = 1; + my $wait_until = time + 30; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:time' => time, + 's2:wait_until' => $wait_until, + }}); + while($waiting) { + sleep 5; + my $all_connected = 1; + $anvil->DRBD->get_status({debug => 3}); foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}}) { - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0420", variables => { - server => $server, - resource => $resource, - }}); - # Make the local resource primary. - $anvil->DRBD->manage_resource({ - resource => $resource, - task => "primary", - }); + foreach my $host (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}}) + { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host => $host }}); + next if $anvil->Network->is_local({host => $host}); + foreach my $connection (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}}) + { + my $connection_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{'connection-state'}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + connection => $connection, + connection_state => $connection_state, + }}); + + if (lc($connection_state) ne "connected") + { + $all_connected = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_connected => $all_connected }}); + } + } + } + } + + if ($all_connected) + { + $waiting = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); + } + elsif (time > $wait_until) + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "warning_0137"}); + $waiting = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); } } diff --git a/share/words.xml b/share/words.xml index 092fcb75..36a0661a 100644 --- a/share/words.xml +++ b/share/words.xml @@ -2086,6 +2086,8 @@ The file: [#!variable!file!#] needs to be updated. The difference is: Requested to power-off as part of the anvil-safe-stop job. The anvil-safe-stop job has completed and will now power off. The anvil-configure-host tool is requesting a reboot. + The connection to: [#!variable!host!#] for the resource: [#!variable!resource!#] is in the connection state: [#!variable!connection_state!#]. Will try to connect to the peer and up the resource now. + The request to start the resource had the return code: [#!variable!return_code!#]. Call output, if any, was: [#!variable!output!#]. Errors, if any, were: [#!variable!error!#]. The host name: [#!variable!target!#] does not resolve to an IP address. @@ -3117,6 +3119,8 @@ We will sleep a bit and try again. [ Warning ] - Table: [history.#!variable!table!#] not found. [ Warning ] - Holding off starting the cluster. Tested access to ourself, and failed. Is '/etc/hosts' populated? Will try again in ten seconds. [ Warning ] - The program: [#!variable!program!#] was not found to be running. + [ Warning ] - Failed to connect to the host: [#!variable!host!#]! Unable to up the resource, so the server may not start. If the peer can't be recovered, manually forcing the local resource(s) to UpToDate may be required. + [ Warning ] - Timed out waiting for the connections to the peers, and the local resource(s) is not in 'UpToDate' state. Booting the server will likely fail.