* Created Cluster->recover_server() that uses crm_resource to try to recover a server that has entered a FAILED state.

* Updated (not not yet completed) scan-cluster's check_resources() function to check if a FAILED server is ready to try to recover.

Signed-off-by: digimer <mkelly@alteeve.ca>
main
digimer 2 years ago
parent f9689a7106
commit 1afa7ce09e
  1. 44
      Anvil/Tools/Cluster.pm
  2. 4
      notes
  3. 99
      scancore-agents/scan-cluster/scan-cluster

@ -35,6 +35,7 @@ my $THIS_FILE = "Cluster.pm";
# parse_cib # parse_cib
# parse_crm_mon # parse_crm_mon
# parse_quorum # parse_quorum
# recover_server
# shutdown_server # shutdown_server
# start_cluster # start_cluster
# which_node # which_node
@ -4268,6 +4269,49 @@ sub parse_quorum
} }
=head2 recover_server
This tries to recover a C<< FAILED >> resource (server).
Parameters;
=head3 server_ (required)
This is the server (resource) name to try to recover.
=cut
sub recover_server
{
my $self = shift;
my $parameter = shift;
my $anvil = $self->parent;
my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "Cluster->recover_server()" }});
my $server = defined $parameter->{server} ? $parameter->{server} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
server => $server,
}});
if (not $server)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0020", variables => { method => "Cluster->recover_server()", parameter => "server" }});
return("!!error!!");
}
my $shell_call = $anvil->data->{path}{exe}{crm_resource}." --resource ".$server." --refresh";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({debug => $debug, shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
output => $output,
return_code => $return_code,
}});
return(0);
}
=head2 shutdown_server =head2 shutdown_server
This shuts down a server that is running on the Anvil! system. If there is a problem, C<< !!error!! >> is returned. On success, C<< 0 >> is returned. This shuts down a server that is running on the Anvil! system. If there is a problem, C<< !!error!! >> is returned. On success, C<< 0 >> is returned.

@ -20,6 +20,10 @@ Common queries;
# Fail a resource for testing purposes. # Fail a resource for testing purposes.
crm_resource --fail --resource srv02-b -N vm-a01n01 crm_resource --fail --resource srv02-b -N vm-a01n01
# Recover without reboot
crm_resource --resource srv01-a --refresh
uname -r; grubby --default-kernel; lsinitrd -m /boot/initramfs-4.18.0-448.el8.x86_64.img | grep lvm; systemctl is-enabled scancore.service; uname -r; grubby --default-kernel; lsinitrd -m /boot/initramfs-4.18.0-448.el8.x86_64.img | grep lvm; systemctl is-enabled scancore.service;
dnf -y update; systemctl disable --now anvil-daemon; systemctl disable --now scancore dnf -y update; systemctl disable --now anvil-daemon; systemctl disable --now scancore

@ -99,6 +99,9 @@ check_config($anvil);
# Check the fence delay # Check the fence delay
check_fence_delay($anvil); check_fence_delay($anvil);
# Check for failed resources
check_resources($anvil);
# Shut down. # Shut down.
$anvil->ScanCore->agent_shutdown({agent => $THIS_FILE}); $anvil->ScanCore->agent_shutdown({agent => $THIS_FILE});
@ -107,6 +110,102 @@ $anvil->ScanCore->agent_shutdown({agent => $THIS_FILE});
# Functions # # Functions #
############################################################################################################# #############################################################################################################
# This looks for failed resource and, if found, tries to recover them.
sub check_resources
{
my ($anvil) = @_;
foreach my $server (sort {$a cmp $b} keys %{$anvil->data->{crm_mon}{parsed}{'pacemaker-result'}{resources}{resource}})
{
my $failed = exists $anvil->data->{crm_mon}{parsed}{'pacemaker-result'}{resources}{resource}{$server}{variables}{failed} ? $anvil->data->{crm_mon}{parsed}{'pacemaker-result'}{resources}{resource}{$server}{variables}{failed} : 0;
if ($failed eq "true")
{
$failed = 1;
}
elsif ($failed eq "false")
{
$failed = 0;
}
print "Server: [".$server."], failed? [".$failed."]\n";
if ($failed)
{
# Who am I and who is my peer? See if the server is running on either host.
print "- Checking if it's safe to recover!\n";
my $attempt_recovery = 0;
my $server_found = 0;
my $both_nodes_ready = 1;
foreach my $target ("local", "peer")
{
my $node_ready = $anvil->data->{cib}{parsed}{$target}{ready};
my $node_name = $anvil->data->{cib}{parsed}{$target}{name};
my $host_uuid = $anvil->Get->host_uuid_from_name({host_name => $node_name});
print "- Searching node: [".$node_name." (".$host_uuid.")] which is in ready state: [".$node_ready."]\n";
if (not $node_ready)
{
$both_nodes_ready = 1;
}
if ($host_uuid eq $anvil->Get->host_uuid)
{
# Search for the server here
$anvil->Server->find({debug => 2});
print "- Searching for the server on the local system.\n";
}
else
{
# Search for the server on the peer.
my $target_ip = $anvil->Network->find_target_ip({host_uuid => $host_uuid});
print "- Searching for the server on the peer using IP: [".$target_ip."]\n";
$anvil->Server->find({
debug => 2,
target => $target_ip,
});
}
my $server_host = defined $anvil->data->{server}{location}{$server}{host_name} ? $anvil->data->{server}{location}{$server}{host_name} : "";
my $server_status = defined $anvil->data->{server}{location}{$server}{status} ? $anvil->data->{server}{location}{$server}{status} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
server_host => $server_host,
server_status => $server_status,
}});
print "- Host: [".$server_host."], status: [".$server_status."]\n";
if ($server_host)
{
$server_found = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { server_found => $server_found }});
if (($node_ready) && ($host_uuid eq $anvil->Get->host_uuid))
{
# Go ahead with recovery
print "The server is running locally and we're a full cluster member. Will attempt recover.\n";
$attempt_recovery = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { attempt_recovery => $attempt_recovery }});
}
}
}
if ((not $server_found) && ($both_nodes_ready))
{
print "Both nodes are up and the server wasn't found anywhere. Attempting recovery.\n";
$attempt_recovery = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { attempt_recovery => $attempt_recovery }});
}
elsif (($server_found) && (not $attempt_recovery))
{
print "The server was found to be running, but not here (or this node is not fully in the cluster). NOT attempting recovery yet.\n";
}
elsif ($attempt_recovery)
{
print "Attempting recovery now...\n";
$anvil->Cluster->recover_server({
debug => 2,
server => $server,
});
}
}
}
return(0);
}
# Check to see if we need to move the fence delay. # Check to see if we need to move the fence delay.
sub check_fence_delay sub check_fence_delay
{ {

Loading…
Cancel
Save