Updated scan-cluster to check for FAILED resources (servers) and, if found, attempt to recover it.

Signed-off-by: digimer <mkelly@alteeve.ca>
main
digimer 2 years ago
parent 1afa7ce09e
commit 83aa4e6a5f
  1. 129
      scancore-agents/scan-cluster/scan-cluster
  2. 12
      scancore-agents/scan-cluster/scan-cluster.xml

@ -117,20 +117,18 @@ sub check_resources
foreach my $server (sort {$a cmp $b} keys %{$anvil->data->{crm_mon}{parsed}{'pacemaker-result'}{resources}{resource}}) foreach my $server (sort {$a cmp $b} keys %{$anvil->data->{crm_mon}{parsed}{'pacemaker-result'}{resources}{resource}})
{ {
my $failed = exists $anvil->data->{crm_mon}{parsed}{'pacemaker-result'}{resources}{resource}{$server}{variables}{failed} ? $anvil->data->{crm_mon}{parsed}{'pacemaker-result'}{resources}{resource}{$server}{variables}{failed} : 0; # This is used for alerts, if needed below.
if ($failed eq "true") my $variables = { server => $server };
{
$failed = 1; my $failed = check_if_server_failed($anvil, $server);
} $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
elsif ($failed eq "false") server => $server,
{ failed => $failed,
$failed = 0; }});
}
print "Server: [".$server."], failed? [".$failed."]\n";
if ($failed) if ($failed)
{ {
# Who am I and who is my peer? See if the server is running on either host. # Who am I and who is my peer? See if the server is running on either host.
print "- Checking if it's safe to recover!\n"; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "scan_cluster_log_0003", variables => { server => $server }});
my $attempt_recovery = 0; my $attempt_recovery = 0;
my $server_found = 0; my $server_found = 0;
my $both_nodes_ready = 1; my $both_nodes_ready = 1;
@ -139,7 +137,11 @@ sub check_resources
my $node_ready = $anvil->data->{cib}{parsed}{$target}{ready}; my $node_ready = $anvil->data->{cib}{parsed}{$target}{ready};
my $node_name = $anvil->data->{cib}{parsed}{$target}{name}; my $node_name = $anvil->data->{cib}{parsed}{$target}{name};
my $host_uuid = $anvil->Get->host_uuid_from_name({host_name => $node_name}); my $host_uuid = $anvil->Get->host_uuid_from_name({host_name => $node_name});
print "- Searching node: [".$node_name." (".$host_uuid.")] which is in ready state: [".$node_ready."]\n"; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "scan_cluster_log_0004", variables => {
node_name => $node_name,
host_uuid => $host_uuid,
node_ready => $node_ready,
}});
if (not $node_ready) if (not $node_ready)
{ {
$both_nodes_ready = 1; $both_nodes_ready = 1;
@ -149,13 +151,13 @@ sub check_resources
{ {
# Search for the server here # Search for the server here
$anvil->Server->find({debug => 2}); $anvil->Server->find({debug => 2});
print "- Searching for the server on the local system.\n"; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "scan_cluster_log_0005"});
} }
else else
{ {
# Search for the server on the peer. # Search for the server on the peer.
my $target_ip = $anvil->Network->find_target_ip({host_uuid => $host_uuid}); my $target_ip = $anvil->Network->find_target_ip({host_uuid => $host_uuid});
print "- Searching for the server on the peer using IP: [".$target_ip."]\n"; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "scan_cluster_log_0006", variables => { target_ip => $target_ip }});
$anvil->Server->find({ $anvil->Server->find({
debug => 2, debug => 2,
target => $target_ip, target => $target_ip,
@ -163,11 +165,10 @@ sub check_resources
} }
my $server_host = defined $anvil->data->{server}{location}{$server}{host_name} ? $anvil->data->{server}{location}{$server}{host_name} : ""; my $server_host = defined $anvil->data->{server}{location}{$server}{host_name} ? $anvil->data->{server}{location}{$server}{host_name} : "";
my $server_status = defined $anvil->data->{server}{location}{$server}{status} ? $anvil->data->{server}{location}{$server}{status} : ""; my $server_status = defined $anvil->data->{server}{location}{$server}{status} ? $anvil->data->{server}{location}{$server}{status} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => {
server_host => $server_host, server_host => $server_host,
server_status => $server_status, server_status => $server_status,
}}); }});
print "- Host: [".$server_host."], status: [".$server_status."]\n";
if ($server_host) if ($server_host)
{ {
$server_found = 1; $server_found = 1;
@ -175,7 +176,7 @@ sub check_resources
if (($node_ready) && ($host_uuid eq $anvil->Get->host_uuid)) if (($node_ready) && ($host_uuid eq $anvil->Get->host_uuid))
{ {
# Go ahead with recovery # Go ahead with recovery
print "The server is running locally and we're a full cluster member. Will attempt recover.\n"; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "scan_cluster_log_0007"});
$attempt_recovery = 1; $attempt_recovery = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { attempt_recovery => $attempt_recovery }}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { attempt_recovery => $attempt_recovery }});
} }
@ -184,21 +185,87 @@ sub check_resources
if ((not $server_found) && ($both_nodes_ready)) if ((not $server_found) && ($both_nodes_ready))
{ {
print "Both nodes are up and the server wasn't found anywhere. Attempting recovery.\n"; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "scan_cluster_log_0008"});
$attempt_recovery = 1; $attempt_recovery = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { attempt_recovery => $attempt_recovery }}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { attempt_recovery => $attempt_recovery }});
} }
elsif (($server_found) && (not $attempt_recovery)) elsif (($server_found) && (not $attempt_recovery))
{ {
print "The server was found to be running, but not here (or this node is not fully in the cluster). NOT attempting recovery yet.\n"; # The server was found to be running, but not here (or this node is not fully in the cluster). NOT attempting recovery yet.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "scan_cluster_log_0009"});
} }
elsif ($attempt_recovery) elsif ($attempt_recovery)
{ {
print "Attempting recovery now...\n"; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "scan_cluster_log_0010"});
$anvil->Cluster->recover_server({ $anvil->Cluster->recover_server({
debug => 2, debug => 2,
server => $server, server => $server,
}); });
# It'll leave 'failed state' for a bit, so we need to wait.
sleep 3;
my $wait_until = time + 10;
my $waiting = 1;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "scan_cluster_log_0011"});
while($waiting)
{
my $failed = check_if_server_failed($anvil, $server);
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { failed => $failed }});
if ($failed)
{
# No luck...
$waiting = 0;
my $changed = $anvil->Alert->check_alert_sent({
record_locator => "scan_cluster::failed_server::".$server,
set_by => $THIS_FILE,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
waiting => $waiting,
changed => $changed,
}});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "scan_cluster_alert_0014", variables => $variables});
if ($changed)
{
# Send an alert.
$anvil->Alert->register({debug => 2, alert_level => "notice", message => "scan_cluster_alert_0014", variables => $variables, set_by => $THIS_FILE});
}
}
elsif (time > $wait_until)
{
# Success!
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "scan_cluster_alert_0013", variables => $variables});
$anvil->Alert->register({debug => 2, alert_level => "notice", message => "scan_cluster_alert_0013", variables => $variables, set_by => $THIS_FILE});
# Clear the alert, if it existed before
$waiting = 0;
my $changed = $anvil->Alert->check_alert_sent({
record_locator => "scan_cluster::failed_server::".$server,
set_by => $THIS_FILE,
clear => 1,
});
}
else
{
# Wait a sec
sleep 2;
}
}
}
}
else
{
# Make sure that this server wasn't previously failed.
my $changed = $anvil->Alert->check_alert_sent({
record_locator => "scan_cluster::failed_server::".$server,
set_by => $THIS_FILE,
clear => 1,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { changed => $changed }});
if ($changed)
{
# Send the All-good alert.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "scan_cluster_alert_0015", variables => $variables});
$anvil->Alert->register({debug => 2, alert_level => "notice", message => "scan_cluster_alert_0015", variables => $variables, set_by => $THIS_FILE});
} }
} }
} }
@ -206,6 +273,28 @@ sub check_resources
return(0); return(0);
} }
sub check_if_server_failed
{
my ($anvil, $server) = @_;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { server => $server }});
$anvil->Cluster->parse_crm_mon({debug => 3});
my $failed = exists $anvil->data->{crm_mon}{parsed}{'pacemaker-result'}{resources}{resource}{$server}{variables}{failed} ? $anvil->data->{crm_mon}{parsed}{'pacemaker-result'}{resources}{resource}{$server}{variables}{failed} : 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { failed => $failed }});
if ($failed eq "true")
{
$failed = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { failed => $failed }});
}
elsif ($failed eq "false")
{
$failed = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { failed => $failed }});
}
return($failed);
}
# Check to see if we need to move the fence delay. # Check to see if we need to move the fence delay.
sub check_fence_delay sub check_fence_delay
{ {

@ -37,10 +37,22 @@ In Maintenance Mode: ..... [#!variable!maintenance_mode!#]
#!variable!difference!# #!variable!difference!#
==== ====
</key> </key>
<key name="scan_cluster_alert_0013">The server: [#!variable!server!#] was found to be failed in pacemaker, but it was successfully recovered. This does NOT mean the server rebooted, but it may have. Checking the server is advised.</key>
<key name="scan_cluster_alert_0014">The server: [#!variable!server!#] was found to be failed in pacemaker. The attempt to recover it appears to have failed. The server might well still be running ok, checking the server is advised.</key>
<key name="scan_cluster_alert_0015">The server: [#!variable!server!#] had been found to be failed in pacemaker. It's now recovered. This does NOT mean the server rebooted, but it may have. Checking the server is advised.</key>
<!-- Log entries --> <!-- Log entries -->
<key name="scan_cluster_log_0001">Starting: [#!variable!program!#].</key> <key name="scan_cluster_log_0001">Starting: [#!variable!program!#].</key>
<key name="scan_cluster_log_0002">This host is a: [#!variable!host_type!#], this agent is only useful on nodes. Exiting.</key> <key name="scan_cluster_log_0002">This host is a: [#!variable!host_type!#], this agent is only useful on nodes. Exiting.</key>
<key name="scan_cluster_log_0003">[ Warning ] - The server: [#!variable!server!#] is in a FAILED state! Checking to see if it's safe to attempt recovery.</key>
<key name="scan_cluster_log_0004">Searching node: [#!variable!node_name!# (#!variable!host_uuid!#] which is in ready state: [#!variable!node_ready!#].</key>
<key name="scan_cluster_log_0005">Searching for the server on the local system.</key>
<key name="scan_cluster_log_0006">Searching for the server on the peer using IP: [#!variable!target_ip!#].</key>
<key name="scan_cluster_log_0007">The server is running locally and we're a full cluster member. Will attempt recover.</key>
<key name="scan_cluster_log_0008">Both nodes are up and the server wasn't found anywhere. Attempting recovery.</key>
<key name="scan_cluster_log_0009">The server was found to be running, but not here (or this node is not fully in the cluster). NOT attempting recovery yet.</key>
<key name="scan_cluster_log_0010">Attempting recovery now...</key>
<key name="scan_cluster_log_0011">Checking to see if the server has recovered yet...</key>
<!-- Message entries (usually meant to be alerts) --> <!-- Message entries (usually meant to be alerts) -->
<key name="scan_cluster_message_0001"></key> <key name="scan_cluster_message_0001"></key>

Loading…
Cancel
Save