Gracefully handle errors from changed node host names in scan-cluster.

Signed-off-by: digimer <mkelly@alteeve.ca>
main
digimer 1 year ago
parent a35e790a4d
commit a11b87458e
  1. 19
      scancore-agents/scan-cluster/scan-cluster
  2. 1
      scancore-agents/scan-cluster/scan-cluster.xml

@ -418,7 +418,7 @@ sub check_if_server_failed
my ($anvil, $server) = @_; my ($anvil, $server) = @_;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { server => $server }}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { server => $server }});
$anvil->Cluster->parse_crm_mon({debug => 3}); $anvil->Cluster->parse_crm_mon({debug => 2});
my $failed = exists $anvil->data->{crm_mon}{parsed}{'pacemaker-result'}{resources}{resource}{$server}{variables}{failed} ? $anvil->data->{crm_mon}{parsed}{'pacemaker-result'}{resources}{resource}{$server}{variables}{failed} : 0; my $failed = exists $anvil->data->{crm_mon}{parsed}{'pacemaker-result'}{resources}{resource}{$server}{variables}{failed} ? $anvil->data->{crm_mon}{parsed}{'pacemaker-result'}{resources}{resource}{$server}{variables}{failed} : 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { failed => $failed }}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { failed => $failed }});
if ($failed eq "true") if ($failed eq "true")
@ -708,15 +708,24 @@ INSERT INTO
$anvil->Database->get_anvils(); $anvil->Database->get_anvils();
foreach my $scan_cluster_node_name (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{data}{node}}) foreach my $scan_cluster_node_name (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{data}{node}})
{ {
my $scan_cluster_node_host_uuid = $anvil->Get->host_uuid_from_name({host_name => $scan_cluster_node_name}); my $scan_cluster_node_host_uuid = $anvil->Get->host_uuid_from_name({host_name => $scan_cluster_node_name}) // "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
scan_cluster_node_name => $scan_cluster_node_name,
scan_cluster_node_host_uuid => $scan_cluster_node_host_uuid,
}});
if (not $scan_cluster_node_host_uuid)
{
# Something is wrong with this host. Does the hostname match to node name?
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_cluster_alert_0016", variables => { node_name => $scan_cluster_node_name }});
next;
}
my $scan_cluster_node_pacemaker_id = $anvil->data->{cib}{parsed}{data}{node}{$scan_cluster_node_name}{node_state}{pacemaker_id}; my $scan_cluster_node_pacemaker_id = $anvil->data->{cib}{parsed}{data}{node}{$scan_cluster_node_name}{node_state}{pacemaker_id};
my $scan_cluster_node_in_ccm = $anvil->data->{cib}{parsed}{data}{node}{$scan_cluster_node_name}{node_state}{in_ccm}; my $scan_cluster_node_in_ccm = $anvil->data->{cib}{parsed}{data}{node}{$scan_cluster_node_name}{node_state}{in_ccm};
my $scan_cluster_node_crmd_member = $anvil->data->{cib}{parsed}{data}{node}{$scan_cluster_node_name}{node_state}{crmd}; my $scan_cluster_node_crmd_member = $anvil->data->{cib}{parsed}{data}{node}{$scan_cluster_node_name}{node_state}{crmd};
my $scan_cluster_node_cluster_member = $anvil->data->{cib}{parsed}{data}{node}{$scan_cluster_node_name}{node_state}{'join'}; my $scan_cluster_node_cluster_member = $anvil->data->{cib}{parsed}{data}{node}{$scan_cluster_node_name}{node_state}{'join'};
my $scan_cluster_node_maintenance_mode = $anvil->data->{cib}{parsed}{data}{node}{$scan_cluster_node_name}{node_state}{'maintenance-mode'}; my $scan_cluster_node_maintenance_mode = $anvil->data->{cib}{parsed}{data}{node}{$scan_cluster_node_name}{node_state}{'maintenance-mode'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
scan_cluster_node_name => $scan_cluster_node_name,
scan_cluster_node_host_uuid => $scan_cluster_node_host_uuid,
scan_cluster_node_pacemaker_id => $scan_cluster_node_pacemaker_id, scan_cluster_node_pacemaker_id => $scan_cluster_node_pacemaker_id,
scan_cluster_node_in_ccm => $scan_cluster_node_in_ccm, scan_cluster_node_in_ccm => $scan_cluster_node_in_ccm,
scan_cluster_node_crmd_member => $scan_cluster_node_crmd_member, scan_cluster_node_crmd_member => $scan_cluster_node_crmd_member,
@ -1044,7 +1053,7 @@ sub collect_data
my ($anvil) = @_; my ($anvil) = @_;
# Pick out core cluster details. # Pick out core cluster details.
my $problem = $anvil->Cluster->parse_cib({debug => 3}); my $problem = $anvil->Cluster->parse_cib({debug => 2});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }});
# If there was a problem, we're not in the cluster. # If there was a problem, we're not in the cluster.

@ -40,6 +40,7 @@ In Maintenance Mode: ..... [#!variable!maintenance_mode!#]
<key name="scan_cluster_alert_0013">The server: [#!variable!server!#] was found to be failed in pacemaker, but it was successfully recovered. This does NOT mean the server rebooted, but it may have. Checking the server is advised.</key> <key name="scan_cluster_alert_0013">The server: [#!variable!server!#] was found to be failed in pacemaker, but it was successfully recovered. This does NOT mean the server rebooted, but it may have. Checking the server is advised.</key>
<key name="scan_cluster_alert_0014">The server: [#!variable!server!#] was found to be failed in pacemaker. The attempt to recover it appears to have failed. The server might well still be running ok, checking the server is advised.</key> <key name="scan_cluster_alert_0014">The server: [#!variable!server!#] was found to be failed in pacemaker. The attempt to recover it appears to have failed. The server might well still be running ok, checking the server is advised.</key>
<key name="scan_cluster_alert_0015">The server: [#!variable!server!#] had been found to be failed in pacemaker. It's now recovered. This does NOT mean the server rebooted, but it may have. Checking the server is advised.</key> <key name="scan_cluster_alert_0015">The server: [#!variable!server!#] had been found to be failed in pacemaker. It's now recovered. This does NOT mean the server rebooted, but it may have. Checking the server is advised.</key>
<key name="scan_cluster_alert_0016">The node name: [#!variable!node_name!#] failed to translate to a host UUID. Does the node name match the host name?</key>
<!-- Log entries --> <!-- Log entries -->
<key name="scan_cluster_log_0001">Starting: [#!variable!program!#].</key> <key name="scan_cluster_log_0001">Starting: [#!variable!program!#].</key>

Loading…
Cancel
Save