From a11b87458e21ad1478e7349982207735e6d93839 Mon Sep 17 00:00:00 2001 From: digimer Date: Fri, 20 Oct 2023 19:15:04 -0400 Subject: [PATCH] Gracefully handle errors from changed node host names in scan-cluster. Signed-off-by: digimer --- scancore-agents/scan-cluster/scan-cluster | 19 ++++++++++++++----- scancore-agents/scan-cluster/scan-cluster.xml | 1 + 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/scancore-agents/scan-cluster/scan-cluster b/scancore-agents/scan-cluster/scan-cluster index dac1623c..a074d669 100755 --- a/scancore-agents/scan-cluster/scan-cluster +++ b/scancore-agents/scan-cluster/scan-cluster @@ -418,7 +418,7 @@ sub check_if_server_failed my ($anvil, $server) = @_; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { server => $server }}); - $anvil->Cluster->parse_crm_mon({debug => 3}); + $anvil->Cluster->parse_crm_mon({debug => 2}); my $failed = exists $anvil->data->{crm_mon}{parsed}{'pacemaker-result'}{resources}{resource}{$server}{variables}{failed} ? $anvil->data->{crm_mon}{parsed}{'pacemaker-result'}{resources}{resource}{$server}{variables}{failed} : 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { failed => $failed }}); if ($failed eq "true") @@ -708,15 +708,24 @@ INSERT INTO $anvil->Database->get_anvils(); foreach my $scan_cluster_node_name (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{data}{node}}) { - my $scan_cluster_node_host_uuid = $anvil->Get->host_uuid_from_name({host_name => $scan_cluster_node_name}); + my $scan_cluster_node_host_uuid = $anvil->Get->host_uuid_from_name({host_name => $scan_cluster_node_name}) // ""; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + scan_cluster_node_name => $scan_cluster_node_name, + scan_cluster_node_host_uuid => $scan_cluster_node_host_uuid, + }}); + if (not $scan_cluster_node_host_uuid) + { + # Something is wrong with this host. Does the hostname match to node name? + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_cluster_alert_0016", variables => { node_name => $scan_cluster_node_name }}); + next; + } + my $scan_cluster_node_pacemaker_id = $anvil->data->{cib}{parsed}{data}{node}{$scan_cluster_node_name}{node_state}{pacemaker_id}; my $scan_cluster_node_in_ccm = $anvil->data->{cib}{parsed}{data}{node}{$scan_cluster_node_name}{node_state}{in_ccm}; my $scan_cluster_node_crmd_member = $anvil->data->{cib}{parsed}{data}{node}{$scan_cluster_node_name}{node_state}{crmd}; my $scan_cluster_node_cluster_member = $anvil->data->{cib}{parsed}{data}{node}{$scan_cluster_node_name}{node_state}{'join'}; my $scan_cluster_node_maintenance_mode = $anvil->data->{cib}{parsed}{data}{node}{$scan_cluster_node_name}{node_state}{'maintenance-mode'}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - scan_cluster_node_name => $scan_cluster_node_name, - scan_cluster_node_host_uuid => $scan_cluster_node_host_uuid, scan_cluster_node_pacemaker_id => $scan_cluster_node_pacemaker_id, scan_cluster_node_in_ccm => $scan_cluster_node_in_ccm, scan_cluster_node_crmd_member => $scan_cluster_node_crmd_member, @@ -1044,7 +1053,7 @@ sub collect_data my ($anvil) = @_; # Pick out core cluster details. - my $problem = $anvil->Cluster->parse_cib({debug => 3}); + my $problem = $anvil->Cluster->parse_cib({debug => 2}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); # If there was a problem, we're not in the cluster. diff --git a/scancore-agents/scan-cluster/scan-cluster.xml b/scancore-agents/scan-cluster/scan-cluster.xml index ba2b7869..218df643 100644 --- a/scancore-agents/scan-cluster/scan-cluster.xml +++ b/scancore-agents/scan-cluster/scan-cluster.xml @@ -40,6 +40,7 @@ In Maintenance Mode: ..... [#!variable!maintenance_mode!#] The server: [#!variable!server!#] was found to be failed in pacemaker, but it was successfully recovered. This does NOT mean the server rebooted, but it may have. Checking the server is advised. The server: [#!variable!server!#] was found to be failed in pacemaker. The attempt to recover it appears to have failed. The server might well still be running ok, checking the server is advised. The server: [#!variable!server!#] had been found to be failed in pacemaker. It's now recovered. This does NOT mean the server rebooted, but it may have. Checking the server is advised. + The node name: [#!variable!node_name!#] failed to translate to a host UUID. Does the node name match the host name? Starting: [#!variable!program!#].