From 7e7b91b2866c561541ded7ce45e0acff88ed4c4e Mon Sep 17 00:00:00 2001 From: Digimer Date: Wed, 14 Jul 2021 12:17:19 -0400 Subject: [PATCH] * Updates anvil-join-anvil to update corosync.conf to use the BCN1 link as the main knet network with the SN1 link as the backup link. * Fixed a bug in Cluster->parse_cib() where the local machine's ready state was being set to the node name. Signed-off-by: Digimer --- Anvil/Tools/Cluster.pm | 2 +- scancore-agents/scan-network/scan-network | 12 +- scancore-agents/scan-network/scan-network.xml | 1 + share/words.xml | 16 +- tools/anvil-join-anvil | 317 +++++++++++++++++- 5 files changed, 343 insertions(+), 5 deletions(-) diff --git a/Anvil/Tools/Cluster.pm b/Anvil/Tools/Cluster.pm index 68b443b3..21f1524e 100644 --- a/Anvil/Tools/Cluster.pm +++ b/Anvil/Tools/Cluster.pm @@ -3319,7 +3319,7 @@ sub parse_cib (($target_short_host_name) && ($node_name =~ /^$target_short_host_name/))) { # Me (or the node the CIB was read from). - $anvil->data->{cib}{parsed}{'local'}{ready} = $node_name; + $anvil->data->{cib}{parsed}{'local'}{ready} = $ready; $anvil->data->{cib}{parsed}{'local'}{name} = $node_name; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { "cib::parsed::local::ready" => $anvil->data->{cib}{parsed}{'local'}{ready}, diff --git a/scancore-agents/scan-network/scan-network b/scancore-agents/scan-network/scan-network index 4c16e0cd..5bad0021 100755 --- a/scancore-agents/scan-network/scan-network +++ b/scancore-agents/scan-network/scan-network @@ -55,6 +55,7 @@ $anvil->Get->switches; if (($anvil->data->{scancore}{'scan-hardware'}{disable}) && (not $anvil->data->{switches}{force})) { # Exit. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, 'print' => 1, key => "log_0646", variables => { program => $THIS_FILE }}); $anvil->nice_exit({exit_code => 0}); } @@ -1594,10 +1595,17 @@ sub check_interfaces new => $new_mac_address, }; - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_network_alert_0044", variables => $variables}); + my $key = "scan_network_alert_0044"; + if ($network_interface_name =~ /^vnet/) + { + # This is a server booting or migrating + $key = "scan_network_alert_0061"; + } + + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => $key, variables => $variables}); $anvil->Alert->register({ alert_level => "notice", - message => "scan_network_alert_0044", + message => $key, variables => $variables, set_by => $THIS_FILE, }); diff --git a/scancore-agents/scan-network/scan-network.xml b/scancore-agents/scan-network/scan-network.xml index eabc4652..dde31e9c 100644 --- a/scancore-agents/scan-network/scan-network.xml +++ b/scancore-agents/scan-network/scan-network.xml @@ -132,6 +132,7 @@ Note: If this is a Storage Network directly connected to the peer, and the peer The bond: [#!variable!name!#] appears to have been stopped or deleted. The last time we saw it, it had transmitted: [#!variable!tx!#] and received: [#!variable!rx!#]. The network interface: [#!variable!name!#] appears to have been removed. The last time we saw it, it had transmitted: [#!variable!tx!#] and received: [#!variable!rx!#]. The IP address: [#!variable!ip!#] appears to no longer be used on this machine. + The network interface: [#!variable!name!#] MAC address has changed from: [#!variable!old!#] to: [#!variable!new!#]. This is normal when a server boots or migrates. Failed to read the network interface speed from the file: [#!variable!file!#]. Ignoring interface. diff --git a/share/words.xml b/share/words.xml index 7d62a485..25c5da6f 100644 --- a/share/words.xml +++ b/share/words.xml @@ -1105,6 +1105,12 @@ It should be provisioned in the next minute or two. Power Off Server VM Power off the target server VM by executing a stop script on the first a host within the cluster. + Verifying that corosync is configured to use the SN1 as a fall-back communication channel. + Verifying (and waiting if needed) for the cluster to be be up and both BCN1 and SN1 connections to be active. + The cluster is up. + Both the BCN1 and SN1 links are working between the nodes. Checking corosync now... + Synchronizing the new corosync config exited with return code: [#!variable!return_code!#] and output: [#!variable!output!#] + Loading the new corosync config exited with return code: [#!variable!return_code!#] and output: [#!variable!output!#] Starting: [#!variable!program!#]. @@ -1827,7 +1833,7 @@ The file: [#!variable!file!#] needs to be updated. The difference is: ==== #!variable!difference!# ==== - + This system will reboot in: [#!variable!seconds!#] seconds... The bond: [#!variable!bond!#] is completely down, trying to recover member interfaces. The bond: [#!variable!bond!#] is up, but at least one interface is down. Will try to recover now. @@ -1845,6 +1851,14 @@ The file: [#!variable!file!#] needs to be updated. The difference is: Out peer is online, no need to check server location constraints. The server: [#!variable!server!#] has a location constraint that preferres our peer, but our peer is offline. Updating the location constraint to prefer this node. Disabling dual primary for the resource: [#!variable!resource!#] to the node: [#!variable!target_name!# (#!variable!target_node_id!#)]. + The corosync config file is being updated with these differences; +==== +#!variable!difference!# +==== + + Synchronizing corosync config. + Reloading corosync config. + #!variable!program!# is disabled in anvil.conf. and '--force' was not used. Exiting. The host name: [#!variable!target!#] does not resolve to an IP address. diff --git a/tools/anvil-join-anvil b/tools/anvil-join-anvil index 11ddcda2..258e53cf 100755 --- a/tools/anvil-join-anvil +++ b/tools/anvil-join-anvil @@ -21,6 +21,7 @@ use warnings; use Anvil::Tools; use Data::Dumper; use String::ShellQuote; +use Text::Diff; my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0]; my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0]; @@ -135,7 +136,8 @@ sub update_passwords return(0); } -# (wait for out peer and) Configure pacemaker. If this is a DR host, this is skipped. + +# (wait for our peer and) Configure pacemaker. If this is a DR host, this is skipped. sub configure_pacemaker { my ($anvil) = @_; @@ -1106,6 +1108,319 @@ sub configure_pacemaker } } + # Update (if needed) corosync.conf to use the BCN1 and SN1 as knet networks. + if ($machine eq "node1") + { + update_progress($anvil, ($anvil->data->{job}{progress} += 1), "job_0344"); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "job_0344"}); + + check_corosync($anvil); + } + + return(0); +} + +sub check_corosync +{ + my ($anvil) = @_; + + update_progress($anvil, ($anvil->data->{job}{progress} += 1), "job_0345"); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "job_0345"}); + my $waiting = 1; + my $anvil_uuid = $anvil->data->{sys}{anvil_uuid}; + my $new_password = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_password}; + while ($waiting) + { + my $problem = $anvil->Cluster->parse_cib({debug => 3}); + my $peer_ready = $anvil->data->{cib}{parsed}{peer}{ready}; + my $peer_name = $anvil->data->{cib}{parsed}{peer}{name}; + my $peer_short_name = $peer_name; + $peer_short_name =~ s/\..*$//; + my $peer_bcn_name = $peer_short_name.".bcn1"; + my $peer_sn_name = $peer_short_name.".sn1"; + my $local_ready = $anvil->data->{cib}{parsed}{'local'}{ready}; + my $local_name = $anvil->data->{cib}{parsed}{'local'}{name}; + my $local_short_name = $peer_name; + $local_short_name =~ s/\..*$//; + my $local_bcn_name = $local_short_name.".bcn1"; + my $local_sn_name = $local_short_name.".sn1"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + problem => $problem, + peer_ready => $peer_ready, + peer_name => $peer_name, + peer_short_name => $peer_short_name, + peer_bcn_name => $peer_bcn_name, + peer_sn_name => $peer_sn_name, + local_ready => $local_ready, + local_name => $local_name, + local_short_name => $local_short_name, + local_bcn_name => $local_bcn_name, + local_sn_name => $local_sn_name, + }}); + if ((not $problem) && ($peer_ready) && ($local_ready)) + { + update_progress($anvil, $anvil->data->{job}{progress}, "job_0346"); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0346"}); + + # Verify we can ping the peer on the BCN and SN. + my $bcn_access = $anvil->Remote->test_access({ + target => $peer_bcn_name, + password => $new_password, + }); + my $sn_access = $anvil->Remote->test_access({ + target => $peer_sn_name, + password => $new_password, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + bcn_access => $bcn_access, + sn_access => $sn_access, + }}); + if (($bcn_access) && ($sn_access)) + { + # We're ready! + update_progress($anvil, ($anvil->data->{job}{progress}+1), "job_0347"); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0347"}); + + $waiting = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); + + my $in_totem = 0; + my $token_seen = 0; + my $in_nodelist = 0; + my $in_node = 0; + my $ring0_addr = ""; + my $ring1_addr = ""; + my $in_node_name = ""; + my $nodelist_body = ""; + my $old_corosync_conf = $anvil->Storage->read_file({file => $anvil->data->{path}{configs}{'corosync.conf'}}); + my $new_corosync_conf = ""; + foreach my $line (split/\n/, $old_corosync_conf) + { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { line => $line }}); + if ($line =~ /totem \{/) + { + $in_totem = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { in_totem => $in_totem }}); + + $new_corosync_conf .= $line."\n"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { new_corosync_conf => $new_corosync_conf }}); + next; + } + if ($line =~ /nodelist \{/) + { + $in_nodelist = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); + + $new_corosync_conf .= $line."\n"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { new_corosync_conf => $new_corosync_conf }}); + next; + } + + if ($in_nodelist) + { + if ($line =~ /node \{/) + { + $in_node = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { in_node => $in_node }}); + + $new_corosync_conf .= $line."\n"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { new_corosync_conf => $new_corosync_conf }}); + next; + } + if ($in_node) + { + if ($line =~ /name:(.*)$/) + { + $in_node_name = $anvil->Words->clean_spaces({string => $1}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { in_node_name => $in_node_name }}); + + $nodelist_body .= $line."\n"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { nodelist_body => $nodelist_body }}); + next; + } + elsif ($line =~ /ring0_addr:(.*)$/) + { + $ring0_addr = $anvil->Words->clean_spaces({string => $1}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { ring0_addr => $ring0_addr }}); + + $nodelist_body .= $line."\n"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { nodelist_body => $nodelist_body }}); + next; + } + elsif ($line =~ /ring1_addr:(.*)$/) + { + $ring1_addr = $anvil->Words->clean_spaces({string => $1}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { ring1_addr => $ring1_addr }}); + + $nodelist_body .= $line."\n"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { nodelist_body => $nodelist_body }}); + next; + } + elsif ($line =~ /}/) + { + $in_node = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { in_node => $in_node }}); + + # Is this the local or peer node? + my $test_ring0_addr = $peer_bcn_name; + my $test_ring1_addr = $peer_sn_name; + if (($in_node_name eq $anvil->Get->host_name) or ($in_node_name eq $anvil->Get->short_host_name)) + { + # It's us + $test_ring0_addr = $local_bcn_name; + $test_ring1_addr = $local_sn_name; + } + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:ring0_addr' => $ring0_addr, + 's2:test_ring0_addr' => $test_ring0_addr, + 's3:ring1_addr' => $ring1_addr, + 's4:test_ring1_addr' => $test_ring1_addr, + }}); + + if ((not $ring1_addr) or ($ring0_addr ne $test_ring0_addr) or ($ring1_addr ne $test_ring1_addr)) + { + # Rewrite the nodelist body. + my $new_nodelist_body = " ring0_addr: ".$test_ring0_addr."\n"; + $new_nodelist_body .= " ring1_addr: ".$test_ring1_addr."\n"; + foreach my $nodelist_line (split/\n/, $nodelist_body) + { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { nodelist_line => $nodelist_line }}); + next if $nodelist_line =~ /ring\d_addr/; + + $new_nodelist_body .= $nodelist_line."\n"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { new_nodelist_body => $new_nodelist_body }}); + } + + $nodelist_body = $new_nodelist_body; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { new_nodelist_body => $new_nodelist_body }}); + } + + $new_corosync_conf .= $nodelist_body; + $new_corosync_conf .= $line."\n"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { new_corosync_conf => $new_corosync_conf }}); + + $ring0_addr = ""; + $ring1_addr = ""; + $in_node_name = ""; + $nodelist_body = ""; + next; + } + else + { + # Normal line, stash it. + $nodelist_body .= $line."\n"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { nodelist_body => $nodelist_body }}); + next; + } + } + elsif ($line =~ /}/) + { + $in_nodelist = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { in_nodelist => $in_nodelist }}); + + $new_corosync_conf .= $line."\n"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { new_corosync_conf => $new_corosync_conf }}); + next; + } + } + if ($in_totem) + { + if ($line =~ /}/) + { + if (not $token_seen) + { + $new_corosync_conf .= " token: 10000\n"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { new_corosync_conf => $new_corosync_conf }}); + } + $new_corosync_conf .= $line."\n"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { new_corosync_conf => $new_corosync_conf }}); + + $in_totem = 0; + $token_seen = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + in_totem => $in_totem, + token_seen => $token_seen, + }}); + next; + } + if ($line =~ /token:/) + { + $token_seen = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); + } + $new_corosync_conf .= $line."\n"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { new_corosync_conf => $new_corosync_conf }}); + } + else + { + # Normal line + $new_corosync_conf .= $line."\n"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { new_corosync_conf => $new_corosync_conf }}); + } + } + + # Take the last new lines of the file bodies. + $old_corosync_conf =~ s/\n$//g; + $new_corosync_conf =~ s/\n$//g; + + my $difference = diff \$old_corosync_conf, \$new_corosync_conf, { STYLE => 'Unified' }; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { difference => $difference }}); + + if ($difference) + { + # Update the corosync.conf, sync it and reload corosync. + update_progress($anvil, ($anvil->data->{job}{progress}+1), "log_0643,!!difference!".$difference."!!"); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0643", variables => { difference => $difference }}); + $anvil->Storage->write_file({ + file => $anvil->data->{path}{configs}{'corosync.conf'}, + body => $new_corosync_conf, + overwrite => 1, + backup => 1, + }); + + # Sync + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0644"}); + my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster sync"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + # RC 0 is OK + my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + update_progress($anvil, ($anvil->data->{job}{progress}+1), "job_0348,!!return_code!".$return_code."!!,!!output!".$output."!!"); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0348", variables => { + output => $output, + return_code => $return_code, + }}); + + # Reload + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0645"}); + $shell_call = $anvil->data->{path}{exe}{pcs}." cluster reload corosync"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + # RC 0 is OK + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + update_progress($anvil, ($anvil->data->{job}{progress}+1), "job_0349,!!return_code!".$return_code."!!,!!output!".$output."!!"); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0349", variables => { + output => $output, + return_code => $return_code, + }}); + } + } + } + + if ($waiting) + { + sleep 5; + } + } + return(0); }