From 5db09f565ddf07181e24bf7c13452fed9691af9c Mon Sep 17 00:00:00 2001 From: Digimer Date: Wed, 10 Mar 2021 14:14:04 -0500 Subject: [PATCH] * Updated anvil-join-anvil to actively call a cluster start once per minute while waiting for initial startup. * Added a check to striker-initialize-host the see if anvil-X RPM is already installed. If so, it will not install the Alteeve repo, even if it's not found. Signed-off-by: Digimer --- share/words.xml | 4 +- tools/anvil-join-anvil | 18 +++++++ tools/striker-initialize-host | 89 +++++++++++++++++++++++++++-------- 3 files changed, 90 insertions(+), 21 deletions(-) diff --git a/share/words.xml b/share/words.xml index 45c555e1..02d9f9d8 100644 --- a/share/words.xml +++ b/share/words.xml @@ -549,7 +549,7 @@ Failure! The return code: [#!variable!return_code!#] was received ('0' was expec Starting the cluster (on both nodes) now. We're node 2, so we will wait until the peer starts the cluster. Both nodes are up! - Still waiting. Node 1: [#!variable!node1_name!#] ready: [#!variable!node1_ready!#] (in_ccm/crmd/join: [#!variable!node1_in_ccm!#/#!variable!node1_crmd!#/#!variable!node1_join!#]), Node 2: [#!variable!node2_name!#] ready: [#!variable!node1_ready!#] (in_ccm/crmd/join: [#!variable!node2_in_ccm!#/#!variable!node2_crmd!#/#!variable!node2_join!#]) + Still waiting. Node 1: [#!variable!node1_name!#] ready: [#!variable!node1_ready!#] (in_ccm/crmd/join: [#!variable!node1_in_ccm!#/#!variable!node1_crmd!#/#!variable!node1_join!#]), Node 2: [#!variable!node2_name!#] ready: [#!variable!node2_ready!#] (in_ccm/crmd/join: [#!variable!node2_in_ccm!#/#!variable!node2_crmd!#/#!variable!node2_join!#]) Cluster hasn't started, calling local start. Corosync is not yet configured, waiting. It will be created when node 1 initializes the cluster. Corosync is configured. Will wait for the cluster to start. If it hasn't started in two minutes, we'll try to join it. @@ -730,6 +730,7 @@ It should be provisioned in the next minute or two. One or more machines are not yet accessible on the first BCN. Will check again in a moment. All machines are now available on the first BCN! One of the Striker dashboards has not yet updated network information in the database. We need this to know which IP to tell the peer to use to connect to us. We'll wait a moment and check again. + The cluster still hasn't started. Calling startup again (will try once per minute). Starting: [#!variable!program!#]. @@ -2316,6 +2317,7 @@ Read UUID: .... [#!variable!read_uuid!#] [ Warning ] - We were asked to insert or update a host with the name: [#!variable!host_name!#]. Another host: [#!variable!host_uuid!#] has the same name, which could be a failed node that is being replaced. We're going to set it's 'host_key' to 'DELETED'. If this warning is logged only once, and after a machine is replaced, it's safe to ignore. If this warning is repeatedly being logged, then there are two active machines with the same host name, and that needs to be fixed. [ Warning ] - It looks like the postfix daemon is not running. Enabling and starting it now. [ Warning ] - Checking the mail queue after attempting to start postgres appears to have still failed. Output received was: [#!variable!output!#]. + [ Warning ] - Not installing the Alteeve repo! The package: [#!variable!anvil_role_rpm!#] is already installed. This is OK, but be aware that updates from Alteeve will not be available. To change this, please install: [#!variable!alteeve_repo!#]. diff --git a/tools/anvil-join-anvil b/tools/anvil-join-anvil index cc5ad56e..098d4c4d 100755 --- a/tools/anvil-join-anvil +++ b/tools/anvil-join-anvil @@ -394,6 +394,7 @@ sub configure_pacemaker # Now wait for both nodes to come online. update_progress($anvil, ($anvil->data->{job}{progress} += 2), "job_0109"); my $both_online = 0; + my $start_again = time + 60; until ($both_online) { ### TODO: If we're waiting more that five minutes, call 'pcs cluster start --all' again. @@ -433,6 +434,23 @@ sub configure_pacemaker }}); } } + if (time > $start_again) + { + # Call cluster start again. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0272"}); + $start_again = time + 60; + my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster start --all"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + start_again => $start_again, + shell_call => $shell_call, + }}); + + my ($output, $return_code) = $anvil->System->call({debug => 3, shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + } sleep 5 if not $both_online; } diff --git a/tools/striker-initialize-host b/tools/striker-initialize-host index 9884bdb7..a1caeb04 100755 --- a/tools/striker-initialize-host +++ b/tools/striker-initialize-host @@ -667,35 +667,84 @@ EOF return_code => $return_code, }}); + # In the CI, we'll have custom repos installed. So here we're looking to see if 'anvil-X' is already + # installed. If so, we won't add our repo. + my $anvil_role_rpm = ""; + undef $output; + undef $error; + undef $return_code; + undef $shell_call; + $shell_call = $anvil->data->{path}{exe}{'dnf'}." list installed"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + ($output, $error, $return_code) = $anvil->Remote->call({ + debug => 3, + shell_call => $shell_call, + password => $anvil->data->{data}{password}, + port => $anvil->data->{data}{ssh_port}, + target => $anvil->data->{data}{host_ip_address}, + remote_user => "root", + timeout => 300, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + foreach my $line (split/\n/, $output) + { + $line =~ s/\s.*$//; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { line => $line }}); + + next if $line =~ /anvil-core/; + if ($line =~ /anvil-(.*).noarch/) + { + $anvil_role_rpm = $1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { anvil_role_rpm => $anvil_role_rpm }}); + last; + } + } + # Install the Alteeve repo, if possible. There may be no Internet access, so it's OK if this fails. if (not -e $anvil->data->{path}{config}{'alteeve-el8.repo'}) { - my ($alteeve_access) = $anvil->Network->check_internet({ - debug => 2, - domains => ["alteeve.com"], - password => $anvil->data->{data}{password}, - port => $anvil->data->{data}{ssh_port}, - target => $anvil->data->{data}{host_ip_address}, - remote_user => "root", - }); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { alteeve_access => $alteeve_access }}); - if ($alteeve_access) + if ($anvil_role_rpm) { - $shell_call = $anvil->data->{path}{exe}{'dnf'}." -y install ".$anvil->data->{path}{urls}{alteeve_repo}; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); - ($output, $error, $return_code) = $anvil->Remote->call({ - debug => 3, - shell_call => $shell_call, + # There's already an anvil RPM installed, so we're going to skip installing the repo. + # Warn the user though. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, 'print' => 1, key => "job_0042", variables => { + anvil_role_rpm => $anvil_role_rpm, + alteeve_repo => $anvil->data->{path}{urls}{alteeve_repo}, + }}); + } + else + { + my ($alteeve_access) = $anvil->Network->check_internet({ + debug => 2, + domains => ["alteeve.com"], password => $anvil->data->{data}{password}, port => $anvil->data->{data}{ssh_port}, target => $anvil->data->{data}{host_ip_address}, remote_user => "root", }); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - output => $output, - error => $error, - return_code => $return_code, - }}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { alteeve_access => $alteeve_access }}); + if ($alteeve_access) + { + $shell_call = $anvil->data->{path}{exe}{'dnf'}." -y install ".$anvil->data->{path}{urls}{alteeve_repo}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + ($output, $error, $return_code) = $anvil->Remote->call({ + debug => 3, + shell_call => $shell_call, + password => $anvil->data->{data}{password}, + port => $anvil->data->{data}{ssh_port}, + target => $anvil->data->{data}{host_ip_address}, + remote_user => "root", + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + } } }