From e036515df32d4b0e2e4778987103597f674087a7 Mon Sep 17 00:00:00 2001 From: Digimer Date: Wed, 14 Apr 2021 00:26:06 -0400 Subject: [PATCH] * Got anvil-safe-start to the point where is starts the cluster stack. Need to create the 'anvil-boot-server' and 'anvil-shutdown-server' before it can be completed, so those files have been added. * Created Cluster->parse_quorum() to check if a node is quorate as 'have-quorum' in the pacemaker CIB doesn't appear to be super accurate during startup. * Fixed a bug in striker-manage-install-target where if a node didn't have any registered IPs, it would break before generating the repo data. * Fixed a bug in anvil-join-anvil where if the database had to be reconnected, the job data was lost. Signed-off-by: Digimer --- Anvil/Tools.pm | 1 + Anvil/Tools/Cluster.pm | 111 ++++++++++++++++- Anvil/Tools/Striker.pm | 12 +- Anvil/Tools/System.pm | 2 +- share/words.xml | 11 ++ tools/Makefile.am | 2 + tools/anvil-boot-server | 47 ++++++++ tools/anvil-join-anvil | 9 +- tools/anvil-safe-start | 178 +++++++++++++++++++++++++++- tools/anvil-shutdown-server | 0 tools/striker-manage-install-target | 14 ++- 11 files changed, 368 insertions(+), 19 deletions(-) create mode 100755 tools/anvil-boot-server create mode 100755 tools/anvil-shutdown-server diff --git a/Anvil/Tools.pm b/Anvil/Tools.pm index 81a0cdc2..ddefb40d 100644 --- a/Anvil/Tools.pm +++ b/Anvil/Tools.pm @@ -1115,6 +1115,7 @@ sub _set_paths 'chown' => "/usr/bin/chown", chronyc => "/usr/bin/chronyc", cibadmin => "/usr/sbin/cibadmin", + 'corosync-quorumtool' => "/usr/sbin/corosync-quorumtool", cp => "/usr/bin/cp", createdb => "/usr/bin/createdb", createrepo_c => "/usr/bin/createrepo_c", diff --git a/Anvil/Tools/Cluster.pm b/Anvil/Tools/Cluster.pm index 11e707d2..dc0cd45e 100644 --- a/Anvil/Tools/Cluster.pm +++ b/Anvil/Tools/Cluster.pm @@ -26,6 +26,8 @@ my $THIS_FILE = "Cluster.pm"; # is_primary # migrate_server # parse_cib +# parse_crm_mon +# parse_quorum # shutdown_server # start_cluster # which_node @@ -2261,7 +2263,7 @@ sub parse_cib # call is to determine what resources are running, and where they are running. $anvil->Cluster->parse_crm_mon({ debug => $debug, - password => $anvil->Log->is_secure($password), + password => $password, port => $port, remote_user => $remote_user, target => $target, @@ -2520,6 +2522,113 @@ sub parse_crm_mon } +=head2 parse_quorum + +This parses C<< corosync-quorumtool -s -p >> to check the status of quorum, as it is more reliable that the CIB's c<< have-quorum >> flag. This does not parse out per-node information. + +b<< Note >>: See c<< man corosync-quorumtool >> for details on what these values store. + +If the cluster is down, C<< 1 >> is returned. Otherwise, C<< 1 >> is returned. + +Data is stored as: + quorum::expected-votes + quorum::flags + quorum::nodes + quorum::quorate + quorum::ring_id + quorum::total-votes + +This method takes no parameters. + +=cut +sub parse_quorum +{ + my $self = shift; + my $parameter = shift; + my $anvil = $self->parent; + my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "Cluster->shutdown_server()" }}); + + my ($output, $return_code) = $anvil->System->call({debug => $debug, shell_call => $anvil->data->{path}{exe}{'corosync-quorumtool'}." -p -s"}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + output => $output, + return_code => $return_code, + }}); + + if ($return_code) + { + # Cluster is down + return(1); + } + else + { + $anvil->data->{quorum}{'expected-votes'} = ""; + $anvil->data->{quorum}{flags} = ""; + $anvil->data->{quorum}{nodes} = ""; + $anvil->data->{quorum}{quorate} = ""; + $anvil->data->{quorum}{ring_id} = ""; + $anvil->data->{quorum}{'total-votes'} = ""; + } + + foreach my $line (split/\n/, $output) + { + $line = $anvil->Words->clean_spaces({string => $line}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { line => $line }}); + + if ($line =~ /Expected votes:\s+(\d+)$/) + { + $anvil->data->{quorum}{'expected-votes'} = $1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + "quorum::expected-votes" => $anvil->data->{quorum}{'expected-votes'}, + }}); + next; + } + if ($line =~ /Flags:\s+(.*)$/) + { + $anvil->data->{quorum}{flags} = $1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + "quorum::flags" => $anvil->data->{quorum}{flags}, + }}); + next; + } + if ($line =~ /Nodes:\s+(\d+)$/) + { + $anvil->data->{quorum}{nodes} = $1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + "quorum::nodes" => $anvil->data->{quorum}{nodes}, + }}); + next; + } + if ($line =~ /Quorate:\s+(.*)$/) + { + $anvil->data->{quorum}{quorate} = lc($1) eq "yes" ? 1 : 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + "quorum::quorate" => $anvil->data->{quorum}{quorate}, + }}); + next; + } + if ($line =~ /Ring ID:\s+(.*)$/) + { + $anvil->data->{quorum}{ring_id} = $1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + "quorum::ring_id" => $anvil->data->{quorum}{ring_id}, + }}); + next; + } + if ($line =~ /Nodes:\s+(\d+)$/) + { + $anvil->data->{quorum}{'total-votes'} = $1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + "quorum::total-votes" => $anvil->data->{quorum}{'total-votes'}, + }}); + next; + } + } + + return(0); +} + + =head2 shutdown_server This shuts down a server that is running on the Anvil! system. If there is a problem, C<< !!error!! >> is returned. On success, C<< 0 >> is returned. diff --git a/Anvil/Tools/Striker.pm b/Anvil/Tools/Striker.pm index e0998295..bcc9b4c6 100644 --- a/Anvil/Tools/Striker.pm +++ b/Anvil/Tools/Striker.pm @@ -1025,14 +1025,14 @@ WHERE "manifests::manifest_uuid::${manifest_uuid}::parsed::machine::${machine}::ipmi_ip" => $anvil->data->{manifests}{manifest_uuid}{$manifest_uuid}{parsed}{machine}{$machine}{ipmi_ip}, }}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { "ref(parsed_xml->{machines}{$machine}{upses}{ups})" => ref($parsed_xml->{machines}{$machine}{upses}{ups}), }}); if (ref($parsed_xml->{machines}{$machine}{upses}{ups}) eq "HASH") { my $ups_name = $parsed_xml->{machines}{$machine}{upses}{ups}{name}; $anvil->data->{manifests}{manifest_uuid}{$manifest_uuid}{parsed}{machine}{$machine}{ups}{$ups_name}{used} = $parsed_xml->{machines}{$machine}{upses}{ups}{used}; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { "manifests::manifest_uuid::${manifest_uuid}::parsed::machine::${machine}::ups::${ups_name}::used" => $anvil->data->{manifests}{manifest_uuid}{$manifest_uuid}{parsed}{machine}{$machine}{ups}{$ups_name}{used}, }}); } @@ -1042,20 +1042,20 @@ WHERE { my $ups_name = $hash_ref->{name}; $anvil->data->{manifests}{manifest_uuid}{$manifest_uuid}{parsed}{machine}{$machine}{ups}{$ups_name}{used} = $hash_ref->{used}; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { "manifests::manifest_uuid::${manifest_uuid}::parsed::machine::${machine}::ups::${ups_name}::used" => $anvil->data->{manifests}{manifest_uuid}{$manifest_uuid}{parsed}{machine}{$machine}{ups}{$ups_name}{used}, }}); } } - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { "ref(parsed_xml->{machines}{$machine}{fences}{fence})" => ref($parsed_xml->{machines}{$machine}{fences}{fence}), }}); if (ref($parsed_xml->{machines}{$machine}{fences}{fence}) eq "HASH") { my $fence_name = $parsed_xml->{machines}{$machine}{fences}{fence}{name}; $anvil->data->{manifests}{manifest_uuid}{$manifest_uuid}{parsed}{machine}{$machine}{fence}{$fence_name}{port} = $parsed_xml->{machines}{$machine}{fences}{fence}{port}; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { "manifests::manifest_uuid::${manifest_uuid}::parsed::machine::${machine}::fence::${fence_name}::port" => $anvil->data->{manifests}{manifest_uuid}{$manifest_uuid}{parsed}{machine}{$machine}{fence}{$fence_name}{port}, }}); } @@ -1065,7 +1065,7 @@ WHERE { my $fence_name = $hash_ref->{name}; $anvil->data->{manifests}{manifest_uuid}{$manifest_uuid}{parsed}{machine}{$machine}{fence}{$fence_name}{port} = $hash_ref->{port}; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { "manifests::manifest_uuid::${manifest_uuid}::parsed::machine::${machine}::fence::${fence_name}::port" => $anvil->data->{manifests}{manifest_uuid}{$manifest_uuid}{parsed}{machine}{$machine}{fence}{$fence_name}{port}, }}); } diff --git a/Anvil/Tools/System.pm b/Anvil/Tools/System.pm index 10c79ecb..ef0e2931 100644 --- a/Anvil/Tools/System.pm +++ b/Anvil/Tools/System.pm @@ -4768,7 +4768,7 @@ sub update_hosts $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0481", variables => { old_ip => $current_ip, new_ip => $ip_address, - name => $name, + host => $name, }}); $changes = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { changes => $changes }}); diff --git a/share/words.xml b/share/words.xml index af09938f..3b87929b 100644 --- a/share/words.xml +++ b/share/words.xml @@ -357,6 +357,12 @@ The error was: Failed to read the lvm.conf file. The reason why should be logged above. Failed to write the lvm.conf file. The reason why should be logged above. + +The attempt to start the cluster appears to have failed. The return code '0' was expected, but: [#!variable!return_code!#] was received. The output was: +==== +#!variable!output!# +==== + @@ -1450,6 +1456,11 @@ The file: [#!variable!file!#] needs to be updated. The difference is: Failed to acess over the peer: [#!variable!peer!#] over the network: [#!variable!network!#] via the peer's IP: [#!variable!peer_ip!#]. At least one network connection to the peer: [#!variable!peer!#] is still down. Waiting a bit and then will check again. All connections to the peer: [#!variable!peer!#] are up! + The cluster does not appear to be running, starting it now. + The cluster isn't up yet, waiting a bit before checking again. + We're online as: [#!variable!node_name!#], but we're not quorate yet. Continuing to wait. + We're online as: [#!variable!node_name!#] and quorate! + We're not online yet. Waiting for 'in_ccm/crmd/join': [#!variable!in_ccm!#/#!variable!crmd!#/#!variable!join!#]. ('in_ccm' = consensus cluster member, communication layer. 'crmd' = cluster resource manager daemon is up, 'join' = allowed to host resources). The host name: [#!variable!target!#] does not resolve to an IP address. diff --git a/tools/Makefile.am b/tools/Makefile.am index 1a192b74..d1baa2db 100644 --- a/tools/Makefile.am +++ b/tools/Makefile.am @@ -5,6 +5,7 @@ EXTRA_DIST = \ watch_drbd dist_sbin_SCRIPTS = \ + anvil-boot-server \ anvil-change-password \ anvil-check-memory \ anvil-configure-host \ @@ -24,6 +25,7 @@ dist_sbin_SCRIPTS = \ anvil-provision-server \ anvil-safe-start \ anvil-scan-network \ + anvil-shutdown-server \ anvil-sync-shared \ anvil-update-issue \ anvil-update-states \ diff --git a/tools/anvil-boot-server b/tools/anvil-boot-server new file mode 100755 index 00000000..9bca8f8b --- /dev/null +++ b/tools/anvil-boot-server @@ -0,0 +1,47 @@ +#!/usr/bin/perl +# +# This program boots a server. It can be called as either a job from the webui or directly from another +# program or a terminal. +# +# Exit codes; +# 0 = Normal exit. +# 1 = No database connection. +# + +use strict; +use warnings; +use Anvil::Tools; + +my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0]; +my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0]; +if (($running_directory =~ /^\./) && ($ENV{PWD})) +{ + $running_directory =~ s/^\./$ENV{PWD}/; +} + +# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete. +$| = 1; + +my $anvil = Anvil::Tools->new(); + +$anvil->data->{switches}{'job-uuid'} = ""; +$anvil->Get->switches; +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, secure => 0, key => "log_0115", variables => { program => $THIS_FILE }}); + +$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 'switches::job-uuid' => $anvil->data->{switches}{'job-uuid'}, +}}); + +$anvil->Database->connect(); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"}); +if (not $anvil->data->{sys}{database}{connections}) +{ + # No databases, update the job, sleep for a bit and then exit. The daemon will pick it up and try + # again after we exit. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0077"}); + sleep 10; + $anvil->nice_exit({exit_code => 1}); +} + +# Pick up the job details +load_job_data($anvil); diff --git a/tools/anvil-join-anvil b/tools/anvil-join-anvil index de735fe3..8546fd44 100755 --- a/tools/anvil-join-anvil +++ b/tools/anvil-join-anvil @@ -146,7 +146,7 @@ sub configure_pacemaker ### TODO: Move these to variables in the 'sys' hash my $anvil_name = $anvil->data->{manifests}{manifest_uuid}{$manifest_uuid}{parsed}{name}; my $anvil_uuid = $anvil->data->{sys}{anvil_uuid}; - my $host_name = $anvil->data->{sys}{host_name}; + my $host_name = $anvil->Get->host_name; my $new_password = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_password}; my $node1_host_uuid = $anvil->data->{sys}{node1_host_uuid} = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node1_host_uuid}; my $node1_host_name = $anvil->data->{hosts}{host_uuid}{$node1_host_uuid}{host_name}; @@ -157,7 +157,7 @@ sub configure_pacemaker my $peer_host_name = $anvil->Get->host_uuid() eq $node1_host_uuid ? $node2_host_name : $node1_host_name; my $peer_host_uuid = $anvil->Get->host_uuid() eq $node1_host_uuid ? $node2_host_uuid : $node1_host_uuid; my $escaped_password = shell_quote($new_password); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { machine => $machine, anvil_uuid => $anvil_uuid, anvil_name => $anvil_name, @@ -333,7 +333,7 @@ sub configure_pacemaker { # Try to authenticate against the peer. my $auth_shell_call = $anvil->data->{path}{exe}{pcs}." host auth ".$node1_host_name." ".$node2_host_name." -u hacluster -p ".$escaped_password; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, secure => 1, list => { auth_shell_call => $auth_shell_call }}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, secure => 1, list => { auth_shell_call => $auth_shell_call }}); my ($output, $return_code) = $anvil->System->call({debug => 3, secure => 1, shell_call => $auth_shell_call}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { @@ -1665,6 +1665,9 @@ sub check_local_network # No databases, sleep and then try again. sleep 2; } + + # reload the job data. + load_job($anvil); } $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, 'print' => 1, key => "job_0084"}); diff --git a/tools/anvil-safe-start b/tools/anvil-safe-start index b7e8a496..8aea60ef 100755 --- a/tools/anvil-safe-start +++ b/tools/anvil-safe-start @@ -13,6 +13,7 @@ # # TODO: # - Make this work on DR hosts. +# - 'pcs quorum unblock' could be useful in sole-survivor cold starts. # use strict; @@ -33,7 +34,7 @@ $| = 1; my $anvil = Anvil::Tools->new(); $anvil->Get->switches; -$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => "log_0115", variables => { program => $THIS_FILE }}); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, secure => 0, key => "log_0115", variables => { program => $THIS_FILE }}); # Make sure we're running as 'root' # $< == real UID, $> == effective UID @@ -55,7 +56,6 @@ $anvil->data->{switches}{force} = ""; $anvil->data->{switches}{'local'} = ""; $anvil->data->{switches}{status} = ""; $anvil->Get->switches; -$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => "log_0115", variables => { program => $THIS_FILE }}); # If I have no databases, sleep until I do @@ -75,7 +75,7 @@ if (not $anvil->data->{sys}{database}{connections}) if (not $anvil->data->{sys}{database}{connections}) { # Keep waiting - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 3, secure => 0, key => "log_0439"}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 1, secure => 0, key => "log_0439"}); } } } @@ -95,6 +95,12 @@ prerun_checks($anvil); # networks. There is no timeout. wait_for_access($anvil); +# Start pacemaker now. +start_pacemaker($anvil); + +# Boot servers. +boot_servers($anvil); + $anvil->nice_exit({exit_code => 0}); @@ -102,6 +108,170 @@ $anvil->nice_exit({exit_code => 0}); # Functions # ############################################################################################################# +# This boots the servers. +sub boot_servers +{ + my ($anvil) = @_; + + ### TODO: We need to handle boot ordering, once the WebUI is at that stage. For now, bling-boot all + ### servers. + + + return(0); +} + +# Start pacemaker and wait until we're quorate. +sub start_pacemaker +{ + my ($anvil) = @_; + + my $anvil_uuid = $anvil->data->{sys}{anvil_uuid}; + my $host_uuid = $anvil->Get->host_uuid(); + my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; + my $peer_host_uuid = $anvil->data->{sys}{peer_host_uuid}; + my $peer_short_host_name = $anvil->data->{hosts}{host_uuid}{$peer_host_uuid}{short_host_name}; + my $fenced_peer = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + anvil_uuid => $anvil_uuid, + host_uuid => $host_uuid, + short_host_name => $short_host_name, + peer_host_uuid => $peer_host_uuid, + peer_short_host_name => $peer_short_host_name, + }}); + + # Is pacemaker already running? + my ($problem) = $anvil->Cluster->parse_cib({debug => 3}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); + + if ($problem) + { + # Nope, start it. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, priority => "alert", key => "log_0608"}); + + ### TODO: A lot more testing is needed for degraded single-node start later. + #my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster start"; + my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster start"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + if ($return_code) + { + # What?! Fail out, we're done. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0256", variables => { + output => $output, + return_code => $return_code, + }}); + $anvil->nice_exit({exit_code => 1}); + } + + ### TODO: We may implement the logic to fence our peer (similar to cman's post_join_delay' + ### logic) at a later time. For now, we'll wait forever for this to exit. This is why + ### we set 'wait_for_peer', even though it's not used yet. + # Now wait up to two minutes for the cluster to start. If it's not up by then, we'll fence + # the peer and, if the fence succeeds, unblock quorum. + my $start_time = time; + my $wait_for_peer = $start_time + 120; + my $waiting = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + start_time => $start_time, + wait_for_peer => $wait_for_peer, + }}); + while ($waiting) + { + $waiting = 0; + my ($problem) = $anvil->Cluster->parse_cib({debug => 3}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); + if ($problem) + { + # Can't parse the CIB yet, wait. + $waiting = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); + } + else + { + # Quorum, as reported in the CIB, sets 'have-quorum to '1' as soon as it + # starts, the retracts it. For this reason, we use 'parse_quorum()' to get + # the quorum directly from corosync/votequorum. + my ($problem) = $anvil->Cluster->parse_quorum({debug => 2}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); + if ($problem) + { + # Corosync is down. + $waiting = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); + } + else + { + ### NOTE: We don't worry about maintenance mode yet, as it shouldn't + ### apply, but we may change that view later. + # See where we are. + my $node_name = $anvil->data->{cib}{parsed}{'local'}{name}; + my $maintenance_mode = $anvil->data->{cib}{parsed}{data}{node}{$node_name}{node_state}{'maintenance-mode'}; + my $in_ccm = $anvil->data->{cib}{parsed}{data}{node}{$node_name}{node_state}{in_ccm}; + my $crmd = $anvil->data->{cib}{parsed}{data}{node}{$node_name}{node_state}{crmd}; + my $join = $anvil->data->{cib}{parsed}{data}{node}{$node_name}{node_state}{'join'}; + my $ready = $anvil->data->{cib}{parsed}{data}{node}{$node_name}{node_state}{ready}; + my $quorate = $anvil->data->{quorum}{quorate}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:node_name' => $node_name, + 's2:maintenance_mode' => $maintenance_mode, + 's3:in_ccm/crmd/join' => $in_ccm."/".$crmd."/".$join, + 's4:ready' => $ready, + 's5:quorate' => $quorate, + }}); + + # Are we online? + if ($ready) + { + # We're ready, but do we have quorum? + if ($quorate) + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, priority => "alert", key => "log_0611", variables => { node_name => $node_name }}); + } + else + { + # Nope + $waiting = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); + + # Keep waiting, or fence the peer? + if (time > $wait_for_peer) + { + ### TODO: See above, not implemented yet. + # Time to fence. + } + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, priority => "alert", key => "log_0610", variables => { node_name => $node_name }}); + } + } + else + { + # Not ready yet. + $waiting = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, priority => "alert", key => "log_0612", variables => { + node_name => $node_name, + in_ccm => $in_ccm, + crmd => $crmd, + 'join' => $join, + }}); + } + } + } + + if ($waiting) + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, priority => "alert", key => "log_0609"}); + sleep 5; + } + } + } + + return(0); +} + # Check for which networks we have and verify that we can ping our peer on each. This function will not # return until all networks are up. sub wait_for_access @@ -274,9 +444,11 @@ sub prerun_checks node2_host_uuid => $node2_host_uuid, }}); + $anvil->data->{sys}{anvil_uuid} = $anvil_uuid; $anvil->data->{sys}{peer_host_uuid} = $host_uuid eq $node1_host_uuid ? $node2_host_uuid : $node1_host_uuid; $anvil->data->{sys}{peer_password} = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_password}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "sys::anvil_uuid" => $anvil->data->{sys}{anvil_uuid}, "sys::peer_host_uuid" => $anvil->data->{sys}{peer_host_uuid}, "sys::peer_password" => $anvil->Log->is_secure($anvil->data->{sys}{peer_password}), }}); diff --git a/tools/anvil-shutdown-server b/tools/anvil-shutdown-server new file mode 100755 index 00000000..e69de29b diff --git a/tools/striker-manage-install-target b/tools/striker-manage-install-target index e3de7f8b..33e05a5c 100755 --- a/tools/striker-manage-install-target +++ b/tools/striker-manage-install-target @@ -1245,21 +1245,25 @@ ORDER BY short_host_name => $short_host_name, }}); $anvil->Network->load_ips({ - debug => 3, + debug => 2, host_uuid => $host_uuid, host => $short_host_name, }); my $access = 0; my ($match) = $anvil->Network->find_matches({ - debug => 3, + debug => 2, first => $local_short_host_name, second => $short_host_name, }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { match => $match }}); + if (ref($match) eq "HASH") + { + my $keys = keys %{$match}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'keys' => $keys }}); + } - my $keys = keys %{$match}; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'keys' => $keys }}); - if ($keys) + if (ref($match) eq "HASH") { foreach my $interface (sort {$a cmp $b} keys %{$match->{$short_host_name}}) {