#!/usr/bin/perl # # This does boot-time sanity checks on nodes and then, if all is well, joins the cluster and boots servers. # # NOTE: Unlike M2, this is controlled by scancore's start, but only if scancore starts up within ten minutes # of the node itself booting. This way, stopping/starting scancore won't call us repeatedly. This tool # is enabled or disabled via the 'tool::anvil-safe-start::enabled' variable tied to the 'hosts' -> # 'host_uuid' table. # # Exit codes; # 0 = Normal exit. # 1 = Any problem that causes an early exit. # # TODO: # - Add job support # - Make this work on DR hosts. # - 'pcs quorum unblock' could be useful in sole-survivor cold starts. # use strict; use warnings; use Anvil::Tools; use NetAddr::IP; require POSIX; my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0]; my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0]; if (($running_directory =~ /^\./) && ($ENV{PWD})) { $running_directory =~ s/^\./$ENV{PWD}/; } # Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete. $| = 1; my $anvil = Anvil::Tools->new(); $anvil->Get->switches; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, secure => 0, key => "log_0115", variables => { program => $THIS_FILE }}); # Make sure we're running as 'root' # $< == real UID, $> == effective UID if (($< != 0) && ($> != 0)) { # Not root print $anvil->Words->string({key => "error_0005"})."\n"; $anvil->nice_exit({exit_code => 1}); } # Connect to the database(s). If we have no connections, we'll proceed anyway as one of the 'run_once' tasks # is to setup the database server. $anvil->Database->connect(); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => "log_0132"}); $anvil->data->{switches}{'job-uuid'} = ""; $anvil->data->{switches}{disable} = ""; $anvil->data->{switches}{enable} = ""; $anvil->data->{switches}{force} = ""; $anvil->data->{switches}{'local'} = ""; $anvil->data->{switches}{status} = ""; $anvil->Get->switches; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'switches::job-uuid' => $anvil->data->{switches}{'job-uuid'}, 'switches::disable' => $anvil->data->{switches}{disable}, 'switches::enable' => $anvil->data->{switches}{enable}, 'switches::force' => $anvil->data->{switches}{force}, 'switches::local' => $anvil->data->{switches}{'local'}, 'switches::status' => $anvil->data->{switches}{status}, }}); # If I have no databases, sleep until I do if (not $anvil->data->{sys}{database}{connections}) { # If this is a dashboard, try to configure and then connect to the local database. If this isn't a # Wait until we have one. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, secure => 0, key => "error_0075"}); until($anvil->data->{sys}{database}{connections}) { sleep 10; $anvil->refresh(); $anvil->Database->connect(); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"}); if (not $anvil->data->{sys}{database}{connections}) { # Keep waiting $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 1, secure => 0, key => "log_0439"}); } } } ### Process # 1. Check if I am enabled and that no other copies are running. # 2. Can I ping my peer on all three networks? Loop until true. # - Wait here indefinately # 3. ... # 6. Using Start Groups/Delays (and ignoring 'clean' off VMs), boot servers. # Check to see if we should run. Also checks/sets enable/disable requests. prerun_checks($anvil); # Wait until I can ping the peer on all three networks. This will not return until access is available on all # networks. There is no timeout. wait_for_access($anvil); # Start pacemaker now. start_pacemaker($anvil); # Boot servers. boot_servers($anvil); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0281"}); $anvil->nice_exit({exit_code => 0}); ############################################################################################################# # Functions # ############################################################################################################# # This boots the servers. sub boot_servers { my ($anvil) = @_; # Call 'anvil-boot-server --server all' to boot the servers now. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0614"}); my $shell_call = $anvil->data->{path}{exe}{'anvil-boot-server'}." --server all"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code, }}); if ($return_code) { # What?! Fail out, we're done. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0275", variables => { output => $output, return_code => $return_code, }}); $anvil->nice_exit({exit_code => 1}); } return(0); } # Start pacemaker and wait until we're quorate. sub start_pacemaker { my ($anvil) = @_; my $anvil_uuid = $anvil->data->{sys}{anvil_uuid}; my $host_uuid = $anvil->Get->host_uuid(); my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; my $peer_host_uuid = $anvil->data->{sys}{peer_host_uuid}; my $peer_short_host_name = $anvil->data->{hosts}{host_uuid}{$peer_host_uuid}{short_host_name}; my $fenced_peer = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { anvil_uuid => $anvil_uuid, host_uuid => $host_uuid, short_host_name => $short_host_name, peer_host_uuid => $peer_host_uuid, peer_short_host_name => $peer_short_host_name, }}); # Is pacemaker already running? my ($problem) = $anvil->Cluster->parse_cib({debug => 3}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); if ($problem) { # Nope, start it. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, priority => "alert", key => "log_0608"}); ### TODO: A lot more testing is needed for degraded single-node start later. ### Should we use --all, or wait for our peer? For now, we wait. #my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster start --all"; my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster start"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code, }}); if ($return_code) { # What?! Fail out, we're done. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0256", variables => { output => $output, return_code => $return_code, }}); $anvil->nice_exit({exit_code => 1}); } ### TODO: We may implement the logic to fence our peer (similar to cman's post_join_delay' ### logic) at a later time. For now, we'll wait forever for this to exit. This is why ### we set 'wait_for_peer', even though it's not used yet. # Now wait up to two minutes for the cluster to start. If it's not up by then, we'll fence # the peer and, if the fence succeeds, unblock quorum. my $start_time = time; my $wait_for_peer = $start_time + 120; my $waiting = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { start_time => $start_time, wait_for_peer => $wait_for_peer, }}); while ($waiting) { $waiting = 0; my ($problem) = $anvil->Cluster->parse_cib({debug => 3}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); if ($problem) { # Can't parse the CIB yet, wait. $waiting = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); } else { # Quorum, as reported in the CIB, sets 'have-quorum to '1' as soon as it # starts, the retracts it. For this reason, we use 'parse_quorum()' to get # the quorum directly from corosync/votequorum. my ($problem) = $anvil->Cluster->parse_quorum({debug => 2}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); if ($problem) { # Corosync is down. $waiting = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); } else { ### NOTE: We don't worry about maintenance mode yet, as it shouldn't ### apply, but we may change that view later. # See where we are. my $node_name = $anvil->data->{cib}{parsed}{'local'}{name}; my $maintenance_mode = $anvil->data->{cib}{parsed}{data}{node}{$node_name}{node_state}{'maintenance-mode'}; my $in_ccm = $anvil->data->{cib}{parsed}{data}{node}{$node_name}{node_state}{in_ccm}; my $crmd = $anvil->data->{cib}{parsed}{data}{node}{$node_name}{node_state}{crmd}; my $join = $anvil->data->{cib}{parsed}{data}{node}{$node_name}{node_state}{'join'}; my $ready = $anvil->data->{cib}{parsed}{data}{node}{$node_name}{node_state}{ready}; my $quorate = $anvil->data->{quorum}{quorate}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:node_name' => $node_name, 's2:maintenance_mode' => $maintenance_mode, 's3:in_ccm/crmd/join' => $in_ccm."/".$crmd."/".$join, 's4:ready' => $ready, 's5:quorate' => $quorate, }}); # Are we online? if ($ready) { # We're ready, but do we have quorum? if ($quorate) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, priority => "alert", key => "log_0611", variables => { node_name => $node_name }}); } else { # Nope $waiting = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); # Keep waiting, or fence the peer? if (time > $wait_for_peer) { ### TODO: See above, not implemented yet. Do we want to do this? If so: # Time to fence. Use 'pcs stonith fence ', verify it succeeded, # then do 'pcs quorum unblock --force' to finish startup. } $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, priority => "alert", key => "log_0610", variables => { node_name => $node_name }}); } } else { # Not ready yet. $waiting = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, priority => "alert", key => "log_0612", variables => { node_name => $node_name, in_ccm => $in_ccm, crmd => $crmd, 'join' => $join, }}); } } } if ($waiting) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, priority => "alert", key => "log_0609"}); sleep 5; } } } return(0); } # Check for which networks we have and verify that we can ping our peer on each. This function will not # return until all networks are up. sub wait_for_access { my ($anvil) = @_; my $host_uuid = $anvil->Get->host_uuid(); my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; my $peer_host_uuid = $anvil->data->{sys}{peer_host_uuid}; my $peer_short_host_name = $anvil->data->{hosts}{host_uuid}{$peer_host_uuid}{short_host_name}; my $peer_password = $anvil->data->{sys}{peer_password}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_uuid => $host_uuid, short_host_name => $short_host_name, peer_host_uuid => $peer_host_uuid, peer_short_host_name => $peer_short_host_name, peer_password => $anvil->Log->is_secure($peer_password), }}); my $waiting = 1; while ($waiting) { # This will get set back to '1' if $waiting = 0; # Load IPs (again, to catch changes that might be delaying startup) $anvil->Network->load_ips({ clear => 1, host => $short_host_name, host_uuid => $host_uuid, }); $anvil->Network->load_ips({ clear => 1, host => $peer_short_host_name, host_uuid => $peer_host_uuid, }); # Loop through our interfaces and then loop our peers. Test access over them and set # 'waiting' back to '1' if the connection fails. foreach my $interface (sort {$a cmp $b} keys %{$anvil->data->{network}{$short_host_name}{interface}}) { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { interface => $interface, waiting => $waiting, }}); # Only care about our networks. next if $waiting; if (($interface !~ /^bcn/) && ($interface !~ /^sn/) && ($interface !~ /^ifn/)) { # Not an interface we care about next; } my $this_network = ($interface =~ /^(.*?)_/)[0]; my $ip_address = $anvil->data->{network}{$short_host_name}{interface}{$interface}{ip}; my $subnet_mask = $anvil->data->{network}{$short_host_name}{interface}{$interface}{subnet_mask}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:this_network' => $this_network, 's2:ip_address' => $ip_address, 's3:subnet_mask' => $subnet_mask, }}); ### NOTE: I know I could match interface names, but that's not certain enough. It's ### possible (if unlikely) that the network name+numbre differs on our peer. So ### this is safer. # Loop through my peer's interfaces and see if we're sharing this one. my $local_network = NetAddr::IP->new($ip_address."/".$subnet_mask); my $peer_match_found = 0; foreach my $peer_interface (sort {$a cmp $b} keys %{$anvil->data->{network}{$peer_short_host_name}{interface}}) { last if $peer_match_found; my $peer_ip_address = $anvil->data->{network}{$peer_short_host_name}{interface}{$peer_interface}{ip}; my $peer_subnet_mask = $anvil->data->{network}{$peer_short_host_name}{interface}{$peer_interface}{subnet_mask}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peer_interface => $peer_interface, peer_ip_address => $peer_ip_address, peer_subnet_mask => $peer_subnet_mask, }}); # This the matching network? next if $subnet_mask ne $peer_subnet_mask; my $peer_network = NetAddr::IP->new($peer_ip_address."/".$peer_subnet_mask); if ($peer_network->within($local_network)) { # Match, test access. $peer_match_found = 1; my $access = $anvil->Remote->test_access({ target => $peer_ip_address, password => $peer_password, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { access => $access }}); if ($access) { # This network is good. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0604", variables => { peer => $peer_short_host_name, network => $this_network, peer_ip => $peer_ip_address, }}); } else { # No access, wait and try it again. $waiting = 1; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, priority => "alert", key => "log_0605", variables => { peer => $peer_short_host_name, network => $this_network, peer_ip => $peer_ip_address, }}); } } } } if ($waiting) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, priority => "alert", key => "log_0606", variables => { peer => $peer_short_host_name }}); sleep 5; } } # All networks are up. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, priority => "alert", key => "log_0607", variables => { peer => $peer_short_host_name }}); return(0); } # This checks to verify that we're a node, and if so, if this tool is enabled. If it's disabled or this isn't # a node, this method will exit. sub prerun_checks { my ($anvil) = @_; $anvil->Database->get_hosts(); $anvil->Database->get_anvils(); my $host_uuid = $anvil->Get->host_uuid(); my $host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_uuid => $host_uuid, host_type => $host_type, }}); if ($host_type ne "node") { # We're done. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0598"}); $anvil->nice_exit({exit_code => 0}); } my $anvil_uuid = $anvil->Cluster->get_anvil_uuid(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { anvil_uuid => $anvil_uuid }}); if (not $anvil_uuid) { # This is a node, but not in an Anvil! yet. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0603"}); $anvil->nice_exit({exit_code => 0}); } my $node1_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node1_host_uuid}; my $node2_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node2_host_uuid}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { node1_host_uuid => $node1_host_uuid, node2_host_uuid => $node2_host_uuid, }}); $anvil->data->{sys}{anvil_uuid} = $anvil_uuid; $anvil->data->{sys}{peer_host_uuid} = $host_uuid eq $node1_host_uuid ? $node2_host_uuid : $node1_host_uuid; $anvil->data->{sys}{peer_password} = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_password}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "sys::anvil_uuid" => $anvil->data->{sys}{anvil_uuid}, "sys::peer_host_uuid" => $anvil->data->{sys}{peer_host_uuid}, "sys::peer_password" => $anvil->Log->is_secure($anvil->data->{sys}{peer_password}), }}); # Are we being asked to enable or disable? my $nodes = [$host_uuid]; my $set_to = 1; my $message = ""; if ($anvil->data->{switches}{enable}) { # We're enabling, which message will we use? $message = $anvil->data->{switches}{'local'} ? "log_0599" : "log_0600"; } elsif ($anvil->data->{switches}{disable}) { # We're disabling. Which message? $set_to = 0; $message = $anvil->data->{switches}{'local'} ? "log_0601" : "log_0602"; } # If we're updating the settings, do so and then exit. if ($message) { if (not $anvil->data->{switches}{'local'}) { # Add our peer as well. push @{$nodes}, $anvil->data->{sys}{peer_host_uuid}; } foreach my $host_uuid (@{$nodes}) { my ($variable_uuid) = $anvil->Database->insert_or_update_variables({ debug => 3, variable_name => "tool::anvil-safe-start::enabled", variable_value => $set_to, variable_default => 1, variable_description => "striker_0286", variable_section => "system", variable_source_uuid => $host_uuid, variable_source_table => "hosts", }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { variable_uuid => $variable_uuid }}); } # Record that it's been enabled. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => $message}); $anvil->nice_exit({exit_code => 0}); } # Read my variables. my ($local_enabled, $variable_uuid, $mtime, $modified_date) = $anvil->Database->read_variable({ debug => 3, variable_name => "tool::anvil-safe-start::enabled", variable_source_table => "hosts", variable_source_uuid => $host_uuid, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_enabled => $local_enabled, variable_uuid => $variable_uuid, }}); # No UUID means the value hasn't been recorded, so we default to 1. if (not $variable_uuid) { $local_enabled = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_enabled => $local_enabled }}); } # Have we just been asked for the status? if ($anvil->data->{switches}{status}) { # Yes, check our peer as well. my ($peer_enabled, $variable_uuid, $mtime, $modified_date) = $anvil->Database->read_variable({ debug => 3, variable_name => "tool::anvil-safe-start::enabled", variable_source_table => "hosts", variable_source_uuid => $anvil->data->{sys}{peer_host_uuid}, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peer_enabled => $peer_enabled, variable_uuid => $variable_uuid, }}); # No UUID means the value hasn't been recorded, so we default to 1. if (not $variable_uuid) { $peer_enabled = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peer_enabled => $peer_enabled }}); } # What we tell the use slightly depends on which nodes are enabled. $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_enabled => $local_enabled, peer_enabled => $peer_enabled, }}); my $message = ""; if (($local_enabled) && ($peer_enabled)) { # Both nodes are enabled. $message = "message_0227"; } elsif ((not $local_enabled) && (not $peer_enabled)) { # Both nodes are disabled $message = "message_0228"; } elsif ($local_enabled) { # We're enabled, the peer is disabled. $message = "message_0229"; } else { # We're disabled, the peer is enabled. $message = "message_0230"; } $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => $message}); $anvil->nice_exit({exit_code => 0}); } # Is another instance running? my $pids = $anvil->System->pids({ debug => 3, ignore_me => 1, program_name => $THIS_FILE, }); my $other_instances = @{$pids}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { other_instances => $other_instances }}); if ($other_instances) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "message_0233"}); $anvil->nice_exit({exit_code => 0}); } # Last test, enabled or forced? if (not $local_enabled) { # Disabled. Forced? if ($anvil->data->{switches}{force}) { # Forced, run anyway. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "message_0232"}); return(0); } else { # Exit. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "message_0231"}); $anvil->nice_exit({exit_code => 0}); } } return(0); }