anvil/tools/anvil-safe-start

632 lines
23 KiB
Plaintext
Raw Normal View History

#!/usr/bin/perl
#
# This does boot-time sanity checks on nodes and then, if all is well, joins the cluster and boots servers.
#
# NOTE: Unlike M2, this is controlled by scancore's start, but only if scancore starts up within ten minutes
# of the node itself booting. This way, stopping/starting scancore won't call us repeatedly. This tool
# is enabled or disabled via the 'tool::anvil-safe-start::enabled' variable tied to the 'hosts' ->
# 'host_uuid' table.
#
# Exit codes;
# 0 = Normal exit.
# 1 = Any problem that causes an early exit.
#
# TODO:
# - Add job support
# - Make this work on DR hosts.
# - 'pcs quorum unblock' could be useful in sole-survivor cold starts.
#
use strict;
use warnings;
use Anvil::Tools;
use NetAddr::IP;
require POSIX;
my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0];
my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0];
if (($running_directory =~ /^\./) && ($ENV{PWD}))
{
$running_directory =~ s/^\./$ENV{PWD}/;
}
# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete.
$| = 1;
my $anvil = Anvil::Tools->new();
$anvil->Get->switches;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, secure => 0, key => "log_0115", variables => { program => $THIS_FILE }});
# Make sure we're running as 'root'
# $< == real UID, $> == effective UID
if (($< != 0) && ($> != 0))
{
# Not root
print $anvil->Words->string({key => "error_0005"})."\n";
$anvil->nice_exit({exit_code => 1});
}
# Connect to the database(s). If we have no connections, we'll proceed anyway as one of the 'run_once' tasks
# is to setup the database server.
$anvil->Database->connect();
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => "log_0132"});
$anvil->data->{switches}{'job-uuid'} = "";
$anvil->data->{switches}{disable} = "";
$anvil->data->{switches}{enable} = "";
$anvil->data->{switches}{force} = "";
$anvil->data->{switches}{'local'} = "";
$anvil->data->{switches}{status} = "";
$anvil->Get->switches;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'switches::job-uuid' => $anvil->data->{switches}{'job-uuid'},
'switches::disable' => $anvil->data->{switches}{disable},
'switches::enable' => $anvil->data->{switches}{enable},
'switches::force' => $anvil->data->{switches}{force},
'switches::local' => $anvil->data->{switches}{'local'},
'switches::status' => $anvil->data->{switches}{status},
}});
# If I have no databases, sleep until I do
if (not $anvil->data->{sys}{database}{connections})
{
# If this is a dashboard, try to configure and then connect to the local database. If this isn't a
# Wait until we have one.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, secure => 0, key => "error_0075"});
until($anvil->data->{sys}{database}{connections})
{
sleep 10;
$anvil->refresh();
$anvil->Database->connect();
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"});
if (not $anvil->data->{sys}{database}{connections})
{
# Keep waiting
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 1, secure => 0, key => "log_0439"});
}
}
}
### Process
# 1. Check if I am enabled and that no other copies are running.
# 2. Can I ping my peer on all three networks? Loop until true.
# - Wait here indefinately
# 3. ...
# 6. Using Start Groups/Delays (and ignoring 'clean' off VMs), boot servers.
# Check to see if we should run. Also checks/sets enable/disable requests.
prerun_checks($anvil);
# Wait until I can ping the peer on all three networks. This will not return until access is available on all
# networks. There is no timeout.
wait_for_access($anvil);
# Start pacemaker now.
start_pacemaker($anvil);
# Boot servers.
boot_servers($anvil);
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0281"});
$anvil->nice_exit({exit_code => 0});
#############################################################################################################
# Functions #
#############################################################################################################
# This boots the servers.
sub boot_servers
{
my ($anvil) = @_;
# Call 'anvil-boot-server --server all' to boot the servers now.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0614"});
my $shell_call = $anvil->data->{path}{exe}{'anvil-boot-server'}." --server all";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
if ($return_code)
{
# What?! Fail out, we're done.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0275", variables => {
output => $output,
return_code => $return_code,
}});
$anvil->nice_exit({exit_code => 1});
}
return(0);
}
# Start pacemaker and wait until we're quorate.
sub start_pacemaker
{
my ($anvil) = @_;
my $anvil_uuid = $anvil->data->{sys}{anvil_uuid};
my $host_uuid = $anvil->Get->host_uuid();
my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name};
my $peer_host_uuid = $anvil->data->{sys}{peer_host_uuid};
my $peer_short_host_name = $anvil->data->{hosts}{host_uuid}{$peer_host_uuid}{short_host_name};
my $fenced_peer = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
anvil_uuid => $anvil_uuid,
host_uuid => $host_uuid,
short_host_name => $short_host_name,
peer_host_uuid => $peer_host_uuid,
peer_short_host_name => $peer_short_host_name,
}});
# Is pacemaker already running?
my ($problem) = $anvil->Cluster->parse_cib({debug => 3});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }});
if ($problem)
{
# Nope, start it.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, priority => "alert", key => "log_0608"});
### TODO: A lot more testing is needed for degraded single-node start later.
### Should we use --all, or wait for our peer? For now, we wait.
#my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster start --all";
my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster start";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
if ($return_code)
{
# What?! Fail out, we're done.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0256", variables => {
output => $output,
return_code => $return_code,
}});
$anvil->nice_exit({exit_code => 1});
}
### TODO: We may implement the logic to fence our peer (similar to cman's post_join_delay'
### logic) at a later time. For now, we'll wait forever for this to exit. This is why
### we set 'wait_for_peer', even though it's not used yet.
# Now wait up to two minutes for the cluster to start. If it's not up by then, we'll fence
# the peer and, if the fence succeeds, unblock quorum.
my $start_time = time;
my $wait_for_peer = $start_time + 120;
my $waiting = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
start_time => $start_time,
wait_for_peer => $wait_for_peer,
}});
while ($waiting)
{
$waiting = 0;
my ($problem) = $anvil->Cluster->parse_cib({debug => 3});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }});
if ($problem)
{
# Can't parse the CIB yet, wait.
$waiting = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
}
else
{
# Quorum, as reported in the CIB, sets 'have-quorum to '1' as soon as it
# starts, the retracts it. For this reason, we use 'parse_quorum()' to get
# the quorum directly from corosync/votequorum.
my ($problem) = $anvil->Cluster->parse_quorum({debug => 2});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }});
if ($problem)
{
# Corosync is down.
$waiting = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
}
else
{
### NOTE: We don't worry about maintenance mode yet, as it shouldn't
### apply, but we may change that view later.
# See where we are.
my $node_name = $anvil->data->{cib}{parsed}{'local'}{name};
my $maintenance_mode = $anvil->data->{cib}{parsed}{data}{node}{$node_name}{node_state}{'maintenance-mode'};
my $in_ccm = $anvil->data->{cib}{parsed}{data}{node}{$node_name}{node_state}{in_ccm};
my $crmd = $anvil->data->{cib}{parsed}{data}{node}{$node_name}{node_state}{crmd};
my $join = $anvil->data->{cib}{parsed}{data}{node}{$node_name}{node_state}{'join'};
my $ready = $anvil->data->{cib}{parsed}{data}{node}{$node_name}{node_state}{ready};
my $quorate = $anvil->data->{quorum}{quorate};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:node_name' => $node_name,
's2:maintenance_mode' => $maintenance_mode,
's3:in_ccm/crmd/join' => $in_ccm."/".$crmd."/".$join,
's4:ready' => $ready,
's5:quorate' => $quorate,
}});
# Are we online?
if ($ready)
{
# We're ready, but do we have quorum?
if ($quorate)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, priority => "alert", key => "log_0611", variables => { node_name => $node_name }});
}
else
{
# Nope
$waiting = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
# Keep waiting, or fence the peer?
if (time > $wait_for_peer)
{
### TODO: See above, not implemented yet. Do we want to do this? If so:
# Time to fence. Use 'pcs stonith fence <peer>', verify it succeeded,
# then do 'pcs quorum unblock --force' to finish startup.
}
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, priority => "alert", key => "log_0610", variables => { node_name => $node_name }});
}
}
else
{
# Not ready yet.
$waiting = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, priority => "alert", key => "log_0612", variables => {
node_name => $node_name,
in_ccm => $in_ccm,
crmd => $crmd,
'join' => $join,
}});
}
}
}
if ($waiting)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, priority => "alert", key => "log_0609"});
sleep 5;
}
}
}
return(0);
}
# Check for which networks we have and verify that we can ping our peer on each. This function will not
# return until all networks are up.
sub wait_for_access
{
my ($anvil) = @_;
my $host_uuid = $anvil->Get->host_uuid();
my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name};
my $peer_host_uuid = $anvil->data->{sys}{peer_host_uuid};
my $peer_short_host_name = $anvil->data->{hosts}{host_uuid}{$peer_host_uuid}{short_host_name};
my $peer_password = $anvil->data->{sys}{peer_password};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
host_uuid => $host_uuid,
short_host_name => $short_host_name,
peer_host_uuid => $peer_host_uuid,
peer_short_host_name => $peer_short_host_name,
peer_password => $anvil->Log->is_secure($peer_password),
}});
my $waiting = 1;
while ($waiting)
{
# This will get set back to '1' if
$waiting = 0;
# Load IPs (again, to catch changes that might be delaying startup)
$anvil->Network->load_ips({
clear => 1,
host => $short_host_name,
host_uuid => $host_uuid,
});
$anvil->Network->load_ips({
clear => 1,
host => $peer_short_host_name,
host_uuid => $peer_host_uuid,
});
# Loop through our interfaces and then loop our peers. Test access over them and set
# 'waiting' back to '1' if the connection fails.
foreach my $interface (sort {$a cmp $b} keys %{$anvil->data->{network}{$short_host_name}{interface}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
interface => $interface,
waiting => $waiting,
}});
# Only care about our networks.
next if $waiting;
if (($interface !~ /^bcn/) && ($interface !~ /^sn/) && ($interface !~ /^ifn/))
{
# Not an interface we care about
next;
}
my $this_network = ($interface =~ /^(.*?)_/)[0];
my $ip_address = $anvil->data->{network}{$short_host_name}{interface}{$interface}{ip};
my $subnet_mask = $anvil->data->{network}{$short_host_name}{interface}{$interface}{subnet_mask};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:this_network' => $this_network,
's2:ip_address' => $ip_address,
's3:subnet_mask' => $subnet_mask,
}});
### NOTE: I know I could match interface names, but that's not certain enough. It's
### possible (if unlikely) that the network name+numbre differs on our peer. So
### this is safer.
# Loop through my peer's interfaces and see if we're sharing this one.
my $local_network = NetAddr::IP->new($ip_address."/".$subnet_mask);
my $peer_match_found = 0;
foreach my $peer_interface (sort {$a cmp $b} keys %{$anvil->data->{network}{$peer_short_host_name}{interface}})
{
last if $peer_match_found;
my $peer_ip_address = $anvil->data->{network}{$peer_short_host_name}{interface}{$peer_interface}{ip};
my $peer_subnet_mask = $anvil->data->{network}{$peer_short_host_name}{interface}{$peer_interface}{subnet_mask};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
peer_interface => $peer_interface,
peer_ip_address => $peer_ip_address,
peer_subnet_mask => $peer_subnet_mask,
}});
# This the matching network?
next if $subnet_mask ne $peer_subnet_mask;
my $peer_network = NetAddr::IP->new($peer_ip_address."/".$peer_subnet_mask);
if ($peer_network->within($local_network))
{
# Match, test access.
$peer_match_found = 1;
my $access = $anvil->Remote->test_access({
target => $peer_ip_address,
password => $peer_password,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { access => $access }});
if ($access)
{
# This network is good.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0604", variables => {
peer => $peer_short_host_name,
network => $this_network,
peer_ip => $peer_ip_address,
}});
}
else
{
# No access, wait and try it again.
$waiting = 1;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, priority => "alert", key => "log_0605", variables => {
peer => $peer_short_host_name,
network => $this_network,
peer_ip => $peer_ip_address,
}});
}
}
}
}
if ($waiting)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, priority => "alert", key => "log_0606", variables => { peer => $peer_short_host_name }});
sleep 5;
}
}
# All networks are up.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, priority => "alert", key => "log_0607", variables => { peer => $peer_short_host_name }});
return(0);
}
# This checks to verify that we're a node, and if so, if this tool is enabled. If it's disabled or this isn't
# a node, this method will exit.
sub prerun_checks
{
my ($anvil) = @_;
$anvil->Database->get_hosts();
$anvil->Database->get_anvils();
my $host_uuid = $anvil->Get->host_uuid();
my $host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
host_uuid => $host_uuid,
host_type => $host_type,
}});
if ($host_type ne "node")
{
# We're done.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0598"});
$anvil->nice_exit({exit_code => 0});
}
my $anvil_uuid = $anvil->Cluster->get_anvil_uuid();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { anvil_uuid => $anvil_uuid }});
if (not $anvil_uuid)
{
# This is a node, but not in an Anvil! yet.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0603"});
$anvil->nice_exit({exit_code => 0});
}
my $node1_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node1_host_uuid};
my $node2_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node2_host_uuid};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
node1_host_uuid => $node1_host_uuid,
node2_host_uuid => $node2_host_uuid,
}});
$anvil->data->{sys}{anvil_uuid} = $anvil_uuid;
$anvil->data->{sys}{peer_host_uuid} = $host_uuid eq $node1_host_uuid ? $node2_host_uuid : $node1_host_uuid;
$anvil->data->{sys}{peer_password} = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_password};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"sys::anvil_uuid" => $anvil->data->{sys}{anvil_uuid},
"sys::peer_host_uuid" => $anvil->data->{sys}{peer_host_uuid},
"sys::peer_password" => $anvil->Log->is_secure($anvil->data->{sys}{peer_password}),
}});
# Are we being asked to enable or disable?
my $nodes = [$host_uuid];
my $set_to = 1;
my $message = "";
if ($anvil->data->{switches}{enable})
{
# We're enabling, which message will we use?
$message = $anvil->data->{switches}{'local'} ? "log_0599" : "log_0600";
}
elsif ($anvil->data->{switches}{disable})
{
# We're disabling. Which message?
$set_to = 0;
$message = $anvil->data->{switches}{'local'} ? "log_0601" : "log_0602";
}
# If we're updating the settings, do so and then exit.
if ($message)
{
if (not $anvil->data->{switches}{'local'})
{
# Add our peer as well.
push @{$nodes}, $anvil->data->{sys}{peer_host_uuid};
}
foreach my $host_uuid (@{$nodes})
{
my ($variable_uuid) = $anvil->Database->insert_or_update_variables({
debug => 3,
variable_name => "tool::anvil-safe-start::enabled",
variable_value => $set_to,
variable_default => 1,
variable_description => "striker_0286",
variable_section => "system",
variable_source_uuid => $host_uuid,
variable_source_table => "hosts",
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { variable_uuid => $variable_uuid }});
}
# Record that it's been enabled.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => $message});
$anvil->nice_exit({exit_code => 0});
}
# Read my variables.
my ($local_enabled, $variable_uuid, $mtime, $modified_date) = $anvil->Database->read_variable({
debug => 3,
variable_name => "tool::anvil-safe-start::enabled",
variable_source_table => "hosts",
variable_source_uuid => $host_uuid,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
local_enabled => $local_enabled,
variable_uuid => $variable_uuid,
}});
# No UUID means the value hasn't been recorded, so we default to 1.
if (not $variable_uuid)
{
$local_enabled = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_enabled => $local_enabled }});
}
# Have we just been asked for the status?
if ($anvil->data->{switches}{status})
{
# Yes, check our peer as well.
my ($peer_enabled, $variable_uuid, $mtime, $modified_date) = $anvil->Database->read_variable({
debug => 3,
variable_name => "tool::anvil-safe-start::enabled",
variable_source_table => "hosts",
variable_source_uuid => $anvil->data->{sys}{peer_host_uuid},
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
peer_enabled => $peer_enabled,
variable_uuid => $variable_uuid,
}});
# No UUID means the value hasn't been recorded, so we default to 1.
if (not $variable_uuid)
{
$peer_enabled = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peer_enabled => $peer_enabled }});
}
# What we tell the use slightly depends on which nodes are enabled.
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
local_enabled => $local_enabled,
peer_enabled => $peer_enabled,
}});
my $message = "";
if (($local_enabled) && ($peer_enabled))
{
# Both nodes are enabled.
$message = "message_0227";
}
elsif ((not $local_enabled) && (not $peer_enabled))
{
# Both nodes are disabled
$message = "message_0228";
}
elsif ($local_enabled)
{
# We're enabled, the peer is disabled.
$message = "message_0229";
}
else
{
# We're disabled, the peer is enabled.
$message = "message_0230";
}
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => $message});
$anvil->nice_exit({exit_code => 0});
}
# Is another instance running?
my $pids = $anvil->System->pids({
debug => 3,
ignore_me => 1,
program_name => $THIS_FILE,
});
my $other_instances = @{$pids};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { other_instances => $other_instances }});
if ($other_instances)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "message_0233"});
$anvil->nice_exit({exit_code => 0});
}
# Last test, enabled or forced?
if (not $local_enabled)
{
# Disabled. Forced?
if ($anvil->data->{switches}{force})
{
# Forced, run anyway.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "message_0232"});
return(0);
}
else
{
# Exit.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "message_0231"});
$anvil->nice_exit({exit_code => 0});
}
}
return(0);
}