anvil/tools/unfence_pacemaker
Fabio M. Di Nitto 635f38b489 anvil-safe-stop: don´t use locked version of pcs
Add pcs_direct tool path and use it for anvil-safe-stop

Closes: #623

Signed-off-by: Fabio M. Di Nitto <fabbione@fabbione.net>
2024-03-30 07:06:10 +01:00

653 lines
22 KiB
Perl
Executable File

#!/usr/bin/perl
#
# Author: Madison Kelly (mkelly@alteeve.ca)
# Alteeve's Niche! Inc. - https://alteeve.com/w/
# Version: 0.0.1
# License: GPL v2+
#
# This program ties LINBIT's DRBD fencing into pacemaker's stonith. It provides a power-fence alternative to
# the default 'crm-{un,}fence-peer.sh' {un,}fence-handler.
#
# WARNING: This unfence handler is probably not safe to use outside of an Anvil! IA platform. It makes a lot
# of operational assumptions about the system and desired goals.
#
# TODO:
# -
### NOTE: This doesn't use Anvil::Tools on purpose. We want to be quick and depend on as few things as
### possible.
use strict;
use warnings;
use XML::Simple;
use Data::Dumper;
# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete.
$| = 1;
my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0];
my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0];
if (($running_directory =~ /^\./) && ($ENV{PWD}))
{
$running_directory =~ s/^\./$ENV{PWD}/;
}
my $conf = {
'log' => {
facility => "local0",
level => 2,
line_numbers => 1,
tag => $THIS_FILE,
},
# If a program isn't at the defined path, $ENV{PATH} will be searched.
path => {
exe => {
cibadmin => "/usr/sbin/cibadmin",
crm_attribute => "/usr/sbin/crm_attribute",
crm_error => "/usr/sbin/crm_error",
drbdadm => "/usr/sbin/drbdadm",
echo => "/usr/bin/echo",
getent => "/usr/bin/getent",
logger => "/usr/bin/logger",
pcs => "/usr/sbin/anvil-pcs-wrapper",
pcs_direct => "/usr/sbin/pcs",
},
},
# The script will set this.
cluster => {
target_node => "",
},
# These are the environment variables set by DRBD. See 'man drbd.conf'
# -> 'handlers'.
environment => {
# The resource triggering the fence.
'DRBD_RESOURCE' => defined $ENV{DRBD_RESOURCE} ? $ENV{DRBD_RESOURCE} : "",
# The resource minor number, or, in the case of volumes, numbers.
'DRBD_MINOR' => defined $ENV{DRBD_MINOR} ? $ENV{DRBD_MINOR} : "",
# This is the address format (ipv4, ipv6, etc)
'DRBD_PEER_AF' => defined $ENV{DRBD_PEER_AF} ? $ENV{DRBD_PEER_AF} : "",
# This is the IP address of the target node.
'DRBD_PEER_ADDRESS' => defined $ENV{DRBD_PEER_ADDRESS} ? $ENV{DRBD_PEER_ADDRESS} : "",
# This isn't set
'DRBD_PEERS' => defined $ENV{DRBD_PEERS} ? $ENV{DRBD_PEERS} : "",
### NOTE: Below here are undocumented variables. Don't expect them to always be useful.
# My node ID
'DRBD_MY_NODE_ID' => defined $ENV{DRBD_MY_NODE_ID} ? $ENV{DRBD_MY_NODE_ID} : "",
# The target's ID
'DRBD_PEER_NODE_ID' => defined $ENV{DRBD_PEER_NODE_ID} ? $ENV{DRBD_PEER_NODE_ID} : "",
},
};
# Find executables.
find_executables($conf);
# Something for the logs
to_log($conf, {message => "The Anvil! DRBD unfence handler has been invoked.", 'line' => __LINE__});
# These are the full host names of the nodes given their IDs.
foreach my $i (0..31)
{
my $key = "DRBD_NODE_ID_".$i;
if ((exists $ENV{$key}) && (defined $ENV{$key}))
{
$conf->{environment}{$key} = $ENV{$key};
my $level = $conf->{environment}{$key} eq "" ? 3 : 2;
to_log($conf, {message => "DRBD Environment variable: [$key] -> [".$conf->{environment}{$key}."]", 'line' => __LINE__, level => $level});
}
}
# Record the environment variables
foreach my $key (sort {$a cmp $b} keys %{$conf->{environment}})
{
# $conf->{environment}{DRBD_RESOURCE} -> [srv51-Workstation3]
my $level = $conf->{environment}{$key} eq "" ? 3 : 2;
to_log($conf, {message => "DRBD Environment variable: [$key] -> [".$conf->{environment}{$key}."]", 'line' => __LINE__, level => $level});
}
foreach my $key (sort {$a cmp $b} keys %ENV)
{
next if exists $conf->{environment}{$key};
my $level = $ENV{$key} eq "" ? 3 : 2;
to_log($conf, {message => "System Environment variable: [$key] -> [".$ENV{$key}."]", 'line' => __LINE__, level => 3});
}
# Make sure we at least have the target's IP.
if (not $conf->{environment}{DRBD_PEER_ADDRESS})
{
to_log($conf, {message => "Called without target's IP. Nothing to do, exiting. Were we called by 'pcs stonith list'?", 'line' => __LINE__, level => 1, priority => "alert"});
exit(1);
}
# This also checks that we're quorate and not in maintenance mode.
identify_peer($conf);
# If we're still alive, we now need to check the DRBD resource disk state locally.
get_drbd_status($conf);
# Is there a specific resource?
if ($conf->{environment}{DRBD_RESOURCE})
{
# Prevent the resource from running on the peer.
to_log($conf, {message => "We're being asked to unfence the specific resource: [".$conf->{environment}{DRBD_RESOURCE}."]. Will remove the node attribute blocking this server from running on: [".$conf->{cluster}{target_node}."].", 'line' => __LINE__});
remove_constraint($conf);
}
else
{
to_log($conf, {message => "We were not given a specific resource to unfence. Nothing to do.", 'line' => __LINE__});
exit(0);
}
# If we hit here, something very wrong happened.
exit(1);
#############################################################################################################
# Functions #
#############################################################################################################
# This removes a location constraint that prevents the resource / server from running on the peer node.
sub remove_constraint
{
my ($conf) = @_;
my $target_server = $conf->{environment}{DRBD_RESOURCE};
my $target_node = $conf->{cluster}{target_node};
to_log($conf, {message => "Will now create a location constraint against: [".$target_server."] preventing it from running on: [".$target_node."].", 'line' => __LINE__, level => 1});
# Check that the rule was set.
my $rule_name = "drbd-fenced_".$target_server;
my $rule_set = 1;
my $rule_found = 0;
my $rule_output = "";
my $shell_call = $conf->{path}{exe}{crm_attribute}." --type nodes --node ".$target_node." --name ".$rule_name." --query";
to_log($conf, {message => "Calling: [".$shell_call."]", 'line' => __LINE__, level => 1});
open (my $file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n";
while(<$file_handle>)
{
# This should not generate output.
chomp;
my $line = $_;
to_log($conf, {message => "Output: [".$line."]", 'line' => __LINE__, level => 1});
if (($line =~ /name=$rule_name/) && ($line =~ /value=0/))
{
$rule_set = 0;
to_log($conf, {message => "rule_set: [".$rule_set."]", 'line' => __LINE__, level => 1});
last;
}
else
{
$rule_output .= $line."\n";
}
}
close $file_handle;
if (not $rule_set)
{
# No need to unfence.
to_log($conf, {message => "The node attribute rule: [".$rule_name."] against the node: [".$target_node."] was not found. No need to unfence.", 'line' => __LINE__, level => 0, priority => "err"});
exit(0);
}
# Clear the node attribute
$shell_call = $conf->{path}{exe}{crm_attribute}." --type nodes --node ".$target_node." --name ".$rule_name." --update 0";
to_log($conf, {message => "Calling: [".$shell_call."]", 'line' => __LINE__, level => 1});
open ($file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n";
while(<$file_handle>)
{
# This should not generate output.
chomp;
my $line = $_;
to_log($conf, {message => "Output: [".$line."]", 'line' => __LINE__, level => 1});
}
close $file_handle;
# Check that the rule was set.
$rule_output = "";
$shell_call = $conf->{path}{exe}{crm_attribute}." --type nodes --node ".$target_node." --name ".$rule_name." --query";
to_log($conf, {message => "Calling: [".$shell_call."]", 'line' => __LINE__, level => 1});
open ($file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n";
while(<$file_handle>)
{
# This should not generate output.
chomp;
my $line = $_;
to_log($conf, {message => "Output: [".$line."]", 'line' => __LINE__, level => 1});
if (($line =~ /name=$rule_name/) && ($line =~ /value=0/))
{
$rule_set = 0;
to_log($conf, {message => "rule_set: [".$rule_set."]", 'line' => __LINE__, level => 1});
last;
}
else
{
$rule_output .= $line."\n";
}
}
close $file_handle;
if (not $rule_set)
{
# Success!
to_log($conf, {message => "The node attribute rule: [".$rule_name."] against the node: [".$target_node."] has been cleared successfully.", 'line' => __LINE__, level => 0, priority => "err"});
exit(0);
}
else
{
# Failed.
$rule_output =~ s/\n$//gs;
to_log($conf, {message => "The node attribute triggering the fence rule: [".$rule_name."] against the node: [".$target_node."] appears to have not been cleared. Expected a string with 'name=".$rule_name." value=0' but got: [".$rule_output."].", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
return(0);
}
# This reads the status of all resources. If we're not all UpToDate, check if the peer is. If the peer is,
# abort. If not, proceed (someone is gouig to have a bad day, but maybe some servers will live)
sub get_drbd_status
{
my ($conf) = @_;
my $resource = "";
my $peer = "";
my $local_all_uptodate = 1;
my $peer_all_uptodate = 1;
my $shell_call = $conf->{path}{exe}{drbdadm}." status all";
to_log($conf, {message => "Calling: [".$shell_call."]", 'line' => __LINE__, level => 2});
open (my $file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n";
while(<$file_handle>)
{
# This should not generate output.
chomp;
my $line = $_;
to_log($conf, {message => "Output: [".$line."]", 'line' => __LINE__, level => 3});
if (not $line)
{
$resource = "";
$peer = "";
to_log($conf, {message => "resource: [".$resource."], peer: [".$peer."]", 'line' => __LINE__, level => 3});
next;
}
if ($line =~ /^(\S+)\s+role/)
{
$resource = $1;
to_log($conf, {message => "resource: [".$resource."]", 'line' => __LINE__, level => 3});
next;
}
if ($line =~ /^\s+(.*?) role:/)
{
$peer = $1;
to_log($conf, {message => "peer: [".$peer."]", 'line' => __LINE__, level => 3});
next;
}
if ($resource)
{
if ($line =~ /disk:(.*)$/)
{
my $local_dstate = $1;
$local_dstate =~ s/\s.*$//;
to_log($conf, {message => "local_dstate: [".$local_dstate."]", 'line' => __LINE__, level => 2});
if (lc($local_dstate) ne "uptodate")
{
$local_all_uptodate = 0;
to_log($conf, {message => "local_all_uptodate: [".$local_all_uptodate."]", 'line' => __LINE__, level => 2});
}
next;
}
if ($line =~ /peer-disk:(.*)$/)
{
my $peer_dstate = $1;
$peer_dstate =~ s/\s.*$//;
to_log($conf, {message => "peer: [".$peer."], peer_dstate: [".$peer_dstate."]", 'line' => __LINE__, level => 2});
if (lc($peer_dstate) ne "uptodate")
{
$peer_all_uptodate = 0;
to_log($conf, {message => "peer_all_uptodate: [".$peer_all_uptodate."]", 'line' => __LINE__, level => 2});
}
next;
}
}
}
close $file_handle;
# If we're not all UpToDate, but the peer is, abort
to_log($conf, {message => "local_all_uptodate: [".$local_all_uptodate."], peer_all_uptodate: [".$peer_all_uptodate."]", 'line' => __LINE__, level => 2});
if ((not $local_all_uptodate) && ($peer_all_uptodate))
{
# We're not good
to_log($conf, {message => "This node has DRBD resources that are not UpToDate, but the peer is fully UpToDate. Aborting.", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
return(0);
}
# This identifies the pacemaker name of the target node. If it can't find the peer, it exits with '1'.
sub identify_peer
{
my ($conf) = @_;
# I know the target's (SN) IP, map it to a node.
my $target_host = "";
my $target_ip = $conf->{environment}{DRBD_PEER_ADDRESS};
# First, can we translate the IP to a host name?
my $shell_call = $conf->{path}{exe}{getent}." hosts ".$target_ip;
to_log($conf, {message => "Calling: [".$shell_call."]", 'line' => __LINE__, level => 2});
open (my $file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n";
while(<$file_handle>)
{
# This should not generate output.
chomp;
my $line = $_;
to_log($conf, {message => "Output: [".$line."]", 'line' => __LINE__, level => 2});
if ($line =~ /^$target_ip\s+(.*)$/)
{
# This could be multiple names.
$target_host = $1;
to_log($conf, {message => "target_host: [".$target_host."]", 'line' => __LINE__, level => 2});
#to_log($conf, {message => ">> target_host: [".$target_host."]", 'line' => __LINE__, level => 2});
# Strip off any suffix, we only want the short name.
#$target_host =~ s/\..*//;
#to_log($conf, {message => "<< target_host: [".$target_host."]", 'line' => __LINE__, level => 2});
#last;
}
}
close $file_handle;
# If I got the host name, try to match it to a pacemaker node name.
if ($target_host)
{
# Get the current CIB (in an XML::Simple hash). This will exit if it fails to read the XML
# and convert it to an XML::Simple hash.
my $body = read_cib($conf);
# Parse the XML.
my $host_name = $ENV{HOSTNAME};
my $short_host_name = $ENV{HOSTNAME};
$short_host_name =~ s/\..*$//;
to_log($conf, {message => "host_name: [".$host_name."], short_host_name: [".$short_host_name."]", 'line' => __LINE__, level => 2});
foreach my $hash_ref (sort {$a cmp $b} @{$body->{configuration}{nodes}{node}})
{
my $node = $hash_ref->{uname};
my $id = $hash_ref->{id};
to_log($conf, {message => "node: [".$node."], id: [".$id."]", 'line' => __LINE__, level => 2});
foreach my $target_name (split/ /, $target_host)
{
to_log($conf, {message => ">> target_name: [".$target_name."]", 'line' => __LINE__, level => 2});
$target_name =~ s/\..*//;
to_log($conf, {message => "<< target_name: [".$target_name."]", 'line' => __LINE__, level => 2});
if ($node =~ /^$target_name/)
{
$conf->{cluster}{target_node} = $node;
to_log($conf, {message => "Found the pacemaker name of the target node: [".$conf->{cluster}{target_node}."]", 'line' => __LINE__, level => 1});
}
elsif ($node =~ /^$short_host_name/)
{
# This is me. Am I in maintenance mode?
if (exists $hash_ref->{instance_attributes})
{
# We've got some data..
my $name = defined $hash_ref->{instance_attributes}{nvpair}{name} ? $hash_ref->{instance_attributes}{nvpair}{name} : "";
my $value = defined $hash_ref->{instance_attributes}{nvpair}{value} ? $hash_ref->{instance_attributes}{nvpair}{value} : "";
to_log($conf, {message => "node: [".$node."] instance attribyte name: [".$name."], value: [".$value."]", 'line' => __LINE__, level => 1});
if (($name eq "maintenance") and ($value eq "on"))
{
# We're in maintenance mode, abort.
to_log($conf, {message => "This node is in maintenance mode. Not able to fence!", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
}
}
}
}
my $quorate = $body->{'have-quorum'};
to_log($conf, {message => "quorate: [".$quorate."]", 'line' => __LINE__, level => 1});
if (not $quorate)
{
to_log($conf, {message => "This not is not quorate. Refusing to fence the peer!", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
# See if the target node is already out of the cluster.
foreach my $hash_ref (@{$body->{status}{node_state}})
{
my $node = $hash_ref->{uname};
my $join = $hash_ref->{'join'};
my $expected = $hash_ref->{expected};
to_log($conf, {message => "node: [".$node."] join: [".$join."], expected: [".$expected."]", 'line' => __LINE__, level => 3});
if ($node eq $conf->{cluster}{target_node})
{
to_log($conf, {message => "Checking the status of target node: [".$node."].", 'line' => __LINE__, level => 1});
if (($join eq "down") && ($expected eq "down"))
{
# The node is out.
to_log($conf, {message => "The node: [".$node."] is already down. No actual fence needed.", 'line' => __LINE__, level => 1});
exit(7);
}
else
{
to_log($conf, {message => "The node: [".$node."] is: [".$join."/".$expected."] (join/expected). Proceeding with the fence action.", 'line' => __LINE__, level => 1});
}
}
}
}
# Did I find the target?
if (not $conf->{cluster}{target_node})
{
to_log($conf, {message => "Failed to find the pacemaker name of the target node. Unable to proceed!", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
return(0);
}
# This reads in the CIB XML and returns it as a single multi-line string.
sub read_cib
{
my ($conf) = @_;
my $xml_opened = 0;
my $xml_closed = 0;
my $body = "";
my $cib = '<?xml version="1.0" encoding="UTF-8"?>';
my $shell_call = $conf->{path}{exe}{cibadmin}." --local --query";
to_log($conf, {message => "Calling: [".$shell_call."]", 'line' => __LINE__, level => 3});
open (my $file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n";
while(<$file_handle>)
{
# This should not generate output.
chomp;
my $line = $_;
to_log($conf, {message => "Output: [".$line."]", 'line' => __LINE__, level => 3});
$cib .= "\n".$line;
if ($line =~ /Signon to CIB failed/i)
{
# Failed to connect, we're probably not in the cluster.
to_log($conf, {message => "This node does not appear to be in the cluster. Unable to get the CIB status.", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
if ($line =~ /^<cib .*?>$/)
{
$xml_opened = 1;
to_log($conf, {message => "xml_opened: [".$xml_opened."].", 'line' => __LINE__, level => 3});
}
if ($line =~ /^<\/cib>$/)
{
$xml_closed = 1;
to_log($conf, {message => "xml_closed: [".$xml_closed."].", 'line' => __LINE__, level => 3});
}
}
close $file_handle;
to_log($conf, {message => "cib: ==========\n".$cib."\n==========", 'line' => __LINE__, level => 3});
# Now parse the CIB XML if I read it OK.
to_log($conf, {message => "xml_opened: [".$xml_opened."], xml_closed: [".$xml_closed."].", 'line' => __LINE__, level => 3});
if (($xml_opened) && ($xml_closed))
{
# We're good
local $@;
my $xml = XML::Simple->new();
my $test = eval { $body = $xml->XMLin($cib, KeyAttr => { language => 'name', key => 'name' }, ForceArray => [ 'id' ]) };
if (not $test)
{
chomp $@;
my $error = "[ Error ] - The was a problem parsing: [".$cib."]. The error was:\n";
$error .= "===========================================================\n";
$error .= $@."\n";
$error .= "===========================================================\n";
to_log($conf, {message => $error, 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
}
else
{
# Failed to read the CIB XML.
to_log($conf, {message => "This node does not appear to be in the cluster. Unable to read the CIB XML properly.", 'line' => __LINE__, level => 2, priority => "err"});
exit(1);
}
return($body);
}
# This checks the given paths and, if something isn't found, it searches PATH trying to find it.
sub find_executables
{
my ($conf) = @_;
# Variables.
my $check = "";
my $bad = 0;
# If PATH isn't set, set it (could have been scrubbed by a caller).
if (not $ENV{PATH})
{
$ENV{PATH} = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/root/bin";
}
# Log entries can only happen if I've found 'logger', so an extra check will be made on 'to_log'
# calls.
my @dirs = split/:/, $ENV{PATH};
foreach my $exe (sort {$b cmp $a} keys %{$conf->{path}{exe}})
{
if ( not -e $conf->{path}{exe}{$exe} )
{
to_log($conf, {message => "The program: [".$exe."] is not at: [".$conf->{path}{exe}{$exe}."]. Looking for it now...", 'line' => __LINE__, level => 1});
foreach my $path (@dirs)
{
$check = "$path/$exe";
$check =~ s/\/\//\//g;
to_log($conf, {message => "Checking: [".$check."]", 'line' => __LINE__, level => 2});
if ( -e $check )
{
if (-e $conf->{path}{exe}{logger})
{
to_log($conf, {message => "Found it! Changed path for: [".$exe."] from: [".$conf->{path}{exe}{$exe}."] to: [".$check."]", 'line' => __LINE__, level => 1});
}
else
{
warn "DEBUG: Found it! Changed path for: [".$exe."] from: [".$conf->{path}{exe}{$exe}."] to: [".$check."]\n";
}
$conf->{path}{exe}{$exe} = $check;
}
else
{
to_log($conf, {message => "Not found.", 'line' => __LINE__, level => 2});
}
}
}
else
{
to_log($conf, {message => "Found!", 'line' => __LINE__, level => 3});
next;
}
# Make sure it exists now.
to_log($conf, {message => "Checking again if: [".$exe."] is at: [".$conf->{path}{exe}{$exe}."].", 'line' => __LINE__, level => 3});
if (not -e $conf->{path}{exe}{$exe})
{
$bad = 1;
if (-e $conf->{path}{exe}{logger})
{
to_log($conf, {message => "Failed to find executable: [".$exe."]. Unable to proceed.", 'line' => __LINE__, level => 0});
}
else
{
warn "Failed to find executable: [".$exe."]. Unable to proceed.\n";
}
}
}
if ($bad)
{
exit(1);
}
return(0);
}
# Log file entries
sub to_log
{
my ($conf, $parameters) = @_;
my $facility = defined $parameters->{facility} ? $parameters->{facility} : $conf->{'log'}{facility};
my $level = defined $parameters->{level} ? $parameters->{level} : 1;
my $line = defined $parameters->{'line'} ? $parameters->{'line'} : 0;
my $message = defined $parameters->{message} ? $parameters->{message} : "";
my $priority = defined $parameters->{priority} ? $parameters->{priority} : "";
# Leave if we don't care about this message
return if $level > $conf->{'log'}{level};
return if not $message;
# Build the message. We log the line
if (($conf->{'log'}{line_numbers}) && ($line))
{
$message = $line."; ".$message;
}
my $priority_string = $facility;
if ($priority)
{
$priority_string .= ".".$priority;
}
elsif ($level eq "0")
{
$priority_string .= ".notice";
}
elsif (($level eq "1") or ($level eq "2"))
{
$priority_string .= ".info";
}
else
{
$priority_string .= ".debug";
}
# Clean up the string for bash
$message =~ s/"/\\\"/gs;
#$message =~ s/\(/\\\(/gs;
my $shell_call = $conf->{path}{exe}{logger}." --priority ".$priority_string." --tag ".$conf->{'log'}{tag}." -- \"".$message."\"";
open (my $file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n";
while(<$file_handle>)
{
# This should not generate output.
chomp;
my $line = $_;
print "Unexpected logging output: [".$line."]\n";
}
close $file_handle;
return(0);
}