Local modifications to ClusterLabs/Anvil by Alteeve
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

624 lines
19 KiB

#!/usr/bin/perl
#
# Author: Madison Kelly (mkelly@alteeve.ca)
# Alteeve's Niche! Inc. - https://alteeve.com/w/
# Version: 0.0.1
# License: GPL v2+
#
# This program ties LINBIT's DRBD fencing into pacemaker's stonith. It provides a power-fence alternative to
# the default 'crm-{un,}fence-peer.sh' {un,}fence-handler.
#
# Exit Codes (as per; http://lists.linbit.com/pipermail/drbd-dev/2006-November/000538.html)
# - 3 -> peer is inconsistent
# - 4 -> peer is outdated (this handler outdated it) [ resource fencing ]
# - 5 -> peer was down / unreachable
# - 6 -> peer is primary
# - 7 -> peer got stonithed [ node fencing ]
# ===] From crm-fence-peer.9.sh [===
# drbd_fence_peer_exit_code is per the exit code
# convention of the DRBD "fence-peer" handler,
# obviously.
# 3: peer is already outdated or worse (e.g. inconsistent)
# 4: peer has been successfully fenced
# 5: peer not reachable, assumed to be dead
# 6: please outdate yourself, peer is known (or likely)
# to have better data, or is even currently primary.
# (actually, currently it is "peer is active primary now", but I'd like to
# change that meaning slightly towards the above meaning)
# 7: peer has been STONITHed, thus assumed to be properly fenced
# XXX IMO, this should rather be handled like 5, not 4.
# =========
#
# This program uses;
# - 1 = Something failed
# - 7 = Fence succeeded
# - 255 = End of program hit... should never happen.
#
# TODO:
# - Read the CIB; 'pcs status xml' or '/usr/sbin/cibadmin --local --query' ?
# -- Map the peer's name in pacemaker.
# -- Verify that stonith is enabled:
# -- Verify that the node is not in maintenance mode:
# -- Verify that we're quorate (a-la pacemaker):
# - Verify that the resource is 'resource-and-stonith'
# - Verify that the resource is 'UpToDate' (if not, should we suicide to give the other node priority, regardless of fence delay? what if both nodes have resources that are not UpToDate?)
# -
### NOTE: This doesn't use Anvil::Tools on purpose. We want to be quick and depend on as few things as
### possible.
use strict;
use warnings;
use Data::Dumper;
# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete.
$| = 1;
my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0];
my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0];
if (($running_directory =~ /^\./) && ($ENV{PWD}))
{
$running_directory =~ s/^\./$ENV{PWD}/;
}
my $conf = {
'log' => {
facility => "local0",
level => 2,
line_numbers => 0,
tag => $THIS_FILE,
},
# If a program isn't at the defined path, $ENV{PATH} will be searched.
path => {
exe => {
logger => "/usr/bin/logger",
stonith_admin => "/usr/sbin/stonith_admin",
},
},
# General settings.
sys => {
# Local host name.
host_name => $ENV{HOSTNAME},
target_name => "",
},
# The script will set this.
cluster => {
this_node => "",
target_node => "",
},
# These are the environment variables set by DRBD. See 'man drbd.conf'
# -> 'handlers'.
environment => {
# The resource triggering the fence.
'DRBD_RESOURCE' => defined $ENV{DRBD_RESOURCE} ? $ENV{DRBD_RESOURCE} : "",
# The resource minor number, or, in the case of volumes, numbers.
'DRBD_MINOR' => defined $ENV{DRBD_MINOR} ? $ENV{DRBD_MINOR} : "",
# The peer(s) hostname(s), space separated.
# DRBD_PEER_AF, DRBD_PEER_ADDRESS , DRBD_PEERS are the address family (e.g. ipv6), the peer's address and hostnames.
'DRBD_PEER_AF' => defined $ENV{DRBD_PEER_AF} ? $ENV{DRBD_PEER_AF} : "",
'DRBD_PEER_ADDRESS' => defined $ENV{DRBD_PEER_ADDRESS} ? $ENV{DRBD_PEER_ADDRESS} : "",
'DRBD_PEERS' => defined $ENV{DRBD_PEERS} ? $ENV{DRBD_PEERS} : "",
},
};
# Find executables.
find_executables($conf);
# Something for the logs
to_log($conf, {message => "Attempting to fence the peer via pacemaker's stonith...", 'line' => __LINE__});
# Record the environment variables
foreach my $key (sort {$a cmp $b} keys %{$conf->{environment}})
{
to_log($conf, {message => "DRBD Environment variable: [$key] -> [".$conf->{environment}{$key}."]", 'line' => __LINE__, level => 2});
}
foreach my $key (sort {$a cmp $b} keys %ENV)
{
next if exists $conf->{environment}{$key};
to_log($conf, {message => "System Environment variable: [$key] -> [".$ENV{$key}."]", 'line' => __LINE__, level => 3});
}
exit(255);
# Who am I?
get_local_node_name($conf);
# Am I up to date?
get_local_resource_state($conf);
# Who else is here?
get_info_on_other_nodes($conf);
# Who shall I kill?
get_target($conf);
# Eject the target, if I can.
eject_target($conf);
# In case cman decided by itself to fence the node...
eventually_wait_for_fenced($conf); # May exit with exit code 7
# Only kill the target if gracefull eject did not work. This is
# important because fence_node does not want to be invoked
# in multiple instances in parallel.
kill_target($conf);
exit(255);
#############################################################################################################
# Functions #
#############################################################################################################
# This checks the given paths and, if something isn't found, it searches PATH trying to find it.
sub find_executables
{
my ($conf) = @_;
# Variables.
my $check = "";
my $bad = 0;
# Log entries can only happen if I've found 'logger', so an extra check will be made on 'to_log'
# calls.
my @dirs = split/:/, $ENV{PATH};
foreach my $exe (sort {$b cmp $a} keys %{$conf->{path}{exe}})
{
to_log($conf, {message => "Checking if: [$exe] is at: [".$conf->{path}{exe}{$exe}."]", 'line' => __LINE__, level => 3});
if ( not -e $conf->{path}{exe}{$exe} )
{
to_log($conf, {message => "It is not!", 'line' => __LINE__, level => 3});
foreach my $path (@dirs)
{
$check = "$path/$exe";
$check =~ s/\/\//\//g;
to_log($conf, {message => "Checking: [$check]", 'line' => __LINE__, level => 3});
if ( -e $check )
{
to_log($conf, {message => "Found!", 'line' => __LINE__, level => 3});
if (-e $conf->{path}{exe}{logger})
{
to_log($conf, {message => "Changed path for: [$exe] from: [".$conf->{path}{exe}{$exe}."] to: [$check]", 'line' => __LINE__, level => 1});
}
else
{
warn "DEBUG: Changed path for: [$exe] from: [".$conf->{path}{exe}{$exe}."] to: [$check]\n";
}
$conf->{path}{exe}{$exe} = $check;
}
else
{
to_log($conf, {message => "Not found.", 'line' => __LINE__, level => 3});
}
}
}
else
{
to_log($conf, {message => "Found!", 'line' => __LINE__, level => 2});
next;
}
# Make sure it exists now.
to_log($conf, {message => "Checking again if: [$exe] is at: [".$conf->{path}{exe}{$exe}."].", 'line' => __LINE__, level => 3});
if ( not -e $conf->{path}{exe}{$exe} )
{
$bad = 1;
if (-e $conf->{path}{exe}{logger})
{
to_log($conf, {message => "Failed to find executable: [$exe]. Unable to proceed.", 'line' => __LINE__, level => 0});
to_log(LOG_DEBUG(), $conf, __LINE__, "");
}
else
{
warn "Failed to find executable: [$exe]. Unable to proceed.\n";
}
}
}
if ($bad)
{
exit(1);
}
return(0);
}
# This kills remote node.
sub kill_target
{
my ($conf) = @_;
# Variables
my $remote_node = $conf->{environment}{DRBD_PEERS};
my $sc = "";
my $shell_call = "";
my $line = "";
my $sc_exit = "";
# Hug it and squeeze it and call it George.
to_log(LOG_DEBUG(), $conf, __LINE__, "Fencing target: [$remote_node]...");
$sc = IO::Handle->new();
$shell_call = "$conf->{path}{exe}{fence_node} -v $remote_node";
to_log(LOG_DEBUG(), $conf, __LINE__, "DEBUG: shell call: [$shell_call]");
open ($sc, "$shell_call 2>&1 |") or to_log(LOG_ERR(), $conf, __LINE__, "Failed to call: [$sc], error was: $!");
while(<$sc>)
{
chomp;
$line = $_;
to_log(LOG_DEBUG(), $conf, __LINE__, "DEBUG: $line");
if ($line=~/fence .*? success/)
{
to_log(LOG_DEBUG(), $conf, __LINE__, "'fence_node $remote_node' appears to have succeeded!");
}
else
{
to_log(LOG_DEBUG(), $conf, __LINE__, "'fence_node $remote_node' appears to have failed!");
to_log(LOG_DEBUG(), $conf, __LINE__, "Read: [$line]");
}
}
$sc->close();
$sc_exit = $?;
to_log(LOG_DEBUG(), $conf, __LINE__, "DEBUG: Attempt to fence node: [$remote_node] exited with: [$sc_exit]");
# Exit.
if ($sc_exit)
{
to_log(LOG_DEBUG(), $conf, __LINE__, "Attempt to fence: [$remote_node] failed!");
exit(1);
}
else
{
to_log(LOG_DEBUG(), $conf, __LINE__, "Fencing of: [$remote_node] succeeded!");
exit(7);
}
# This should not be reachable.
return(0);
}
# This ejects the remote node from the cluster, if cluster comms are still up.
sub eject_target
{
my ($conf) = @_;
# Variables;
my $remote_node = "";
my $sc = "";
my $sc_exit = "";
my $shell_call = "";
my $line = "";
### I don't know if I really want to use/keep this.
# If the node is still a cluster member, kick it out.
$remote_node = $conf->{environment}{DRBD_PEERS};
if ($conf->{nodes}{$remote_node}{member} eq "M")
{
# It is, kick it out. If cluster comms are up, this will
# trigger a fence in a few moment, regardless of what we do
# next.
to_log(LOG_DEBUG(), $conf, __LINE__, "Target node: [$remote_node] is a cluster member, attempting to eject.");
$sc = IO::Handle->new();
$shell_call = "$conf->{path}{exe}{cman_tool} kill -n $remote_node";
to_log(LOG_DEBUG(), $conf, __LINE__, "DEBUG: shell call: [$shell_call]");
open ($sc, "$shell_call 2>&1 |") or to_log(LOG_ERR(), $conf, __LINE__, "Failed to call: [$sc], error was: $!");
while(<$sc>)
{
chomp;
$line = $_;
to_log(LOG_DEBUG(), $conf, __LINE__, "DEBUG: line: [$line]");
}
$sc->close();
$sc_exit = $?;
to_log(LOG_DEBUG(), $conf, __LINE__, "DEBUG: Attempt to force-remove node: [$remote_node] exited with: [$sc_exit]");
return 1;
}
else
{
to_log(LOG_DEBUG(), $conf, __LINE__, "Target node: [$remote_node] is *not* a cluster member (state: [$conf->{nodes}{$remote_node}{member}]). Not ejecting.");
return 0;
}
}
# This identifies the remote node.
sub get_target
{
my ($conf) = @_;
# Variables
my $remote_node = $conf->{environment}{DRBD_PEERS};
# Make sure I know my target.
if ( not exists $conf->{nodes}{$remote_node} )
{
# Try the short name.
$remote_node =~ s/^(.*?)\..*$//;
if ( not exists $conf->{nodes}{$remote_node} )
{
to_log(LOG_DEBUG(), $conf, __LINE__, "I didn't see the other node: [$conf->{environment}{DRBD_PEERS} ($remote_node)] in cman's node list. I can't fence this node.");
print "I didn't see the other node: [$conf->{environment}{DRBD_PEERS} ($remote_node)] in cman's node list. I can't fence this node.\n";
print "Does the hostname in cluster.conf match the host names used in the DRBD resource files?\n";
exit(1);
}
# Update the peer.
$conf->{environment}{DRBD_PEERS} = $remote_node;
}
to_log(LOG_DEBUG(), $conf, __LINE__, "I have identified my target: [$remote_node]");
return(0);
}
# This uses 'cman_tool' to get the information on the other node(s) in the
# cluster.
sub get_info_on_other_nodes
{
my ($conf) = @_;
# Variables
my $node_count = 0;
my $sc = "";
my $shell_call = "";
my $sc_exit = "";
my $line = "";
my $node_id = "";
my $node_name = "";
my $member = "";
my $address = "";
$sc = IO::Handle->new();
$shell_call = "$conf->{path}{exe}{cman_tool} -a -F id,name,type,addr nodes";
to_log(LOG_DEBUG(), $conf, __LINE__, "DEBUG: shell call: [$shell_call]");
open ($sc, "$shell_call 2>&1 |") or to_log(LOG_ERR(), $conf, __LINE__, "Failed to call: [$sc], error was: $!");
while(<$sc>)
{
chomp;
$line = $_;
($node_id, $node_name, $member, $address) = (split/ /, $line);
to_log(LOG_DEBUG(), $conf, __LINE__, "DEBUG: id: [$node_id], name: [$node_name], member: [$member], address: [$address]");
$conf->{nodes}{$node_name}{member} = $member;
$conf->{nodes}{$node_name}{id} = $node_id;
$conf->{nodes}{$node_name}{address} = $address;
$node_count++;
to_log(LOG_DEBUG(), $conf, __LINE__, "DEBUG: output: $line");
}
$sc->close();
$sc_exit=$?;
to_log(LOG_DEBUG(), $conf, __LINE__, "DEBUG: Attempt to gather cluster member information exited with: [$sc_exit]");
return(0);
}
# This reads /proc/drbd and pulls out the state of the defined resource
sub get_local_resource_state
{
my ($conf) = @_;
# Variables
my $minor = $conf->{environment}{DRBD_MINOR};
my $sc = "";
my $shell_call = "";
my $sc_exit = "";
my $line = "";
my $state = "";
# Minor may well be '0', so I need to check for an empty string here.
to_log(LOG_DEBUG(), $conf, __LINE__, "DEBUG: Checking the state of resource with minor number: [$conf->{environment}{DRBD_MINOR}]");
if ($conf->{environment}{DRBD_MINOR} eq "")
{
to_log(LOG_ERR(), $conf, __LINE__, "Resource minor number not defined! Unable to proceed.");
}
# minor can be a space delimited list. Assume it is always a list and iterate over it
my @minor = split / /, $minor;
to_log(LOG_DEBUG(), $conf, __LINE__, "DEBUG: minor: [$minor]");
$sc = IO::Handle->new();
$shell_call = "</proc/drbd";
to_log(LOG_DEBUG(), $conf, __LINE__, "DEBUG: shell call: [$shell_call]");
open ($sc, "$shell_call") or to_log(LOG_ERR(), $conf, __LINE__, "Failed to call: [$sc], error was: $!");
while(<$sc>)
{
chomp;
$line = $_;
to_log(LOG_DEBUG(), $conf, __LINE__, "DEBUG: line: [$line]");
$line =~ s/^\s+//;
foreach $minor (@minor)
{
if ($line =~ /^$minor: .*? ds:(.*?)\//)
{
$state = $1;
if ($conf->{sys}{debug})
{
to_log(LOG_DEBUG(), $conf, __LINE__, "DEBUG: read state of minor: [$minor] as: [$state]");
}
$conf->{sys}{local_res_uptodate} = $state eq "UpToDate" ? 1 : 0;
to_log(LOG_DEBUG(), $conf, __LINE__, "DEBUG: sys::local_res_uptodate: [$conf->{sys}{local_res_uptodate}]");
}
}
# If one or more minors out not UpToDate bail
last if ($conf->{sys}{local_res_uptodate} eq 0);
to_log(LOG_DEBUG(), $conf, __LINE__, "DEBUG: read state of minor: [$minor] as: [$state]");
to_log(LOG_DEBUG(), $conf, __LINE__, "DEBUG: sys::local_res_uptodate: [$conf->{sys}{local_res_uptodate}]");
}
$sc->close();
$sc_exit = $?;
to_log(LOG_DEBUG(), $conf, __LINE__, "DEBUG: Attempt to collect UpToDate information device with minor: [@minor] exited with: [$sc_exit]");
to_log(LOG_DEBUG(), $conf, __LINE__, "DEBUG: UpToDate: [$conf->{sys}{local_res_uptodate}]");
if (not $conf->{sys}{local_res_uptodate})
{
to_log(LOG_ERR(), $conf, __LINE__, "Local resource: [$conf->{environment}{DRBD_RESOURCE}], minor: [@minor] is NOT 'UpToDate', will not fence peer.");
}
return(0);
}
# This reads in and sets the local node's name.
sub get_local_node_name
{
my ($conf) = @_;
# Variables
my $sc = "";
my $shell_call = "";
my $sc_exit = "";
my $line = "";
$sc = IO::Handle->new();
$shell_call = "$conf->{path}{exe}{cman_tool} status";
to_log(LOG_DEBUG(), $conf, __LINE__, "DEBUG: shell call: [$shell_call]");
open ($sc, "$shell_call 2>&1 |") or to_log(LOG_ERR(), $conf, __LINE__, "Failed to call: [$sc], error was: $!");
while(<$sc>)
{
chomp;
$line = $_;
to_log(LOG_DEBUG(), $conf, __LINE__, "DEBUG: line: [$line]");
if ($line =~ /Node name: (.*)/)
{
$conf->{sys}{this_node} = $1;
last;
}
}
$sc->close();
$sc_exit = $?;
to_log(LOG_DEBUG(), $conf, __LINE__, "DEBUG: Attempt to get local node name via 'cman_tool status' exited with: [$sc_exit]");
to_log(LOG_DEBUG(), $conf, __LINE__, "DEBUG: I am: [$conf->{sys}{this_node}]");
if (not $conf->{sys}{this_node})
{
to_log(LOG_ERR(), $conf, __LINE__, "Unable to find local node name.");
}
return(0);
}
# Log file entries
sub to_log
{
my ($conf, $parameters) = @_;
my $facility = defined $parameters->{facility} ? $parameters->{facility} : $conf->{'log'}{facility};
my $level = defined $parameters->{level} ? $parameters->{level} : 1;
my $line = defined $parameters->{'line'} ? $parameters->{'line'} : 0;
my $message = defined $parameters->{message} ? $parameters->{message} : "";
my $priority = defined $parameters->{priority} ? $parameters->{priority} : "";
# Leave if we don't care about this message
return if $level > $conf->{'log'}{level};
return if not $message;
# Build the message. We log the line
if (($conf->{'log'}{line_numbers}) && ($line))
{
$message = $line."; ".$message;
}
my $priority_string = $facility;
if ($priority)
{
$priority_string .= ".".$priority;
}
elsif ($level eq "0")
{
$priority_string .= ".notice";
}
elsif (($level eq "1") or ($level eq "2"))
{
$priority_string .= ".info";
}
else
{
$priority_string .= ".debug";
}
# Clean up the string for bash
$message =~ s/"/\\\"/gs;
$message =~ s/\(/\\\(/gs;
my $shell_call = $conf->{path}{exe}{logger}." --priority ".$priority_string." --tag ".$conf->{'log'}{tag}." -- \"".$message."\"";
open (my $file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n";
while(<$file_handle>)
{
# This should not generate output.
chomp;
my $line = $_;
print "Unexpected logging output: [".$line."]\n";
}
close $file_handle;
return(0);
}
sub get_fenced_state
{
my ($conf) = @_;
my $sc = IO::Handle->new();
open($sc, "fence_tool -n ls |") or to_log(LOG_ERR(), $conf, __LINE__, "Failed to call: fence_tools error was $!\n");
my %fence_state;
<$sc> =~ /fence domain/;
$fence_state{member_count} = $1 if (<$sc> =~ /member count\s+([0-9]+)/);
$fence_state{victim_count} = $1 if (<$sc> =~ /victim count\s+([0-9]+)/);
$fence_state{victim_now} = $1 if (<$sc> =~ /victim now\s+([0-9]+)/);
$fence_state{master_nodeid} = $1 if (<$sc> =~ /master nodeid\s+([0-9]+)/);
$fence_state{wait_state} = $1 if (<$sc> =~ /wait state\s+(\w+)$/);
$sc->close();
return \%fence_state;
}
sub wait_for_fenced_status
{
my ($conf, $target_state, $time_seconds)=@_;
my $fenced_state;
while ($time_seconds)
{
$fenced_state = get_fenced_state($conf);
if ($fenced_state->{wait_state} eq $target_state)
{
return $fenced_state;
}
sleep(1);
to_log(LOG_DEBUG(), $conf, __LINE__, "DEBUG: Waiting for $target_state. Now $fenced_state->{wait_state}\n");
$time_seconds--;
}
return $fenced_state;
}
sub eventually_wait_for_fenced
{
my ($conf) = @_;
my $fenced_state1 = wait_for_fenced_status($conf, "fencing", 30);
if ($fenced_state1->{wait_state} ne "fencing")
{
to_log(LOG_DEBUG(), $conf, __LINE__, "DEBUG: Expected fencd to do a fence action, got $fenced_state1->{wait_state}\n");
return;
}
my $to_fence_node_id = $conf->{nodes}{$conf->{environment}{DRBD_PEERS}}{id};
if ($fenced_state1->{victim_now} != $to_fence_node_id)
{
to_log(LOG_ERR(), $conf, __LINE__, "Fenced is shooting at $fenced_state1->{victim_now}; Should shoot at $to_fence_node_id\n");
}
my $fenced_state2 = wait_for_fenced_status($conf, "none", 240);
if ($fenced_state2->{wait_state} eq "none")
{
to_log(LOG_INFO(), $conf, __LINE__, "Seems fenced was successfull\n");
exit 7;
}
else
{
to_log(LOG_ERR(), $conf, __LINE__, "Fenced failed" . Dumper($fenced_state1) . Dumper($fenced_state2));
}
}