You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1041 lines
36 KiB
1041 lines
36 KiB
#!/usr/bin/perl |
|
# |
|
# Author: Madison Kelly (mkelly@alteeve.ca) |
|
# Alteeve's Niche! Inc. - https://alteeve.com/w/ |
|
# Version: 0.0.1 |
|
# License: GPL v2+ |
|
# |
|
# This program ties LINBIT's DRBD fencing into pacemaker's stonith. It provides a power-fence alternative to |
|
# the default 'crm-{un,}fence-peer.sh' {un,}fence-handler. |
|
# |
|
# WARNING: This fence handler is probably not safe to use outside of an Anvil! IA platform. It makes a lot of |
|
# operational assumptions about the system and desired goals. |
|
# |
|
# Exit Codes (as per; http://lists.linbit.com/pipermail/drbd-dev/2006-November/000538.html) |
|
# - 3 -> peer is inconsistent |
|
# - 4 -> peer is outdated (this handler outdated it) [ resource fencing ] |
|
# - 5 -> peer was down / unreachable |
|
# - 6 -> peer is primary |
|
# - 7 -> peer got stonithed [ node fencing ] |
|
# ===] From crm-fence-peer.9.sh [=== |
|
# drbd_fence_peer_exit_code is per the exit code |
|
# convention of the DRBD "fence-peer" handler, |
|
# obviously. |
|
# 3: peer is already outdated or worse (e.g. inconsistent) |
|
# 4: peer has been successfully fenced |
|
# 5: peer not reachable, assumed to be dead |
|
# 6: please outdate yourself, peer is known (or likely) |
|
# to have better data, or is even currently primary. |
|
# (actually, currently it is "peer is active primary now", but I'd like to |
|
# change that meaning slightly towards the above meaning) |
|
# 7: peer has been STONITHed, thus assumed to be properly fenced |
|
# XXX IMO, this should rather be handled like 5, not 4. |
|
# ========= |
|
# |
|
# This program uses; |
|
# - 1 = Something failed or end of program hit (should not happen). |
|
# - 7 = Fence succeeded |
|
# |
|
# TODO: |
|
# - Read the CIB; 'pcs status xml' or '/usr/sbin/cibadmin --local --query' ? |
|
# -- Map the peer's name in pacemaker. |
|
# -- Verify that stonith is enabled: |
|
# -- Verify that the node is not in maintenance mode: |
|
# -- Verify that we're quorate (a-la pacemaker): |
|
# - Verify that the resource is 'resource-and-stonith' |
|
# - Verify that the resource is 'UpToDate' (if not, should we suicide to give the other node priority, regardless of fence delay? what if both nodes have resources that are not UpToDate?) |
|
# - |
|
|
|
### NOTE: This doesn't use Anvil::Tools on purpose. We want to be quick and depend on as few things as |
|
### possible. |
|
|
|
use strict; |
|
use warnings; |
|
use XML::Simple; |
|
use Data::Dumper; |
|
|
|
# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete. |
|
$| = 1; |
|
|
|
my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0]; |
|
my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0]; |
|
if (($running_directory =~ /^\./) && ($ENV{PWD})) |
|
{ |
|
$running_directory =~ s/^\./$ENV{PWD}/; |
|
} |
|
|
|
my $conf = { |
|
'log' => { |
|
facility => "local0", |
|
level => 2, |
|
line_numbers => 1, |
|
tag => $THIS_FILE, |
|
}, |
|
# If a program isn't at the defined path, $ENV{PATH} will be searched. |
|
path => { |
|
exe => { |
|
cibadmin => "/usr/sbin/cibadmin", |
|
crm_attribute => "/usr/sbin/crm_attribute", |
|
crm_error => "/usr/sbin/crm_error", |
|
drbdadm => "/usr/sbin/drbdadm", |
|
echo => "/usr/bin/echo", |
|
getent => "/usr/bin/getent", |
|
hostnamectl => "/usr/bin/hostnamectl", |
|
logger => "/usr/bin/logger", |
|
pcs => "/usr/sbin/pcs", |
|
}, |
|
}, |
|
# The script will set this. |
|
cluster => { |
|
target_node => "", |
|
fence_method => "stonith", # Will change to 'constraint' if we're using a DRBD resource is passed. |
|
}, |
|
# These are the environment variables set by DRBD. See 'man drbd.conf' |
|
# -> 'handlers'. |
|
environment => { |
|
# The resource triggering the fence. |
|
'DRBD_RESOURCE' => defined $ENV{DRBD_RESOURCE} ? $ENV{DRBD_RESOURCE} : "", |
|
# The resource minor number, or, in the case of volumes, numbers. |
|
'DRBD_MINOR' => defined $ENV{DRBD_MINOR} ? $ENV{DRBD_MINOR} : "", |
|
# This is the address format (ipv4, ipv6, etc) |
|
'DRBD_PEER_AF' => defined $ENV{DRBD_PEER_AF} ? $ENV{DRBD_PEER_AF} : "", |
|
# This is the IP address of the target node. |
|
'DRBD_PEER_ADDRESS' => defined $ENV{DRBD_PEER_ADDRESS} ? $ENV{DRBD_PEER_ADDRESS} : "", |
|
# This isn't set |
|
'DRBD_PEERS' => defined $ENV{DRBD_PEERS} ? $ENV{DRBD_PEERS} : "", |
|
### NOTE: Below here are undocumented variables. Don't expect them to always be useful. |
|
# My node ID |
|
'DRBD_MY_NODE_ID' => defined $ENV{DRBD_MY_NODE_ID} ? $ENV{DRBD_MY_NODE_ID} : "", |
|
# The target's ID |
|
'DRBD_PEER_NODE_ID' => defined $ENV{DRBD_PEER_NODE_ID} ? $ENV{DRBD_PEER_NODE_ID} : "", |
|
}, |
|
}; |
|
|
|
# Find executables. |
|
find_executables($conf); |
|
|
|
# Something for the logs |
|
to_log($conf, {message => "The Anvil! DRBD fence handler invoked.", 'line' => __LINE__}); |
|
|
|
# These are the full host names of the nodes given their IDs. |
|
foreach my $i (0..31) |
|
{ |
|
my $key = "DRBD_NODE_ID_".$i; |
|
if ((exists $ENV{$key}) && (defined $ENV{$key})) |
|
{ |
|
$conf->{environment}{$key} = $ENV{$key}; |
|
my $level = $conf->{environment}{$key} eq "" ? 3 : 2; |
|
to_log($conf, {message => "DRBD Environment variable: [".$key."] -> [".$conf->{environment}{$key}."]", 'line' => __LINE__, level => $level}); |
|
} |
|
} |
|
|
|
# Record the environment variables |
|
foreach my $key (sort {$a cmp $b} keys %{$conf->{environment}}) |
|
{ |
|
# $conf->{environment}{DRBD_RESOURCE} -> [srv51-Workstation3] |
|
my $level = $conf->{environment}{$key} eq "" ? 3 : 2; |
|
to_log($conf, {message => "DRBD Environment variable: [".$key."] -> [".$conf->{environment}{$key}."]", 'line' => __LINE__, level => $level}); |
|
} |
|
foreach my $key (sort {$a cmp $b} keys %ENV) |
|
{ |
|
next if exists $conf->{environment}{$key}; |
|
my $level = $ENV{$key} eq "" ? 3 : 2; |
|
to_log($conf, {message => "System Environment variable: [".$key."] -> [".$ENV{$key}."]", 'line' => __LINE__, level => 3}); |
|
} |
|
|
|
# Make sure we at least have the target's IP. |
|
if (not $conf->{environment}{DRBD_PEER_ADDRESS}) |
|
{ |
|
to_log($conf, {message => "Called without target's IP. Nothing to do, exiting. Were we called by 'pcs stonith list'?", 'line' => __LINE__, level => 1, priority => "alert"}); |
|
exit(1); |
|
} |
|
|
|
# This also checks that we're quorate and not in maintenance mode. |
|
identify_peer($conf); |
|
|
|
# If we're still alive, we now need to check the DRBD resource disk state locally. |
|
get_drbd_status($conf); |
|
|
|
# Is there a specific resource? |
|
if ($conf->{environment}{DRBD_RESOURCE}) |
|
{ |
|
# Prevent the resource from running on the peer. |
|
to_log($conf, {message => "We're being asked to fence the specific resource: [".$conf->{environment}{DRBD_RESOURCE}."]. Switching to location constraint fencing to prevent the resource from running on: [".$conf->{cluster}{target_node}."].", 'line' => __LINE__}); |
|
$conf->{cluster}{fence_method} = "constraint"; |
|
to_log($conf, {message => "cluster::fence_method: [".$conf->{cluster}{fence_method}."].", 'line' => __LINE__, level => 2}); |
|
} |
|
else |
|
{ |
|
to_log($conf, {message => "We were not given a specific resource to fence. Requesting pacemaker to stonith. Node: [".$conf->{cluster}{target_node}."] should power power off..", 'line' => __LINE__}); |
|
} |
|
|
|
# No, do the deed |
|
perform_fence($conf); |
|
|
|
# If we hit here, something very wrong happened. |
|
exit(1); |
|
|
|
|
|
############################################################################################################# |
|
# Functions # |
|
############################################################################################################# |
|
|
|
# This creates a location constraint that prevents the resource / server from running on the peer node. |
|
sub create_constraint |
|
{ |
|
my ($conf) = @_; |
|
|
|
my $target_server = $conf->{environment}{DRBD_RESOURCE}; |
|
my $target_node = $conf->{cluster}{target_node}; |
|
to_log($conf, {message => "Will now create a location constraint against: [".$target_server."] preventing it from running on: [".$target_node."].", 'line' => __LINE__, level => 1}); |
|
|
|
# Make sure there's a rule to apply the node attribute against. |
|
my $rule_found = 0; |
|
my $rule_name = "drbd-fenced_".$target_server; |
|
my $shell_call = $conf->{path}{exe}{pcs}." constraint location config show ".$target_server; |
|
to_log($conf, {message => "Calling: [".$shell_call."]", 'line' => __LINE__, level => 1}); |
|
open (my $file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n"; |
|
while(<$file_handle>) |
|
{ |
|
# This should not generate output. |
|
chomp; |
|
my $line = $_; |
|
to_log($conf, {message => "Output: [".$line."]", 'line' => __LINE__, level => 1}); |
|
|
|
if ($line =~ /Expression: $rule_name /) |
|
{ |
|
$rule_found = 1; |
|
to_log($conf, {message => "rule_found: [".$rule_found."]", 'line' => __LINE__, level => 1}); |
|
last; |
|
} |
|
} |
|
close $file_handle; |
|
|
|
if (not $rule_found) |
|
{ |
|
# We can't fence. |
|
to_log($conf, {message => "The fence rule: [".$rule_name."] was now found in the cluster, unable to fence by constraint.", 'line' => __LINE__, level => 0, priority => "err"}); |
|
exit(1); |
|
} |
|
|
|
# Set the node attribute |
|
my $rule_set = 0; |
|
$shell_call = $conf->{path}{exe}{crm_attribute}." --type nodes --node ".$target_node." --name ".$rule_name." --update 1"; |
|
to_log($conf, {message => "Calling: [".$shell_call."]", 'line' => __LINE__, level => 1}); |
|
open ($file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n"; |
|
while(<$file_handle>) |
|
{ |
|
# This should not generate output. |
|
chomp; |
|
my $line = $_; |
|
to_log($conf, {message => "Output: [".$line."]", 'line' => __LINE__, level => 1}); |
|
} |
|
close $file_handle; |
|
|
|
# Check that the rule was set. |
|
$rule_set = 0; |
|
my $rule_output = ""; |
|
$shell_call = $conf->{path}{exe}{crm_attribute}." --type nodes --node ".$target_node." --name ".$rule_name." --query"; |
|
to_log($conf, {message => "Calling: [".$shell_call."]", 'line' => __LINE__, level => 1}); |
|
open ($file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n"; |
|
while(<$file_handle>) |
|
{ |
|
# This should not generate output. |
|
chomp; |
|
my $line = $_; |
|
to_log($conf, {message => "Output: [".$line."]", 'line' => __LINE__, level => 1}); |
|
|
|
if (($line =~ /name=$rule_name/) && ($line =~ /value=1/)) |
|
{ |
|
$rule_set = 1; |
|
to_log($conf, {message => "rule_set: [".$rule_set."]", 'line' => __LINE__, level => 1}); |
|
last; |
|
} |
|
else |
|
{ |
|
$rule_output .= $line."\n"; |
|
} |
|
} |
|
close $file_handle; |
|
|
|
if (not $rule_set) |
|
{ |
|
# We can't fence. |
|
$rule_output =~ s/\n$//gs; |
|
to_log($conf, {message => "The node attribute triggering the fence rule: [".$rule_name."] against the node: [".$target_node."] appears to have not been set. Expected a string with 'name=".$rule_name." value=1' but got: [".$rule_output."].", 'line' => __LINE__, level => 0, priority => "err"}); |
|
exit(1); |
|
} |
|
|
|
# Place the constraint. and wait up to a minute for the target's DRBD resource to not be primary (in |
|
# case the resource is running on the peer and needs to migrate here, which should not happen but |
|
# best to check and be safe). |
|
my $stop_waiting = time + 60; |
|
my $peer_rolee = ""; |
|
my $waiting = 1; |
|
while ($waiting) |
|
{ |
|
# Check the peer's disk state |
|
my $shell_call = $conf->{path}{exe}{drbdadm}." status ".$target_server; |
|
to_log($conf, {message => "Calling: [".$shell_call."]", 'line' => __LINE__, level => 2}); |
|
open (my $file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n"; |
|
while(<$file_handle>) |
|
{ |
|
# This should not generate output. |
|
chomp; |
|
my $line = $_; |
|
to_log($conf, {message => "Output: [".$line."]", 'line' => __LINE__, level => 3}); |
|
|
|
|
|
if ($line =~ /$target_node role:(.*)$/) |
|
{ |
|
$peer_rolee = $1; |
|
to_log($conf, {message => "peer_rolee: [".$peer_rolee."]", 'line' => __LINE__, level => 2}); |
|
last; |
|
} |
|
} |
|
close $file_handle; |
|
|
|
if (lc($peer_rolee) ne "primary") |
|
{ |
|
# Set the location constraint so that pacemaker doesn't migrate the server when it |
|
# comes back up. |
|
set_location_constraint($conf); |
|
|
|
# We're good, fence is complete. |
|
to_log($conf, {message => "Resource: [".$target_server."] has been fenced via location constraint successfully!", 'line' => __LINE__, level => 1}); |
|
|
|
exit(7); |
|
} |
|
|
|
if (time > $stop_waiting) |
|
{ |
|
# Done waiting, failed. |
|
to_log($conf, {message => "The resource: [".$target_server."] on the fence target: [".$target_node."] is still in the 'Primary' role after a minute. Is the server running on the target? Is something else holding it open? Giving up on the location constraint fence attempt.", 'line' => __LINE__, level => 0, priority => "err"}); |
|
exit(1); |
|
} |
|
|
|
# Check again in a couple seconds. |
|
sleep 2; |
|
} |
|
|
|
return(0); |
|
} |
|
|
|
sub perform_fence |
|
{ |
|
my ($conf) = @_; |
|
|
|
if ($conf->{cluster}{fence_method} eq "constraint") |
|
{ |
|
create_constraint($conf); |
|
} |
|
else |
|
{ |
|
kill_target($conf); |
|
} |
|
|
|
return(0); |
|
} |
|
# This sets a location contraint so the server prefers our node. |
|
sub set_location_constraint |
|
{ |
|
my ($conf) = @_; |
|
|
|
# Get the host names. |
|
my ($local_host, $peer_host) = get_hostname($conf); |
|
my $server = $conf->{environment}{DRBD_RESOURCE}; |
|
to_log($conf, {message => "server: [".$server."], local_host: [".$local_host."], peer_host: [".$peer_host."]", 'line' => __LINE__, level => 2}); |
|
|
|
if ((not $local_host) or (not $peer_host)) |
|
{ |
|
# We can't update the constraints. |
|
return(1); |
|
} |
|
|
|
to_log($conf, {message => "Setting the pacemaker location constraint so that: [".$server."] prefers this host.", 'line' => __LINE__, level => 1}); |
|
my $shell_call = $conf->{path}{exe}{pcs}." constraint location ".$server." prefers ".$local_host."=200 ".$peer_host."=100"; |
|
to_log($conf, {message => "Calling: [".$shell_call."]", 'line' => __LINE__, level => 2}); |
|
open (my $file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n"; |
|
while(<$file_handle>) |
|
{ |
|
# This should not generate output. |
|
chomp; |
|
my $line = $_; |
|
to_log($conf, {message => "Output: [".$line."]", 'line' => __LINE__, level => 2}); |
|
} |
|
close($file_handle); |
|
|
|
return(0); |
|
} |
|
|
|
# This gets the local short hostname |
|
sub get_hostname |
|
{ |
|
my ($conf) = @_; |
|
|
|
# This will store our name. |
|
$conf->{cluster}{local_node} = ""; |
|
|
|
my $shell_call = $conf->{path}{exe}{hostnamectl}." --static"; |
|
to_log($conf, {message => "Calling: [".$shell_call."]", 'line' => __LINE__, level => 2}); |
|
open (my $file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n"; |
|
while(<$file_handle>) |
|
{ |
|
# This should not generate output. |
|
chomp; |
|
my $line = $_; |
|
to_log($conf, {message => "Output: [".$line."]", 'line' => __LINE__, level => 2}); |
|
|
|
if ((not $line) or ($line =~ /\s/)) |
|
{ |
|
# We can't trust this, it could be an error like "Could not get property: Refusing |
|
# activation, D-Bus is shutting down.". |
|
last; |
|
} |
|
else |
|
{ |
|
$conf->{cluster}{local_node} = $line; |
|
to_log($conf, {message => "cluster::local_node: [".$conf->{cluster}{local_node}."]", 'line' => __LINE__, level => 2}); |
|
last; |
|
} |
|
} |
|
|
|
# If we didn't get the host name, try reading /etc/hostname |
|
if (not $conf->{cluster}{local_node}) |
|
{ |
|
# Try reading the config file name. |
|
my $shell_call = "/etc/hostname"; |
|
to_log($conf, {message => "Reading: [".$shell_call."]", 'line' => __LINE__, level => 2}); |
|
open (my $file_handle, "<", $shell_call) or warn "Failed to read: [".$shell_call.", error was: [".$!."]"; |
|
while(<$file_handle>) |
|
{ |
|
### NOTE: Don't chop this, we want to record exactly what we read |
|
my $line = $_; |
|
to_log($conf, {message => "line: [".$line."]", 'line' => __LINE__, level => 2}); |
|
|
|
if ((not $line) or ($line =~ /\s/)) |
|
{ |
|
# We can't trust this. |
|
last; |
|
} |
|
else |
|
{ |
|
$conf->{cluster}{local_node} = $line; |
|
to_log($conf, {message => "cluster::local_node: [".$conf->{cluster}{local_node}."]", 'line' => __LINE__, level => 2}); |
|
last; |
|
} |
|
} |
|
close $file_handle; |
|
} |
|
|
|
# If we still didn't get the hostname, try calling 'hostnamectl --transient' |
|
if (not $conf->{cluster}{local_node}) |
|
{ |
|
my $shell_call = $conf->{path}{exe}{hostnamectl}." --transient"; |
|
to_log($conf, {message => "Calling: [".$shell_call."]", 'line' => __LINE__, level => 2}); |
|
open (my $file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n"; |
|
while(<$file_handle>) |
|
{ |
|
# This should not generate output. |
|
chomp; |
|
my $line = $_; |
|
to_log($conf, {message => "Output: [".$line."]", 'line' => __LINE__, level => 2}); |
|
|
|
if ((not $line) or ($line =~ /\s/)) |
|
{ |
|
# We can't trust this, it could be an error like "Could not get property: Refusing |
|
# activation, D-Bus is shutting down.". |
|
last; |
|
} |
|
else |
|
{ |
|
$conf->{cluster}{local_node} = $line; |
|
to_log($conf, {message => "cluster::local_node: [".$conf->{cluster}{local_node}."]", 'line' => __LINE__, level => 2}); |
|
last; |
|
} |
|
} |
|
} |
|
|
|
# Make sure we've got a short hostname |
|
$conf->{cluster}{local_node} =~ s/\..*$//; |
|
to_log($conf, {message => "cluster::local_node: [".$conf->{cluster}{local_node}."]", 'line' => __LINE__, level => 2}); |
|
|
|
my $peer_host = $conf->{cluster}{target_node}; |
|
my $local_host = $conf->{cluster}{local_node}; |
|
to_log($conf, {message => "peer_host: [".$peer_host."], local_host: [".$local_host."]", 'line' => __LINE__, level => 2}); |
|
|
|
# Last, look through the pacemaker CIB to make sure we're going to use the names used in pacemaker. |
|
if ((not exists $conf->{cluster}{cib}) or (not $conf->{cluster}{cib})) |
|
{ |
|
read_cib($conf); |
|
} |
|
|
|
if ($conf->{cluster}{cib}) |
|
{ |
|
foreach my $line (split/\n/, $conf->{cluster}{cib}) |
|
{ |
|
to_log($conf, {message => "Output: [".$line."]", 'line' => __LINE__, level => 2}); |
|
|
|
if ($line =~ /<node .*>$/) |
|
{ |
|
my $this_node_name = ($line =~ /uname="(.*?)"/)[0]; |
|
to_log($conf, {message => "this_node_name: [".$this_node_name."]", 'line' => __LINE__, level => 2}); |
|
|
|
if (($this_node_name eq $local_host) or ($this_node_name eq $peer_host)) |
|
{ |
|
# Name is accurate, we're good |
|
next; |
|
} |
|
elsif ($this_node_name =~ /^$local_host\./) |
|
{ |
|
# Update the host name |
|
$conf->{cluster}{local_node} = $this_node_name; |
|
to_log($conf, {message => "cluster::local_node: [".$conf->{cluster}{local_node}."]", 'line' => __LINE__, level => 2}); |
|
} |
|
elsif ($this_node_name =~ /^$peer_host\./) |
|
{ |
|
# Update the host name |
|
$conf->{cluster}{target_node} = $this_node_name; |
|
to_log($conf, {message => "cluster::target_node: [".$conf->{cluster}{target_node}."]", 'line' => __LINE__, level => 2}); |
|
} |
|
} |
|
} |
|
} |
|
|
|
to_log($conf, {message => "cluster::local_node: [".$conf->{cluster}{local_node}."], cluster::target_node: [".$conf->{cluster}{target_node}."]", 'line' => __LINE__, level => 2}); |
|
return($conf->{cluster}{local_node}, $conf->{cluster}{target_node}); |
|
} |
|
|
|
# This reads the status of all resources. If we're not all UpToDate, check if the peer is. If the peer is, |
|
# abort. If not, proceed (someone is gouig to have a bad day, but maybe some servers will live) |
|
sub get_drbd_status |
|
{ |
|
my ($conf) = @_; |
|
|
|
my $resource = ""; |
|
my $peer = ""; |
|
my $local_all_uptodate = 1; |
|
my $peer_all_uptodate = 1; |
|
my $shell_call = $conf->{path}{exe}{drbdadm}." status all"; |
|
to_log($conf, {message => "Calling: [".$shell_call."]", 'line' => __LINE__, level => 2}); |
|
open (my $file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n"; |
|
while(<$file_handle>) |
|
{ |
|
# This should not generate output. |
|
chomp; |
|
my $line = $_; |
|
to_log($conf, {message => "Output: [".$line."]", 'line' => __LINE__, level => 3}); |
|
|
|
if (not $line) |
|
{ |
|
$resource = ""; |
|
$peer = ""; |
|
to_log($conf, {message => "resource: [".$resource."], peer: [".$peer."]", 'line' => __LINE__, level => 3}); |
|
next; |
|
} |
|
if ($line =~ /^(\S+)\s+role/) |
|
{ |
|
$resource = $1; |
|
to_log($conf, {message => "resource: [".$resource."]", 'line' => __LINE__, level => 3}); |
|
next; |
|
} |
|
if ($line =~ /^\s+(.*?) role:/) |
|
{ |
|
$peer = $1; |
|
to_log($conf, {message => "peer: [".$peer."]", 'line' => __LINE__, level => 3}); |
|
next; |
|
} |
|
if ($resource) |
|
{ |
|
if ($line =~ /disk:(.*)$/) |
|
{ |
|
my $local_dstate = $1; |
|
$local_dstate =~ s/\s.*$//; |
|
to_log($conf, {message => "local_dstate: [".$local_dstate."]", 'line' => __LINE__, level => 2}); |
|
if (lc($local_dstate) ne "uptodate") |
|
{ |
|
$local_all_uptodate = 0; |
|
to_log($conf, {message => "local_all_uptodate: [".$local_all_uptodate."]", 'line' => __LINE__, level => 2}); |
|
} |
|
next; |
|
} |
|
if ($line =~ /peer-disk:(.*)$/) |
|
{ |
|
my $peer_dstate = $1; |
|
$peer_dstate =~ s/\s.*$//; |
|
to_log($conf, {message => "peer: [".$peer."], peer_dstate: [".$peer_dstate."]", 'line' => __LINE__, level => 2}); |
|
if (lc($peer_dstate) ne "uptodate") |
|
{ |
|
$peer_all_uptodate = 0; |
|
to_log($conf, {message => "peer_all_uptodate: [".$peer_all_uptodate."]", 'line' => __LINE__, level => 2}); |
|
} |
|
next; |
|
} |
|
} |
|
|
|
} |
|
close $file_handle; |
|
|
|
# If we're not all UpToDate, but the peer is, abort |
|
to_log($conf, {message => "local_all_uptodate: [".$local_all_uptodate."], peer_all_uptodate: [".$peer_all_uptodate."]", 'line' => __LINE__, level => 2}); |
|
if ((not $local_all_uptodate) && ($peer_all_uptodate)) |
|
{ |
|
# We're not good |
|
to_log($conf, {message => "This node has DRBD resources that are not UpToDate, but the peer is fully UpToDate. Aborting.", 'line' => __LINE__, level => 0, priority => "err"}); |
|
exit(1); |
|
} |
|
|
|
return(0); |
|
} |
|
|
|
# This identifies the pacemaker name of the target node. If it can't find the peer, it exits with '1'. |
|
sub identify_peer |
|
{ |
|
my ($conf) = @_; |
|
|
|
# I know the target's (SN) IP, map it to a node. |
|
my $target_host = ""; |
|
my $target_ip = $conf->{environment}{DRBD_PEER_ADDRESS}; |
|
|
|
# First, can we translate the IP to a host name? |
|
my $shell_call = $conf->{path}{exe}{getent}." hosts ".$target_ip; |
|
to_log($conf, {message => "Calling: [".$shell_call."]", 'line' => __LINE__, level => 2}); |
|
open (my $file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n"; |
|
while(<$file_handle>) |
|
{ |
|
# This should not generate output. |
|
chomp; |
|
my $line = $_; |
|
to_log($conf, {message => "Output: [".$line."]", 'line' => __LINE__, level => 2}); |
|
if ($line =~ /^$target_ip\s+(.*)$/) |
|
{ |
|
# This could be multiple names. |
|
$target_host = $1; |
|
to_log($conf, {message => "target_host: [".$target_host."]", 'line' => __LINE__, level => 2}); |
|
#to_log($conf, {message => ">> target_host: [".$target_host."]", 'line' => __LINE__, level => 2}); |
|
|
|
# Strip off any suffix, we only want the short name. |
|
#$target_host =~ s/\..*//; |
|
#to_log($conf, {message => "<< target_host: [".$target_host."]", 'line' => __LINE__, level => 2}); |
|
#last; |
|
} |
|
} |
|
close $file_handle; |
|
|
|
# If I got the host name, try to match it to a pacemaker node name. |
|
if ($target_host) |
|
{ |
|
# Get the current CIB (in an XML::Simple hash). This will exit if it fails to read the XML |
|
# and convert it to an XML::Simple hash. |
|
my $body = read_cib($conf); |
|
|
|
# Parse the XML. |
|
my $host_name = $ENV{HOSTNAME}; |
|
my $short_host_name = $ENV{HOSTNAME}; |
|
$short_host_name =~ s/\..*$//; |
|
to_log($conf, {message => "host_name: [".$host_name."], short_host_name: [".$short_host_name."]", 'line' => __LINE__, level => 2}); |
|
|
|
foreach my $hash_ref (sort {$a cmp $b} @{$body->{configuration}{nodes}{node}}) |
|
{ |
|
my $node = $hash_ref->{uname}; |
|
my $id = $hash_ref->{id}; |
|
to_log($conf, {message => "node: [".$node."], id: [".$id."]", 'line' => __LINE__, level => 2}); |
|
foreach my $target_name (split/ /, $target_host) |
|
{ |
|
to_log($conf, {message => ">> target_name: [".$target_name."]", 'line' => __LINE__, level => 2}); |
|
$target_name =~ s/\..*//; |
|
to_log($conf, {message => "<< target_name: [".$target_name."]", 'line' => __LINE__, level => 2}); |
|
if ($node =~ /^$target_name/) |
|
{ |
|
$conf->{cluster}{target_node} = $node; |
|
to_log($conf, {message => "Found the pacemaker name of the target node: [".$conf->{cluster}{target_node}."]", 'line' => __LINE__, level => 1}); |
|
} |
|
elsif ($node =~ /^$short_host_name/) |
|
{ |
|
# This is me. Am I in maintenance mode? |
|
if (exists $hash_ref->{instance_attributes}) |
|
{ |
|
next if not exists $hash_ref->{instance_attributes}{nvpair}; |
|
my $reference = ref($hash_ref->{instance_attributes}{nvpair}); |
|
to_log($conf, {message => "Instance attribute reference type: [".$reference."]", 'line' => __LINE__, level => 3}); |
|
if ($reference eq "ARRAY") |
|
{ |
|
foreach my $array_ref (@{$hash_ref->{instance_attributes}{nvpair}}) |
|
{ |
|
my $id = $array_ref->{id}; |
|
my $name = $array_ref->{name}; |
|
my $value = $array_ref->{value}; |
|
to_log($conf, {message => "Instance attribute ID: [".$id."], name: [".$name."], value: [".$value."]", 'line' => __LINE__, level => 3}); |
|
if (($name eq "maintenance") and ($value eq "on")) |
|
{ |
|
# We're in maintenance mode, abort. |
|
to_log($conf, {message => "This node is in maintenance mode. Not able to fence!", 'line' => __LINE__, level => 0, priority => "err"}); |
|
exit(1); |
|
} |
|
} |
|
} |
|
elsif ($reference eq "HASH") |
|
{ |
|
my $name = defined $hash_ref->{instance_attributes}{nvpair}{name} ? $hash_ref->{instance_attributes}{nvpair}{name} : ""; |
|
my $value = defined $hash_ref->{instance_attributes}{nvpair}{value} ? $hash_ref->{instance_attributes}{nvpair}{value} : ""; |
|
to_log($conf, {message => "node: [".$node."] instance attribyte name: [".$name."], value: [".$value."]", 'line' => __LINE__, level => 1}); |
|
if (($name eq "maintenance") and ($value eq "on")) |
|
{ |
|
# We're in maintenance mode, abort. |
|
to_log($conf, {message => "This node is in maintenance mode. Not able to fence!", 'line' => __LINE__, level => 0, priority => "err"}); |
|
exit(1); |
|
} |
|
} |
|
} |
|
} |
|
} |
|
} |
|
|
|
my $quorate = $body->{'have-quorum'}; |
|
to_log($conf, {message => "quorate: [".$quorate."]", 'line' => __LINE__, level => 1}); |
|
if (not $quorate) |
|
{ |
|
to_log($conf, {message => "This not is not quorate. Refusing to fence the peer!", 'line' => __LINE__, level => 0, priority => "err"}); |
|
exit(1); |
|
} |
|
|
|
# See if the target node is already out of the cluster. |
|
foreach my $hash_ref (@{$body->{status}{node_state}}) |
|
{ |
|
my $node = $hash_ref->{uname}; |
|
my $join = $hash_ref->{'join'}; |
|
my $expected = $hash_ref->{expected}; |
|
to_log($conf, {message => "node: [".$node."] join: [".$join."], expected: [".$expected."]", 'line' => __LINE__, level => 3}); |
|
if ($node eq $conf->{cluster}{target_node}) |
|
{ |
|
to_log($conf, {message => "Checking the status of target node: [".$node."].", 'line' => __LINE__, level => 1}); |
|
if (($join eq "down") && ($expected eq "down")) |
|
{ |
|
# Set the location constraint so that pacemaker doesn't migrate the |
|
# server when it comes back up. |
|
set_location_constraint($conf); |
|
|
|
# The node is out. |
|
to_log($conf, {message => "The node: [".$node."] is already down. No actual fence needed.", 'line' => __LINE__, level => 1}); |
|
exit(7); |
|
} |
|
else |
|
{ |
|
to_log($conf, {message => "The node: [".$node."] is: [".$join."/".$expected."] (join/expected). Proceeding with the fence action.", 'line' => __LINE__, level => 1}); |
|
} |
|
} |
|
} |
|
} |
|
|
|
# Did I find the target? |
|
if (not $conf->{cluster}{target_node}) |
|
{ |
|
to_log($conf, {message => "Failed to find the pacemaker name of the target node. Unable to proceed!", 'line' => __LINE__, level => 0, priority => "err"}); |
|
exit(1); |
|
} |
|
|
|
return(0); |
|
} |
|
|
|
# This reads in the CIB XML and returns it as a single multi-line string. |
|
sub read_cib |
|
{ |
|
my ($conf) = @_; |
|
|
|
my $xml_opened = 0; |
|
my $xml_closed = 0; |
|
my $body = ""; |
|
my $cib = '<?xml version="1.0" encoding="UTF-8"?>'; |
|
my $shell_call = $conf->{path}{exe}{cibadmin}." --local --query"; |
|
to_log($conf, {message => "Calling: [".$shell_call."]", 'line' => __LINE__, level => 3}); |
|
open (my $file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n"; |
|
while(<$file_handle>) |
|
{ |
|
# This should not generate output. |
|
chomp; |
|
my $line = $_; |
|
to_log($conf, {message => "Output: [".$line."]", 'line' => __LINE__, level => 3}); |
|
|
|
$cib .= "\n".$line; |
|
if ($line =~ /Signon to CIB failed/i) |
|
{ |
|
# Failed to connect, we're probably not in the cluster. |
|
to_log($conf, {message => "This node does not appear to be in the cluster. Unable to get the CIB status.", 'line' => __LINE__, level => 0, priority => "err"}); |
|
exit(1); |
|
} |
|
if ($line =~ /^<cib .*?>$/) |
|
{ |
|
$xml_opened = 1; |
|
to_log($conf, {message => "xml_opened: [".$xml_opened."].", 'line' => __LINE__, level => 3}); |
|
} |
|
if ($line =~ /^<\/cib>$/) |
|
{ |
|
$xml_closed = 1; |
|
to_log($conf, {message => "xml_closed: [".$xml_closed."].", 'line' => __LINE__, level => 3}); |
|
} |
|
} |
|
close $file_handle; |
|
to_log($conf, {message => "cib: ==========\n".$cib."\n==========", 'line' => __LINE__, level => 3}); |
|
|
|
# Now parse the CIB XML if I read it OK. |
|
to_log($conf, {message => "xml_opened: [".$xml_opened."], xml_closed: [".$xml_closed."].", 'line' => __LINE__, level => 3}); |
|
if (($xml_opened) && ($xml_closed)) |
|
{ |
|
# We're good |
|
local $@; |
|
my $xml = XML::Simple->new(); |
|
my $test = eval { $body = $xml->XMLin($cib, KeyAttr => { language => 'name', key => 'name' }, ForceArray => [ 'id' ]) }; |
|
if (not $test) |
|
{ |
|
chomp $@; |
|
my $error = "[ Error ] - The was a problem parsing: [".$cib."]. The error was:\n"; |
|
$error .= "===========================================================\n"; |
|
$error .= $@."\n"; |
|
$error .= "===========================================================\n"; |
|
to_log($conf, {message => $error, 'line' => __LINE__, level => 0, priority => "err"}); |
|
exit(1); |
|
} |
|
} |
|
else |
|
{ |
|
# Failed to read the CIB XML. |
|
to_log($conf, {message => "This node does not appear to be in the cluster. Unable to read the CIB XML properly.", 'line' => __LINE__, level => 2, priority => "err"}); |
|
exit(1); |
|
} |
|
|
|
# Cache the CIB. |
|
$conf->{cluster}{cib} = $body; |
|
|
|
return($body); |
|
} |
|
|
|
# This checks the given paths and, if something isn't found, it searches PATH trying to find it. |
|
sub find_executables |
|
{ |
|
my ($conf) = @_; |
|
|
|
# Variables. |
|
my $check = ""; |
|
my $bad = 0; |
|
|
|
# If PATH isn't set, set it (could have been scrubbed by a caller). |
|
if (not $ENV{PATH}) |
|
{ |
|
$ENV{PATH} = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/root/bin"; |
|
} |
|
|
|
# Log entries can only happen if I've found 'logger', so an extra check will be made on 'to_log' |
|
# calls. |
|
my @dirs = split/:/, $ENV{PATH}; |
|
foreach my $exe (sort {$b cmp $a} keys %{$conf->{path}{exe}}) |
|
{ |
|
if (not -e $conf->{path}{exe}{$exe}) |
|
{ |
|
to_log($conf, {message => "The program: [".$exe."] is not at: [".$conf->{path}{exe}{$exe}."]. Looking for it now...", 'line' => __LINE__, level => 1}); |
|
foreach my $path (@dirs) |
|
{ |
|
$check = "$path/$exe"; |
|
$check =~ s/\/\//\//g; |
|
to_log($conf, {message => "Checking: [".$check."]", 'line' => __LINE__, level => 2}); |
|
if ( -e $check ) |
|
{ |
|
if (-e $conf->{path}{exe}{logger}) |
|
{ |
|
to_log($conf, {message => "Found it! Changed path for: [".$exe."] from: [".$conf->{path}{exe}{$exe}."] to: [".$check."]", 'line' => __LINE__, level => 1}); |
|
} |
|
else |
|
{ |
|
warn "DEBUG: Found it! Changed path for: [".$exe."] from: [".$conf->{path}{exe}{$exe}."] to: [".$check."]\n"; |
|
} |
|
$conf->{path}{exe}{$exe} = $check; |
|
} |
|
else |
|
{ |
|
to_log($conf, {message => "Not found.", 'line' => __LINE__, level => 2}); |
|
} |
|
} |
|
} |
|
else |
|
{ |
|
to_log($conf, {message => "Found: [".$exe."] at: [".$conf->{path}{exe}{$exe}."]!", 'line' => __LINE__, level => 3}); |
|
next; |
|
} |
|
|
|
# Make sure it exists now. |
|
to_log($conf, {message => "Checking again if: [".$exe."] is at: [".$conf->{path}{exe}{$exe}."].", 'line' => __LINE__, level => 3}); |
|
if (not -e $conf->{path}{exe}{$exe}) |
|
{ |
|
$bad = 1; |
|
if (-e $conf->{path}{exe}{logger}) |
|
{ |
|
to_log($conf, {message => "Failed to find executable: [".$exe."]. Unable to proceed.", 'line' => __LINE__, level => 0}); |
|
} |
|
else |
|
{ |
|
warn "Failed to find executable: [".$exe."]. Unable to proceed.\n"; |
|
} |
|
} |
|
} |
|
if ($bad) |
|
{ |
|
exit(1); |
|
} |
|
|
|
return(0); |
|
} |
|
|
|
# This checks to see if 'cluster::target_node' is out of the pacemaker cluster. |
|
sub check_peer_is_fenced |
|
{ |
|
my ($conf) = @_; |
|
|
|
my $fenced = 0; |
|
my $body = read_cib($conf); |
|
foreach my $hash_ref (@{$body->{status}{node_state}}) |
|
{ |
|
my $node = $hash_ref->{uname}; |
|
my $join = $hash_ref->{'join'}; |
|
my $expected = $hash_ref->{expected}; |
|
to_log($conf, {message => "node: [".$node."] join: [".$join."], expected: [".$expected."]", 'line' => __LINE__, level => 3}); |
|
if ($node eq $conf->{cluster}{target_node}) |
|
{ |
|
to_log($conf, {message => "Checking the status of target node: [".$node."].", 'line' => __LINE__, level => 1}); |
|
if (($join eq "down") && ($expected eq "down")) |
|
{ |
|
# The node is out. |
|
to_log($conf, {message => "The node: [".$node."] has been fenced successfully.", 'line' => __LINE__, level => 1}); |
|
|
|
# Call 'drbdadm adjust all' as it seems like drbd's in-memory can change |
|
# causing 'incompatible <fence option>' on return of the peer. |
|
to_log($conf, {message => "Reloading DRBD config from disk to ensure in-memory and on-disk configs match.", 'line' => __LINE__, level => 1}); |
|
my $shell_call = $conf->{path}{exe}{drbdadm}." adjust all"; |
|
to_log($conf, {message => "Calling: [".$shell_call."]", 'line' => __LINE__, level => 2}); |
|
open (my $file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n"; |
|
while(<$file_handle>) |
|
{ |
|
# This should not generate output. |
|
chomp; |
|
my $line = $_; |
|
to_log($conf, {message => "Output: [".$line."]", 'line' => __LINE__, level => 2}); |
|
} |
|
close $file_handle; |
|
|
|
# Set the location constraint so that pacemaker doesn't migrate the server |
|
# when it comes back up. |
|
set_location_constraint($conf); |
|
|
|
to_log($conf, {message => "Fence completed successfully!", 'line' => __LINE__, level => 1}); |
|
exit(7); |
|
} |
|
else |
|
{ |
|
to_log($conf, {message => "The node: [".$node."] is: [".$join."/".$expected."] (join/expected). It has not yet been fenced.", 'line' => __LINE__, level => 1}); |
|
} |
|
} |
|
} |
|
|
|
return(0); |
|
} |
|
|
|
# This kills remote node. |
|
sub kill_target |
|
{ |
|
my ($conf) = @_; |
|
|
|
# Variables |
|
my $shell_call = $conf->{path}{exe}{pcs}." stonith fence ".$conf->{cluster}{target_node}."; ".$conf->{path}{exe}{echo}." rc:\$?"; |
|
to_log($conf, {message => "Calling: [".$shell_call."]", 'line' => __LINE__, level => 2}); |
|
open (my $file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n"; |
|
while(<$file_handle>) |
|
{ |
|
# This should not generate output. |
|
chomp; |
|
my $line = $_; |
|
to_log($conf, {message => "Output: [".$line."]", 'line' => __LINE__, level => 2}); |
|
} |
|
close $file_handle; |
|
|
|
### NOTE: Should we just wait forever? |
|
# Now loop and wait for it to be stonithed. We'll wait up to five minutes. |
|
my $start_time = time; |
|
my $end_time = $start_time + 300; |
|
my $fenced = 0; |
|
to_log($conf, {message => "start_time: [".$start_time."], end_time: [".$end_time."]", 'line' => __LINE__, level => 2}); |
|
until ($fenced) |
|
{ |
|
# This will exit |
|
check_peer_is_fenced($conf); |
|
if (time > $end_time) |
|
{ |
|
# Done waiting, failed. |
|
to_log($conf, {message => "The node has not been fenced after five minutes. Giving up.", 'line' => __LINE__, level => 0, priority => "err"}); |
|
exit(1); |
|
} |
|
else |
|
{ |
|
sleep 5; |
|
} |
|
} |
|
|
|
# This should not be reachable. |
|
return(0); |
|
} |
|
|
|
# Log file entries |
|
sub to_log |
|
{ |
|
my ($conf, $parameters) = @_; |
|
|
|
my $facility = defined $parameters->{facility} ? $parameters->{facility} : $conf->{'log'}{facility}; |
|
my $level = defined $parameters->{level} ? $parameters->{level} : 1; |
|
my $line = defined $parameters->{'line'} ? $parameters->{'line'} : 0; |
|
my $message = defined $parameters->{message} ? $parameters->{message} : ""; |
|
my $priority = defined $parameters->{priority} ? $parameters->{priority} : ""; |
|
|
|
# Leave if we don't care about this message |
|
return if $level > $conf->{'log'}{level}; |
|
return if not $message; |
|
|
|
# Build the message. We log the line |
|
if (($conf->{'log'}{line_numbers}) && ($line)) |
|
{ |
|
# Record the PID as well to make it easier to separate parallel runs. |
|
$message = "[".$$."]:".$line."; ".$message; |
|
} |
|
|
|
my $priority_string = $facility; |
|
if ($priority) |
|
{ |
|
$priority_string .= ".".$priority; |
|
} |
|
elsif ($level eq "0") |
|
{ |
|
$priority_string .= ".notice"; |
|
} |
|
elsif (($level eq "1") or ($level eq "2")) |
|
{ |
|
$priority_string .= ".info"; |
|
} |
|
else |
|
{ |
|
$priority_string .= ".debug"; |
|
} |
|
|
|
# Clean up the string for bash |
|
$message =~ s/"/\\\"/gs; |
|
#$message =~ s/\(/\\\(/gs; |
|
|
|
my $shell_call = $conf->{path}{exe}{logger}." --priority ".$priority_string." --tag ".$conf->{'log'}{tag}." -- \"".$message."\""; |
|
open (my $file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n"; |
|
while(<$file_handle>) |
|
{ |
|
# This should not generate output. |
|
chomp; |
|
my $line = $_; |
|
print "Unexpected logging output: [".$line."]\n"; |
|
} |
|
close $file_handle; |
|
|
|
return(0); |
|
} |
|
|
|
|