#!/usr/bin/perl # # Author: Madison Kelly (mkelly@alteeve.ca) # Alteeve's Niche! Inc. - https://alteeve.com/w/ # Version: 0.0.1 # License: GPL v2+ # # This program ties LINBIT's DRBD fencing into pacemaker's stonith. It provides a power-fence alternative to # the default 'crm-{un,}fence-peer.sh' {un,}fence-handler. # # WARNING: This unfence handler is probably not safe to use outside of an Anvil! IA platform. It makes a lot # of operational assumptions about the system and desired goals. # # TODO: # - ### NOTE: This doesn't use Anvil::Tools on purpose. We want to be quick and depend on as few things as ### possible. use strict; use warnings; use XML::Simple; use Data::Dumper; # Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete. $| = 1; my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0]; my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0]; if (($running_directory =~ /^\./) && ($ENV{PWD})) { $running_directory =~ s/^\./$ENV{PWD}/; } my $conf = { 'log' => { facility => "local0", level => 2, line_numbers => 1, tag => $THIS_FILE, }, # If a program isn't at the defined path, $ENV{PATH} will be searched. path => { exe => { cibadmin => "/usr/sbin/cibadmin", crm_attribute => "/usr/sbin/crm_attribute", crm_error => "/usr/sbin/crm_error", drbdadm => "/usr/sbin/drbdadm", echo => "/usr/bin/echo", getent => "/usr/bin/getent", logger => "/usr/bin/logger", pcs => "/usr/sbin/pcs", }, }, # The script will set this. cluster => { target_node => "", }, # These are the environment variables set by DRBD. See 'man drbd.conf' # -> 'handlers'. environment => { # The resource triggering the fence. 'DRBD_RESOURCE' => defined $ENV{DRBD_RESOURCE} ? $ENV{DRBD_RESOURCE} : "", # The resource minor number, or, in the case of volumes, numbers. 'DRBD_MINOR' => defined $ENV{DRBD_MINOR} ? $ENV{DRBD_MINOR} : "", # This is the address format (ipv4, ipv6, etc) 'DRBD_PEER_AF' => defined $ENV{DRBD_PEER_AF} ? $ENV{DRBD_PEER_AF} : "", # This is the IP address of the target node. 'DRBD_PEER_ADDRESS' => defined $ENV{DRBD_PEER_ADDRESS} ? $ENV{DRBD_PEER_ADDRESS} : "", # This isn't set 'DRBD_PEERS' => defined $ENV{DRBD_PEERS} ? $ENV{DRBD_PEERS} : "", ### NOTE: Below here are undocumented variables. Don't expect them to always be useful. # My node ID 'DRBD_MY_NODE_ID' => defined $ENV{DRBD_MY_NODE_ID} ? $ENV{DRBD_MY_NODE_ID} : "", # The target's ID 'DRBD_PEER_NODE_ID' => defined $ENV{DRBD_PEER_NODE_ID} ? $ENV{DRBD_PEER_NODE_ID} : "", }, }; # Find executables. find_executables($conf); # Something for the logs to_log($conf, {message => "The Anvil! DRBD unfence handler has been invoked.", 'line' => __LINE__}); # These are the full host names of the nodes given their IDs. foreach my $i (0..31) { my $key = "DRBD_NODE_ID_".$i; if ((exists $ENV{$key}) && (defined $ENV{$key})) { $conf->{environment}{$key} = $ENV{$key}; my $level = $conf->{environment}{$key} eq "" ? 3 : 2; to_log($conf, {message => "DRBD Environment variable: [$key] -> [".$conf->{environment}{$key}."]", 'line' => __LINE__, level => $level}); } } # Record the environment variables foreach my $key (sort {$a cmp $b} keys %{$conf->{environment}}) { # $conf->{environment}{DRBD_RESOURCE} -> [srv51-Workstation3] my $level = $conf->{environment}{$key} eq "" ? 3 : 2; to_log($conf, {message => "DRBD Environment variable: [$key] -> [".$conf->{environment}{$key}."]", 'line' => __LINE__, level => $level}); } foreach my $key (sort {$a cmp $b} keys %ENV) { next if exists $conf->{environment}{$key}; my $level = $ENV{$key} eq "" ? 3 : 2; to_log($conf, {message => "System Environment variable: [$key] -> [".$ENV{$key}."]", 'line' => __LINE__, level => 3}); } # Make sure we at least have the target's IP. if (not $conf->{environment}{DRBD_PEER_ADDRESS}) { to_log($conf, {message => "Called without target's IP. Nothing to do, exiting. Were we called by 'pcs stonith list'?", 'line' => __LINE__, level => 1, priority => "alert"}); exit(1); } # This also checks that we're quorate and not in maintenance mode. identify_peer($conf); # If we're still alive, we now need to check the DRBD resource disk state locally. get_drbd_status($conf); # Is there a specific resource? if ($conf->{environment}{DRBD_RESOURCE}) { # Prevent the resource from running on the peer. to_log($conf, {message => "We're being asked to unfence the specific resource: [".$conf->{environment}{DRBD_RESOURCE}."]. Will remove the node attribute blocking this server from running on: [".$conf->{cluster}{target_node}."].", 'line' => __LINE__}); remove_constraint($conf); } else { to_log($conf, {message => "We were not given a specific resource to unfence. Nothing to do.", 'line' => __LINE__}); exit(0); } # If we hit here, something very wrong happened. exit(1); ############################################################################################################# # Functions # ############################################################################################################# # This removes a location constraint that prevents the resource / server from running on the peer node. sub remove_constraint { my ($conf) = @_; my $target_server = $conf->{environment}{DRBD_RESOURCE}; my $target_node = $conf->{cluster}{target_node}; to_log($conf, {message => "Will now create a location constraint against: [".$target_server."] preventing it from running on: [".$target_node."].", 'line' => __LINE__, level => 1}); # Check that the rule was set. my $rule_name = "drbd-fenced_".$target_server; my $rule_set = 1; my $rule_found = 0; my $rule_output = ""; my $shell_call = $conf->{path}{exe}{crm_attribute}." --type nodes --node ".$target_node." --name ".$rule_name." --query"; to_log($conf, {message => "Calling: [".$shell_call."]", 'line' => __LINE__, level => 2}); open (my $file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n"; while(<$file_handle>) { # This should not generate output. chomp; my $line = $_; to_log($conf, {message => "Output: [".$line."]", 'line' => __LINE__, level => 2}); if (($line =~ /name=$rule_name/) && ($line =~ /value=0/)) { $rule_set = 0; to_log($conf, {message => "rule_set: [".$rule_set."]", 'line' => __LINE__, level => 2}); last; } else { $rule_output .= $line."\n"; } } close $file_handle; if (not $rule_set) { # No need to unfence. to_log($conf, {message => "The node attribute rule: [".$rule_name."] against the node: [".$target_node."] was not found. No need to unfence.", 'line' => __LINE__, level => 0, priority => "err"}); exit(0); } # Clear the node attribute $shell_call = $conf->{path}{exe}{crm_attribute}." --type nodes --node ".$target_node." --name ".$rule_name." --update 0"; to_log($conf, {message => "Calling: [".$shell_call."]", 'line' => __LINE__, level => 2}); open ($file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n"; while(<$file_handle>) { # This should not generate output. chomp; my $line = $_; to_log($conf, {message => "Output: [".$line."]", 'line' => __LINE__, level => 3}); } close $file_handle; # Check that the rule was set. $rule_output = ""; $shell_call = $conf->{path}{exe}{crm_attribute}." --type nodes --node ".$target_node." --name ".$rule_name." --query"; to_log($conf, {message => "Calling: [".$shell_call."]", 'line' => __LINE__, level => 2}); open ($file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n"; while(<$file_handle>) { # This should not generate output. chomp; my $line = $_; to_log($conf, {message => "Output: [".$line."]", 'line' => __LINE__, level => 2}); if (($line =~ /name=$rule_name/) && ($line =~ /value=0/)) { $rule_set = 0; to_log($conf, {message => "rule_set: [".$rule_set."]", 'line' => __LINE__, level => 2}); last; } else { $rule_output .= $line."\n"; } } close $file_handle; if (not $rule_set) { # Success! to_log($conf, {message => "The node attribute rule: [".$rule_name."] against the node: [".$target_node."] has been cleared successfully.", 'line' => __LINE__, level => 0, priority => "err"}); exit(0); } else { # Failed. $rule_output =~ s/\n$//gs; to_log($conf, {message => "The node attribute triggering the fence rule: [".$rule_name."] against the node: [".$target_node."] appears to have not been cleared. Expected a string with 'name=".$rule_name." value=0' but got: [".$rule_output."].", 'line' => __LINE__, level => 0, priority => "err"}); exit(1); } return(0); } # This reads the status of all resources. If we're not all UpToDate, check if the peer is. If the peer is, # abort. If not, proceed (someone is gouig to have a bad day, but maybe some servers will live) sub get_drbd_status { my ($conf) = @_; my $resource = ""; my $peer = ""; my $local_all_uptodate = 1; my $peer_all_uptodate = 1; my $shell_call = $conf->{path}{exe}{drbdadm}." status all"; to_log($conf, {message => "Calling: [".$shell_call."]", 'line' => __LINE__, level => 2}); open (my $file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n"; while(<$file_handle>) { # This should not generate output. chomp; my $line = $_; to_log($conf, {message => "Output: [".$line."]", 'line' => __LINE__, level => 3}); if (not $line) { $resource = ""; $peer = ""; to_log($conf, {message => "resource: [".$resource."], peer: [".$peer."]", 'line' => __LINE__, level => 3}); next; } if ($line =~ /^(\S+)\s+role/) { $resource = $1; to_log($conf, {message => "resource: [".$resource."]", 'line' => __LINE__, level => 3}); next; } if ($line =~ /^\s+(.*?) role:/) { $peer = $1; to_log($conf, {message => "peer: [".$peer."]", 'line' => __LINE__, level => 3}); next; } if ($resource) { if ($line =~ /disk:(.*)$/) { my $local_dstate = $1; $local_dstate =~ s/\s.*$//; to_log($conf, {message => "local_dstate: [".$local_dstate."]", 'line' => __LINE__, level => 2}); if (lc($local_dstate) ne "uptodate") { $local_all_uptodate = 0; to_log($conf, {message => "local_all_uptodate: [".$local_all_uptodate."]", 'line' => __LINE__, level => 2}); } next; } if ($line =~ /peer-disk:(.*)$/) { my $peer_dstate = $1; $peer_dstate =~ s/\s.*$//; to_log($conf, {message => "peer: [".$peer."], peer_dstate: [".$peer_dstate."]", 'line' => __LINE__, level => 2}); if (lc($peer_dstate) ne "uptodate") { $peer_all_uptodate = 0; to_log($conf, {message => "peer_all_uptodate: [".$peer_all_uptodate."]", 'line' => __LINE__, level => 2}); } next; } } } close $file_handle; # If we're not all UpToDate, but the peer is, abort to_log($conf, {message => "local_all_uptodate: [".$local_all_uptodate."], peer_all_uptodate: [".$peer_all_uptodate."]", 'line' => __LINE__, level => 2}); if ((not $local_all_uptodate) && ($peer_all_uptodate)) { # We're not good to_log($conf, {message => "This node has DRBD resources that are not UpToDate, but the peer is fully UpToDate. Aborting.", 'line' => __LINE__, level => 0, priority => "err"}); exit(1); } return(0); } # This identifies the pacemaker name of the target node. If it can't find the peer, it exits with '1'. sub identify_peer { my ($conf) = @_; # I know the target's (SN) IP, map it to a node. my $target_host = ""; my $target_ip = $conf->{environment}{DRBD_PEER_ADDRESS}; # First, can we translate the IP to a host name? my $shell_call = $conf->{path}{exe}{getent}." hosts ".$target_ip; to_log($conf, {message => "Calling: [".$shell_call."]", 'line' => __LINE__, level => 2}); open (my $file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n"; while(<$file_handle>) { # This should not generate output. chomp; my $line = $_; to_log($conf, {message => "Output: [".$line."]", 'line' => __LINE__, level => 2}); if ($line =~ /^$target_ip\s+(.*)$/) { # This could be multiple names. $target_host = $1; to_log($conf, {message => "target_host: [".$target_host."]", 'line' => __LINE__, level => 2}); #to_log($conf, {message => ">> target_host: [".$target_host."]", 'line' => __LINE__, level => 2}); # Strip off any suffix, we only want the short name. #$target_host =~ s/\..*//; #to_log($conf, {message => "<< target_host: [".$target_host."]", 'line' => __LINE__, level => 2}); #last; } } close $file_handle; # If I got the host name, try to match it to a pacemaker node name. if ($target_host) { # Get the current CIB (in an XML::Simple hash). This will exit if it fails to read the XML # and convert it to an XML::Simple hash. my $body = read_cib($conf); # Parse the XML. my $host_name = $ENV{HOSTNAME}; my $short_host_name = $ENV{HOSTNAME}; $short_host_name =~ s/\..*$//; to_log($conf, {message => "host_name: [".$host_name."], short_host_name: [".$short_host_name."]", 'line' => __LINE__, level => 2}); foreach my $hash_ref (sort {$a cmp $b} @{$body->{configuration}{nodes}{node}}) { my $node = $hash_ref->{uname}; my $id = $hash_ref->{id}; to_log($conf, {message => "node: [".$node."], id: [".$id."]", 'line' => __LINE__, level => 2}); foreach my $target_name (split/ /, $target_host) { to_log($conf, {message => ">> target_name: [".$target_name."]", 'line' => __LINE__, level => 2}); $target_name =~ s/\..*//; to_log($conf, {message => "<< target_name: [".$target_name."]", 'line' => __LINE__, level => 2}); if ($node =~ /^$target_name/) { $conf->{cluster}{target_node} = $node; to_log($conf, {message => "Found the pacemaker name of the target node: [".$conf->{cluster}{target_node}."]", 'line' => __LINE__, level => 1}); } elsif ($node =~ /^$short_host_name/) { # This is me. Am I in maintenance mode? if (exists $hash_ref->{instance_attributes}) { # We've got some data.. my $name = defined $hash_ref->{instance_attributes}{nvpair}{name} ? $hash_ref->{instance_attributes}{nvpair}{name} : ""; my $value = defined $hash_ref->{instance_attributes}{nvpair}{value} ? $hash_ref->{instance_attributes}{nvpair}{value} : ""; to_log($conf, {message => "node: [".$node."] instance attribyte name: [".$name."], value: [".$value."]", 'line' => __LINE__, level => 1}); if (($name eq "maintenance") and ($value eq "on")) { # We're in maintenance mode, abort. to_log($conf, {message => "This node is in maintenance mode. Not able to fence!", 'line' => __LINE__, level => 0, priority => "err"}); exit(1); } } } } } my $quorate = $body->{'have-quorum'}; to_log($conf, {message => "quorate: [".$quorate."]", 'line' => __LINE__, level => 1}); if (not $quorate) { to_log($conf, {message => "This not is not quorate. Refusing to fence the peer!", 'line' => __LINE__, level => 0, priority => "err"}); exit(1); } # See if the target node is already out of the cluster. foreach my $hash_ref (@{$body->{status}{node_state}}) { my $node = $hash_ref->{uname}; my $join = $hash_ref->{'join'}; my $expected = $hash_ref->{expected}; to_log($conf, {message => "node: [".$node."] join: [".$join."], expected: [".$expected."]", 'line' => __LINE__, level => 3}); if ($node eq $conf->{cluster}{target_node}) { to_log($conf, {message => "Checking the status of target node: [".$node."].", 'line' => __LINE__, level => 1}); if (($join eq "down") && ($expected eq "down")) { # The node is out. to_log($conf, {message => "The node: [".$node."] is already down. No actual fence needed.", 'line' => __LINE__, level => 1}); exit(7); } else { to_log($conf, {message => "The node: [".$node."] is: [".$join."/".$expected."] (join/expected). Proceeding with the fence action.", 'line' => __LINE__, level => 1}); } } } } # Did I find the target? if (not $conf->{cluster}{target_node}) { to_log($conf, {message => "Failed to find the pacemaker name of the target node. Unable to proceed!", 'line' => __LINE__, level => 0, priority => "err"}); exit(1); } return(0); } # This reads in the CIB XML and returns it as a single multi-line string. sub read_cib { my ($conf) = @_; my $xml_opened = 0; my $xml_closed = 0; my $body = ""; my $cib = ''; my $shell_call = $conf->{path}{exe}{cibadmin}." --local --query"; to_log($conf, {message => "Calling: [".$shell_call."]", 'line' => __LINE__, level => 3}); open (my $file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n"; while(<$file_handle>) { # This should not generate output. chomp; my $line = $_; to_log($conf, {message => "Output: [".$line."]", 'line' => __LINE__, level => 3}); $cib .= "\n".$line; if ($line =~ /Signon to CIB failed/i) { # Failed to connect, we're probably not in the cluster. to_log($conf, {message => "This node does not appear to be in the cluster. Unable to get the CIB status.", 'line' => __LINE__, level => 0, priority => "err"}); exit(1); } if ($line =~ /^$/) { $xml_opened = 1; to_log($conf, {message => "xml_opened: [".$xml_opened."].", 'line' => __LINE__, level => 3}); } if ($line =~ /^<\/cib>$/) { $xml_closed = 1; to_log($conf, {message => "xml_closed: [".$xml_closed."].", 'line' => __LINE__, level => 3}); } } close $file_handle; to_log($conf, {message => "cib: ==========\n".$cib."\n==========", 'line' => __LINE__, level => 3}); # Now parse the CIB XML if I read it OK. to_log($conf, {message => "xml_opened: [".$xml_opened."], xml_closed: [".$xml_closed."].", 'line' => __LINE__, level => 3}); if (($xml_opened) && ($xml_closed)) { # We're good local $@; my $xml = XML::Simple->new(); my $test = eval { $body = $xml->XMLin($cib, KeyAttr => { language => 'name', key => 'name' }, ForceArray => [ 'id' ]) }; if (not $test) { chomp $@; my $error = "[ Error ] - The was a problem parsing: [".$cib."]. The error was:\n"; $error .= "===========================================================\n"; $error .= $@."\n"; $error .= "===========================================================\n"; to_log($conf, {message => $error, 'line' => __LINE__, level => 0, priority => "err"}); exit(1); } } else { # Failed to read the CIB XML. to_log($conf, {message => "This node does not appear to be in the cluster. Unable to read the CIB XML properly.", 'line' => __LINE__, level => 2, priority => "err"}); exit(1); } return($body); } # This checks the given paths and, if something isn't found, it searches PATH trying to find it. sub find_executables { my ($conf) = @_; # Variables. my $check = ""; my $bad = 0; # If PATH isn't set, set it (could have been scrubbed by a caller). if (not $ENV{PATH}) { $ENV{PATH} = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/root/bin"; } # Log entries can only happen if I've found 'logger', so an extra check will be made on 'to_log' # calls. my @dirs = split/:/, $ENV{PATH}; foreach my $exe (sort {$b cmp $a} keys %{$conf->{path}{exe}}) { if ( not -e $conf->{path}{exe}{$exe} ) { to_log($conf, {message => "The program: [".$exe."] is not at: [".$conf->{path}{exe}{$exe}."]. Looking for it now...", 'line' => __LINE__, level => 1}); foreach my $path (@dirs) { $check = "$path/$exe"; $check =~ s/\/\//\//g; to_log($conf, {message => "Checking: [".$check."]", 'line' => __LINE__, level => 2}); if ( -e $check ) { if (-e $conf->{path}{exe}{logger}) { to_log($conf, {message => "Found it! Changed path for: [".$exe."] from: [".$conf->{path}{exe}{$exe}."] to: [".$check."]", 'line' => __LINE__, level => 1}); } else { warn "DEBUG: Found it! Changed path for: [".$exe."] from: [".$conf->{path}{exe}{$exe}."] to: [".$check."]\n"; } $conf->{path}{exe}{$exe} = $check; } else { to_log($conf, {message => "Not found.", 'line' => __LINE__, level => 2}); } } } else { to_log($conf, {message => "Found!", 'line' => __LINE__, level => 3}); next; } # Make sure it exists now. to_log($conf, {message => "Checking again if: [".$exe."] is at: [".$conf->{path}{exe}{$exe}."].", 'line' => __LINE__, level => 3}); if (not -e $conf->{path}{exe}{$exe}) { $bad = 1; if (-e $conf->{path}{exe}{logger}) { to_log($conf, {message => "Failed to find executable: [".$exe."]. Unable to proceed.", 'line' => __LINE__, level => 0}); } else { warn "Failed to find executable: [".$exe."]. Unable to proceed.\n"; } } } if ($bad) { exit(1); } return(0); } # Log file entries sub to_log { my ($conf, $parameters) = @_; my $facility = defined $parameters->{facility} ? $parameters->{facility} : $conf->{'log'}{facility}; my $level = defined $parameters->{level} ? $parameters->{level} : 1; my $line = defined $parameters->{'line'} ? $parameters->{'line'} : 0; my $message = defined $parameters->{message} ? $parameters->{message} : ""; my $priority = defined $parameters->{priority} ? $parameters->{priority} : ""; # Leave if we don't care about this message return if $level > $conf->{'log'}{level}; return if not $message; # Build the message. We log the line if (($conf->{'log'}{line_numbers}) && ($line)) { $message = $line."; ".$message; } my $priority_string = $facility; if ($priority) { $priority_string .= ".".$priority; } elsif ($level eq "0") { $priority_string .= ".notice"; } elsif (($level eq "1") or ($level eq "2")) { $priority_string .= ".info"; } else { $priority_string .= ".debug"; } # Clean up the string for bash $message =~ s/"/\\\"/gs; #$message =~ s/\(/\\\(/gs; my $shell_call = $conf->{path}{exe}{logger}." --priority ".$priority_string." --tag ".$conf->{'log'}{tag}." -- \"".$message."\""; open (my $file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n"; while(<$file_handle>) { # This should not generate output. chomp; my $line = $_; print "Unexpected logging output: [".$line."]\n"; } close $file_handle; return(0); }