#!/usr/bin/perl
#
# This is the resource agent used to manage servers on the Anvil! Intelligent Availability platform.
#
# License: GNU General Public License (GPL) v2+
# (c) 1997-2018 - Alteeve's Niche! Inc.
#
# WARNING: This is a pretty purpose-specific resource agent. No effort was made to test this on an rgmanager
# cluster or on any configuration outside how the Anvil! m3 uses it. If you plan to adapt it to
# another purpose, let us know and we'll try to help.
#
# Based on: https://github.com/ClusterLabs/resource-agents/blob/master/doc/dev-guides/ra-dev-guide.asc
#
# Error types from pacemaker's perspective;
#
# - Soft Error - Unless specifically configured otherwise, pacemaker will attempt to recover a resource
# in-place - usually by restarting the resource on the same node.
# - Hard Error - Unless specifically configured otherwise, pacemaker will attempt to recover a resource
# which failed with this error by restarting the resource on a different node.
# - Fatal Error - This is a cluster-wide error, it would make no sense to recover such a resource on a
# different node, let alone in-place. When a resource fails with this error, Pacemaker will
# attempt to shut down the resource, and wait for administrator intervention.
#
# Exit codes;
# 0 - OCF_SUCCESS
# - The action completed successfully. This is the expected return code for any successful start, stop,
# migrate_to, meta_data, help, and usage action.
# - For monitor, however, a modified convention applies:
# - If the server is running we return, OCF_SUCCESS. If not running and gracefully stopped or migrated
# off, return OCF_NOT_RUNNING.
#
# 1 - OCF_ERR_GENERIC
# - The action returned a generic error. This is used only when none of the more specific error codes,
# defined below, accurately describes the problem.
# - Pacemaker interprets this exit code as a soft error.
#
# 2 - OCF_ERR_ARGS
# - The resource’s configuration is not valid on this machine. This can happen if the serve fails to boot
# because of a missing bridge, for example.
#
# 3 - OCF_ERR_UNIMPLEMENTED
# - The resource agent was instructed to execute an action that we do not implement.
# - Not all resource agent actions are mandatory. We don't implement 'promote' or 'demote'. We do implement
# 'migrate_to', 'migrate_from', and 'notify'. If we're misconfigured as a master/slave resource, for
# example, then will alert the user about this misconfiguration by returning OCF_ERR_UNIMPLEMENTED.
#
# 4 - OCF_ERR_PERM
# - The action failed due to insufficient permissions. This may be due to a node not being able to open a
# definition file or resource config.
# - Pacemaker interprets this exit code as a hard error.
#
# 5 - OCF_ERR_INSTALLED
# - The action failed because a required component is missing on the node where the action was executed.
# This may be due to a required binary not being executable, or a the DRBD resource config file not
# existing.
# - Pacemaker interprets this exit code as a hard error.
#
# 6 - OCF_ERR_CONFIGURED
# - The action failed because the user misconfigured the resource in pacemaker. For example, the user may
# have configured an alphanumeric string for a parameter that really should be an integer.
# - Pacemaker interprets this exit code as a fatal error.
#
# 7 - OCF_NOT_RUNNING
# - The resource was found not to be running. This is an exit code that may be returned by the monitor
# action exclusively. Note that this implies that the resource has either gracefully shut down, or has
# never been started.
#
# 8 - OCF_RUNNING_MASTER
# 9 - OCF_FAILED_MASTER
# - These OCF exit codes are not used here.
#
# NOTE: We don't use Anvil::Tools to keep overhead low and to keep this agent independent as possible.
use strict;
use warnings;
use XML::Simple;
use JSON;
use Math::BigInt;
use Data::Dumper;
# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete.
$| = 1;
my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0];
my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0];
if (($running_directory =~ /^\./) && ($ENV{PWD}))
{
$running_directory =~ s/^\./$ENV{PWD}/;
}
my $conf = {
'log' => {
facility => "local0",
level => 2,
line_numbers => 1,
tag => "ocf:alteeve:".$THIS_FILE,
},
# If a program isn't at the defined path, $ENV{PATH} will be searched.
path => {
config => {
definition => "/mnt/anvil/definitions/#!NAME!#.xml",
},
exe => {
brctl => "/usr/sbin/brctl",
cibadmin => "/usr/sbin/cibadmin",
crm_error => "/usr/sbin/crm_error",
drbdadm => "/usr/sbin/drbdadm",
drbdsetup => "/usr/sbin/drbdsetup",
echo => "/usr/bin/echo",
free => "/usr/bin/free",
getent => "/usr/bin/getent",
logger => "/usr/bin/logger",
stonith_admin => "/usr/sbin/stonith_admin",
virsh => "/usr/bin/virsh",
},
},
environment => {
# This is the name of the server we're managing. # Example values:
OCF_RESKEY_name => defined $ENV{OCF_RESKEY_name} ? $ENV{OCF_RESKEY_name} : "", # srv01-c7
# This is our node name
OCF_RESKEY_CRM_meta_on_node => defined $ENV{OCF_RESKEY_CRM_meta_on_node} ? $ENV{OCF_RESKEY_CRM_meta_on_node} : "", # m3-a02n01.alteeve.com
# This says "UUID", but it's the node ID.
OCF_RESKEY_CRM_meta_on_node_uuid => defined $ENV{OCF_RESKEY_CRM_meta_on_node_uuid} ? $ENV{OCF_RESKEY_CRM_meta_on_node_uuid} : "", # 1
# This is the timeout for the called action in millisecond.
OCF_RESKEY_CRM_meta_timeout => defined $ENV{OCF_RESKEY_CRM_meta_timeout} ? $ENV{OCF_RESKEY_CRM_meta_timeout} : "", # 20000
# If this is set, we'll bump our log level as well.
PCMK_debug => defined $ENV{PCMK_debug} ? $ENV{PCMK_debug} : "", # 0
# These are other variables that are set, but we don't currently care about them
OCF_EXIT_REASON_PREFIX => defined $ENV{OCF_EXIT_REASON_PREFIX} ? $ENV{OCF_EXIT_REASON_PREFIX} : "", # ocf-exit-reason:
OCF_RA_VERSION_MAJOR => defined $ENV{OCF_RA_VERSION_MAJOR} ? $ENV{OCF_RA_VERSION_MAJOR} : "", # 1
OCF_RA_VERSION_MINOR => defined $ENV{OCF_RA_VERSION_MINOR} ? $ENV{OCF_RA_VERSION_MINOR} : "", # 0
OCF_RESKEY_crm_feature_set => defined $ENV{OCF_RESKEY_crm_feature_set} ? $ENV{OCF_RESKEY_crm_feature_set} : "", # 3.0.12
OCF_RESOURCE_INSTANCE => defined $ENV{OCF_RESOURCE_INSTANCE} ? $ENV{OCF_RESOURCE_INSTANCE} : "", # srv01-c7
OCF_RESOURCE_PROVIDER => defined $ENV{OCF_RESOURCE_PROVIDER} ? $ENV{OCF_RESOURCE_PROVIDER} : "", # alteeve
OCF_RESOURCE_TYPE => defined $ENV{OCF_RESOURCE_TYPE} ? $ENV{OCF_RESOURCE_TYPE} : "", # server
OCF_ROOT => defined $ENV{OCF_ROOT} ? $ENV{OCF_ROOT} : "", # /usr/lib/ocf
},
};
# If pacemaker is in debug, so are we,
if ($conf->{environment}{PCMK_debug})
{
$conf->{'log'}{level} = 3;
}
# Find executables.
find_executables($conf);
# Get any command line switches.
get_switches($conf);
### TEST: to be removed later
if ($conf->{switches}{test})
{
$conf->{environment}{OCF_RESKEY_name} = "srv01-c7";
$conf->{environment}{OCF_RESKEY_CRM_meta_on_node} = "m3-a02n01.alteeve.com";
$conf->{environment}{OCF_RESKEY_CRM_meta_timeout} = 20000;
}
# Something for the logs
to_log($conf, {message => "ocf:alteeve:server invoked.", 'line' => __LINE__, level => 2});
# This is for debugging.
show_environment($conf, 2);
### What are we being asked to do?
# start - Starts the resource.
# stop - Shuts down the resource.
# monitor - (status aliases here) Queries the resource for its state.
# meta-data - Dumps the resource agent metadata.
# promote - Turns a resource into the Master role (Master/Slave resources only).
# demote - Turns a resource into the Slave role (Master/Slave resources only).
# migrate_to - migration target
# migrate_from - Implement live migration of resources.
# validate-all - Validates a resource’s configuration.
# help - (usage maps here) Displays a usage message when the resource agent is invoked from the command line, rather than by the cluster manager.
# notify - Inform resource about changes in state of other clones.
if ($conf->{switches}{start})
{
# Start the server
start_server($conf);
}
elsif ($conf->{switches}{stop})
{
# Stop the server
stop_server($conf);
}
elsif (($conf->{switches}{monitor}) or ($conf->{switches}{status}))
{
# Report the status of the server.
server_status($conf);
}
elsif (($conf->{switches}{metadaata}) or ($conf->{switches}{'meta-data'}))
{
show_metadata($conf);
}
elsif ($conf->{switches}{promote})
{
# We don't support this, so we return OCF_ERR_UNIMPLEMENTED (3)
to_log($conf, {message => "We were asked to promote: [".$conf->{environment}{OCF_RESKEY_name}."], which makes no sense and is not supported. Ignoreing.", 'line' => __LINE__, level => 0, priority => "err"});
exit(3);
}
elsif ($conf->{switches}{demote})
{
# We don't support this, so we return OCF_ERR_UNIMPLEMENTED (3)
to_log($conf, {message => "We were asked to demote: [".$conf->{environment}{OCF_RESKEY_name}."], which makes no sense and is not supported. Ignoreing.", 'line' => __LINE__, level => 0, priority => "err"});
exit(3);
}
elsif (($conf->{switches}{migrate_to}) && ($conf->{switches}{migrate_from}))
{
# We don't support this, so we return OCF_ERR_UNIMPLEMENTED (3)
migrate_server($conf);
}
elsif ($conf->{switches}{'validate-all'})
{
# Validate our local config and setup.
validate_all($conf);
exit(0);
}
elsif (($conf->{switches}{help}) && ($conf->{switches}{usage}))
{
# Show the usage information
show_usage($conf);
exit(0);
}
elsif ($conf->{switches}{notify})
{
# We don't implement this
to_log($conf, {message => "We were asked to notify, but this is not a promotable (we're stateless) agent. Ignoring.", 'line' => __LINE__, level => 0, priority => "warn"});
exit(3);
}
else
{
# We were called in some unexpected way. Log an error, show usage and exit.
show_environment($conf, 0);
to_log($conf, {message => "We were invoked with an unexpected (or no) command. Environment variables and arguments below.", 'line' => __LINE__, level => 0, priority => "warn"});
exit(1);
}
# If we hit here, something very wrong happened.
exit(255);
#############################################################################################################
# Functions #
#############################################################################################################
=cut
STATES
The State field lists what state each domain is currently in. A domain can be in one of the following
possible states:
running - The domain is currently running on a CPU
idle - The domain is idle, and not running or runnable. This can be caused because the domain is
waiting on IO (a traditional wait state) or has gone to sleep because there was nothing else
for it to do.
paused - The domain has been paused, usually occurring through the administrator running virsh suspend.
When in a paused state the domain will still consume allocated resources like memory, but will
not be eligible for scheduling by the hypervisor.
in shutdown - The domain is in the process of shutting down, i.e. the guest operating system has been
notified and should be in the process of stopping its operations gracefully.
shut off - The domain is not running. Usually this indicates the domain has been shut down completely, or
has not been started.
crashed - The domain has crashed, which is always a violent ending. Usually this state can only occur if
the domain has been configured not to restart on crash.
pmsuspended - The domain has been suspended by guest power management, e.g. entered into s3 state.
=cut
# This boots the server if possible.
sub start_server
{
my ($conf) = @_;
# Start procedure;
# 1. Read the XML definition file and find the backing storage and bridges. Soft error if read fails.
# 2. Make sure the name matches.
# 3. Make sure we have enough free RAM.
# 4. Make sure the emulator exists (can be an issue after migrating from an different gen Anvil!).
# 5.1. Make sure optical drives with mounted data have the disk present. Soft error if not.
# 5.2. Find any backing DRBD devices
# 6. For each DRBD device;
# 6.1. Make sure the backing LV is ACTIVE. Soft error if not.
# 6.2. Check if the drbd resource is up. If not, up it.
# 6.3. Make sure the backing disk is UpToDate. Soft error if not.
# 6.4. Make sure the backing device is 'Connected' or 'Connecting'. Call a connect if not.
# 7. Make sure all bridges exist and soft error if not.
# 8. Start the server.
my $server = $conf->{environment}{OCF_RESKEY_name};
to_log($conf, {message => "We've been asked to start the server: [$server].", 'line' => __LINE__, level => 2});
validate_all($conf);
# If we're still alive, we're ready to boot.
to_log($conf, {message => "Sanity checks passed, ready to start: [$server].", 'line' => __LINE__, level => 2});
my $definition_file = $conf->{path}{config}{definition};
$definition_file =~ s/#!NAME!#/$server/;
to_log($conf, {message => "definition_file: [$definition_file].", 'line' => __LINE__, level => 2});
my ($return_code, $output) = shell_call($conf, $conf->{path}{exe}{virsh}." create $definition_file");
if ($return_code)
{
# If this fails, we want to exit with OCF_ERR_CONFIGURED (6) so that pacemaker doesn't try to
# also start the server on another node, because we don't know the state of it here.
to_log($conf, {message => "All tests passed, yet the attempt to boot the server: [$server] exited with a non-zero return code: [$return_code]. The server is in an unknown state, so exiting with a fatal error. Human intervention is now required. The output, if any, was: [$output].", 'line' => __LINE__, level => 0, priority => "err"});
exit(6);
}
# Verify that it started.
sleep 2;
$return_code = undef;
$output = undef;
($return_code, $output) = shell_call($conf, $conf->{path}{exe}{virsh}." list");
if ($return_code)
{
# If this fails, we want to exit with OCF_ERR_CONFIGURED (6) so that pacemaker doesn't try to
# also start the server on another node, because we don't know the state of it here.
to_log($conf, {message => "It appears that the call to boot the server: [$server] worked, but the call to list running servers exited with a non-zero return code: [$return_code]. The server is in an unknown state, so exiting with a fatal error. Human intervention is now required. The output, if any, was: [$output].", 'line' => __LINE__, level => 0, priority => "err"});
exit(6);
}
foreach my $line (split/\n/, $output)
{
$line =~ s/^\s+//;
$line =~ s/\s+$//;
$line =~ s/\s+/ /g;
if ($line =~ /^(\d+) $server (.*)$/)
{
my $state = $2;
to_log($conf, {message => "server: [$server], state: [$state]", 'line' => __LINE__, level => 2});
if ($state eq "running")
{
# Success!
to_log($conf, {message => "The server: [$server] has started successfully.", 'line' => __LINE__, level => 2});
exit(0);
}
else
{
# WTF?
to_log($conf, {message => "The server: [$server] should have been started, but it's state is: [$state]. Human intervention is required!", 'line' => __LINE__, level => 1, priority => "err"});
exit(6);
}
last;
}
}
# If we're still alive, then we didn't see the server in the list of running servers, which is really weird.
to_log($conf, {message => "The server: [$server] should have been started, but it wasn't found in the list of running servers.", 'line' => __LINE__, level => 1, priority => "err"});
exit(1);
}
# This shuts down the server if possible.
sub stop_server
{
my ($conf) = @_;
# Stopping the server is simply a question of "is the server running?" and, if so, stop it.
my $server = $conf->{environment}{OCF_RESKEY_name};
my ($return_code, $output) = shell_call($conf, $conf->{path}{exe}{virsh}." list");
if ($return_code)
{
# Looks like virsh isn't running.
to_log($conf, {message => "The attempt to list the running servers returned a non-zero return code: [$return_code]. The output, if any, was: [$output].", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
my $found = 0;
foreach my $line (split/\n/, $output)
{
$line =~ s/^\s+//;
$line =~ s/\s+$//;
$line =~ s/\s+/ /g;
if ($line =~ /^(\d+) $server (.*)$/)
{
my $state = $2;
$found = 1;
to_log($conf, {message => "server: [$server], state: [$state]", 'line' => __LINE__, level => 2});
if ($state eq "running")
{
# The server is running, shut it down.
to_log($conf, {message => "The server: [$server] is running. We will ask it to shut down now.", 'line' => __LINE__, level => 2});
}
elsif ($state eq "paused")
{
# The server is paused. Resume it, wait a few, then proceed with the shutdown.
to_log($conf, {message => "The server: [$server] is paused. Resuming it now so that it can react to the shutdown request.", 'line' => __LINE__, level => 2});
my ($return_code, $output) = shell_call($conf, $conf->{path}{exe}{virsh}." resume $server");
if ($return_code)
{
# Looks like virsh isn't running.
to_log($conf, {message => "The attempt to resume the server: [$server] returned a non-zero return code: [$return_code]. The output, if any, was: [$output].", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
to_log($conf, {message => "Pausing for a moment to give the server time to resume.", 'line' => __LINE__, level => 2});
sleep 3;
}
elsif ($state eq "pmsuspended")
{
# The server is paused. Resume it, wait a few, then proceed with the shutdown.
to_log($conf, {message => "The server: [$server] is asleep. Waking it now so that it can react to the shutdown request.", 'line' => __LINE__, level => 2});
my ($return_code, $output) = shell_call($conf, $conf->{path}{exe}{virsh}." dompmwakeup $server");
if ($return_code)
{
# Looks like virsh isn't running.
to_log($conf, {message => "The attempt to wake the server: [$server] returned a non-zero return code: [$return_code]. The output, if any, was: [$output].", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
to_log($conf, {message => "Pausing for half a minute to give the server time to wake up.", 'line' => __LINE__, level => 2});
sleep 30;
}
elsif (($state eq "in shutdown") or ($state eq "shut off"))
{
# The server is already shutting down
to_log($conf, {message => "The server: [$server] is already shutting down.", 'line' => __LINE__, level => 2});
exit(0);
}
elsif (($state eq "idle") or ($state eq "crashed"))
{
# The server needs to be destroyed.
to_log($conf, {message => "The server: [$server] is hung. Its state is: [$state]. We will force it off now.", 'line' => __LINE__, level => 2});
my ($return_code, $output) = shell_call($conf, $conf->{path}{exe}{virsh}." destroy $server");
if ($return_code)
{
# Looks like virsh isn't running.
to_log($conf, {message => "The attempt to force-off the server: [$server] returned a non-zero return code: [$return_code]. The output, if any, was: [$output].", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
to_log($conf, {message => "The server: [$server] is now off.", 'line' => __LINE__, level => 2});
exit(0);
}
else
{
# WTF?
to_log($conf, {message => "The server: [$server] is running, but it is in an unexpected state: [$state]. Human intervention is required!", 'line' => __LINE__, level => 1, priority => "err"});
exit(6);
}
last;
}
}
# If we didn't see it, it's off and undefined.
if (not $found)
{
to_log($conf, {message => "The server: [$server] was not listed on this node, so it is not running here.", 'line' => __LINE__, level => 2});
exit(0);
}
# If we're alive, it is time to stop the server
$return_code = undef;
$output = undef;
($return_code, $output) = shell_call($conf, $conf->{path}{exe}{virsh}." shutdown $server");
to_log($conf, {message => "Asking the server: [$server] to shut down now. Please be patient.", 'line' => __LINE__, level => 1});
if ($return_code)
{
# Looks like virsh isn't running.
to_log($conf, {message => "The attempt to shut down the server: [$server] returned a non-zero return code: [$return_code]. The output, if any, was: [$output].", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
# Now loop until we see the server either vanish from virsh or enter "shut off" state. We wait
# forever and let pacemaker kill us if we time out.
while (1)
{
my $found = 0;
my ($return_code, $output) = shell_call($conf, $conf->{path}{exe}{virsh}." list");
if ($return_code)
{
# Looks like virsh isn't running.
to_log($conf, {message => "The attempt to list the running servers returned a non-zero return code: [$return_code]. The output, if any, was: [$output].", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
foreach my $line (split/\n/, $output)
{
$line =~ s/^\s+//;
$line =~ s/\s+$//;
$line =~ s/\s+/ /g;
if ($line =~ /^(\d+) $server (.*)$/)
{
my $state = $2;
$found = 1;
to_log($conf, {message => "server: [$server], state: [$state]", 'line' => __LINE__, level => 2});
if ($state eq "shut off")
{
# We're down.
to_log($conf, {message => "The server: [$server] is now off.", 'line' => __LINE__, level => 2});
exit(0);
}
last;
}
}
# If we didn't find the server, it's off and undefined now.
if (not $found)
{
to_log($conf, {message => "The server: [$server] is no longer listed. It is now off.", 'line' => __LINE__, level => 2});
exit(0);
}
to_log($conf, {message => "The server: [$server] is not off yet, waiting a few seconds and then we'll check again.", 'line' => __LINE__, level => 2});
sleep 5;
}
exit(0);
}
# This checks the status of the server.
sub server_status
{
my ($conf) = @_;
# If the named server is running, return OCF_SUCCESS (0), otherwise OCF_NOT_RUNNING (7). If the
# server is failed, return OCF_ERR_GENERIC (1).
my $state = "";
my $server = $conf->{environment}{OCF_RESKEY_name};
### NOTE: When pacemaker is first starting, virsh won't be up right away. So if we get a return code
### of '1', we'll try again up to 50% of 'environment::OCF_RESKEY_CRM_meta_timeout'.
if (not $conf->{environment}{OCF_RESKEY_CRM_meta_timeout})
{
# Set a sane default of 20 seconds.
$conf->{environment}{OCF_RESKEY_CRM_meta_timeout} = 20000;
to_log($conf, {message => "The environment variable 'OCF_RESKEY_CRM_meta_timeout' was not set, so setting it to: [".$conf->{environment}{OCF_RESKEY_CRM_meta_timeout}."].", 'line' => __LINE__, level => 1, priority => "warn"});
}
my $return_code = undef;
my $output = "";
my $current_time = time;
my $timeout = $current_time + int(($conf->{environment}{OCF_RESKEY_CRM_meta_timeout} /= 1000) / 2);
my $waiting = 1;
to_log($conf, {message => "current_time: [$current_time], timeout: [$timeout].", 'line' => __LINE__, level => 2});
while($waiting)
{
# Make the call
($return_code, $output) = shell_call($conf, $conf->{path}{exe}{virsh}." list");
to_log($conf, {message => "return_code: [$return_code].", 'line' => __LINE__, level => 2});
if (not $return_code)
{
$waiting = 0;
to_log($conf, {message => "waiting: [$waiting].", 'line' => __LINE__, level => 2});
}
elsif (time > $timeout)
{
# We've waited long enough.
$waiting = 0;
to_log($conf, {message => "The 'virsh' call exited with the return code: [$return_code]. The 'libvirtd' may have failed to start. We won't wait any longer.", 'line' => __LINE__, level => 1, priority => "warn"});
}
else
{
to_log($conf, {message => "The 'virsh' call exited with the return code: [$return_code]. The 'libvirtd' service might be starting, so we will check again shortly.", 'line' => __LINE__, level => 2});
sleep 2;
}
}
# If I got a non-zero return code, something went wrong with the virsh call.
if ($return_code)
{
to_log($conf, {message => "It would appear that libvirtd is not operating (or not operating correctly). Expected the return code '0' but got: [$return_code].", 'line' => __LINE__, level => 0, priority => "err"});
if ($output)
{
to_log($conf, {message => "Output of: [".$conf->{path}{exe}{virsh}." list] follows;", 'line' => __LINE__, level => 0, priority => "err"});
to_log($conf, {message => "Output: [$output]", 'line' => __LINE__, level => 0, priority => "err"});
}
exit(1);
}
# If we're still alive, process the output
foreach my $line (split/\n/, $output)
{
$line =~ s/^\s+//;
$line =~ s/\s+$//;
$line =~ s/\s+/ /g;
if ($line =~ /^(\d+) $server (.*)$/)
{
$state = $2;
to_log($conf, {message => "server: [$server], state: [$state]", 'line' => __LINE__, level => 2});
last;
}
}
# If there is a state, see what the state is.
if ($state)
{
# What is the state?
# (See the comment below the 'FUNCTIONS' divider above the first function for a full list of states.)
if (($state eq "running") or ($state eq "paused") or ($state eq "pmsuspended") or ($state eq "in shutdown"))
{
to_log($conf, {message => "The server: [$server] is: [$state], which is OK.", 'line' => __LINE__, level => 1});
exit(0);
}
elsif ($state eq "shut off")
{
to_log($conf, {message => "The server: [$server] is: [$state].", 'line' => __LINE__, level => 1});
exit(7);
}
elsif (($state eq "idle") or ($state eq "crashed"))
{
to_log($conf, {message => "The server: [$server] is in a bad state: [$state]!", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
else
{
# WTF?
to_log($conf, {message => "The server: [$server] is in an unexpected state: [$state]!", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
}
else
{
# Not running. Exit with OCF_NOT_RUNNING
to_log($conf, {message => "The server: [$server] is not running on this node.", 'line' => __LINE__, level => 1});
exit(7);
}
exit(0);
}
# Migrate the server
sub migrate_server
{
my ($conf) = @_;
# If we were given 'migrate_to', then just verify that the node name makes sense. If we were given
# 'migrate_from', we need to find the peer.
# Return failed until this is actually implemented.
exit(1);
}
# Validation checks that we have the definition XML, resource config and that needed apps are installed.
sub validate_all
{
my ($conf) = @_;
to_log($conf, {message => "Running validation tests...", 'line' => __LINE__, level => 2});
# Read in the server's definition file (if found and readable).
read_server_definition($conf);
to_log($conf, {message => "- Server definition was read.", 'line' => __LINE__, level => 2});
# Does the internal server name match?
validate_name($conf);
to_log($conf, {message => "- Server name is valid.", 'line' => __LINE__, level => 2});
# Make sure the emulator it wants is the one we have.
validate_emulator($conf);
to_log($conf, {message => "- Eumlator is valid.", 'line' => __LINE__, level => 2});
# These tests are only needed if we're about to boot the server
if ($conf->{switches}{start})
{
# Check that we have enough RAM.
validate_ram($conf);
to_log($conf, {message => "- Sufficient RAM is available.", 'line' => __LINE__, level => 2});
}
# Validate bridges
validate_bridges($conf);
to_log($conf, {message => "- Network bridge(s) are available.", 'line' => __LINE__, level => 2});
# Validate storage (Disks and optical media)
validate_storage($conf);
to_log($conf, {message => "- Storage is valid and ready.", 'line' => __LINE__, level => 2});
return(0);
}
# This ensures that the bridges the server connects to exist on this node.
sub validate_bridges
{
my ($conf) = @_;
# Find the Optical drives and DRBD devices.
foreach my $device_ref (@{$conf->{server}{definition_xml}->{devices}})
{
foreach my $interface_ref (@{$device_ref->{interface}})
{
foreach my $source_ref (@{$interface_ref->{source}})
{
my $bridge = $source_ref->{bridge};
$conf->{server}{bridges}{$bridge} = 1;
to_log($conf, {message => "server::bridges::${bridge}: [".$conf->{server}{bridges}{$bridge}."].", 'line' => __LINE__, level => 3});
}
}
}
# Get a list of available bridges.
my ($return_code, $output) = shell_call($conf, $conf->{path}{exe}{brctl}." show");
my $first_line = 1;
foreach my $line (split/\n/, $output)
{
to_log($conf, {message => "line: [$line].", 'line' => __LINE__, level => 3});
if ($first_line)
{
# We skip the first line instead of parse the string to avoid getting caught by
# translations.
$first_line = 0;
}
elsif ($line =~ /^(\S+)\s/)
{
my $bridge = $1;
$conf->{'local'}{bridge}{$bridge} = 1;
to_log($conf, {message => "local::bridge::${bridge}: [$bridge].", 'line' => __LINE__, level => 3});
}
}
# Verify bridges now
foreach my $bridge (sort {$a cmp $b} keys %{$conf->{server}{bridges}})
{
if ($conf->{'local'}{bridge}{$bridge})
{
to_log($conf, {message => "The bridge: [$bridge] is available for this server.", 'line' => __LINE__, level => 2});
}
else
{
# Missing bridge.
to_log($conf, {message => "The server wants to connect to the bridge: [$bridge] which we do not have on this node.", 'line' => __LINE__, level => 0, priority => "err"});
exit(5);
}
}
return(0);
}
# This looks up the disks and optical media connected to this server.
sub validate_storage
{
my ($conf) = @_;
# Find the bridge(s) this server uses.
foreach my $device_ref (@{$conf->{server}{definition_xml}->{devices}})
{
foreach my $disk_ref (@{$device_ref->{disk}})
{
my $type = $disk_ref->{device};
to_log($conf, {message => "type: [$type].", 'line' => __LINE__, level => 2});
if ($type eq "disk")
{
foreach my $source_ref (@{$disk_ref->{source}})
{
my $device_path = $source_ref->{dev};
$conf->{server}{disks}{$device_path} = "check";
to_log($conf, {message => "server::disks::${device_path}: [".$conf->{server}{disks}{$device_path}."].", 'line' => __LINE__, level => 2});
}
}
elsif ($type eq "cdrom")
{
foreach my $source_ref (@{$disk_ref->{source}})
{
my $file = $source_ref->{file};
$conf->{server}{optical}{$file} = 1;
to_log($conf, {message => "server::optical::${file}: [".$conf->{server}{optical}{$file}."].", 'line' => __LINE__, level => 2});
}
}
}
}
# Verify optical disks now
validate_storage_optical($conf);
# Verify DRBD devices now
validate_storage_drbd($conf);
return(0);
}
# THis makes sure that the needed backing DRBD devices are on this node. If so, and if they are not up, they
# will be brought up. If that fails, it errors out.
sub validate_storage_drbd
{
my ($conf) = @_;
# Read in the DRBD configuration XML.
my ($return_code, $drbd_body) = shell_call($conf, $conf->{path}{exe}{drbdadm}." dump-xml");
if ($return_code)
{
# Something went wrong.
to_log($conf, {message => "The attempt to read the DRBD configuration returned a non-zero code: [$return_code]. The returned output (if any) was: [$drbd_body].", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
# Parse the XML
my $drbd_xml = "";
my $xml = XML::Simple->new();
eval { $drbd_xml = $xml->XMLin($drbd_body, KeyAttr => ["name", "vnr"], ForceArray => 1) };
if ($@)
{
chomp $@;
my $error = "[ Error ] - The was a problem parsing: [$drbd_body]. The error was:\n";
$error .= "===========================================================\n";
$error .= $@."\n";
$error .= "===========================================================\n";
to_log($conf, {message => $error, 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
foreach my $resource (sort {$a cmp $b} keys %{$drbd_xml->{resource}})
{
to_log($conf, {message => "resource: [$resource].", 'line' => __LINE__, level => 2});
my $peer = "";
my $local = "";
foreach my $connection_ref (@{$drbd_xml->{resource}->{$resource}->{connection}})
{
my $protocol = $connection_ref->{section}->{net}->{option}->{protocol}->{value};
my $fencing = $connection_ref->{section}->{net}->{option}->{fencing}->{value};
to_log($conf, {message => "protocol: [$resource], fencing: [$fencing].", 'line' => __LINE__, level => 3});
# If this isn't set to 'resource-and-stonith', it's a DR connection and we'll ignore
# it.
next if $fencing ne "resource-and-stonith";
# Look at the hosts
foreach my $host (sort {$a cmp $b} keys %{$connection_ref->{host}})
{
my $address = $connection_ref->{host}->{$host}->{address}->[0]->{content};
my $port = $connection_ref->{host}->{$host}->{address}->[0]->{port};
my $short_hostname = $host;
$short_hostname =~ s/\..*$//;
my $local_hostname = $conf->{environment}{OCF_RESKEY_CRM_meta_on_node};
to_log($conf, {message => "host: [$host ($short_hostname)], address: [$address:$port], local_hostname: [$local_hostname].", 'line' => __LINE__, level => 2});
# Is this me or the peer?
if (($local_hostname eq $short_hostname) or ($local_hostname =~ /^$short_hostname\./))
{
# This is us.
$local = $host;
to_log($conf, {message => "Recording the local connection details for the resource: [$resource] -> [$address:$port].", 'line' => __LINE__, level => 2});
$conf->{server}{drbd}{'local'}{hostname} = $host,
$conf->{server}{drbd}{'local'}{short_hostname} = $short_hostname,
$conf->{server}{drbd}{'local'}{address} = $address,
$conf->{server}{drbd}{'local'}{port} = $port,
}
else
{
# This is our peer
$peer = $host;
to_log($conf, {message => "Recording the peer's connection details for the resource: [$resource] -> [$address:$port].", 'line' => __LINE__, level => 2});
$conf->{server}{drbd}{peer}{hostname} = $host,
$conf->{server}{drbd}{peer}{short_hostname} = $short_hostname,
$conf->{server}{drbd}{peer}{address} = $address,
$conf->{server}{drbd}{peer}{port} = $port,
}
}
}
to_log($conf, {message => "local: [$local], peer: [$peer].", 'line' => __LINE__, level => 2});
foreach my $volume (sort {$a cmp $b} keys %{$drbd_xml->{resource}->{$resource}->{host}->{$local}->{volume}})
{
my $backing_device = $drbd_xml->{resource}->{$resource}->{host}->{$local}->{volume}->{$volume}->{disk}->[0];
my $device_path = $drbd_xml->{resource}->{$resource}->{host}->{$local}->{volume}->{$volume}->{device}->[0]->{content};
my $device_minor = $drbd_xml->{resource}->{$resource}->{host}->{$local}->{volume}->{$volume}->{device}->[0]->{minor};
to_log($conf, {message => "volume: [$volume], backing_device: [$backing_device], device_path: [$device_path], device_minor: [$device_minor].", 'line' => __LINE__, level => 3});
$conf->{server}{drbd}{'local'}{device}{$device_path}{lv} = $backing_device;
$conf->{server}{drbd}{'local'}{device}{$device_path}{minor} = $device_minor;
to_log($conf, {message => "server::drbd::local::device::${device_path}::lv: [".$conf->{server}{drbd}{'local'}{device}{$device_path}{lv}."], server::drbd::local::device::${device_path}::minor: [".$conf->{server}{drbd}{'local'}{device}{$device_path}{minor}."].", 'line' => __LINE__, level => 2});
# Map the resource name to the local drbd device path.
$conf->{resource}{$resource}{lv} = $backing_device;
$conf->{resource}{$resource}{path} = $device_path;
$conf->{device_path}{$device_path}{resource} = $resource;
to_log($conf, {message => "resource::${resource}::path: [".$conf->{resource}{$resource}{path}."], resource::${resource}::lv: [".$conf->{resource}{$resource}{lv}."], device_path::${device_path}::resource: [".$conf->{device_path}{$device_path}{resource}."].", 'line' => __LINE__, level => 2});
}
}
foreach my $device_path (sort {$a cmp $b} keys %{$conf->{server}{disks}})
{
to_log($conf, {message => "Checking that the DRBD device: [$device_path] is ready.", 'line' => __LINE__, level => 2});
to_log($conf, {message => "server::drbd::local::device::${device_path}::lv: [".$conf->{server}{drbd}{'local'}{device}{$device_path}{lv}."].", 'line' => __LINE__, level => 2});
if (not $conf->{server}{drbd}{'local'}{device}{$device_path}{lv})
{
# The backing LV doesn't exist.
to_log($conf, {message => "The server wants to use: [$device_path] as a hard drive, but we couldn't find the backing logical volume on this node.", 'line' => __LINE__, level => 0, priority => "err"});
exit(5);
}
elsif (not -e $conf->{server}{drbd}{'local'}{device}{$device_path}{lv})
{
# The backing LV doesn't exist.
to_log($conf, {message => "The server wants to use: [$device_path] as a hard drive, but the backing logical volume: [".$conf->{server}{drbd}{'local'}{device}{$device_path}{lv}."] doesn't exist on this node.", 'line' => __LINE__, level => 0, priority => "err"});
exit(5);
}
else
{
to_log($conf, {message => "The server wants to use: [$device_path] as a hard drive, which is backed by the logical volume: [".$conf->{server}{drbd}{'local'}{device}{$device_path}{lv}."]. Checking that these are ready.", 'line' => __LINE__, level => 1});
}
}
# Now read in the status of the drbd devices
$return_code = undef;
($return_code, my $status_json) = shell_call($conf, $conf->{path}{exe}{drbdsetup}." status --json");
if ($return_code)
{
# Something went wrong.
to_log($conf, {message => "The attempt to read the DRBD status returned a non-zero code: [$return_code]. The returned output (if any) was: [$status_json].", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
# If DRBD is not up, the returned JSON output will not actually exist.
if ($status_json =~ /No currently configured DRBD found/si)
{
to_log($conf, {message => "DRBD is not loaded. Bringing it up now.", 'line' => __LINE__, level => 2});
foreach my $device_path (sort {$a cmp $b} keys %{$conf->{server}{disks}})
{
my $resource = $conf->{device_path}{$device_path}{resource};
to_log($conf, {message => "Bringing up the resource: [$resource] for the server's: [".$device_path."] disk.", 'line' => __LINE__, level => 2});
($return_code, my $drbdadm_output) = shell_call($conf, $conf->{path}{exe}{drbdadm}." up $resource");
if ($return_code)
{
# Something went wrong.
to_log($conf, {message => "The attempt to start the DRBD resource: [$resource] returned a non-zero code: [$return_code]. The returned output (if any) was: [$drbdadm_output].", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
}
# Give them a few seconds to start.
to_log($conf, {message => "Pausing briefly to give the resources time to start.", 'line' => __LINE__, level => 0});
sleep 3;
# Check DRBD setup again
$return_code = undef;
$status_json = undef;
to_log($conf, {message => "Checking the DRBD status again.", 'line' => __LINE__, level => 0});
($return_code, $status_json) = shell_call($conf, $conf->{path}{exe}{drbdsetup}." status --json");
if ($return_code)
{
# Something went wrong.
to_log($conf, {message => "The attempt to read the DRBD status after bringing up the resource(s) for this server returned a non-zero code: [$return_code]. The returned output (if any) was: [$status_json].", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
# If DRBD is still not up, we're done.
if ($status_json =~ /No currently configured DRBD found/si)
{
to_log($conf, {message => "The attempt to read the DRBD status after bringing up the resource(s) appears to have failed.", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
}
# Process the JSON data. If any disks are not seen, they won't be set to 'ok', which we'll catch next.
check_drbd_status($conf, $status_json);
# Make sure I saw all disks.
my $check_again = 0;
foreach my $device_path (sort {$a cmp $b} keys %{$conf->{server}{disks}})
{
if ($conf->{server}{disks}{$device_path} eq "check")
{
# Failed to see it, see if we can bring it up.
$check_again = 1;
my $resource = $conf->{device_path}{$device_path}{resource};
to_log($conf, {message => "The DRBD resource: [$resource] backing the device: [$device_path] was not seen in the 'drbdsetup' status data. Attempting to bringing it up now.", 'line' => __LINE__, level => 2});
($return_code, my $drbdadm_output) = shell_call($conf, $conf->{path}{exe}{drbdadm}." up $resource");
if ($return_code)
{
# Something went wrong.
to_log($conf, {message => "The attempt to start the DRBD resource: [$resource] returned a non-zero code: [$return_code]. The returned output (if any) was: [$drbdadm_output].", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
}
to_log($conf, {message => "check_again: [$check_again].", 'line' => __LINE__, level => 2});
if ($check_again)
{
# Give the resource a few seconds to start.
to_log($conf, {message => "Pausing briefly to give the resources time to start.", 'line' => __LINE__, level => 2});
sleep 3;
# Check again.
$return_code = undef;
$status_json = undef;
to_log($conf, {message => "Checking the DRBD status again.", 'line' => __LINE__, level => 2});
($return_code, $status_json) = shell_call($conf, $conf->{path}{exe}{drbdsetup}." status --json");
if ($return_code)
{
# Something went wrong.
to_log($conf, {message => "The attempt to read the DRBD status after bringing up the resource(s) for this server returned a non-zero code: [$return_code]. The returned output (if any) was: [$status_json].", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
# Check again.
check_drbd_status($conf, $status_json);
}
}
# Do I need to check again?
if ($check_again)
{
foreach my $device_path (sort {$a cmp $b} keys %{$conf->{server}{disks}})
{
if ($conf->{server}{disks}{$device_path} eq "check")
{
# Failed.
my $resource = $conf->{device_path}{$device_path}{resource};
to_log($conf, {message => "The DRBD resource: [$resource] backing the device: [$device_path] was not able to start.", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
}
}
return(0);
}
# This processes the DRBD setup JSON data
sub check_drbd_status
{
my ($conf, $status_json) = @_;
my $json = JSON->new->allow_nonref;
my $drbd_status = $json->decode($status_json);
foreach my $resource_ref (@{$drbd_status})
{
my $resource = $resource_ref->{name};
my $device_path = $conf->{resource}{$resource}{path};
my $logical_volume = $conf->{resource}{$resource}{lv};
to_log($conf, {message => "resource: [$resource], device_path: [$device_path], logical_volume: [$logical_volume].", 'line' => __LINE__, level => 2});
if ((exists $conf->{server}{disks}{$device_path}) && ($conf->{server}{disks}{$device_path} eq "check"))
{
### This disk is in use by this server, check it.
to_log($conf, {message => "The local replicated disk: [$device_path] is used by this server. Checking it out now.", 'line' => __LINE__, level => 2});
# First, are any of the local volumes not UpToDate?
foreach my $device_ref (@{$resource_ref->{devices}})
{
# Are we UpToDate (or SyncSource)?
if ((lc($device_ref->{'disk-state'}) ne "uptodate") && (lc($device_ref->{'disk-state'}) ne "syncsource"))
{
# If we've been asked to start, refuse.
if ($conf->{switches}{start})
{
to_log($conf, {message => "The DRBD resource: [$resource] volume: [".$device_ref->{volume}."] locat disk state is: [".$device_ref->{'disk-state'}."]. Unsafe to boot the server unless the disk state is UpToDate.", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
}
else
{
to_log($conf, {message => "The DRBD resource: [$resource] volume: [".$device_ref->{volume}."] locat disk state is: [".$device_ref->{'disk-state'}."], good.", 'line' => __LINE__, level => 2});
}
}
# Is this a connection we care about?
foreach my $connection_ref (@{$resource_ref->{connections}})
{
# Is the peer's role Primary?
to_log($conf, {message => "Checking connection to: [".$connection_ref->{name}."].", 'line' => __LINE__, level => 2});
if (lc($connection_ref->{'peer-role'}) eq "primary")
{
# Don't boot here
if ($conf->{switches}{start})
{
to_log($conf, {message => "The DRBD resource: [$resource] on the peer: [".$connection_ref->{name}."] is 'Primary'. Refusing to boot.", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
}
}
# If we're here, it's OK.
$conf->{server}{disks}{$device_path} = "ok";
to_log($conf, {message => "server::disks::${device_path}: [".$conf->{server}{disks}{$device_path}."].", 'line' => __LINE__, level => 2});
}
else
{
to_log($conf, {message => "Ignoring the local replicated disk: [$device_path], it is not used by this server.", 'line' => __LINE__, level => 2});
}
}
return(0);
}
# This makes sure that any media in the server's optical drive exists here and is readable.
sub validate_storage_optical
{
my ($conf) = @_;
foreach my $file (sort {$a cmp $b} keys %{$conf->{server}{optical}})
{
to_log($conf, {message => "Checking that the optical disc image: [$file] exists.", 'line' => __LINE__, level => 2});
# If the file doesn't exist, exit with OCF_ERR_INSTALLED (5). If we can't read it, exit with
# OCF_ERR_PERM (4).
if (not -e $file)
{
# It doesn't exist. Exit with OCF_ERR_INSTALLED (5).
to_log($conf, {message => "The server has the ISO: [$file] mounted in its optical drive, but that file doesn't exist on this system.", 'line' => __LINE__, level => 0, priority => "err"});
exit(5);
}
elsif (not -r $file)
{
# We can't read it. Exit with OCF_ERR_PERM (4).
to_log($conf, {message => "The server has the ISO: [$file] mounted in its optical drive, which we have, but we can't read it. Check permissions and for SELinux denials.", 'line' => __LINE__, level => 0, priority => "err"});
exit(4);
}
else
{
# We're OK.
to_log($conf, {message => "The server has the ISO: [$file] mounted in its optical drive, which we have.", 'line' => __LINE__, level => 2});
}
}
return(0);
}
# This verifies that the requested emulator exists and can be used.
sub validate_emulator
{
my ($conf) = @_;
# What emulator is this using?
my $emulator = $conf->{server}{definition_xml}->{devices}->[0]->{emulator}->[0];
to_log($conf, {message => "emulator: [$emulator]", 'line' => __LINE__, level => 2});
if (not -e $emulator)
{
# It doesn't exist. Exit with OCF_ERR_INSTALLED (5).
to_log($conf, {message => "The server wants to use the emulator: [$emulator] which doesn't exist on this node. Was this server migrated from a different generation Anvil! system? Please update '...' in the server's definition file: [".$conf->{server}{definition_file}."].", 'line' => __LINE__, level => 0, priority => "err"});
exit(5);
}
if (not -x $emulator)
{
# We can't execute it. Exit with OCF_ERR_PERM (4).
to_log($conf, {message => "The server wants to use the emulator: [$emulator] which exists, but we can't run. Please check permissions and for SELinux denials.", 'line' => __LINE__, level => 0, priority => "err"});
exit(4);
}
return(0);
}
# This makes sure the name we see in the definition file matches what we expect.
sub validate_name
{
my ($conf) = @_;
my $server = $conf->{environment}{OCF_RESKEY_name};
if ($server ne $conf->{server}{definition_xml}->{name}->[0])
{
to_log($conf, {message => "The configured server name: [$server] does not match the name of the server in the definition file: [".$conf->{server}{definition_xml}->{name}."]!", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
return(0);
}
# This checks that there is enough RAM to run this server.
sub validate_ram
{
my ($conf) = @_;
# How mcuh RAM does the server need?
my $server_ram_value = $conf->{server}{definition_xml}->{memory}->[0]->{content};
my $server_ram_units = $conf->{server}{definition_xml}->{memory}->[0]->{unit};
to_log($conf, {message => "server_ram_value: [$server_ram_value], server_ram_units: [$server_ram_units].", 'line' => __LINE__, level => 2});
# Convert to bytes
my $server_ram_bytes = $server_ram_value;
if ($server_ram_units =~ /^k/i) { $server_ram_bytes = ($server_ram_value * (2 ** 10)); }
elsif ($server_ram_units =~ /^m/i) { $server_ram_bytes = ($server_ram_value * (2 ** 20)); }
elsif ($server_ram_units =~ /^g/i) { $server_ram_bytes = ($server_ram_value * (2 ** 30)); }
elsif ($server_ram_units =~ /^t/i) { $server_ram_bytes = ($server_ram_value * (2 ** 40)); }
elsif ($server_ram_units =~ /^p/i) { $server_ram_bytes = Math::BigInt->new('2')->bpow('50')->bmul($server_ram_value); }
elsif ($server_ram_units =~ /^e/i) { $server_ram_bytes = Math::BigInt->new('2')->bpow('60')->bmul($server_ram_value); }
elsif ($server_ram_units =~ /^z/i) { $server_ram_bytes = Math::BigInt->new('2')->bpow('70')->bmul($server_ram_value); }
elsif ($server_ram_units =~ /^y/i) { $server_ram_bytes = Math::BigInt->new('2')->bpow('80')->bmul($server_ram_value); }
to_log($conf, {message => "server_ram_bytes: [$server_ram_bytes].", 'line' => __LINE__, level => 3});
# How much RAM do we have available?
my $available = 0;
my ($free_rc, $free_output) = shell_call($conf, $conf->{path}{exe}{free}." --bytes");
foreach my $line (split/\n/, $free_output)
{
to_log($conf, {message => "line: [$line].", 'line' => __LINE__, level => 3});
if ($line =~ /Mem:\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)$/)
{
my $total = $1;
my $used = $2;
my $free = $3;
my $shared = $4;
my $cache = $5;
$available = $6;
to_log($conf, {message => "total: [$total], used: [$used], free: [$free], shared: [$shared], cache: [$cache], available: [$available]", 'line' => __LINE__, level => 3});
}
}
to_log($conf, {message => "server_ram_bytes: [".comma($conf, $server_ram_bytes)." bytes].", 'line' => __LINE__, level => 2});
to_log($conf, {message => "available: ...... [".comma($conf, $available)." bytes].", 'line' => __LINE__, level => 2});
if ($server_ram_bytes > $available)
{
# Not enough free memory.
to_log($conf, {message => "The configured server name: [".$conf->{environment}{OCF_RESKEY_name}."] needs: [".comma($conf, $server_ram_bytes)." bytes] of RAM, but only: [".comma($conf, $available)." bytes] are available!", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
return(0);
}
# This reads the XML definition data into an XML data hash.
sub read_server_definition
{
my ($conf) = @_;
my $server = $conf->{environment}{OCF_RESKEY_name};
my $definition_file = $conf->{path}{config}{definition};
$definition_file =~ s/#!NAME!#/$server/;
my $server_xml = "";
to_log($conf, {message => "server: [$server], definition_file: [$definition_file]", 'line' => __LINE__, level => 3});
# If the file doesn't exist, return OCF_ERR_INSTALLED (5). If the file exists but we can't read it,
# return OCF_ERR_PERM (4).
if (not -e $definition_file)
{
to_log($conf, {message => "The definition file: [$definition_file] for the server: [$server] does not exist here!", 'line' => __LINE__, level => 0, priority => "err"});
exit(5);
}
elsif (not -r $definition_file)
{
to_log($conf, {message => "The definition file: [$definition_file] for the server: [$server] can not be read!", 'line' => __LINE__, level => 0, priority => "err"});
exit(4);
}
# Still alive? Read it in.
my ($definition_xml) = read_file($conf, $definition_file);
to_log($conf, {message => "definition_xml: [$definition_xml]", 'line' => __LINE__, level => 3});
my $xml = XML::Simple->new();
eval { $server_xml = $xml->XMLin($definition_xml, KeyAttr => {}, ForceArray => 1) };
if ($@)
{
chomp $@;
my $error = "[ Error ] - The was a problem parsing: [$definition_file]. The error was:\n";
$error .= "===========================================================\n";
$error .= $@."\n";
$error .= "===========================================================\n";
to_log($conf, {message => $error, 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
$conf->{server}{definition_xml} = $server_xml;
$conf->{server}{definition_file} = $definition_file;
return(0);
}
# This reads in a file and returns the contents as a single string variable.
sub read_file
{
my ($conf, $file) = @_;
my $body = "";
open (my $file_handle, "<".$file) or to_log($conf, {message => "Failed to read: [$file]. The error was: $!", 'line' => __LINE__, level => 0, priority => "err", exit_code => 1});
while(<$file_handle>)
{
# This should not generate output.
chomp;
my $line = $_;
to_log($conf, {message => "Output: [$line]", 'line' => __LINE__, level => 3});
$body .= $line."\n";
}
close $file_handle;
to_log($conf, {message => "body: [$body]", 'line' => __LINE__, level => 3});
return($body);
}
# This makes a system call and returns the return code and the output as string variable.
sub shell_call
{
my ($conf, $shell_call) = @_;
$shell_call .= " 2>&1; ".$conf->{path}{exe}{echo}." return_code:\$?";
my $return_code = 9999;
my $output = "";
to_log($conf, {message => "Calling: [$shell_call]", 'line' => __LINE__, level => 2});
open (my $file_handle, $shell_call." 2>&1 |") or to_log($conf, {message => "Failed to call: [".$shell_call."]. The error was: $!", 'line' => __LINE__, level => 0, priority => "err", exit_code => 1});
while(<$file_handle>)
{
# This should not generate output.
chomp;
my $line = $_;
to_log($conf, {message => "line: [$line]", 'line' => __LINE__, level => 3});
if ($line =~ /^return_code:(\d+)$/)
{
$return_code = $1;
to_log($conf, {message => "return_code: [$return_code]", 'line' => __LINE__, level => 3});
next;
}
$output .= $line."\n";
to_log($conf, {message => "Output: [$line]", 'line' => __LINE__, level => 3});
}
close $file_handle;
to_log($conf, {message => "return_code: [$return_code], output: [$output]", 'line' => __LINE__, level => 3});
return($return_code, $output);
}
# This logs the details of this call.
sub show_environment
{
my ($conf, $level) = @_;
foreach my $key (sort {$a cmp $b} keys %{$conf->{switches}})
{
next if $key eq "raw";
next if $conf->{switches}{$key} eq "";
to_log($conf, {message => "Command line switch: [$key] -> [".$conf->{switches}{$key}."]", 'line' => __LINE__, level => $level});
}
foreach my $key (sort {$a cmp $b} keys %{$conf->{environment}})
{
next if $conf->{environment}{$key} eq "";
to_log($conf, {message => "OCF Environment variable: [$key] -> [".$conf->{environment}{$key}."]", 'line' => __LINE__, level => $level});
}
foreach my $key (sort {$a cmp $b} keys %ENV)
{
next if exists $conf->{environment}{$key};
to_log($conf, {message => "System Environment variable: [$key] -> [".$ENV{$key}."]", 'line' => __LINE__, level => ($level + 1)});
}
return(0);
}
# This just prints a quick usage message for now.
sub show_usage
{
my ($conf) = @_;
print "TODO: How to use this...\n";
exit(0);
}
# This prints out the metadata and exits.
sub show_metadata
{
my ($conf) = @_;
# This is a pretty simple agent, by design. We only take a server name for now.
print '
0.1
This resource agent manages KVM+qemu virtual servers on an Anvil! m3 Intelligent Availability(tm) system.
It manages underlying components like DRBD 9 storage resources, brodge connections and so forth.
Anvil! m3 server resource agent
This is the name of the server as reported by virsh.
Server name
';
exit(0);
}
# This gathers command line switches and stores them in 'swithes::'.
sub get_switches
{
my ($conf) = @_;
my $last_argument = "";
$conf->{switches}{raw} = "";
foreach my $argument (@ARGV)
{
to_log($conf, {message => "argument: [$argument]", 'line' => __LINE__, level => 3});
if ($last_argument eq "raw")
{
# Don't process anything.
$conf->{switches}{raw} .= " ".$argument;
to_log($conf, {message => "switches::raw: [".$conf->{switches}{raw}."]", 'line' => __LINE__, level => 3});
}
elsif ($argument =~ /^-/)
{
# If the argument is just '--', appeand everything after it to 'raw'.
if ($argument eq "--")
{
$last_argument = "raw";
$conf->{switches}{raw} = "";
to_log($conf, {message => "switches::raw: [".$conf->{switches}{raw}."]", 'line' => __LINE__, level => 3});
}
else
{
($last_argument) = ($argument =~ /^-{1,2}(.*)/)[0];
to_log($conf, {message => "last_argument: [$last_argument]", 'line' => __LINE__, level => 3});
if ($last_argument =~ /=/)
{
# Break up the variable/value.
($last_argument, my $value) = (split /=/, $last_argument, 2);
$conf->{switches}{$last_argument} = $value;
to_log($conf, {message => "switches::${last_argument}: [".$conf->{switches}{$last_argument}."]", 'line' => __LINE__, level => 3});
}
else
{
$conf->{switches}{$last_argument} = "#!SET!#";
to_log($conf, {message => "switches::${last_argument}: [".$conf->{switches}{$last_argument}."]", 'line' => __LINE__, level => 3});
}
}
}
else
{
if ($last_argument)
{
$conf->{switches}{$last_argument} = $argument;
to_log($conf, {message => "switches::${last_argument}: [".$conf->{switches}{$last_argument}."]", 'line' => __LINE__, level => 3});
$last_argument = "";
to_log($conf, {message => "last_argument: [$last_argument]", 'line' => __LINE__, level => 3});
}
else
{
# Got a value without an argument. That's OK.
$conf->{switches}{$argument} = "#!SET!#";
to_log($conf, {message => "switches::${argument}: [".$conf->{switches}{$argument}."]", 'line' => __LINE__, level => 3});
}
}
}
# Clean up the initial space added to 'raw'.
to_log($conf, {message => "switches::raw: [".$conf->{switches}{raw}."]", 'line' => __LINE__, level => 3});
if ($conf->{switches}{raw})
{
$conf->{switches}{raw} =~ s/^ //;
to_log($conf, {message => "switches::raw: [".$conf->{switches}{raw}."]", 'line' => __LINE__, level => 3});
}
return(0);
}
# This adds commas to long numbers.
sub comma
{
my ($conf, $number) = @_;
return undef if not defined $number;
# Strip out any existing commas.
$number =~ s/,//g;
# Record and remove the sign, if present.
my $sign = "";
if ($number =~ /^\+/)
{
$number =~ s/^\+//g;
$sign = "+";
}
elsif ($number =~ /^\-/)
{
$number =~ s/^\-//g;
$sign = "-";
}
# Split on the left-most period.
my ($whole, $decimal) = split/\./, $number, 2;
$whole = "" if not defined $whole;
$decimal = "" if not defined $decimal;
# Now die if either number has a non-digit character in it.
if (($whole =~ /\D/) or ($decimal =~ /\D/))
{
to_log($conf, {message => "We were asked to insert commas into a dumber that is not actually a number: [$number]. This is likely a symptom of a larger problem.", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
local($_) = $whole ? $whole : "";
1 while s/^(-?\d+)(\d{3})/$1,$2/;
$whole = $_;
my $return = $decimal ? $whole.".".$decimal : $whole;
if ($sign)
{
$return = $sign.$return;
}
return ($return);
}
# Log file entries
sub to_log
{
my ($conf, $parameters) = @_;
my $facility = defined $parameters->{facility} ? $parameters->{facility} : $conf->{'log'}{facility};
my $level = defined $parameters->{level} ? $parameters->{level} : 1;
my $line = defined $parameters->{'line'} ? $parameters->{'line'} : 0;
my $message = defined $parameters->{message} ? $parameters->{message} : "";
my $priority = defined $parameters->{priority} ? $parameters->{priority} : "";
my $exit_code = defined $parameters->{exit_code} ? $parameters->{exit_code} : "";
# Leave if we don't care about this message
return if $level > $conf->{'log'}{level};
return if not $message;
# Build the message. We log the line
if (($conf->{'log'}{line_numbers}) && ($line))
{
$message = $line."; ".$message;
}
my $priority_string = $facility;
if ($priority)
{
$priority_string .= ".".$priority;
}
elsif ($level eq "0")
{
$priority_string .= ".notice";
}
elsif (($level eq "1") or ($level eq "2"))
{
$priority_string .= ".info";
}
else
{
$priority_string .= ".debug";
}
# Clean up the string for bash
$message =~ s/"/\\\"/gs;
#$message =~ s/\(/\\\(/gs;
my $shell_call = $conf->{path}{exe}{logger}." --priority ".$priority_string." --tag ".$conf->{'log'}{tag}." -- \"".$message."\"";
open (my $file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n";
while(<$file_handle>)
{
# This should not generate output.
chomp;
my $line = $_;
print "Unexpected logging output: [".$line."]\n";
}
close $file_handle;
if ($exit_code =~ /^\d+$/)
{
exit($exit_code);
}
return(0);
}
# This checks the given paths and, if something isn't found, it searches PATH trying to find it.
sub find_executables
{
my ($conf) = @_;
# Variables.
my $check = "";
my $bad = 0;
# Log entries can only happen if I've found 'logger', so an extra check will be made on 'to_log'
# calls.
my @dirs = split/:/, $ENV{PATH};
foreach my $exe (sort {$b cmp $a} keys %{$conf->{path}{exe}})
{
if ( not -e $conf->{path}{exe}{$exe} )
{
to_log($conf, {message => "The program: [$exe] is not at: [".$conf->{path}{exe}{$exe}."]. Looking for it now.", 'line' => __LINE__, level => 1});
foreach my $path (@dirs)
{
$check = "$path/$exe";
$check =~ s/\/\//\//g;
to_log($conf, {message => "Checking: [$check]", 'line' => __LINE__, level => 2});
if ( -e $check )
{
if (-e $conf->{path}{exe}{logger})
{
to_log($conf, {message => "Found it! Changed path for: [$exe] from: [".$conf->{path}{exe}{$exe}."] to: [$check]", 'line' => __LINE__, level => 1});
}
else
{
warn "DEBUG: Found it! Changed path for: [$exe] from: [".$conf->{path}{exe}{$exe}."] to: [$check]\n";
}
$conf->{path}{exe}{$exe} = $check;
}
else
{
to_log($conf, {message => "Not found.", 'line' => __LINE__, level => 2});
}
}
}
else
{
to_log($conf, {message => "Found!", 'line' => __LINE__, level => 3});
next;
}
# Make sure it exists now.
to_log($conf, {message => "Checking again if: [$exe] is at: [".$conf->{path}{exe}{$exe}."].", 'line' => __LINE__, level => 3});
if (not -e $conf->{path}{exe}{$exe})
{
$bad = 1;
if (-e $conf->{path}{exe}{logger})
{
to_log($conf, {message => "Failed to find executable: [$exe]. Unable to proceed.", 'line' => __LINE__, level => 0});
}
else
{
warn "Failed to find executable: [$exe]. Unable to proceed.\n";
}
}
}
if ($bad)
{
exit(1);
}
return(0);
}