#!/usr/bin/perl
#
# This is the resource agent used to manage servers on the Anvil! Intelligent Availability platform.
#
# License: GNU General Public License (GPL) v2+
# (c) 1997-2018 - Alteeve's Niche! Inc.
#
# WARNING: This is a pretty purpose-specific resource agent. No effort was made to test this on an rgmanager
# cluster or on any configuration outside how the Anvil! m3 uses it. If you plan to adapt it to
# another purpose, let us know and we'll try to help.
#
# Based on: https://github.com/ClusterLabs/resource-agents/blob/master/doc/dev-guides/ra-dev-guide.asc
#
# Error types from pacemaker's perspective;
#
# - Soft Error - Unless specifically configured otherwise, pacemaker will attempt to recover a resource
# in-place - usually by restarting the resource on the same node.
# - Hard Error - Unless specifically configured otherwise, pacemaker will attempt to recover a resource
# which failed with this error by restarting the resource on a different node.
# - Fatal Error - This is a cluster-wide error, it would make no sense to recover such a resource on a
# different node, let alone in-place. When a resource fails with this error, Pacemaker will
# attempt to shut down the resource, and wait for administrator intervention.
#
# Exit codes;
# 0 - OCF_SUCCESS
# - The action completed successfully. This is the expected return code for any successful start, stop,
# migrate_to, meta_data, help, and usage action.
# - For monitor, however, a modified convention applies:
# - If the server is running we return, OCF_SUCCESS. If not running and gracefully stopped or migrated
# off, return OCF_NOT_RUNNING.
#
# 1 - OCF_ERR_GENERIC
# - The action returned a generic error. This is used only when none of the more specific error codes,
# defined below, accurately describes the problem.
# - Pacemaker interprets this exit code as a soft error.
#
# 2 - OCF_ERR_ARGS
# - The resource’s configuration is not valid on this machine. This can happen if the serve fails to boot
# because of a missing bridge, for example.
#
# 3 - OCF_ERR_UNIMPLEMENTED
# - The resource agent was instructed to execute an action that we do not implement.
# - Not all resource agent actions are mandatory. We don't implement 'promote' or 'demote'. We do implement
# 'migrate_to', 'migrate_from', and 'notify'. If we're misconfigured as a master/slave resource, for
# example, then will alert the user about this misconfiguration by returning OCF_ERR_UNIMPLEMENTED.
#
# 4 - OCF_ERR_PERM
# - The action failed due to insufficient permissions. This may be due to a node not being able to open a
# definition file or resource config.
# - Pacemaker interprets this exit code as a hard error.
#
# 5 - OCF_ERR_INSTALLED
# - The action failed because a required component is missing on the node where the action was executed.
# This may be due to a required binary not being executable, or a the DRBD resource config file not
# existing.
# - Pacemaker interprets this exit code as a hard error.
#
# 6 - OCF_ERR_CONFIGURED
# - The action failed because the user misconfigured the resource in pacemaker. For example, the user may
# have configured an alphanumeric string for a parameter that really should be an integer.
# - Pacemaker interprets this exit code as a fatal error.
#
# 7 - OCF_NOT_RUNNING
# - The resource was found not to be running. This is an exit code that may be returned by the monitor
# action exclusively. Note that this implies that the resource has either gracefully shut down, or has
# never been started.
#
# 8 - OCF_RUNNING_MASTER
# 9 - OCF_FAILED_MASTER
# - These OCF exit codes are not used here.
#
# NOTE: We don't use Anvil::Tools to keep overhead low and to keep this agent independent as possible.
use strict;
use warnings;
use Anvil::Tools;
use XML::Simple;
use JSON;
use Math::BigInt;
use Data::Dumper;
# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete.
$| = 1;
my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0];
my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0];
if (($running_directory =~ /^\./) && ($ENV{PWD}))
{
$running_directory =~ s/^\./$ENV{PWD}/;
}
# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete.
$| = 1;
# NOTE: Setting 'log_level' and 'log_secure' here will get overridden in the main lopp. Use the Log methods
# in the loop as well to override defaults in code.
my $anvil = Anvil::Tools->new();
$anvil->Log->level({set => 2});
$anvil->Log->secure({set => 1});
### Read or Set the environment variables
# This is the name of the server we're managing. # Example values:
$anvil->data->{environment}{OCF_RESKEY_name} = defined $ENV{OCF_RESKEY_name} ? $ENV{OCF_RESKEY_name} : ""; # srv01-c7
# This is our node name
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node} = defined $ENV{OCF_RESKEY_CRM_meta_on_node} ? $ENV{OCF_RESKEY_CRM_meta_on_node} : ""; # el8-a01n01.digimer.ca
# This says "UUID", but it's the node ID.
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node_uuid} = defined $ENV{OCF_RESKEY_CRM_meta_on_node_uuid} ? $ENV{OCF_RESKEY_CRM_meta_on_node_uuid} : ""; # 1
# This is the timeout for the called action in millisecond.
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout} = defined $ENV{OCF_RESKEY_CRM_meta_timeout} ? $ENV{OCF_RESKEY_CRM_meta_timeout} : ""; # 20000
# If this is set, we'll bump our log level as well.
$anvil->data->{environment}{PCMK_debug} = defined $ENV{PCMK_debug} ? $ENV{PCMK_debug} : ""; # 0
# These are other variables that are set, but we don't currently care about them
$anvil->data->{environment}{OCF_EXIT_REASON_PREFIX} = defined $ENV{OCF_EXIT_REASON_PREFIX} ? $ENV{OCF_EXIT_REASON_PREFIX} : ""; # ocf-exit-reason:
$anvil->data->{environment}{OCF_RA_VERSION_MAJOR} = defined $ENV{OCF_RA_VERSION_MAJOR} ? $ENV{OCF_RA_VERSION_MAJOR} : ""; # 1
$anvil->data->{environment}{OCF_RA_VERSION_MINOR} = defined $ENV{OCF_RA_VERSION_MINOR} ? $ENV{OCF_RA_VERSION_MINOR} : ""; # 0
$anvil->data->{environment}{OCF_RESKEY_crm_feature_set} = defined $ENV{OCF_RESKEY_crm_feature_set} ? $ENV{OCF_RESKEY_crm_feature_set} : ""; # 3.0.12
$anvil->data->{environment}{OCF_RESOURCE_INSTANCE} = defined $ENV{OCF_RESOURCE_INSTANCE} ? $ENV{OCF_RESOURCE_INSTANCE} : ""; # srv01-c7
$anvil->data->{environment}{OCF_RESOURCE_PROVIDER} = defined $ENV{OCF_RESOURCE_PROVIDER} ? $ENV{OCF_RESOURCE_PROVIDER} : ""; # alteeve
$anvil->data->{environment}{OCF_RESOURCE_TYPE} = defined $ENV{OCF_RESOURCE_TYPE} ? $ENV{OCF_RESOURCE_TYPE} : ""; # server
$anvil->data->{environment}{OCF_ROOT} = defined $ENV{OCF_ROOT} ? $ENV{OCF_ROOT} : ""; # /usr/lib/ocf
# These are set during a migration
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source} = defined $ENV{OCF_RESKEY_CRM_meta_migrate_source} ? $ENV{OCF_RESKEY_CRM_meta_migrate_source} : ""; # el8-a01n01.digimer.ca
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target} = defined $ENV{OCF_RESKEY_CRM_meta_migrate_target} ? $ENV{OCF_RESKEY_CRM_meta_migrate_target} : ""; # el8-a01n02.digimer.ca
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_record_pending} = defined $ENV{OCF_RESKEY_CRM_meta_record_pending} ? $ENV{OCF_RESKEY_CRM_meta_record_pending} : ""; # true
# If pacemaker is in debug, so are we,
if ($anvil->data->{environment}{PCMK_debug})
{
$anvil->Log->level({set => 3});
}
# Get any command line switches.
$anvil->Get->switches;
# Something for the logs
if ((not $anvil->data->{switches}{metadaata}) and (not $anvil->data->{switches}{'meta-data'}))
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 3, key => "log_0298"});
}
### TEST: to be removed later
if ($anvil->data->{switches}{test1})
{
$anvil->data->{environment}{OCF_RESKEY_name} = "test_server";
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout} = 20000;
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node} = "el8-a01n01.digimer.ca";
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source} = "el8-a01n01.digimer.ca";
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target} = "el8-a01n02.digimer.ca";
#print "Running test 1; Migrate: [".$anvil->data->{environment}{OCF_RESKEY_name}."] from: [".$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source}."] to: [".$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target}."]\n";
}
if ($anvil->data->{switches}{test2})
{
$anvil->data->{environment}{OCF_RESKEY_name} = "test_server";
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout} = 20000;
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node} = "el8-a01n02.digimer.ca";
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source} = "el8-a01n02.digimer.ca";
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target} = "el8-a01n01.digimer.ca";
#print "Running test 2; Migrate: [".$anvil->data->{environment}{OCF_RESKEY_name}."] from: [".$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source}."] to: [".$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target}."]\n";
}
if ($anvil->data->{switches}{test3})
{
$anvil->data->{environment}{OCF_RESKEY_name} = "test_server";
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node} = "el8-a01n01.digimer.ca";
#print "Running test 3; Boot: [".$anvil->data->{environment}{OCF_RESKEY_name}."] locally.\n";
}
if ($anvil->data->{switches}{test4})
{
$anvil->data->{environment}{OCF_RESKEY_name} = "test_server";
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node} = "el8-a01n01.digimer.ca";
#print "Running test 3; Shut down: [".$anvil->data->{environment}{OCF_RESKEY_name}."] locally.\n";
}
# This is for debugging.
if (($anvil->data->{switches}{monitor}) or
($anvil->data->{switches}{status}) or
($anvil->data->{switches}{'meta-data'}) or
($anvil->data->{switches}{metadaata}))
{
show_environment($anvil, 3);
}
else
{
show_environment($anvil, 3);
}
### What are we being asked to do?
# start - Starts the resource.
# stop - Shuts down the resource.
# monitor - (status aliases here) Queries the resource for its state.
# meta-data - Dumps the resource agent metadata.
# promote - Turns a resource into the Master role (Master/Slave resources only).
# demote - Turns a resource into the Slave role (Master/Slave resources only).
# migrate_to - migration target
# migrate_from - Implement live migration of resources.
# validate-all - Validates a resource’s configuration.
# help - (usage maps here) Displays a usage message when the resource agent is invoked from the command line, rather than by the cluster manager.
# notify - Inform resource about changes in state of other clones.
if ($anvil->data->{switches}{start})
{
# Start the server
start_server($anvil);
}
elsif ($anvil->data->{switches}{stop})
{
# Stop the server
stop_server($anvil);
}
elsif (($anvil->data->{switches}{monitor}) or ($anvil->data->{switches}{status}))
{
# Report the status of the server.
server_status($anvil);
}
elsif (($anvil->data->{switches}{metadaata}) or ($anvil->data->{switches}{'meta-data'}))
{
show_metadata($anvil);
}
elsif ($anvil->data->{switches}{promote})
{
# We don't support this, so we return OCF_ERR_UNIMPLEMENTED (3)
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0299", variables => { server => $anvil->data->{environment}{OCF_RESKEY_name} }});
$anvil->nice_exit({exit_code => 3});
}
elsif ($anvil->data->{switches}{demote})
{
# We don't support this, so we return OCF_ERR_UNIMPLEMENTED (3)
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0300", variables => { server => $anvil->data->{environment}{OCF_RESKEY_name} }});
$anvil->nice_exit({exit_code => 3});
}
elsif (($anvil->data->{switches}{migrate_to}) or ($anvil->data->{switches}{migrate_from}))
{
# We don't support this, so we return OCF_ERR_UNIMPLEMENTED (3)
migrate_server($anvil);
}
elsif ($anvil->data->{switches}{'validate-all'})
{
# Validate our local config and setup.
validate_all($anvil);
$anvil->nice_exit({exit_code => 0});
}
elsif (($anvil->data->{switches}{help}) or ($anvil->data->{switches}{usage}))
{
# Show the usage information
show_usage($anvil);
$anvil->nice_exit({exit_code => 0});
}
elsif ($anvil->data->{switches}{notify})
{
# We don't implement this
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level =>0, key => "log_0301"});
$anvil->nice_exit({exit_code => 3});
}
else
{
# We were called in some unexpected way. Log an error, show usage and exit.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level =>0, key => "log_0302"});
show_environment($anvil, 0);
$anvil->nice_exit({exit_code => 1});
}
# If we hit here, something very wrong happened.
$anvil->nice_exit({exit_code => 255});
#############################################################################################################
# Functions #
#############################################################################################################
=cut
STATES
The State field lists what state each domain is currently in. A domain can be in one of the following
possible states:
running - The domain is currently running on a CPU
idle - The domain is idle, and not running or runnable. This can be caused because the domain is
waiting on IO (a traditional wait state) or has gone to sleep because there was nothing else
for it to do.
paused - The domain has been paused, usually occurring through the administrator running virsh suspend.
When in a paused state the domain will still consume allocated resources like memory, but will
not be eligible for scheduling by the hypervisor.
in shutdown - The domain is in the process of shutting down, i.e. the guest operating system has been
notified and should be in the process of stopping its operations gracefully.
shut off - The domain is not running. Usually this indicates the domain has been shut down completely, or
has not been started.
crashed - The domain has crashed, which is always a violent ending. Usually this state can only occur if
the domain has been configured not to restart on crash.
pmsuspended - The domain has been suspended by guest power management, e.g. entered into s3 state.
=cut
# This boots the server if possible.
sub start_server
{
my ($anvil) = @_;
# Start procedure;
# 1. Read the XML definition file and find the backing storage and bridges. Soft error if read fails.
# 2. Make sure the name matches.
# 3. Make sure we have enough free RAM.
# 4. Make sure the emulator exists (can be an issue after migrating from an different gen Anvil!).
# 5.1. Make sure optical drives with mounted data have the disk present. Soft error if not.
# 5.2. Find any backing DRBD devices
# 6. For each DRBD device;
# 6.1. Make sure the backing LV is ACTIVE. Soft error if not.
# 6.2. Check if the drbd resource is up. If not, up it.
# 6.3. Make sure the backing disk is UpToDate. Soft error if not.
# 6.4. Make sure the backing device is 'Connected' or 'Connecting'. Call a connect if not.
# 7. Make sure all bridges exist and soft error if not.
# 8. Start the server.
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0303", variables => { server => $server }});
# Make sure things are sane.
validate_all($anvil);
die;
# If we're still alive, then we didn't see the server in the list of running servers, which is really weird.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "log_0311", variables => { server => $server }});
$anvil->nice_exit({exit_code => 1});
}
# This shuts down the server if possible.
sub stop_server
{
my ($anvil) = @_;
# Stopping the server is simply a question of "is the server running?" and, if so, stop it. Once
# stopped, we stop the DRBD resource on both nodes.
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
$anvil->nice_exit({exit_code => 0});
}
# This checks the status of the server.
sub server_status
{
my ($anvil) = @_;
# If the named server is running, return OCF_SUCCESS (0), otherwise OCF_NOT_RUNNING (7). If the
# server is failed, return OCF_ERR_GENERIC (1).
my $state = "";
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
### NOTE: When pacemaker is first starting, virsh won't be up right away. So if we get a return code
### of '1', we'll try again up to 50% of 'environment::OCF_RESKEY_CRM_meta_timeout'.
if (not $anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout})
{
# Set a sane default of 20 seconds.
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout} = 20000;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, priority => "warn", key => "log_0331", variables => { logout => $anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout} }});
}
$anvil->nice_exit({exit_code => 0});
}
# Migrate the server
sub migrate_server
{
my ($anvil) = @_;
# If we were given 'migrate_to', we need to make sure the storage is UpToDate on the peer for all
# backing resources. We can't check the target's bridges, but the migation will fail if one is
# missing.
# If we're given 'migrate_from', we're pulling the server towards us, so we can check both brdiges
# and storage.
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
my $source = $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source};
my $target = $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
server => $server,
source => $source,
target => $target,
}});
# If we made it here, we succeeded.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0360"});
$anvil->nice_exit({exit_code => 0});
}
# Validation checks that we have the definition XML, resource config and that needed apps are installed.
sub validate_all
{
my ($anvil) = @_;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0361"});
### TODO: When we have actual Anvil! systems, connect to the peers (nodes / DR) for this host and see
### if the server is running elsewhere.
# Read in an parse the server's XML.
$anvil->Server->get_status({debug => 2, server => $anvil->data->{environment}{OCF_RESKEY_name}});
return(0);
}
# This ensures that the bridges the server connects to exist on this node.
sub validate_bridges
{
my ($anvil) = @_;
return(0);
}
# This looks up the disks and optical media connected to this server.
sub validate_storage
{
my ($anvil) = @_;
return(0);
}
# THis makes sure that the needed backing DRBD devices are on this node. If so, and if they are not up, they
# will be brought up. If that fails, it errors out.
sub validate_storage_drbd
{
my ($anvil) = @_;
return(0);
}
# This processes the DRBD setup JSON data
sub check_drbd_status
{
my ($anvil, $status_json) = @_;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { status_json => $status_json }});
return(0);
}
# This makes sure that any media in the server's optical drive exists here and is readable.
sub validate_storage_optical
{
my ($anvil) = @_;
return(0);
}
# This verifies that the requested emulator exists and can be used.
sub validate_emulator
{
my ($anvil) = @_;
return(0);
}
# This makes sure the name we see in the definition file matches what we expect.
sub validate_name
{
my ($anvil) = @_;
return(0);
}
# This checks that there is enough RAM to run this server.
sub validate_ram
{
my ($anvil) = @_;
return(0);
}
### TODO: Make sure the appropriate SN ports are opened.
# This stops (drbdadm down <server>) the storage for a given server on both nodes.
sub manage_drbd_resource
{
my ($anvil, $task, $resource) = @_;
return(0);
}
# This reads the XML definition data into an XML data hash.
sub read_server_definition
{
my ($anvil) = @_;
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
server => $server,
}});
return(0);
}
# This logs the details of this call.
sub show_environment
{
my ($anvil, $level) = @_;
foreach my $key (sort {$a cmp $b} keys %{$anvil->data->{switches}})
{
next if $key eq "raw";
next if $anvil->data->{switches}{$key} eq "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $level, list => { "switches::${key}" => $anvil->data->{switches}{$key} }});
}
foreach my $key (sort {$a cmp $b} keys %{$anvil->data->{environment}})
{
next if $anvil->data->{environment}{$key} eq "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $level, list => { "environment::${key}" => $anvil->data->{environment}{$key} }});
}
foreach my $key (sort {$a cmp $b} keys %ENV)
{
next if exists $anvil->data->{environment}{$key};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $level, list => { "ENV::${key}" => $ENV{$key} }});
}
foreach my $value (@ARGV)
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $level, list => { "ARGV" => $value }});
}
return(0);
}
# This just prints a quick usage message for now.
sub show_usage
{
my ($anvil) = @_;
### TODO: How to use this...
$anvil->nice_exit({exit_code => 0});
}
# This prints out the metadata and exits.
sub show_metadata
{
my ($anvil) = @_;
# This is a pretty simple agent, by design. We only take a server name for now.
print '<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="ocs:alteeve:server" version="0.1">
<version>1.0</version>
<longdesc lang="en">
This resource agent manages KVM+qemu virtual servers on an Anvil! m3 Intelligent Availability(tm) system.
It manages underlying components like DRBD 9 storage resources, brodge connections and so forth.
</longdesc>
<shortdesc lang="en">Anvil! m3 server resource agent</shortdesc>
<parameters>
<parameter name="name" unique="1" required="1">
<longdesc lang="en">
This is the name of the server as reported by virsh.
</longdesc>
<shortdesc lang="en">Server name</shortdesc>
<content type="string"/>
</parameter>
</parameters>
<actions>
<action name="start" timeout="30" />
<action name="stop" timeout="60" />
<action name="monitor" timeout="10" />
<action name="notify" timeout="20" />
<action name="migrate_to" timeout="600" />
<action name="migrate_from" timeout="600" />
<action name="meta-data" timeout="5" />
<action name="validate-all" timeout="20" />
</actions>
</resource-agent>
';
$anvil->nice_exit({exit_code => 0});
}