934c9b1286
Signed-off-by: Digimer <digimer@alteeve.ca>
1319 lines
55 KiB
Perl
Executable File
1319 lines
55 KiB
Perl
Executable File
#!/usr/bin/perl
|
||
#
|
||
# This is the resource agent used to manage servers on the Anvil! Intelligent Availability platform.
|
||
#
|
||
# License: GNU General Public License (GPL) v2+
|
||
# (c) 1997-2019 - Alteeve's Niche! Inc.
|
||
#
|
||
# WARNING: This is a pretty purpose-specific resource agent. No effort was made to test this on an rgmanager
|
||
# cluster or on any configuration outside how the Anvil! m3 uses it. If you plan to adapt it to
|
||
# another purpose, let us know and we'll try to help.
|
||
#
|
||
# Based on: https://github.com/ClusterLabs/resource-agents/blob/master/doc/dev-guides/ra-dev-guide.asc
|
||
#
|
||
# Error types from pacemaker's perspective;
|
||
#
|
||
# - Soft Error - Unless specifically configured otherwise, pacemaker will attempt to recover a resource
|
||
# in-place - usually by restarting the resource on the same node.
|
||
# - Hard Error - Unless specifically configured otherwise, pacemaker will attempt to recover a resource
|
||
# which failed with this error by restarting the resource on a different node.
|
||
# - Fatal Error - This is a cluster-wide error, it would make no sense to recover such a resource on a
|
||
# different node, let alone in-place. When a resource fails with this error, Pacemaker will
|
||
# attempt to shut down the resource, and wait for administrator intervention.
|
||
#
|
||
# Exit codes;
|
||
# 0 - OCF_SUCCESS
|
||
# - The action completed successfully. This is the expected return code for any successful start, stop,
|
||
# migrate_to, meta_data, help, and usage action.
|
||
# - For monitor, however, a modified convention applies:
|
||
# - If the server is running we return, OCF_SUCCESS. If not running and gracefully stopped or migrated
|
||
# off, return OCF_NOT_RUNNING.
|
||
#
|
||
# 1 - OCF_ERR_GENERIC
|
||
# - The action returned a generic error. This is used only when none of the more specific error codes,
|
||
# defined below, accurately describes the problem.
|
||
# - Pacemaker interprets this exit code as a soft error.
|
||
#
|
||
# 2 - OCF_ERR_ARGS
|
||
# - The resource’s configuration is not valid on this machine. This can happen if the serve fails to boot
|
||
# because of a missing bridge, for example.
|
||
#
|
||
# 3 - OCF_ERR_UNIMPLEMENTED
|
||
# - The resource agent was instructed to execute an action that we do not implement.
|
||
# - Not all resource agent actions are mandatory. We don't implement 'promote' or 'demote'. We do implement
|
||
# 'migrate_to', 'migrate_from', and 'notify'. If we're misconfigured as a master/slave resource, for
|
||
# example, then will alert the user about this misconfiguration by returning OCF_ERR_UNIMPLEMENTED.
|
||
#
|
||
# 4 - OCF_ERR_PERM
|
||
# - The action failed due to insufficient permissions. This may be due to a node not being able to open a
|
||
# definition file or resource config.
|
||
# - Pacemaker interprets this exit code as a hard error.
|
||
#
|
||
# 5 - OCF_ERR_INSTALLED
|
||
# - The action failed because a required component is missing on the node where the action was executed.
|
||
# This may be due to a required binary not being executable, or a the DRBD resource config file not
|
||
# existing.
|
||
# - Pacemaker interprets this exit code as a hard error.
|
||
#
|
||
# 6 - OCF_ERR_CONFIGURED
|
||
# - The action failed because the user misconfigured the resource in pacemaker. For example, the user may
|
||
# have configured an alphanumeric string for a parameter that really should be an integer.
|
||
# - Pacemaker interprets this exit code as a fatal error.
|
||
#
|
||
# 7 - OCF_NOT_RUNNING
|
||
# - The resource was found not to be running. This is an exit code that may be returned by the monitor
|
||
# action exclusively. Note that this implies that the resource has either gracefully shut down, or has
|
||
# never been started.
|
||
#
|
||
# 8 - OCF_RUNNING_MASTER
|
||
# 9 - OCF_FAILED_MASTER
|
||
# - These OCF exit codes are not used here.
|
||
#
|
||
|
||
# NOTE: We don't use Anvil::Tools to keep overhead low and to keep this agent independent as possible.
|
||
use strict;
|
||
use warnings;
|
||
use Anvil::Tools;
|
||
use XML::Simple;
|
||
use JSON;
|
||
use Math::BigInt;
|
||
use Data::Dumper;
|
||
|
||
# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete.
|
||
$| = 1;
|
||
|
||
my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0];
|
||
my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0];
|
||
if (($running_directory =~ /^\./) && ($ENV{PWD}))
|
||
{
|
||
$running_directory =~ s/^\./$ENV{PWD}/;
|
||
}
|
||
|
||
# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete.
|
||
$| = 1;
|
||
|
||
# NOTE: Setting 'log_level' and 'log_secure' here will get overridden in the main lopp. Use the Log methods
|
||
# in the loop as well to override defaults in code.
|
||
my $anvil = Anvil::Tools->new();
|
||
$anvil->Log->level({set => 2});
|
||
$anvil->Log->secure({set => 1});
|
||
|
||
### Read or Set the environment variables
|
||
# This is the name of the server we're managing. # Example values:
|
||
$anvil->data->{environment}{OCF_RESKEY_name} = defined $ENV{OCF_RESKEY_name} ? $ENV{OCF_RESKEY_name} : ""; # srv01-c7
|
||
# This is our node name
|
||
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node} = defined $ENV{OCF_RESKEY_CRM_meta_on_node} ? $ENV{OCF_RESKEY_CRM_meta_on_node} : ""; # el8-a01n01.digimer.ca
|
||
# This says "UUID", but it's the node ID.
|
||
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node_uuid} = defined $ENV{OCF_RESKEY_CRM_meta_on_node_uuid} ? $ENV{OCF_RESKEY_CRM_meta_on_node_uuid} : ""; # 1
|
||
# This is the timeout for the called action in millisecond.
|
||
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout} = defined $ENV{OCF_RESKEY_CRM_meta_timeout} ? $ENV{OCF_RESKEY_CRM_meta_timeout} : ""; # 20000
|
||
# If this is set, we'll bump our log level as well.
|
||
$anvil->data->{environment}{PCMK_debug} = defined $ENV{PCMK_debug} ? $ENV{PCMK_debug} : ""; # 0
|
||
# These are other variables that are set, but we don't currently care about them
|
||
$anvil->data->{environment}{OCF_EXIT_REASON_PREFIX} = defined $ENV{OCF_EXIT_REASON_PREFIX} ? $ENV{OCF_EXIT_REASON_PREFIX} : ""; # ocf-exit-reason:
|
||
$anvil->data->{environment}{OCF_RA_VERSION_MAJOR} = defined $ENV{OCF_RA_VERSION_MAJOR} ? $ENV{OCF_RA_VERSION_MAJOR} : ""; # 1
|
||
$anvil->data->{environment}{OCF_RA_VERSION_MINOR} = defined $ENV{OCF_RA_VERSION_MINOR} ? $ENV{OCF_RA_VERSION_MINOR} : ""; # 0
|
||
$anvil->data->{environment}{OCF_RESKEY_crm_feature_set} = defined $ENV{OCF_RESKEY_crm_feature_set} ? $ENV{OCF_RESKEY_crm_feature_set} : ""; # 3.0.12
|
||
$anvil->data->{environment}{OCF_RESOURCE_INSTANCE} = defined $ENV{OCF_RESOURCE_INSTANCE} ? $ENV{OCF_RESOURCE_INSTANCE} : ""; # srv01-c7
|
||
$anvil->data->{environment}{OCF_RESOURCE_PROVIDER} = defined $ENV{OCF_RESOURCE_PROVIDER} ? $ENV{OCF_RESOURCE_PROVIDER} : ""; # alteeve
|
||
$anvil->data->{environment}{OCF_RESOURCE_TYPE} = defined $ENV{OCF_RESOURCE_TYPE} ? $ENV{OCF_RESOURCE_TYPE} : ""; # server
|
||
$anvil->data->{environment}{OCF_ROOT} = defined $ENV{OCF_ROOT} ? $ENV{OCF_ROOT} : ""; # /usr/lib/ocf
|
||
# These are set during a migration
|
||
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source} = defined $ENV{OCF_RESKEY_CRM_meta_migrate_source} ? $ENV{OCF_RESKEY_CRM_meta_migrate_source} : ""; # el8-a01n01.digimer.ca
|
||
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target} = defined $ENV{OCF_RESKEY_CRM_meta_migrate_target} ? $ENV{OCF_RESKEY_CRM_meta_migrate_target} : ""; # el8-a01n02.digimer.ca
|
||
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_record_pending} = defined $ENV{OCF_RESKEY_CRM_meta_record_pending} ? $ENV{OCF_RESKEY_CRM_meta_record_pending} : ""; # true
|
||
|
||
# If pacemaker is in debug, so are we,
|
||
if ($anvil->data->{environment}{PCMK_debug})
|
||
{
|
||
$anvil->Log->level({set => 3});
|
||
}
|
||
|
||
# Originally, this was designed to start and stop a server's DRBD resources on demand. Early testing appears
|
||
# to show this prone to higher risk of fencing if something goes wrong. As such, we're changing the default
|
||
# behaviour to leave DRBD resources up. Set this to '1' (here or by switch) to revert back to the old
|
||
# behaviour.
|
||
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_stop_drbd_resources} = 0;
|
||
|
||
# Get any command line switches.
|
||
$anvil->Get->switches;
|
||
|
||
if ($anvil->data->{switches}{stop_drbd_resources})
|
||
{
|
||
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source} = 1;
|
||
}
|
||
|
||
# Something for the logs
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 3, key => "log_0298"});
|
||
|
||
### TEST: to be removed later
|
||
if ($anvil->data->{switches}{test1})
|
||
{
|
||
$anvil->data->{environment}{OCF_RESKEY_name} = "test_server";
|
||
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout} = 20000;
|
||
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node} = "el8-a01n01.digimer.ca";
|
||
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source} = "el8-a01n01.digimer.ca";
|
||
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target} = "el8-a01n02.digimer.ca";
|
||
#print "Running test 1; Migrate: [".$anvil->data->{environment}{OCF_RESKEY_name}."] from: [".$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source}."] to: [".$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target}."]\n";
|
||
}
|
||
if ($anvil->data->{switches}{test2})
|
||
{
|
||
$anvil->data->{environment}{OCF_RESKEY_name} = "test_server";
|
||
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout} = 20000;
|
||
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node} = "el8-a01n02.digimer.ca";
|
||
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source} = "el8-a01n02.digimer.ca";
|
||
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target} = "el8-a01n01.digimer.ca";
|
||
#print "Running test 2; Migrate: [".$anvil->data->{environment}{OCF_RESKEY_name}."] from: [".$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source}."] to: [".$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target}."]\n";
|
||
}
|
||
if ($anvil->data->{switches}{test3})
|
||
{
|
||
$anvil->data->{environment}{OCF_RESKEY_name} = "test_server";
|
||
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node} = "el8-a01n01.digimer.ca";
|
||
#print "Running test 3; Boot or shutdown of: [".$anvil->data->{environment}{OCF_RESKEY_name}."].\n";
|
||
}
|
||
|
||
# This is for debugging.
|
||
if (($anvil->data->{switches}{monitor}) or
|
||
($anvil->data->{switches}{status}) or
|
||
($anvil->data->{switches}{'meta-data'}) or
|
||
($anvil->data->{switches}{metadaata}))
|
||
{
|
||
show_environment($anvil, 3);
|
||
}
|
||
else
|
||
{
|
||
show_environment($anvil, 3);
|
||
}
|
||
|
||
### What are we being asked to do?
|
||
# start - Starts the resource.
|
||
# stop - Shuts down the resource.
|
||
# monitor - (status aliases here) Queries the resource for its state.
|
||
# meta-data - Dumps the resource agent metadata.
|
||
# promote - Turns a resource into the Master role (Master/Slave resources only).
|
||
# demote - Turns a resource into the Slave role (Master/Slave resources only).
|
||
# migrate_to - migration target
|
||
# migrate_from - Implement live migration of resources.
|
||
# validate-all - Validates a resource’s configuration.
|
||
# help - (usage maps here) Displays a usage message when the resource agent is invoked from the command line, rather than by the cluster manager.
|
||
# notify - Inform resource about changes in state of other clones.
|
||
|
||
if ($anvil->data->{switches}{start})
|
||
{
|
||
# Start the server
|
||
start_server($anvil);
|
||
}
|
||
elsif ($anvil->data->{switches}{stop})
|
||
{
|
||
# Stop the server
|
||
stop_server($anvil);
|
||
}
|
||
elsif (($anvil->data->{switches}{monitor}) or ($anvil->data->{switches}{status}))
|
||
{
|
||
# Report the status of the server.
|
||
server_status($anvil);
|
||
}
|
||
elsif (($anvil->data->{switches}{metadaata}) or ($anvil->data->{switches}{'meta-data'}))
|
||
{
|
||
show_metadata($anvil);
|
||
}
|
||
elsif ($anvil->data->{switches}{promote})
|
||
{
|
||
# We don't support this, so we return OCF_ERR_UNIMPLEMENTED (3)
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0299", variables => { server => $anvil->data->{environment}{OCF_RESKEY_name} }});
|
||
$anvil->nice_exit({exit_code => 3});
|
||
}
|
||
elsif ($anvil->data->{switches}{demote})
|
||
{
|
||
# We don't support this, so we return OCF_ERR_UNIMPLEMENTED (3)
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0300", variables => { server => $anvil->data->{environment}{OCF_RESKEY_name} }});
|
||
$anvil->nice_exit({exit_code => 3});
|
||
}
|
||
elsif (($anvil->data->{switches}{migrate_to}) or ($anvil->data->{switches}{migrate_from}))
|
||
{
|
||
# We don't support this, so we return OCF_ERR_UNIMPLEMENTED (3)
|
||
migrate_server($anvil);
|
||
}
|
||
elsif ($anvil->data->{switches}{'validate-all'})
|
||
{
|
||
# Validate our local config and setup.
|
||
validate_all($anvil);
|
||
$anvil->nice_exit({exit_code => 0});
|
||
}
|
||
elsif (($anvil->data->{switches}{help}) or ($anvil->data->{switches}{usage}))
|
||
{
|
||
# Show the usage information
|
||
show_usage($anvil);
|
||
$anvil->nice_exit({exit_code => 0});
|
||
}
|
||
elsif ($anvil->data->{switches}{notify})
|
||
{
|
||
# We don't implement this
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level =>0, key => "log_0301"});
|
||
$anvil->nice_exit({exit_code => 3});
|
||
}
|
||
else
|
||
{
|
||
# We were called in some unexpected way. Log an error, show usage and exit.
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level =>0, key => "log_0302"});
|
||
show_environment($anvil, 0);
|
||
$anvil->nice_exit({exit_code => 1});
|
||
}
|
||
|
||
# If we hit here, something very wrong happened.
|
||
$anvil->nice_exit({exit_code => 255});
|
||
|
||
|
||
#############################################################################################################
|
||
# Functions #
|
||
#############################################################################################################
|
||
|
||
=cut
|
||
|
||
STATES
|
||
|
||
The State field lists what state each domain is currently in. A domain can be in one of the following
|
||
possible states:
|
||
|
||
running - The domain is currently running on a CPU
|
||
idle - The domain is idle, and not running or runnable. This can be caused because the domain is
|
||
waiting on IO (a traditional wait state) or has gone to sleep because there was nothing else
|
||
for it to do.
|
||
paused - The domain has been paused, usually occurring through the administrator running virsh suspend.
|
||
When in a paused state the domain will still consume allocated resources like memory, but will
|
||
not be eligible for scheduling by the hypervisor.
|
||
in shutdown - The domain is in the process of shutting down, i.e. the guest operating system has been
|
||
notified and should be in the process of stopping its operations gracefully.
|
||
shut off - The domain is not running. Usually this indicates the domain has been shut down completely, or
|
||
has not been started.
|
||
crashed - The domain has crashed, which is always a violent ending. Usually this state can only occur if
|
||
the domain has been configured not to restart on crash.
|
||
pmsuspended - The domain has been suspended by guest power management, e.g. entered into s3 state.
|
||
=cut
|
||
|
||
# This boots the server if possible.
|
||
sub start_server
|
||
{
|
||
my ($anvil) = @_;
|
||
|
||
# Start procedure;
|
||
# 1. Read the XML definition file and find the backing storage and bridges. Soft error if read fails.
|
||
# 2. Make sure the name matches.
|
||
# 3. Make sure we have enough free RAM.
|
||
# 4. Make sure the emulator exists (can be an issue after migrating from an different gen Anvil!).
|
||
# 5.1. Make sure optical drives with mounted data have the disk present. Soft error if not.
|
||
# 5.2. Find any backing DRBD devices
|
||
# 6. For each DRBD device;
|
||
# 6.1. Make sure the backing LV is ACTIVE. Soft error if not.
|
||
# 6.2. Check if the drbd resource is up. If not, up it.
|
||
# 6.3. Make sure the backing disk is UpToDate. Soft error if not.
|
||
# 6.4. Make sure the backing device is 'Connected' or 'Connecting'. Call a connect if not.
|
||
# 7. Make sure all bridges exist and soft error if not.
|
||
# 8. Start the server.
|
||
|
||
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0303", variables => { server => $server }});
|
||
|
||
# Make sure things are sane.
|
||
validate_all($anvil);
|
||
|
||
# Is the server already running somewhere?
|
||
find_server($anvil);
|
||
|
||
# Start the resource, if needed.
|
||
start_drbd_resource($anvil);
|
||
|
||
# Still alive? Boot!
|
||
my ($success) = $anvil->Server->boot({debug => 2, server => $server});
|
||
if ($success)
|
||
{
|
||
# Success!
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0309", variables => { server => $server }});
|
||
$anvil->nice_exit({exit_code => 0});
|
||
}
|
||
else
|
||
{
|
||
# WTF?
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "log_0310", variables => {
|
||
server => $server,
|
||
'state' => defined $anvil->data->{server}{location}{$server}{host} ? $anvil->data->{server}{location}{$server}{host} : "#!string!unit_0003!#",
|
||
}});
|
||
$anvil->nice_exit({exit_code => 6});
|
||
}
|
||
|
||
# If we're still alive, then we didn't see the server in the list of running servers, which is really weird.
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "log_0311", variables => { server => $server }});
|
||
$anvil->nice_exit({exit_code => 1});
|
||
}
|
||
|
||
# This stops the DRBD resource(s) that ran under a server.
|
||
sub stop_drbd_resource
|
||
{
|
||
my ($anvil) = @_;
|
||
|
||
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
|
||
my $host = $anvil->_short_host_name;
|
||
my $peer = $anvil->data->{drbd}{config}{$host}{peer};
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
||
server => $server,
|
||
host => $host,
|
||
peer => $peer,
|
||
}});
|
||
|
||
# Start DRBD locally.
|
||
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{'local'}{$server}{resource}})
|
||
{
|
||
my $peer_ip = $anvil->data->{drbd}{config}{$host}{resource}{$resource}{connection}{$peer}{ip_address};
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0408", variables => {
|
||
server => $server,
|
||
peer => $peer,
|
||
peer_ip => $peer_ip,
|
||
resource => $resource,
|
||
}});
|
||
|
||
# Bring the peer's resource down.
|
||
$anvil->DRBD->manage_resource({
|
||
resource => $resource,
|
||
task => "down",
|
||
target => $peer_ip,
|
||
});
|
||
|
||
# Bring the local resource down
|
||
$anvil->DRBD->manage_resource({
|
||
resource => $resource,
|
||
task => "down",
|
||
});
|
||
}
|
||
|
||
return(0);
|
||
}
|
||
|
||
# This starts the drbd resource(s) for the requested server, if needed.
|
||
sub start_drbd_resource
|
||
{
|
||
my ($anvil) = @_;
|
||
|
||
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
|
||
my $host = $anvil->_short_host_name;
|
||
my $peer = $anvil->data->{drbd}{config}{$host}{peer};
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
||
server => $server,
|
||
host => $host,
|
||
peer => $peer,
|
||
}});
|
||
|
||
# Do we need startup?
|
||
my $startup_needed = 0;
|
||
$anvil->DRBD->get_status({debug => 3});
|
||
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{'local'}{$server}{resource}})
|
||
{
|
||
# Is the current resource up locally already? If it is, we're done.
|
||
my $role = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role};
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { role => $role }});
|
||
|
||
if ((lc($role) ne "secondary") && (lc($role) ne "primary"))
|
||
{
|
||
$startup_needed = 1;
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { startup_needed => $startup_needed }});
|
||
last;
|
||
}
|
||
else
|
||
{
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0434", variables => {
|
||
resource => $resource,
|
||
role => $role,
|
||
}});
|
||
}
|
||
}
|
||
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { startup_needed => $startup_needed }});
|
||
if (not $startup_needed)
|
||
{
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0431"});
|
||
return(0);
|
||
}
|
||
|
||
# Start DRBD locally.
|
||
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{'local'}{$server}{resource}})
|
||
{
|
||
my $peer_ip = $anvil->data->{drbd}{config}{$host}{resource}{$resource}{connection}{$peer}{ip_address};
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0419", variables => {
|
||
server => $server,
|
||
peer => $peer,
|
||
peer_ip => $peer_ip,
|
||
resource => $resource,
|
||
}});
|
||
|
||
# Bring the local resource up
|
||
$anvil->DRBD->manage_resource({
|
||
resource => $resource,
|
||
task => "up",
|
||
});
|
||
|
||
# Bring the peer's resource up.
|
||
$anvil->DRBD->manage_resource({
|
||
resource => $resource,
|
||
task => "up",
|
||
target => $peer_ip,
|
||
});
|
||
|
||
# Now wait for it to be connected or UpToDate...
|
||
my $waiting = 1;
|
||
while($waiting)
|
||
{
|
||
$anvil->DRBD->get_status({debug => 3});
|
||
|
||
my $connection_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer}{'connection-state'};
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
||
connection_state => $connection_state,
|
||
}});
|
||
|
||
my $all_ready = 1;
|
||
foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}})
|
||
{
|
||
my $disk_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}{$volume}{'disk-state'};
|
||
my $replication_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer}{volume}{$volume}{'replication-state'};
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
||
disk_state => $disk_state,
|
||
replication_state => $replication_state,
|
||
}});
|
||
|
||
# Is the peer isn't connected (directly or by being in Sync), or this volume
|
||
# isn't UpToDate, we need to keep waiting.
|
||
if ((lc($disk_state) ne "uptodate") && ($replication_state !~ /^Sync/i) && (lc($connection_state) ne "connected"))
|
||
{
|
||
$all_ready = 0;
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_ready => $all_ready }});
|
||
}
|
||
}
|
||
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_ready => $all_ready }});
|
||
if ($all_ready)
|
||
{
|
||
$waiting = 0;
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
|
||
}
|
||
|
||
if ($waiting)
|
||
{
|
||
sleep 1;
|
||
}
|
||
}
|
||
}
|
||
|
||
# If auto-promote isn't set, promote the resource.
|
||
if (not $anvil->data->{drbd}{config}{$host}{'auto-promote'})
|
||
{
|
||
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{'local'}{$server}{resource}})
|
||
{
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0420", variables => {
|
||
server => $server,
|
||
resource => $resource,
|
||
}});
|
||
# Make the local resource primary.
|
||
$anvil->DRBD->manage_resource({
|
||
resource => $resource,
|
||
task => "primary",
|
||
});
|
||
}
|
||
}
|
||
|
||
return(0);
|
||
}
|
||
|
||
# This uses the DRBD information to find other peers and see if the server is running on them.
|
||
sub find_server
|
||
{
|
||
my ($anvil) = @_;
|
||
|
||
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
|
||
my $host = $anvil->_short_host_name;
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0415", variables => { server => $server }});
|
||
foreach my $ip_address (sort {$a cmp $b} keys %{$anvil->data->{drbd}{config}{$host}{ip_addresses}})
|
||
{
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { ip_address => $ip_address }});
|
||
$anvil->Server->find({
|
||
debug => 3,
|
||
target => $ip_address,
|
||
remote_user => "root",
|
||
});
|
||
}
|
||
|
||
foreach my $this_server (sort {$a cmp $b} keys %{$anvil->data->{server}{location}})
|
||
{
|
||
my $status = $anvil->data->{server}{location}{$this_server}{status};
|
||
my $host = $anvil->data->{server}{location}{$this_server}{host};
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
||
this_server => $this_server,
|
||
status => $status,
|
||
host => $host,
|
||
}});
|
||
}
|
||
if ((exists $anvil->data->{server}{location}{$server}) && ($anvil->data->{server}{location}{$server}{host}))
|
||
{
|
||
# The server is running. If it is running here, exit with success. If it's running elsewhere,
|
||
# exit with a failure.
|
||
my $status = $anvil->data->{server}{location}{$server}{status};
|
||
my $host = $anvil->data->{server}{location}{$server}{host};
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
||
status => $status,
|
||
host => $host,
|
||
}});
|
||
|
||
if ($host eq $anvil->_host_name)
|
||
{
|
||
# Already running, we're good, and we're done.
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0416", variables => { server => $server }});
|
||
$anvil->nice_exit({exit_code => 0});
|
||
}
|
||
elsif ($host =~ /dr(\d+)$/)
|
||
{
|
||
# The server is running elsewhere. If the peer host is DR, exit with
|
||
# OCF_ERR_CONFIGURED (6) so that pacemaker doesn't try to also start the server on
|
||
# the other node, because we don't know the state of it here.
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "log_0417", variables => {
|
||
server => $server,
|
||
host => $host,
|
||
}});
|
||
$anvil->nice_exit({exit_code => 6});
|
||
}
|
||
else
|
||
{
|
||
# It looks like it's running on the peer. So we'll exit OCF_ERR_INSTALLED (5) to tell
|
||
# pacemaker to try to start it on our peer.
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "log_0418", variables => {
|
||
server => $server,
|
||
host => $host,
|
||
}});
|
||
$anvil->nice_exit({exit_code => 6});
|
||
}
|
||
}
|
||
|
||
return(0);
|
||
}
|
||
|
||
# This shuts down the server if possible.
|
||
sub stop_server
|
||
{
|
||
my ($anvil) = @_;
|
||
|
||
# Stopping the server is simply a question of "is the server running?" and, if so, stop it. Once
|
||
# stopped, we stop the DRBD resource on both nodes.
|
||
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
|
||
|
||
# Read in an parse the server's XML.
|
||
$anvil->System->check_storage({debug => 2});
|
||
$anvil->Server->get_status({debug => 2, server => $server});
|
||
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0313", variables => { server => $server }});
|
||
my $success = $anvil->Server->shutdown({debug => 2, server => $server});
|
||
if (not $success)
|
||
{
|
||
# Something went wrong. Details should be in the logs.
|
||
$anvil->nice_exit({exit_code => 1});
|
||
}
|
||
|
||
# Now stop the DRBD resource(s).
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
||
'environment::OCF_RESKEY_CRM_meta_stop_drbd_resources' => $anvil->data->{environment}{OCF_RESKEY_CRM_meta_stop_drbd_resources},
|
||
}});
|
||
if ($anvil->data->{environment}{OCF_RESKEY_CRM_meta_stop_drbd_resources})
|
||
{
|
||
stop_drbd_resource($anvil);
|
||
}
|
||
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0324", variables => { server => $server }});
|
||
$anvil->nice_exit({exit_code => 0});
|
||
}
|
||
|
||
# This checks the status of the server.
|
||
sub server_status
|
||
{
|
||
my ($anvil) = @_;
|
||
|
||
# If the named server is running, return OCF_SUCCESS (0), otherwise OCF_NOT_RUNNING (7). If the
|
||
# server is failed, return OCF_ERR_GENERIC (1).
|
||
my $state = "";
|
||
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
|
||
|
||
### NOTE: When pacemaker is first starting, virsh won't be up right away. So if we get a return code
|
||
### of '1', we'll try again up to 50% of 'environment::OCF_RESKEY_CRM_meta_timeout'.
|
||
if (not $anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout})
|
||
{
|
||
# Set a sane default of 20 seconds.
|
||
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout} = 20000;
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, priority => "alert", key => "log_0331", variables => { timeout => $anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout} }});
|
||
}
|
||
|
||
$anvil->nice_exit({exit_code => 0});
|
||
}
|
||
|
||
# Migrate the server
|
||
sub migrate_server
|
||
{
|
||
my ($anvil) = @_;
|
||
|
||
### NOTE: For now, we're not going to block if the target is not UpToDate. There are times when a
|
||
### user might want to do this (ie: sync will be done soon and the need to evacuate the node
|
||
### ASAP is high). Maybe we'll enforce this and require a '--force' switch later?
|
||
# If we were given 'migrate_to', we need to make sure the storage is UpToDate on the peer for all
|
||
# backing resources. We can't check the target's bridges, but the migation will fail if one is
|
||
# missing.
|
||
# If we're given 'migrate_from', we're pulling the server towards us, so we can check both brdiges
|
||
# and storage.
|
||
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
|
||
my $source = $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source};
|
||
my $target = $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target};
|
||
my $meta_on_node = $anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node};
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
||
server => $server,
|
||
source => $source,
|
||
target => $target,
|
||
meta_on_node => $meta_on_node,
|
||
'switches::migrate_to' => $anvil->data->{switches}{migrate_to},
|
||
'switches::migrate_from' => $anvil->data->{switches}{migrate_from},
|
||
}});
|
||
|
||
# The actual migration command will involve enabling dual primary, then beginning the migration. The
|
||
# virsh call will depend on if we're pushing or pulling. Once the migration completes, regardless of
|
||
# success or failure, dual primary will be disabled again.
|
||
my $migration_command = "";
|
||
my $migrated = 0;
|
||
if ($anvil->data->{switches}{migrate_to})
|
||
{
|
||
# Can I even connect to the target?
|
||
my ($access) = $anvil->Remote->test_access({debug => 2, target => $target});
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { access => $access }});
|
||
if (not $access)
|
||
{
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "log_0429", variables => {
|
||
server => $server,
|
||
target => $target,
|
||
}});
|
||
### TODO: I wonder if this should be exit'ed with '6'?
|
||
$anvil->nice_exit({exit_code => 5});
|
||
}
|
||
|
||
# Find the server
|
||
$anvil->Server->find({debug => 3});
|
||
my $server_host = defined $anvil->data->{server}{location}{$server}{host} ? $anvil->data->{server}{location}{$server}{host} : "";
|
||
my $server_status = defined $anvil->data->{server}{location}{$server}{status} ? $anvil->data->{server}{location}{$server}{status} : "";
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
||
server_host => $server_host,
|
||
server_status => $server_status,
|
||
}});
|
||
|
||
# Is it already on the target?
|
||
if (not $server_status)
|
||
{
|
||
# Maybe...
|
||
$anvil->Server->find({debug => 3, target => $target});
|
||
$server_host = defined $anvil->data->{server}{location}{$server}{host} ? $anvil->data->{server}{location}{$server}{host} : "";
|
||
$server_status = defined $anvil->data->{server}{location}{$server}{status} ? $anvil->data->{server}{location}{$server}{status} : "";
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
||
server_host => $server_host,
|
||
server_status => $server_status,
|
||
}});
|
||
|
||
if (($server_host eq $target) && (($server_status) && ($server_status eq "running")))
|
||
{
|
||
# Already over there, we're done.
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0430", variables => {
|
||
server => $server,
|
||
target => $target,
|
||
}});
|
||
$anvil->nice_exit({exit_code => 0});
|
||
}
|
||
}
|
||
|
||
if (not $server_host)
|
||
{
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "log_0344", variables => { server => $server }});
|
||
$anvil->nice_exit({exit_code => 1});
|
||
}
|
||
|
||
# Get a view of the servers locally and our peer.
|
||
validate_all($anvil);
|
||
|
||
# Get the DRBD status.
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0387"});
|
||
$anvil->DRBD->get_status({debug => 2});
|
||
|
||
# Make sure all resource(s) are ready for the server.
|
||
my $all_up_to_date = 1;
|
||
my $host = $anvil->_short_host_name;
|
||
my $peer_name = $anvil->data->{drbd}{config}{$host}{peer};
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
||
host => $host,
|
||
peer_name => $peer_name,
|
||
}});
|
||
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{'local'}{$server}{resource}})
|
||
{
|
||
my $connection_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'connection-state'};
|
||
my $peer_node_id = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'peer-node-id'};
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
||
peer_node_id => $peer_node_id,
|
||
resource => $resource,
|
||
connection_state => $connection_state,
|
||
}});
|
||
if (lc($connection_state) ne "connected")
|
||
{
|
||
# Try to bring the resource up on the peer now.
|
||
$anvil->DRBD->manage_resource({
|
||
resource => $resource,
|
||
task => "up",
|
||
target => $target,
|
||
});
|
||
|
||
# We'll give it 20 seconds.
|
||
my $wait = 20;
|
||
while($wait)
|
||
{
|
||
$anvil->DRBD->get_status({debug => 2});
|
||
$connection_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'connection-state'};
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
||
resource => $resource,
|
||
connection_state => $connection_state,
|
||
}});
|
||
|
||
if (lc($connection_state) ne "connected")
|
||
{
|
||
# It's up!
|
||
$wait = 0;
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'wait' => $wait }});
|
||
}
|
||
else
|
||
{
|
||
$wait--;
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'wait' => $wait }});
|
||
|
||
if (not $wait)
|
||
{
|
||
# We're done waiting.
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "log_0428", variables => {
|
||
server => $server,
|
||
target => $target,
|
||
resource => $resource,
|
||
connection_state => $connection_state,
|
||
}});
|
||
### TODO: I wonder if this should be exit'ed with '6'?
|
||
$anvil->nice_exit({exit_code => 5});
|
||
}
|
||
}
|
||
}
|
||
|
||
}
|
||
foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{volume}})
|
||
{
|
||
my $peer_disk_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{volume}{$volume}{'peer-disk-state'};
|
||
my $percent_in_sync = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{volume}{$volume}{'percent-in-sync'};
|
||
my $replication_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{volume}{$volume}{'replication-state'};
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
||
volume => $volume,
|
||
peer_disk_state => $peer_disk_state,
|
||
percent_in_sync => $percent_in_sync,
|
||
replication_state => $replication_state,
|
||
}});
|
||
|
||
if (lc($peer_disk_state) ne "uptodate")
|
||
{
|
||
$all_up_to_date = 0;
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_up_to_date => $all_up_to_date }});
|
||
}
|
||
}
|
||
}
|
||
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_up_to_date => $all_up_to_date }});
|
||
if (not $all_up_to_date)
|
||
{
|
||
### TODO: If we decide later to block migration to Inconsistent peers, here's where we'd do it.
|
||
}
|
||
|
||
# If we're still alive, we're ready to migrate.
|
||
($migrated) = $anvil->Server->migrate({
|
||
debug => 2,
|
||
server => $server,
|
||
target => $target
|
||
});
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { migrated => $migrated }});
|
||
}
|
||
elsif ($anvil->data->{switches}{migrate_from})
|
||
{
|
||
# Pull the server here. Start by verifying it's on the 'meta_on_node' host.
|
||
# Scan locally and on our peer
|
||
$anvil->Server->find({debug => 2});
|
||
$anvil->Server->find({debug => 2, target => $meta_on_node, refresh => 0});
|
||
|
||
my $host = defined $anvil->data->{server}{location}{$server}{host} ? $anvil->data->{server}{location}{$server}{host} : "";
|
||
my $short_host = ($host =~ /^(.*?)\..*$/)[0];
|
||
my $status = defined $anvil->data->{server}{location}{$server}{status} ? $anvil->data->{server}{location}{$server}{status} : "";
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
||
host => $host,
|
||
short_host => $short_host,
|
||
status => $status,
|
||
target => $target,
|
||
}});
|
||
|
||
# Convert the host to a short name, in case the node's name is the short version.
|
||
|
||
my $server_host = defined $anvil->data->{server}{location}{$server}{host} ? $anvil->data->{server}{location}{$server}{host} : "";
|
||
my $server_status = defined $anvil->data->{server}{location}{$server}{status} ? $anvil->data->{server}{location}{$server}{status} : "";
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
|
||
server_host => $server_host,
|
||
server_status => $server_status,
|
||
}});
|
||
|
||
# This is called after a migration. If that is the case here, the target will be us. Just
|
||
# make sure it is running and, if so, return '0'. The 'meta_on_node' is the new host.
|
||
if (($target eq $anvil->_host_name) or ($target eq $anvil->_short_host_name) or ($target eq $meta_on_node))
|
||
{
|
||
# If it's running, we're succesfully out.
|
||
if ((($host eq $target) or ($short_host eq $target)) && ($status eq "running"))
|
||
{
|
||
# Success!
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0347", variables => { server => $server }});
|
||
$anvil->nice_exit({exit_code => 0});
|
||
}
|
||
|
||
# If we're still alive, we'll proceed as if we're pulling the server to us, and maybe
|
||
# that will work.
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0348", variables => { server => $server }});
|
||
}
|
||
|
||
# Validate as if we were about to boot the server.
|
||
validate_all($anvil);
|
||
|
||
# Call the pull migation.
|
||
($migrated) = $anvil->Server->migrate({
|
||
debug => 2,
|
||
server => $server,
|
||
source => $source,
|
||
target => $target
|
||
});
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { migrated => $migrated }});
|
||
}
|
||
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { migrated => $migrated }});
|
||
if (not $migrated)
|
||
{
|
||
# Exit
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "log_0357"});
|
||
$anvil->nice_exit({exit_code => 1});
|
||
}
|
||
|
||
# If we made it here, we succeeded.
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0360"});
|
||
$anvil->nice_exit({exit_code => 0});
|
||
}
|
||
|
||
# Validation checks that we have the definition XML, resource config and that needed apps are installed.
|
||
sub validate_all
|
||
{
|
||
my ($anvil) = @_;
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0361"});
|
||
|
||
### TODO: When we have actual Anvil! systems, connect to the peers (nodes / DR) for this host and see
|
||
### if the server is running elsewhere.
|
||
|
||
# Read in an parse the server's XML.
|
||
$anvil->System->check_storage({debug => 3});
|
||
$anvil->Server->get_status({debug => 2, server => $anvil->data->{environment}{OCF_RESKEY_name}});
|
||
|
||
# Is the name in the definition file what we expect (and did we read the XML data at all)?
|
||
validate_name($anvil);
|
||
|
||
# Make sure the emulator it wants is the one we have.
|
||
validate_emulator($anvil);
|
||
|
||
# These tests are only needed if we're about to boot the server
|
||
if (($anvil->data->{switches}{start}) or ($anvil->data->{switches}{migrate_from}))
|
||
{
|
||
# Check that we have enough RAM.
|
||
validate_ram($anvil);
|
||
}
|
||
|
||
# Validate bridges
|
||
validate_bridges($anvil);
|
||
|
||
# Validate storage (Disks and optical media)
|
||
validate_storage($anvil);
|
||
|
||
return(0);
|
||
}
|
||
|
||
# This ensures that the bridges the server connects to exist on this node.
|
||
sub validate_bridges
|
||
{
|
||
my ($anvil) = @_;
|
||
|
||
# Get my bridge list
|
||
$anvil->System->get_bridges({debug => 3});
|
||
|
||
# Find the Optical drives and DRBD devices.
|
||
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
|
||
foreach my $mac (sort {$a cmp $b} keys %{$anvil->data->{server}{'local'}{$server}{from_disk}{device}{interface}})
|
||
{
|
||
# See if we have this bridge
|
||
my $found = 0;
|
||
my $bridge = $anvil->data->{server}{'local'}{$server}{from_disk}{device}{interface}{$mac}{bridge};
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { bridge => $bridge }});
|
||
foreach my $interface_name (sort {$a cmp $b} keys %{$anvil->data->{'local'}{network}{bridges}{bridge}})
|
||
{
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { interface_name => $interface_name }});
|
||
if ((exists $anvil->data->{'local'}{network}{bridges}{bridge}{$interface_name}) && ($anvil->data->{'local'}{network}{bridges}{bridge}{$interface_name}{found}))
|
||
{
|
||
$found = 1;
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { found => $found }});
|
||
last;
|
||
}
|
||
}
|
||
|
||
if ($found)
|
||
{
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 3, key => "log_0368", variables => { bridge => $bridge }});
|
||
}
|
||
else
|
||
{
|
||
# Missing bridge.
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "log_0369", variables => { bridge => $bridge }});
|
||
$anvil->nice_exit({exit_code => 5});
|
||
}
|
||
}
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0366"});
|
||
|
||
return(0);
|
||
}
|
||
|
||
# This looks up the disks and optical media connected to this server.
|
||
sub validate_storage
|
||
{
|
||
my ($anvil) = @_;
|
||
|
||
### TODO: When checking on a running server, use 'from_memory'.
|
||
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
|
||
my $source = "from_disk";
|
||
if ($anvil->data->{server}{'local'}{$server}{from_memory}{host})
|
||
{
|
||
$source = "from_memory";
|
||
}
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
|
||
server => $server,
|
||
source => $source,
|
||
}});
|
||
|
||
### TODO: If we're called with a status and find an ISO file missing and eject it instead of failing.
|
||
### For now, we just fault out.
|
||
# Do the optical discs in the drive exist? If not, we'll eject it if we're about to boot and fail if
|
||
# we're about to migrate. We skip this check if we're migrating off or shutting down the server.
|
||
if ((exists $anvil->data->{server}{'local'}{$server}{$source}{device}{cdrom}) && (not $anvil->data->{switches}{migrate_to}) && (not $anvil->data->{switches}{stop}))
|
||
{
|
||
foreach my $device_target (sort {$a cmp $b} keys %{$anvil->data->{server}{'local'}{$server}{$source}{device}{cdrom}{target}})
|
||
{
|
||
if ($anvil->data->{server}{'local'}{$server}{$source}{device}{cdrom}{target}{$device_target}{path})
|
||
{
|
||
my $file = $anvil->data->{server}{'local'}{$server}{$source}{device}{cdrom}{target}{$device_target}{path};
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { file => $file }});
|
||
if (not -e $file)
|
||
{
|
||
# It doesn't exist. Exit with OCF_ERR_INSTALLED (5).
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "log_0398", variables => { file => $file }});
|
||
$anvil->nice_exit({exit_code => 5});
|
||
}
|
||
elsif (not -r $file)
|
||
{
|
||
# We can't read it. Exit with OCF_ERR_PERM (4).
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "log_0399", variables => { file => $file }});
|
||
$anvil->nice_exit({exit_code => 4});
|
||
}
|
||
else
|
||
{
|
||
# We're OK.
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 2, key => "log_0400", variables => { file => $file }});
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
# Verify DRBD devices now
|
||
validate_storage_drbd($anvil);
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0367"});
|
||
|
||
return(0);
|
||
}
|
||
|
||
# THis makes sure that the needed backing DRBD devices are on this node. If so, and if they are not up, they
|
||
# will be brought up. If that fails, it errors out.
|
||
sub validate_storage_drbd
|
||
{
|
||
my ($anvil) = @_;
|
||
|
||
# Now check storage.
|
||
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
|
||
my $source = "from_disk";
|
||
my $host = $anvil->_short_host_name;
|
||
|
||
# Did I find a resource for each disk?
|
||
foreach my $device_path (sort {$a cmp $b} keys %{$anvil->data->{server}{'local'}{$server}{device}})
|
||
{
|
||
next if not $device_path;
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
|
||
"server::local::${server}::device::${device_path}::resource" => $anvil->data->{server}{'local'}{$server}{device}{$device_path}{resource},
|
||
}});
|
||
if (not $anvil->data->{server}{'local'}{$server}{device}{$device_path}{resource})
|
||
{
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 0, priority => "err", key => "log_0414", variables => { drbd_device => $device_path }});
|
||
$anvil->nice_exit({exit_code => 5});
|
||
}
|
||
}
|
||
|
||
foreach my $device_target (sort {$a cmp $b} keys %{$anvil->data->{server}{'local'}{$server}{$source}{device}{disk}{target}})
|
||
{
|
||
my $drbd_device = $anvil->data->{server}{'local'}{$server}{$source}{device}{disk}{target}{$device_target}{path};
|
||
my $drbd_resource = $anvil->data->{drbd}{config}{$host}{drbd_path}{$drbd_device}{resource};
|
||
my $on_lv = $anvil->data->{drbd}{config}{$host}{drbd_path}{$drbd_device}{on};
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
|
||
host => $host,
|
||
drbd_device => $drbd_device,
|
||
drbd_resource => $drbd_resource,
|
||
on_lv => $on_lv,
|
||
}});
|
||
|
||
# If the logical volume here here and active?
|
||
if ((not $on_lv) or (not exists $anvil->data->{lvm}{'local'}{lv}{$on_lv}))
|
||
{
|
||
# LV not found
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 0, priority => "err", key => "log_0374", variables => { drbd_device => $drbd_device, lv_path => $on_lv }});
|
||
$anvil->nice_exit({exit_code => 5});
|
||
}
|
||
elsif (not $anvil->data->{lvm}{'local'}{lv}{$on_lv}{active})
|
||
{
|
||
# LV not active. If we're starting the server or we're the migration target, try to
|
||
# activate it.
|
||
my $active = $anvil->System->activate_lv({debug => 2, path => $on_lv});
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { active => $active }});
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 1, key => "log_0413", variables => { lv_path => $on_lv }});
|
||
|
||
if (not $active)
|
||
{
|
||
# Boo :(
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 0, priority => "err", key => "log_0375", variables => { drbd_device => $drbd_device, lv_path => $on_lv }});
|
||
$anvil->nice_exit({exit_code => 5});
|
||
}
|
||
}
|
||
|
||
# LV is good if I am still alive.
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 1, key => "log_0376", variables => {
|
||
drbd_device => $drbd_device,
|
||
lv_path => $on_lv,
|
||
}});
|
||
|
||
}
|
||
|
||
### NOTE: Checking/Managing firewall ports is expensive option, so DRBD ports are permanently opened
|
||
### when a resource is created.
|
||
|
||
return(0);
|
||
}
|
||
|
||
# This verifies that the requested emulator exists and can be used.
|
||
sub validate_emulator
|
||
{
|
||
my ($anvil) = @_;
|
||
|
||
# What emulator is this using?
|
||
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
|
||
my $emulator = $anvil->data->{server}{'local'}{$server}{from_disk}{info}{emulator};
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
|
||
emulator => $emulator,
|
||
"server::local::${server}::from_disk::info::emulator" => $anvil->data->{server}{'local'}{$server}{from_disk}{info}{emulator}
|
||
}});
|
||
if (not -e $emulator)
|
||
{
|
||
# It doesn't exist. Exit with OCF_ERR_INSTALLED (5).
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "log_0401", variables => {
|
||
emulator => $emulator,
|
||
definition_file => $anvil->data->{server}{definition_file},
|
||
}});
|
||
$anvil->nice_exit({exit_code => 5});
|
||
}
|
||
if (not -x $emulator)
|
||
{
|
||
# We can't execute it. Exit with OCF_ERR_PERM (4).
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "log_0402", variables => { emulator => $emulator }});
|
||
$anvil->nice_exit({exit_code => 4});
|
||
}
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0364"});
|
||
|
||
return(0);
|
||
}
|
||
|
||
# This makes sure the name we see in the definition file matches what we expect.
|
||
sub validate_name
|
||
{
|
||
my ($anvil) = @_;
|
||
|
||
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
|
||
server => $server,
|
||
"server::local::${server}::from_disk::info::name" => $anvil->data->{server}{'local'}{$server}{from_disk}{info}{name},
|
||
}});
|
||
|
||
# If we failed to read the XML, the server probably doesn't exist.
|
||
if (not $anvil->data->{server}{'local'}{$server}{from_disk}{xml})
|
||
{
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "log_0403", variables => {
|
||
server => $server,
|
||
name => $anvil->data->{server}{definition_xml}->{name},
|
||
}});
|
||
$anvil->nice_exit({exit_code => 1});
|
||
}
|
||
|
||
# Is the name in the definition file what we expect?
|
||
if ($server ne $anvil->data->{server}{'local'}{$server}{from_disk}{info}{name})
|
||
{
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "log_0403", variables => {
|
||
server => $server,
|
||
name => $anvil->data->{server}{'local'}{$server}{from_disk}{info}{name},
|
||
}});
|
||
$anvil->nice_exit({exit_code => 1});
|
||
}
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0363"});
|
||
|
||
return(0);
|
||
}
|
||
|
||
# This checks that there is enough RAM to run this server.
|
||
sub validate_ram
|
||
{
|
||
my ($anvil) = @_;
|
||
|
||
# How mcuh RAM does the server need and how much do we have free?
|
||
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
|
||
my $server_ram_bytes = $anvil->data->{server}{'local'}{$server}{from_disk}{memory};
|
||
my $available = $anvil->System->get_free_memory({debug => 3});
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
|
||
server_ram_bytes => $anvil->Convert->add_commas({number => $server_ram_bytes})." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $server_ram_bytes}).")",
|
||
available => $anvil->Convert->add_commas({number => $available})." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $available}).")",
|
||
}});
|
||
if ($server_ram_bytes > $available)
|
||
{
|
||
# Not enough free memory.
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "log_0404", variables => {
|
||
name => $server,
|
||
ram => $anvil->Convert->bytes_to_human_readable({'bytes' => $server_ram_bytes}),
|
||
ram_bytes => $anvil->Convert->add_commas({number => $server_ram_bytes}),
|
||
available_ram => $anvil->Convert->bytes_to_human_readable({'bytes' => $server_ram_bytes}),
|
||
available_ram_bytes => $anvil->Convert->add_commas({number => $available}),
|
||
}});
|
||
$anvil->nice_exit({exit_code => 1});
|
||
}
|
||
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0365"});
|
||
|
||
return(0);
|
||
}
|
||
|
||
### TODO: Make sure the appropriate SN ports are opened.
|
||
# This stops (drbdadm down <server>) the storage for a given server on both nodes.
|
||
sub manage_drbd_resource
|
||
{
|
||
my ($anvil, $task, $resource) = @_;
|
||
|
||
|
||
|
||
|
||
return(0);
|
||
}
|
||
|
||
# This reads the XML definition data into an XML data hash.
|
||
sub read_server_definition
|
||
{
|
||
my ($anvil) = @_;
|
||
|
||
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
|
||
server => $server,
|
||
}});
|
||
|
||
|
||
return(0);
|
||
}
|
||
|
||
# This logs the details of this call.
|
||
sub show_environment
|
||
{
|
||
my ($anvil, $level) = @_;
|
||
|
||
foreach my $key (sort {$a cmp $b} keys %{$anvil->data->{switches}})
|
||
{
|
||
next if $key eq "raw";
|
||
next if $anvil->data->{switches}{$key} eq "";
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $level, list => { "switches::${key}" => $anvil->data->{switches}{$key} }});
|
||
}
|
||
foreach my $key (sort {$a cmp $b} keys %{$anvil->data->{environment}})
|
||
{
|
||
next if $anvil->data->{environment}{$key} eq "";
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $level, list => { "environment::${key}" => $anvil->data->{environment}{$key} }});
|
||
}
|
||
foreach my $key (sort {$a cmp $b} keys %ENV)
|
||
{
|
||
next if exists $anvil->data->{environment}{$key};
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $level, list => { "ENV::${key}" => $ENV{$key} }});
|
||
}
|
||
foreach my $value (@ARGV)
|
||
{
|
||
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $level, list => { "ARGV" => $value }});
|
||
}
|
||
|
||
return(0);
|
||
}
|
||
|
||
# This just prints a quick usage message for now.
|
||
sub show_usage
|
||
{
|
||
my ($anvil) = @_;
|
||
|
||
### TODO: How to use this...
|
||
|
||
$anvil->nice_exit({exit_code => 0});
|
||
}
|
||
|
||
# This prints out the metadata and exits.
|
||
sub show_metadata
|
||
{
|
||
my ($anvil) = @_;
|
||
|
||
# This is a pretty simple agent, by design. We only take a server name for now.
|
||
print '<?xml version="1.0"?>
|
||
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
|
||
<resource-agent name="ocs:alteeve:server" version="0.1">
|
||
<version>1.0</version>
|
||
<longdesc lang="en">
|
||
This resource agent manages KVM+qemu virtual servers on an Anvil! m3 Intelligent Availability(tm) system.
|
||
It manages underlying components like DRBD 9 storage resources, brodge connections and so forth.
|
||
</longdesc>
|
||
<shortdesc lang="en">Anvil! m3 server resource agent</shortdesc>
|
||
<parameters>
|
||
<parameter name="name" unique="1" required="1">
|
||
<longdesc lang="en">
|
||
This is the name of the server as reported by virsh.
|
||
</longdesc>
|
||
<shortdesc lang="en">Server name</shortdesc>
|
||
<content type="string"/>
|
||
</parameter>
|
||
</parameters>
|
||
<actions>
|
||
<action name="start" timeout="30" />
|
||
<action name="stop" timeout="60" />
|
||
<action name="monitor" timeout="10" />
|
||
<action name="notify" timeout="20" />
|
||
<action name="migrate_to" timeout="600" />
|
||
<action name="migrate_from" timeout="600" />
|
||
<action name="meta-data" timeout="5" />
|
||
<action name="validate-all" timeout="20" />
|
||
</actions>
|
||
</resource-agent>
|
||
';
|
||
|
||
$anvil->nice_exit({exit_code => 0});
|
||
}
|