Local modifications to ClusterLabs/Anvil by Alteeve
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1892 lines
77 KiB

#!/usr/bin/perl
#
# This is the resource agent used to manage servers on the Anvil! Intelligent Availability platform.
#
# License: GNU General Public License (GPL) v2+
# (c) 1997-2020 - Alteeve's Niche! Inc.
#
# WARNING: This is a pretty purpose-specific resource agent. No effort was made to test this on an rgmanager
# cluster or on any configuration outside how the Anvil! m3 uses it. If you plan to adapt it to
# another purpose, let us know and we'll try to help.
#
# Based on: https://github.com/ClusterLabs/resource-agents/blob/master/doc/dev-guides/ra-dev-guide.asc
#
# Error types from pacemaker's perspective;
#
# - Soft Error - Unless specifically configured otherwise, pacemaker will attempt to recover a resource
# in-place-usually by restarting the resource on the same node.
# - Hard Error - Unless specifically configured otherwise, pacemaker will attempt to recover a resource
# which failed with this error by restarting the resource on a different node.
# - Fatal Error - This is a cluster-wide error, it would make no sense to recover such a resource on a
# different node, let alone in-place. When a resource fails with this error, Pacemaker will
# attempt to shut down the resource, and wait for administrator intervention.
#
# Exit codes;
# 0 - OCF_SUCCESS
# - The action completed successfully. This is the expected return code for any successful start, stop,
# migrate_to, meta_data, help, and usage action.
# - For monitor, however, a modified convention applies:
# - If the server is running we return, OCF_SUCCESS. If not running and gracefully stopped or migrated
# off, return OCF_NOT_RUNNING.
#
# 1 - OCF_ERR_GENERIC
# - The action returned a generic error. This is used only when none of the more specific error codes,
# defined below, accurately describes the problem.
# - Pacemaker interprets this exit code as a soft error.
#
# 2 - OCF_ERR_ARGS
# - The resource’s configuration is not valid on this machine. This can happen if the server fails to boot
# because of a missing bridge, for example.
#
# 3 - OCF_ERR_UNIMPLEMENTED
# - The resource agent was instructed to execute an action that we do not implement.
# - Not all resource agent actions are mandatory. We don't implement 'promote' or 'demote'. We do implement
# 'migrate_to', 'migrate_from', and 'notify'. If we're misconfigured as a master/slave resource, for
# example, then will alert the user about this misconfiguration by returning OCF_ERR_UNIMPLEMENTED.
#
# 4 - OCF_ERR_PERM
# - The action failed due to insufficient permissions. This may be due to a node not being able to open a
# definition file or resource config.
# - Pacemaker interprets this exit code as a hard error.
#
# 5 - OCF_ERR_INSTALLED
# - The action failed because a required component is missing on the node where the action was executed.
# This may be due to a required binary not being executable, or a the DRBD resource config file not
# existing.
# - Pacemaker interprets this exit code as a hard error.
#
# 6 - OCF_ERR_CONFIGURED
# - The action failed because the user misconfigured the resource in pacemaker. For example, the user may
# have configured an alphanumeric string for a parameter that really should be an integer.
# - Pacemaker interprets this exit code as a fatal error.
#
# 7 - OCF_NOT_RUNNING
# - The resource was found not to be running. This is an exit code that may be returned by the monitor
# action exclusively. Note that this implies that the resource has either gracefully shut down, or has
# never been started.
#
# 8 - OCF_RUNNING_MASTER
# 9 - OCF_FAILED_MASTER
# - These OCF exit codes are not used here.
#
# NOTE: We don't use Anvil::Tools to keep overhead low and to keep this agent independent as possible.
use strict;
use warnings;
use Anvil::Tools;
use XML::Simple;
use JSON;
use Math::BigInt;
use Data::Dumper;
# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete.
$| = 1;
# The name of this file is just 'server', which isn't helpful, so we manually set it to our RA name.
my $THIS_FILE = "ocf:alteeve:server";
my $running_directory = ($0 =~ /^(.*?)\/server/)[0];
if (($running_directory =~ /^\./) && ($ENV{PWD}))
{
$running_directory =~ s/^\./$ENV{PWD}/;
}
# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete.
$| = 1;
# NOTE: Setting 'log_level' and 'log_secure' here will get overridden in the main lopp. Use the Log methods
# in the loop as well to override defaults in code.
my $anvil = Anvil::Tools->new();
$anvil->Log->level({set => 2});
$anvil->Log->secure({set => 1});
# If we can connect to a database, we'll set/clear the 'migrating' flag during migrations
$anvil->Database->connect();
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"});
if (not $anvil->data->{sys}{database}{connections})
{
# No databases, exit.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, secure => 0, key => "error_0003"});
return(1);
}
### Read or Set the environment variables
# This is the name of the server we're managing. # Example values:
$anvil->data->{environment}{OCF_RESKEY_name} = defined $ENV{OCF_RESKEY_name} ? $ENV{OCF_RESKEY_name} : ""; # srv07-el6
# This is our node name
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node} = defined $ENV{OCF_RESKEY_CRM_meta_on_node} ? $ENV{OCF_RESKEY_CRM_meta_on_node} : ""; # mk-a02n01.digimer.ca
# This says "UUID", but it's the node ID.
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node_uuid} = defined $ENV{OCF_RESKEY_CRM_meta_on_node_uuid} ? $ENV{OCF_RESKEY_CRM_meta_on_node_uuid} : ""; # 1
# This is the timeout for the called action in millisecond.
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout} = defined $ENV{OCF_RESKEY_CRM_meta_timeout} ? $ENV{OCF_RESKEY_CRM_meta_timeout} : ""; # 20000
# If this is set, we'll bump our log level as well.
$anvil->data->{environment}{PCMK_debug} = defined $ENV{PCMK_debug} ? $ENV{PCMK_debug} : ""; # 0
# These are other variables that are set, but we don't currently care about them
$anvil->data->{environment}{OCF_EXIT_REASON_PREFIX} = defined $ENV{OCF_EXIT_REASON_PREFIX} ? $ENV{OCF_EXIT_REASON_PREFIX} : ""; # ocf-exit-reason:
$anvil->data->{environment}{OCF_RA_VERSION_MAJOR} = defined $ENV{OCF_RA_VERSION_MAJOR} ? $ENV{OCF_RA_VERSION_MAJOR} : ""; # 1
$anvil->data->{environment}{OCF_RA_VERSION_MINOR} = defined $ENV{OCF_RA_VERSION_MINOR} ? $ENV{OCF_RA_VERSION_MINOR} : ""; # 0
$anvil->data->{environment}{OCF_RESKEY_crm_feature_set} = defined $ENV{OCF_RESKEY_crm_feature_set} ? $ENV{OCF_RESKEY_crm_feature_set} : ""; # 3.0.12
$anvil->data->{environment}{OCF_RESOURCE_INSTANCE} = defined $ENV{OCF_RESOURCE_INSTANCE} ? $ENV{OCF_RESOURCE_INSTANCE} : ""; # srv01-c7
$anvil->data->{environment}{OCF_RESOURCE_PROVIDER} = defined $ENV{OCF_RESOURCE_PROVIDER} ? $ENV{OCF_RESOURCE_PROVIDER} : ""; # alteeve
$anvil->data->{environment}{OCF_RESOURCE_TYPE} = defined $ENV{OCF_RESOURCE_TYPE} ? $ENV{OCF_RESOURCE_TYPE} : ""; # server
$anvil->data->{environment}{OCF_ROOT} = defined $ENV{OCF_ROOT} ? $ENV{OCF_ROOT} : ""; # /usr/lib/ocf
# These are set during a migration
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source} = defined $ENV{OCF_RESKEY_CRM_meta_migrate_source} ? $ENV{OCF_RESKEY_CRM_meta_migrate_source} : ""; # mk-a02n01.digimer.ca
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target} = defined $ENV{OCF_RESKEY_CRM_meta_migrate_target} ? $ENV{OCF_RESKEY_CRM_meta_migrate_target} : ""; # mk-a02n02.digimer.ca
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_record_pending} = defined $ENV{OCF_RESKEY_CRM_meta_record_pending} ? $ENV{OCF_RESKEY_CRM_meta_record_pending} : ""; # true
# Any variable=value arguments in the resource are set under 'OCF_RESKEY_CRM_meta_'
foreach my $key (sort {$a cmp $b} keys %ENV)
{
next if $key !~ /^OCF_RESKEY_CRM_meta_/;
$anvil->data->{environment}{$key} = $ENV{$key};
}
# If pacemaker is in debug, so are we,
if ($anvil->data->{environment}{PCMK_debug})
{
$anvil->Log->level({set => 3});
}
# Originally, this was designed to start and stop a server's DRBD resources on demand. Early testing appears
# to show this prone to higher risk of fencing if something goes wrong. As such, we're changing the default
# behaviour to leave DRBD resources up. Set this to '1' (here or by switch) to revert back to the old
# behaviour.
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_stop_drbd_resources} = 0;
# Get any command line switches.
$anvil->Get->switches({debug => 2});
if ($anvil->data->{switches}{stop_drbd_resources})
{
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source} = 1;
}
# Something for the logs
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 3, key => "log_0298"});
### TEST: to be removed later
if ($anvil->data->{switches}{test1})
{
$anvil->data->{switches}{migrate_to} = "#!SET!#";
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source} = "mk-a02n02";
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target} = "mk-a02n01";
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_name} = "migrate_to";
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node} = "mk-a02n01";
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout} = "600000";
$anvil->data->{environment}{OCF_RESKEY_name} = "srv07-el6";
print "Running test 1; Migrate: [".$anvil->data->{environment}{OCF_RESKEY_name}."] from: [".$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source}."] to: [".$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target}."]\n";
}
if ($anvil->data->{switches}{test2})
{
$anvil->data->{switches}{migrate_to} = "#!SET!#";
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source} = "mk-a02n01";
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target} = "mk-a02n02";
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_name} = "migrate_to";
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node} = "mk-a02n01";
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout} = "600000";
$anvil->data->{environment}{OCF_RESKEY_name} = "srv07-el6";
print "Running test 2; Migrate: [".$anvil->data->{environment}{OCF_RESKEY_name}."] from: [".$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source}."] to: [".$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target}."]\n";
}
if ($anvil->data->{switches}{test3})
{
$anvil->data->{environment}{OCF_RESKEY_name} = $anvil->data->{switches}{server};
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node} = "mk-a02n01";
print "Running test 3; Boot or shutdown of: [".$anvil->data->{environment}{OCF_RESKEY_name}."].\n";
}
if ($anvil->data->{switches}{test4})
{
$anvil->data->{switches}{monitor} = "#!SET!#";
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_name} = "monitor";
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_fail} = "block";
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout} = "20000";
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node} = "mk-a02n01";
$anvil->data->{environment}{OCF_RESKEY_name} = "srv07-el6";
print "Status check of: [".$anvil->data->{environment}{OCF_RESKEY_name}."] on: [".$anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node}."].\n";
}
# This is for debugging.
if (not $anvil->data->{switches}{monitor})
{
show_environment($anvil, 3);
}
### What are we being asked to do?
# start -Starts the resource.
# stop -Shuts down the resource.
# monitor -(status aliases here) Queries the resource for its state.
# meta-data -Dumps the resource agent metadata.
# promote -Turns a resource into the Master role (Master/Slave resources only).
# demote -Turns a resource into the Slave role (Master/Slave resources only).
# migrate_to - migration target
# migrate_from-Implement live migration of resources.
# validate-all-Validates a resource’s configuration.
# help -(usage maps here) Displays a usage message when the resource agent is invoked from the command line, rather than by the cluster manager.
# notify -Inform resource about changes in state of other clones.
if ($anvil->data->{switches}{migrate_to})
{
# We don't support this, so we return OCF_ERR_UNIMPLEMENTED (3)
migrate_server($anvil);
}
elsif ($anvil->data->{switches}{migrate_from})
{
# This is called after a migration is complete, so we're basically just doing a status check.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0529", variables => { server => $anvil->data->{environment}{OCF_RESKEY_name} }});
server_status($anvil);
}
elsif ($anvil->data->{switches}{start})
{
# Start the server
start_server($anvil);
}
elsif ($anvil->data->{switches}{stop})
{
# Stop the server
stop_server($anvil);
}
elsif (($anvil->data->{switches}{monitor}) or ($anvil->data->{switches}{status}))
{
# Report the status of the server.
server_status($anvil);
}
elsif (($anvil->data->{switches}{metadaata}) or ($anvil->data->{switches}{'meta-data'}))
{
show_metadata($anvil);
}
elsif ($anvil->data->{switches}{promote})
{
# We don't support this, so we return OCF_ERR_UNIMPLEMENTED (3)
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0299", variables => { server => $anvil->data->{environment}{OCF_RESKEY_name} }});
$anvil->nice_exit({exit_code => 3});
}
elsif ($anvil->data->{switches}{demote})
{
# We don't support this, so we return OCF_ERR_UNIMPLEMENTED (3)
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0300", variables => { server => $anvil->data->{environment}{OCF_RESKEY_name} }});
$anvil->nice_exit({exit_code => 3});
}
elsif ($anvil->data->{switches}{'validate-all'})
{
# Validate our local config and setup.
validate_all($anvil);
$anvil->nice_exit({exit_code => 0});
}
elsif (($anvil->data->{switches}{help}) or ($anvil->data->{switches}{usage}))
{
# Show the usage information
show_usage($anvil);
$anvil->nice_exit({exit_code => 0});
}
elsif ($anvil->data->{switches}{notify})
{
# We don't implement this
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level =>0, key => "log_0301"});
$anvil->nice_exit({exit_code => 3});
}
else
{
# We were called in some unexpected way. Log an error, show usage and exit.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level =>0, key => "log_0302"});
show_environment($anvil, 0);
$anvil->nice_exit({exit_code => 1});
}
# If we hit here, something very wrong happened.
$anvil->nice_exit({exit_code => 255});
#############################################################################################################
# Functions #
#############################################################################################################
# This will either verify that 'libvirtd' and 'drbd' are running (and start them if not) is called with
# "start". If called with "stop", a check is made on both nodes. If all VMs are gone, "libvirtd" and "drbd"
# are stopped.
sub check_daemons
{
my ($anvil, $task) = @_;
my $problem = $anvil->Cluster->parse_cib();
if ($problem)
{
# Pacemaker isn't running, or some other problem. Someone must have called this script
# directly or something.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0133"});
$anvil->nice_exit({exit_code => 1});
}
# Is the peer running? We'll use this to know whether to try and start daemons on the peer.
my $peer_name = $anvil->data->{cib}{parsed}{peer}{name};
my $peer_ready = $anvil->data->{cib}{parsed}{peer}{ready};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
peer_name => $peer_name,
peer_ready => $peer_ready,
}});
if ($task eq "start")
{
### It doesn't look like we need to start drbd. Up'ing the first resource works without it.
#foreach my $daemon ("libvirtd.service", "drbd.service")
foreach my $daemon ("libvirtd.service")
{
my $running_local = 0;
my $running_peer = 0;
my ($output, $return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{systemctl}." status ".$daemon});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
output => $output,
return_code => $return_code,
}});
if ($return_code eq "3")
{
# It is stopped, start it..
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0482", variables => { daemon => $daemon }});
my ($output, $return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{systemctl}." start ".$daemon});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
output => $output,
return_code => $return_code,
}});
my $loops = 0;
my $running = 0;
until ($running)
{
my ($output, $return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{systemctl}." status ".$daemon});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
output => $output,
return_code => $return_code,
}});
if ($return_code eq "0")
{
# It's running
$running = 1;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0483", variables => { daemon => $daemon }});
}
else
{
$loops++;
if ($loops > 5)
{
# Give up
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0134", variables => { daemon => $daemon }});
$anvil->nice_exit({exit_code => 1});
}
else
{
# Wait for a second.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0484", variables => { daemon => $daemon }});
sleep 1;
}
}
}
}
elsif ($return_code eq "0")
{
# Running, nothing to do.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0485", variables => { daemon => $daemon }});
}
if ($peer_ready)
{
my ($output, $error, $return_code) = $anvil->Remote->call({
target => $peer_name,
shell_call => $anvil->data->{path}{exe}{systemctl}." status ".$daemon,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
output => $output,
error => $error,
return_code => $return_code,
}});
if ($return_code eq "3")
{
# Stopped, start it..
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0486", variables => {
daemon => $daemon,
host => $peer_name,
}});
my ($output, $error, $return_code) = $anvil->Remote->call({
target => $peer_name,
shell_call => $anvil->data->{path}{exe}{systemctl}." start ".$daemon,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
output => $output,
error => $error,
return_code => $return_code,
}});
my $loops = 0;
my $running = 0;
until ($running)
{
my ($output, $error, $return_code) = $anvil->Remote->call({
target => $peer_name,
shell_call => $anvil->data->{path}{exe}{systemctl}." status ".$daemon,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
output => $output,
error => $error,
return_code => $return_code,
}});
if ($return_code eq "0")
{
$running = 1;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0487", variables => {
daemon => $daemon,
host => $peer_name,
}});
}
else
{
$loops++;
if ($loops > 5)
{
### TODO: We may want to NOT die here, if
### we're booting a server (though we
### will if we're migrating).
# Give up
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0135", variables => {
daemon => $daemon,
host => $peer_name,
}});
$anvil->nice_exit({exit_code => 1});
}
else
{
# Wait for a second.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0488", variables => {
daemon => $daemon,
host => $peer_name,
}});
sleep 1;
}
}
}
}
elsif ($return_code eq "0")
{
# Running, nothing to do.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0489", variables => {
daemon => $daemon,
host => $peer_name,
}});
}
}
}
}
=cut # It's simpler (and thus safer) to not stop daemons.
if ($task eq "stop")
{
print "Stopping daemons\n";
my $stop = 0;
# Check both nodes if a server is running on either node.
my $local_vm_count = 0;
my $remote_vm_count = 0;
# Call virsh list --all
my ($local_output, $local_return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{virsh}." list --all"});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
local_output => $local_output,
local_return_code => $local_return_code,
}});
if (not $local_return_code)
{
# Parse output
foreach my $line (split/\n/, $local_output)
{
$line = $anvil->Words->clean_spaces({ string => $line });
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { line => $line }});
if ($line =~ /(\d+)\s+(.*?)\s+running/)
{
$local_vm_count++;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { local_vm_count => $local_vm_count }});
}
}
}
my ($remote_output, $remote_error, $remote_return_code) = $anvil->Remote->call({
target => $peer_name,
shell_call => $anvil->data->{path}{exe}{virsh}." list --all",
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
remote_output => $remote_output,
remote_error => $remote_error,
remote_return_code => $remote_return_code,
}});
if (not $remote_return_code)
{
# Parse output
foreach my $line (split/\n/, $remote_output)
{
$line = $anvil->Words->clean_spaces({ string => $line });
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { line => $line }});
if ($line =~ /(\d+)\s+(.*?)\s+running/)
{
$remote_vm_count++;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { remote_vm_count => $remote_vm_count }});
}
}
}
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
local_vm_count => $local_vm_count,
remote_vm_count => $remote_vm_count,
}});
if ((not $local_vm_count) && (not $remote_vm_count))
{
if ($peer_ready)
{
# No servers running on either node.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0490"});
}
else
{
# No servers running here and the peer is not in the cluster.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0491"});
}
foreach my $daemon ("libvirtd.service", "drbd.service")
{
my $running_local = 0;
my $running_peer = 0;
my ($local_output, $local_return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{systemctl}." status ".$daemon});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
local_output => $local_output,
local_return_code => $local_return_code,
}});
if ($local_return_code eq "3")
{
# Already stopped.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0492", variables => { daemon => $daemon }});
}
elsif ($local_return_code eq "0")
{
# Running, stop it.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0493", variables => { daemon => $daemon }});
my ($output, $return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{systemctl}." stop ".$daemon});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
output => $output,
return_code => $return_code,
}});
}
my ($remote_output, $remote_error, $remote_return_code) = $anvil->Remote->call({
target => $peer_name,
shell_call => $anvil->data->{path}{exe}{systemctl}." status ".$daemon,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
remote_output => $remote_output,
remote_error => $remote_error,
remote_return_code => $remote_return_code,
}});
if ($remote_return_code eq "3")
{
# Already stopped.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0494", variables => {
daemon => $daemon,
host => $peer_name,
}});
}
elsif ($remote_return_code eq "0")
{
# Running, stop it.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0495", variables => {
daemon => $daemon,
host => $peer_name,
}});
my ($output, $error, $return_code) = $anvil->Remote->call({
target => $peer_name,
shell_call => $anvil->data->{path}{exe}{systemctl}." stop ".$daemon,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
output => $output,
error => $error,
return_code => $return_code,
}});
}
}
}
else
{
# Servers are still running, don't stop the daemons.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0496"});
}
}
=cut
return(0);
}
=cut
STATES
The State field lists what state each domain is currently in. A domain can be in one of the following
possible states:
running - The domain is currently running on a CPU
idle - The domain is idle, and not running or runnable. This can be caused because the domain is
waiting on IO (a traditional wait state) or has gone to sleep because there was nothing else
for it to do.
paused - The domain has been paused, usually occurring through the administrator running virsh suspend.
When in a paused state the domain will still consume allocated resources like memory, but will
not be eligible for scheduling by the hypervisor.
in shutdown - The domain is in the process of shutting down, i.e. the guest operating system has been
notified and should be in the process of stopping its operations gracefully.
shut off - The domain is not running. Usually this indicates the domain has been shut down completely, or
has not been started.
crashed - The domain has crashed, which is always a violent ending. Usually this state can only occur if
the domain has been configured not to restart on crash.
pmsuspended - The domain has been suspended by guest power management, e.g. entered into s3 state.
=cut
# This boots the server if possible.
sub start_server
{
my ($anvil) = @_;
# Before we do anything, make sure that 'libvirtd' and 'drbd' services are running.
check_daemons($anvil, "start");
# Start procedure;
# 1. Read the XML definition file and find the backing storage and bridges. Soft error if read fails.
# 2. Make sure the name matches.
# 3. Make sure we have enough free RAM.
# 4. Make sure the emulator exists (can be an issue after migrating from an different gen Anvil!).
# 5.1. Make sure optical drives with mounted data have the disk present. Soft error if not.
# 5.2. Find any backing DRBD devices
# 6. For each DRBD device;
# 6.1. Make sure the backing LV is ACTIVE. Soft error if not.
# 6.2. Check if the drbd resource is up. If not, up it.
# 6.3. Make sure the backing disk is UpToDate. Soft error if not.
# 6.4. Make sure the backing device is 'Connected' or 'Connecting'. Call a connect if not.
# 7. Make sure all bridges exist and soft error if not.
# 8. Start the server.
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0303", variables => { server => $server }});
# Make sure things are sane.
validate_all($anvil);
# Is the server already running somewhere?
find_server($anvil);
# Start the resource, if needed.
start_drbd_resource($anvil);
# Still alive? Boot!
my ($success) = $anvil->Server->boot_virsh({debug => 2, server => $server});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { success => $success }});
if ($success)
{
# Success!
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0309", variables => { server => $server }});
$anvil->nice_exit({exit_code => 0});
}
else
{
# WTF?
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0310", variables => {
server => $server,
'state' => defined $anvil->data->{server}{location}{$server}{host} ? $anvil->data->{server}{location}{$server}{host} : "#!string!unit_0003!#",
}});
$anvil->nice_exit({exit_code => 6});
}
# If we're still alive, then we didn't see the server in the list of running servers, which is really weird.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0311", variables => { server => $server }});
$anvil->nice_exit({exit_code => 1});
}
# This stops the DRBD resource(s) that ran under a server.
sub stop_drbd_resource
{
my ($anvil) = @_;
my $local_host = $anvil->Get->short_host_name();
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
my $host = $anvil->Get->short_host_name;
my $peer = $anvil->data->{drbd}{config}{$host}{peer};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
server => $server,
host => $host,
peer => $peer,
}});
# Start DRBD locally.
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
{
my $peer_ip = $anvil->data->{drbd}{config}{$host}{resource}{$resource}{connection}{$peer}{ip_address};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0408", variables => {
server => $server,
peer => $peer,
peer_ip => $peer_ip,
resource => $resource,
}});
# Bring the peer's resource down.
$anvil->DRBD->manage_resource({
debug => 3,
resource => $resource,
task => "down",
target => $peer_ip,
});
# Bring the local resource down
$anvil->DRBD->manage_resource({
debug => 3,
resource => $resource,
task => "down",
});
}
return(0);
}
# This starts the drbd resource(s) for the requested server, if needed.
sub start_drbd_resource
{
my ($anvil) = @_;
my $local_host = $anvil->Get->short_host_name();
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
my $host = $anvil->Get->short_host_name;
my $peer = $anvil->data->{drbd}{config}{$host}{peer};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
server => $server,
host => $host,
peer => $peer,
}});
# Do we need startup?
my $startup_needed = 0;
$anvil->DRBD->get_status({debug => 3});
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
{
# Is the current resource up locally already? If it is, we're done.
my $role = defined $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role} ? $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
resource => $resource,
role => $role,
}});
if ((lc($role) ne "secondary") && (lc($role) ne "primary"))
{
$startup_needed = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { startup_needed => $startup_needed }});
last;
}
else
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0434", variables => {
resource => $resource,
role => $role,
}});
}
}
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { startup_needed => $startup_needed }});
if (not $startup_needed)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0431"});
return(0);
}
# Start DRBD locally.
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
{
my $peer_ip = $anvil->data->{drbd}{config}{$host}{resource}{$resource}{connection}{$peer}{ip_address};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0419", variables => {
server => $server,
peer => $peer,
peer_ip => $peer_ip,
resource => $resource,
}});
# Bring the local resource up
$anvil->DRBD->manage_resource({
resource => $resource,
task => "up",
});
# Bring the peer's resource up.
$anvil->DRBD->manage_resource({
resource => $resource,
task => "up",
target => $peer_ip,
});
# Now wait for it to be connected or UpToDate...
my $waiting = 1;
while($waiting)
{
$anvil->DRBD->get_status({debug => 3});
my $connection_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer}{'connection-state'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
connection_state => $connection_state,
}});
my $all_ready = 1;
foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}})
{
my $disk_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}{$volume}{'disk-state'};
my $replication_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer}{volume}{$volume}{'replication-state'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
disk_state => $disk_state,
replication_state => $replication_state,
}});
# Is the peer isn't connected (directly or by being in Sync), or this volume
# isn't UpToDate, we need to keep waiting.
if ((lc($disk_state) ne "uptodate") && ($replication_state !~ /^Sync/i) && (lc($connection_state) ne "connected"))
{
$all_ready = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_ready => $all_ready }});
}
}
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_ready => $all_ready }});
if ($all_ready)
{
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
}
if ($waiting)
{
sleep 1;
}
}
}
# If auto-promote isn't set, promote the resource.
if (not $anvil->data->{drbd}{config}{$host}{'auto-promote'})
{
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0420", variables => {
server => $server,
resource => $resource,
}});
# Make the local resource primary.
$anvil->DRBD->manage_resource({
resource => $resource,
task => "primary",
});
}
}
return(0);
}
# This uses the DRBD information to find other peers and see if the server is running on them.
sub find_server
{
my ($anvil) = @_;
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
my $host = $anvil->Get->short_host_name;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0415", variables => { server => $server }});
foreach my $ip_address (sort {$a cmp $b} keys %{$anvil->data->{drbd}{config}{$host}{ip_addresses}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { ip_address => $ip_address }});
$anvil->Server->find({
debug => 3,
target => $ip_address,
remote_user => "root",
});
}
foreach my $this_server (sort {$a cmp $b} keys %{$anvil->data->{server}{location}})
{
my $status = $anvil->data->{server}{location}{$this_server}{status};
my $host = $anvil->data->{server}{location}{$this_server}{host};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
this_server => $this_server,
status => $status,
host => $host,
}});
}
if ((exists $anvil->data->{server}{location}{$server}) && ($anvil->data->{server}{location}{$server}{host}))
{
# The server is running. If it is running here, exit with success. If it's running elsewhere,
# exit with a failure.
my $status = $anvil->data->{server}{location}{$server}{status};
my $host = $anvil->data->{server}{location}{$server}{host};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
status => $status,
host => $host,
}});
if ($host eq $anvil->Get->host_name)
{
# Already running, we're good, and we're done.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0416", variables => { server => $server }});
$anvil->nice_exit({exit_code => 0});
}
elsif ($host =~ /dr(\d+)$/)
{
# The server is running elsewhere. If the peer host is DR, exit with
# OCF_ERR_CONFIGURED (6) so that pacemaker doesn't try to also start the server on
# the other node, because we don't know the state of it here.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0417", variables => {
server => $server,
host => $host,
}});
$anvil->nice_exit({exit_code => 6});
}
else
{
# It looks like it's running on the peer. So we'll exit OCF_ERR_INSTALLED (5) to tell
# pacemaker to try to start it on our peer.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0418", variables => {
server => $server,
host => $host,
}});
$anvil->nice_exit({exit_code => 6});
}
}
return(0);
}
# This shuts down the server if possible.
sub stop_server
{
my ($anvil) = @_;
# Stopping the server is simply a question of "is the server running?" and, if so, stop it. Once
# stopped, we stop the DRBD resource on both nodes.
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { server => $server }});
# Read in an parse the server's XML.
$anvil->System->check_storage({debug => 3});
$anvil->Server->get_status({debug => 3, server => $server});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0313", variables => { server => $server }});
my $success = $anvil->Server->shutdown_virsh({debug => 3, server => $server});
if (not $success)
{
# Something went wrong. Details should be in the logs.
$anvil->nice_exit({exit_code => 1});
}
# Now stop the DRBD resource(s).
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'environment::OCF_RESKEY_CRM_meta_stop_drbd_resources' => $anvil->data->{environment}{OCF_RESKEY_CRM_meta_stop_drbd_resources},
}});
if ($anvil->data->{environment}{OCF_RESKEY_CRM_meta_stop_drbd_resources})
{
stop_drbd_resource($anvil);
}
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0324", variables => { server => $server }});
$anvil->nice_exit({exit_code => 0});
}
# This checks the status of the server.
sub server_status
{
my ($anvil) = @_;
# If the named server is running, return OCF_SUCCESS (rc: 0), otherwise OCF_NOT_RUNNING (rc: 7). If
# the server is failed, return OCF_ERR_GENERIC (1).
my $state = "";
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, key => "log_0521", variables => { server => $server }});
if (not $anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout})
{
# Set a sane default of 20 seconds.
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout} = 20000;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "log_0331", variables => { timeout => $anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout} }});
}
# Is 'libvirtd' running? We'll wait up to half the timeout for it to start (in case it _just_ started)
# before timing out.
my $wait_until = time + ($anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout} / 2000); # Devide by 2000 to convert to seconds and total second halved.
my $look_for_pid = 0;
my $libvirtd_wait = 1;
my $warning_shown = 0;
while($libvirtd_wait)
{
my $running = $anvil->System->check_daemon({daemon => "libvirtd.service"});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { running => $running }});
if ($running)
{
$libvirtd_wait = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { libvirtd_wait => $libvirtd_wait }});
}
else
{
if (not $warning_shown)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0522", variables => { wait_time => ($wait_until - time) }});
$warning_shown = 1;
}
sleep 1;
if (time > $wait_until)
{
# Libvirtd isn't running, try to find the PID of the server (in case it's
# running a libvirtd isn't)
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "warning_0057"});
$look_for_pid = 1;
$libvirtd_wait = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
look_for_pid => $look_for_pid,
libvirtd_wait => $libvirtd_wait,
}});
}
}
}
# If libvirtd wasn't running, we'll manually look for a PID.
if ($look_for_pid)
{
my $server_up = 0;
my $shell_call = $anvil->data->{path}{exe}{ps}." aux";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
output => $output,
return_code => $return_code,
}});
foreach my $line (split/\n/, $output)
{
next if $line !~ /qemu-kvm/;
$line = $anvil->Words->clean_spaces({ string => $line });
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { line => $line }});
if ($line =~ /guest=(.*?),/)
{
my $this_server = $1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { this_server => $this_server }});
if ($this_server eq $server)
{
# Found it.
$server_up = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { server_up => $server_up }});
last;
}
}
}
if ($server_up)
{
# The server is running. Exit with OCF_SUCCESS (rc 0);
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0523"});
$anvil->nice_exit({exit_code => 0});
}
else
{
# The server is not running. Exit with OCF_NOT_RUNNING (rc: 7)
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0524"});
$anvil->nice_exit({exit_code => 7});
}
}
else
{
# Parse the virsh state. If it's listed as 'crashed', return OCF_ERR_GENERIC (rc: 1). If it's
# 'in shutdown', 'loop' gets set to 1 and this will loop indefinitely. We don't put a timer
# on it, we let pacemaker handle that.
my $loop = 1;
while($loop)
{
$loop = 0;
my $found = 0;
my $shell_call = $anvil->data->{path}{exe}{virsh}." list --all";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
output => $output,
return_code => $return_code,
}});
foreach my $line (split/\n/, $output)
{
$line = $anvil->Words->clean_spaces({ string => $line });
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { line => $line }});
if ($line =~ /\s\Q$server\E\s+(.*)/)
{
my $state = $1;
$found = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
found => $found,
'state' => $state,
}});
* Got the code in scan-server to the point where it _should_ now gracefully and automatically detect changes to a server's definition originatin from the database (via Striker), directly editing the on-disk definition file, or editing via libvirt tools (like virt-manager). Still needs to be tested though. * Updated Server->migrate_virsh() to set 'servers' -> 'server_state' to 'migrating' and clear it again once the migation completes. Also added support for cold (frozen) versus live migrations. * Updated Cluster->parse_cib() to check if a server with the server_state set to 'migrating' isn't actually migrating anymore and, if not, to clear that state. This is needed as scan-server will blindly ignore/skip any migrating server, and if a migration call is interrupted, the state could get stuck. * Updated the 'servers' database table (and associated Database methods) to add columns for; ** server_ram_in_use - tracking RAM used by a running server ** server_configured_ram - RAM allocated to a running server (used with the above to alert a user and track _currently_ available RAM) ** server_updated_by_user - To be set by Striker tools to indicate when the user made a change that needs to push out to nodes / running server. ** server_boot_time - Tracks the unixtime when the server booted (to track uptime even if the server migrates across nodes). * Created Get->anvil_name_from_uuid() to easily convert an Anvil! UUID into a name. Also created ->host_uuid_from_name() to translate a host name into a host UUID. * Created Server->get_runtime() that translates a server name into a process ID and then uses that to determine how long (in seconds) it has been running. This is used when a server transitions from 'shut off' to 'running' to determine exactly when the server booted (current time - runtime). * Renamed all 'Server->parse_definition' calls that used 'from_memory' to 'from_virsh' to clarify the data source. * Made scan-hardware smarter about RAM change alerts. * Updated scancore to load agent strings on startup so that processing pending alerts works properly. Signed-off-by: Digimer <digimer@alteeve.ca>
4 years ago
=cut
* Server states;
running - The domain is currently running on a CPU
idle - The domain is idle, and not running or runnable. This can be caused because the domain is waiting on IO (a traditional wait state) or has gone to sleep because there was nothing else for it to do.
paused - The domain has been paused, usually occurring through the administrator running virsh suspend. When in a paused state the domain will still consume allocated resources like memory, but will not be eligible for scheduling by the hypervisor.
in shutdown - The domain is in the process of shutting down, i.e. the guest operating system has been notified and should be in the process of stopping its operations gracefully.
shut off - The domain is not running. Usually this indicates the domain has been shut down completely, or has not been started.
crashed - The domain has crashed, which is always a violent ending. Usually this state can only occur if the domain has been configured not to restart on crash.
pmsuspended - The domain has been suspended by guest power management, e.g. entered into s3 state.
=cut
### TODO: Should we treat 'idle' same as crashed?
if ($state eq "crashed")
{
# Woops. Exit with OCF_ERR_GENERIC (rc: 1).
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "warning_0058", variables => { server => $server }});
$anvil->nice_exit({exit_code => 1});
}
elsif ($state eq "in shutdown")
{
# Wait.
$loop = 1;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0525"});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { loop => $loop }});
sleep 2;
last;
}
elsif ($state eq "shut off")
{
# Exit with OCF_NOT_RUNNING (rc: 7);
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0526"});
$anvil->nice_exit({exit_code => 7});
}
else
{
# In some fashion or another, the server is running. Exit with OCF_SUCCESS (rc: 0)
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, key => "log_0527", variables => { 'state' => $state }});
$anvil->nice_exit({exit_code => 0});
}
}
}
# If it wasn't found at all, exit.
if (not $found)
{
# Exit with OCF_NOT_RUNNING (rc: 7);
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0526"});
$anvil->nice_exit({exit_code => 7});
}
}
}
# This method should never return. Just in case it does though, exit with generic error.
$anvil->nice_exit({exit_code => 1});
}
# Migrate the server
sub migrate_server
{
my ($anvil) = @_;
### NOTE: For now, we're not going to block if the target is not UpToDate. There are times when a
### user might want to do this (ie: sync will be done soon and the need to evacuate the node
### ASAP is high). Maybe we'll enforce this and require a '--force' switch later?
# If we were given 'migrate_to', we need to make sure the storage is UpToDate on the peer for all
# backing resources. We can't check the target's bridges, but the migation will fail if one is
# missing.
# If we're given 'migrate_from', we're pulling the server towards us, so we can check both brdiges
# and storage.
my $local_host = $anvil->Get->short_host_name();
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
my $source = $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source};
my $target = $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target};
my $meta_on_node = $anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
server => $server,
source => $source,
target => $target,
meta_on_node => $meta_on_node,
}});
# Make sure switches are at least defined.
$anvil->data->{switches}{migrate_to} = "" if not defined $anvil->data->{switches}{migrate_to};
$anvil->data->{switches}{migrate_from} = "" if not defined $anvil->data->{switches}{migrate_from};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'switches::migrate_to' => $anvil->data->{switches}{migrate_to},
'switches::migrate_from' => $anvil->data->{switches}{migrate_from},
}});
# Log what we're doing.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0528", variables => {
server => $server,
target_host => $target,
}});
# Before migrating, make sure the daemons are running on the peer.
check_daemons($anvil, "start");
# The actual migration command will involve enabling dual primary, then beginning the migration. The
# virsh call will depend on if we're pushing or pulling. Once the migration completes, regardless of
# success or failure, dual primary will be disabled again.
* Got the code in scan-server to the point where it _should_ now gracefully and automatically detect changes to a server's definition originatin from the database (via Striker), directly editing the on-disk definition file, or editing via libvirt tools (like virt-manager). Still needs to be tested though. * Updated Server->migrate_virsh() to set 'servers' -> 'server_state' to 'migrating' and clear it again once the migation completes. Also added support for cold (frozen) versus live migrations. * Updated Cluster->parse_cib() to check if a server with the server_state set to 'migrating' isn't actually migrating anymore and, if not, to clear that state. This is needed as scan-server will blindly ignore/skip any migrating server, and if a migration call is interrupted, the state could get stuck. * Updated the 'servers' database table (and associated Database methods) to add columns for; ** server_ram_in_use - tracking RAM used by a running server ** server_configured_ram - RAM allocated to a running server (used with the above to alert a user and track _currently_ available RAM) ** server_updated_by_user - To be set by Striker tools to indicate when the user made a change that needs to push out to nodes / running server. ** server_boot_time - Tracks the unixtime when the server booted (to track uptime even if the server migrates across nodes). * Created Get->anvil_name_from_uuid() to easily convert an Anvil! UUID into a name. Also created ->host_uuid_from_name() to translate a host name into a host UUID. * Created Server->get_runtime() that translates a server name into a process ID and then uses that to determine how long (in seconds) it has been running. This is used when a server transitions from 'shut off' to 'running' to determine exactly when the server booted (current time - runtime). * Renamed all 'Server->parse_definition' calls that used 'from_memory' to 'from_virsh' to clarify the data source. * Made scan-hardware smarter about RAM change alerts. * Updated scancore to load agent strings on startup so that processing pending alerts works properly. Signed-off-by: Digimer <digimer@alteeve.ca>
4 years ago
my $migrated = 0;
if ($target)
{
# Can I even connect to the target?
my ($access) = $anvil->Remote->test_access({debug => 3, target => $target});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { access => $access }});
if (not $access)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0429", variables => {
server => $server,
target => $target,
}});
### TODO: I wonder if this should be exit'ed with '6'?
$anvil->nice_exit({exit_code => 5});
}
# Find the server
$anvil->Server->find({debug => 3});
my $server_host = defined $anvil->data->{server}{location}{$server}{host} ? $anvil->data->{server}{location}{$server}{host} : "";
my $server_status = defined $anvil->data->{server}{location}{$server}{status} ? $anvil->data->{server}{location}{$server}{status} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
server_host => $server_host,
server_status => $server_status,
}});
# Is it already on the target?
if (not $server_status)
{
# Maybe...
$anvil->Server->find({debug => 3, target => $target});
$server_host = defined $anvil->data->{server}{location}{$server}{host} ? $anvil->data->{server}{location}{$server}{host} : "";
$server_status = defined $anvil->data->{server}{location}{$server}{status} ? $anvil->data->{server}{location}{$server}{status} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
server_host => $server_host,
server_status => $server_status,
}});
if (($server_host eq $target) && (($server_status) && ($server_status eq "running")))
{
# Already over there, we're done.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0430", variables => {
server => $server,
target => $target,
}});
$anvil->nice_exit({exit_code => 0});
}
}
if (not $server_host)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0344", variables => { server => $server }});
$anvil->nice_exit({exit_code => 1});
}
# Get a view of the servers locally and our peer.
validate_all($anvil);
# Get the DRBD status.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0387"});
$anvil->DRBD->get_status({debug => 3});
# Make sure all resource(s) are ready for the server.
my $all_up_to_date = 1;
my $host = $anvil->Get->short_host_name;
my $peer_name = $anvil->data->{drbd}{config}{$host}{peer};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
host => $host,
peer_name => $peer_name,
}});
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
{
my $connection_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'connection-state'};
my $peer_node_id = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'peer-node-id'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
peer_node_id => $peer_node_id,
resource => $resource,
connection_state => $connection_state,
}});
if (lc($connection_state) ne "connected")
{
# Try to bring the resource up on the peer now.
$anvil->DRBD->manage_resource({
resource => $resource,
task => "up",
target => $target,
});
# We'll give it 20 seconds.
my $wait = 20;
while($wait)
{
$anvil->DRBD->get_status({debug => 3});
$connection_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'connection-state'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
resource => $resource,
connection_state => $connection_state,
}});
if (lc($connection_state) ne "connected")
{
# It's up!
$wait = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'wait' => $wait }});
}
else
{
$wait--;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'wait' => $wait }});
if (not $wait)
{
# We're done waiting.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0428", variables => {
server => $server,
target => $target,
resource => $resource,
connection_state => $connection_state,
}});
### TODO: I wonder if this should be exit'ed with '6'?
$anvil->nice_exit({exit_code => 5});
}
}
}
}
foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{volume}})
{
my $peer_disk_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{volume}{$volume}{'peer-disk-state'};
my $percent_in_sync = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{volume}{$volume}{'percent-in-sync'};
my $replication_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{volume}{$volume}{'replication-state'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
volume => $volume,
peer_disk_state => $peer_disk_state,
percent_in_sync => $percent_in_sync,
replication_state => $replication_state,
}});
if (lc($peer_disk_state) ne "uptodate")
{
$all_up_to_date = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_up_to_date => $all_up_to_date }});
}
}
}
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_up_to_date => $all_up_to_date }});
if (not $all_up_to_date)
{
### TODO: If we decide later to block migration to Inconsistent peers, here's where we'd do it.
}
# If we're still alive, we're ready to migrate.
($migrated) = $anvil->Server->migrate_virsh({
debug => 2,
server => $server,
source => $source,
target => $target
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { migrated => $migrated }});
}
elsif ($source)
{
### NOTE: Pacemaker doesn't seem to ever pull servers.
# Pull the server here. Start by verifying it's on the 'meta_on_node' host.
# Scan locally and on our peer
$anvil->Server->find({debug => 3});
$anvil->Server->find({debug => 3, target => $meta_on_node, refresh => 0});
my $host = defined $anvil->data->{server}{location}{$server}{host} ? $anvil->data->{server}{location}{$server}{host} : "";
my $short_host = ($host =~ /^(.*?)\..*$/)[0];
my $status = defined $anvil->data->{server}{location}{$server}{status} ? $anvil->data->{server}{location}{$server}{status} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
host => $host,
short_host => $short_host,
status => $status,
target => $target,
}});
# Convert the host to a short name, in case the node's name is the short version.
my $server_host = defined $anvil->data->{server}{location}{$server}{host} ? $anvil->data->{server}{location}{$server}{host} : "";
my $server_status = defined $anvil->data->{server}{location}{$server}{status} ? $anvil->data->{server}{location}{$server}{status} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
server_host => $server_host,
server_status => $server_status,
}});
# This is called after a migration. If that is the case here, the target will be us. Just
# make sure it is running and, if so, return '0'. The 'meta_on_node' is the new host.
if (($target eq $anvil->Get->host_name) or ($target eq $anvil->Get->short_host_name) or ($target eq $meta_on_node))
{
# If it's running, we're succesfully out.
if ((($host eq $target) or ($short_host eq $target)) && ($status eq "running"))
{
# Success!
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0347", variables => { server => $server }});
$anvil->nice_exit({exit_code => 0});
}
# If we're still alive, we'll proceed as if we're pulling the server to us, and maybe
# that will work.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0348", variables => { server => $server }});
}
# Validate as if we were about to boot the server.
validate_all($anvil);
# Call the pull migation.
($migrated) = $anvil->Server->migrate_virsh({
debug => 2,
server => $server,
source => $source,
target => $target
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { migrated => $migrated }});
}
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { migrated => $migrated }});
if (not $migrated)
{
# Exit
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0357"});
$anvil->nice_exit({exit_code => 1});
}
# If we made it here, we succeeded.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0360"});
$anvil->nice_exit({exit_code => 0});
}
# Validation checks that we have the definition XML, resource config and that needed apps are installed.
sub validate_all
{
my ($anvil) = @_;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0361"});
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
my $source = defined $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source} ? $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source} : "";
my $target = defined $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target} ? $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
server => $server,
source => $source,
target => $target,
}});
# Read in and parse the server's XML.
$anvil->System->check_storage({debug => 3});
$anvil->Server->get_status({debug => 2, server => $server});
# Is the name in the definition file what we expect (and did we read the XML data at all)?
validate_name($anvil);
# Make sure the emulator it wants is the one we have.
validate_emulator($anvil);
# These tests are only needed if we're about to boot the server
if (($anvil->data->{switches}{start}) or ($source))
{
# Check that we have enough RAM.
validate_ram($anvil);
}
# Validate bridges
validate_bridges($anvil);
# Validate storage (Disks and optical media)
validate_storage($anvil);
return(0);
}
# This ensures that the bridges the server connects to exist on this node.
sub validate_bridges
{
my ($anvil) = @_;
# Get my bridge list
$anvil->Get->bridges({debug => 3});
# Find the Optical drives and DRBD devices.
my $local_host = $anvil->Get->short_host_name();
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
foreach my $mac (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{from_disk}{device}{interface}})
{
# See if we have this bridge
my $found = 0;
my $bridge = $anvil->data->{server}{$local_host}{$server}{from_disk}{device}{interface}{$mac}{bridge};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { bridge => $bridge }});
foreach my $interface_name (sort {$a cmp $b} keys %{$anvil->data->{$local_host}{network}{bridges}{bridge}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { interface_name => $interface_name }});
if ((exists $anvil->data->{$local_host}{network}{bridges}{bridge}{$interface_name}) && ($anvil->data->{$local_host}{network}{bridges}{bridge}{$interface_name}{found}))
{
$found = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { found => $found }});
last;
}
}
if ($found)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, key => "log_0368", variables => { bridge => $bridge }});
}
else
{
# Missing bridge.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0369", variables => { bridge => $bridge }});
$anvil->nice_exit({exit_code => 5});
}
}
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0366"});
return(0);
}
# This looks up the disks and optical media connected to this server.
sub validate_storage
{
my ($anvil) = @_;
* Got the code in scan-server to the point where it _should_ now gracefully and automatically detect changes to a server's definition originatin from the database (via Striker), directly editing the on-disk definition file, or editing via libvirt tools (like virt-manager). Still needs to be tested though. * Updated Server->migrate_virsh() to set 'servers' -> 'server_state' to 'migrating' and clear it again once the migation completes. Also added support for cold (frozen) versus live migrations. * Updated Cluster->parse_cib() to check if a server with the server_state set to 'migrating' isn't actually migrating anymore and, if not, to clear that state. This is needed as scan-server will blindly ignore/skip any migrating server, and if a migration call is interrupted, the state could get stuck. * Updated the 'servers' database table (and associated Database methods) to add columns for; ** server_ram_in_use - tracking RAM used by a running server ** server_configured_ram - RAM allocated to a running server (used with the above to alert a user and track _currently_ available RAM) ** server_updated_by_user - To be set by Striker tools to indicate when the user made a change that needs to push out to nodes / running server. ** server_boot_time - Tracks the unixtime when the server booted (to track uptime even if the server migrates across nodes). * Created Get->anvil_name_from_uuid() to easily convert an Anvil! UUID into a name. Also created ->host_uuid_from_name() to translate a host name into a host UUID. * Created Server->get_runtime() that translates a server name into a process ID and then uses that to determine how long (in seconds) it has been running. This is used when a server transitions from 'shut off' to 'running' to determine exactly when the server booted (current time - runtime). * Renamed all 'Server->parse_definition' calls that used 'from_memory' to 'from_virsh' to clarify the data source. * Made scan-hardware smarter about RAM change alerts. * Updated scancore to load agent strings on startup so that processing pending alerts works properly. Signed-off-by: Digimer <digimer@alteeve.ca>
4 years ago
# When checking on a running server, use 'from_virsh'.
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
my $target = defined $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target} ? $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
server => $server,
target => $target,
}});
my $local_host = $anvil->Get->short_host_name();
my $xml_source = "from_disk";
* Got the code in scan-server to the point where it _should_ now gracefully and automatically detect changes to a server's definition originatin from the database (via Striker), directly editing the on-disk definition file, or editing via libvirt tools (like virt-manager). Still needs to be tested though. * Updated Server->migrate_virsh() to set 'servers' -> 'server_state' to 'migrating' and clear it again once the migation completes. Also added support for cold (frozen) versus live migrations. * Updated Cluster->parse_cib() to check if a server with the server_state set to 'migrating' isn't actually migrating anymore and, if not, to clear that state. This is needed as scan-server will blindly ignore/skip any migrating server, and if a migration call is interrupted, the state could get stuck. * Updated the 'servers' database table (and associated Database methods) to add columns for; ** server_ram_in_use - tracking RAM used by a running server ** server_configured_ram - RAM allocated to a running server (used with the above to alert a user and track _currently_ available RAM) ** server_updated_by_user - To be set by Striker tools to indicate when the user made a change that needs to push out to nodes / running server. ** server_boot_time - Tracks the unixtime when the server booted (to track uptime even if the server migrates across nodes). * Created Get->anvil_name_from_uuid() to easily convert an Anvil! UUID into a name. Also created ->host_uuid_from_name() to translate a host name into a host UUID. * Created Server->get_runtime() that translates a server name into a process ID and then uses that to determine how long (in seconds) it has been running. This is used when a server transitions from 'shut off' to 'running' to determine exactly when the server booted (current time - runtime). * Renamed all 'Server->parse_definition' calls that used 'from_memory' to 'from_virsh' to clarify the data source. * Made scan-hardware smarter about RAM change alerts. * Updated scancore to load agent strings on startup so that processing pending alerts works properly. Signed-off-by: Digimer <digimer@alteeve.ca>
4 years ago
if ($anvil->data->{server}{$local_host}{$server}{from_virsh}{host})
{
* Got the code in scan-server to the point where it _should_ now gracefully and automatically detect changes to a server's definition originatin from the database (via Striker), directly editing the on-disk definition file, or editing via libvirt tools (like virt-manager). Still needs to be tested though. * Updated Server->migrate_virsh() to set 'servers' -> 'server_state' to 'migrating' and clear it again once the migation completes. Also added support for cold (frozen) versus live migrations. * Updated Cluster->parse_cib() to check if a server with the server_state set to 'migrating' isn't actually migrating anymore and, if not, to clear that state. This is needed as scan-server will blindly ignore/skip any migrating server, and if a migration call is interrupted, the state could get stuck. * Updated the 'servers' database table (and associated Database methods) to add columns for; ** server_ram_in_use - tracking RAM used by a running server ** server_configured_ram - RAM allocated to a running server (used with the above to alert a user and track _currently_ available RAM) ** server_updated_by_user - To be set by Striker tools to indicate when the user made a change that needs to push out to nodes / running server. ** server_boot_time - Tracks the unixtime when the server booted (to track uptime even if the server migrates across nodes). * Created Get->anvil_name_from_uuid() to easily convert an Anvil! UUID into a name. Also created ->host_uuid_from_name() to translate a host name into a host UUID. * Created Server->get_runtime() that translates a server name into a process ID and then uses that to determine how long (in seconds) it has been running. This is used when a server transitions from 'shut off' to 'running' to determine exactly when the server booted (current time - runtime). * Renamed all 'Server->parse_definition' calls that used 'from_memory' to 'from_virsh' to clarify the data source. * Made scan-hardware smarter about RAM change alerts. * Updated scancore to load agent strings on startup so that processing pending alerts works properly. Signed-off-by: Digimer <digimer@alteeve.ca>
4 years ago
$xml_source = "from_virsh";
}
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
server => $server,
xml_source => $xml_source,
}});
### TODO: If we're called with a status and find an ISO file missing and eject it instead of failing.
### For now, we just fault out.
# Do the optical discs in the drive exist? If not, we'll eject it if we're about to boot and fail if
# we're about to migrate. We skip this check if we're migrating off or shutting down the server.
if ((exists $anvil->data->{server}{$local_host}{$server}{$xml_source}{device}{cdrom}) && (not $target) && (not $anvil->data->{switches}{stop}))
{
foreach my $device_target (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{$xml_source}{device}{cdrom}{target}})
{
if ($anvil->data->{server}{$local_host}{$server}{$xml_source}{device}{cdrom}{target}{$device_target}{path})
{
my $file = $anvil->data->{server}{$local_host}{$server}{$xml_source}{device}{cdrom}{target}{$device_target}{path};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { file => $file }});
if (not -e $file)
{
# It doesn't exist. Exit with OCF_ERR_INSTALLED (5).
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0398", variables => { file => $file }});
$anvil->nice_exit({exit_code => 5});
}
elsif (not -r $file)
{
# We can't read it. Exit with OCF_ERR_PERM (4).
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0399", variables => { file => $file }});
$anvil->nice_exit({exit_code => 4});
}
else
{
# We're OK.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 2, key => "log_0400", variables => { file => $file }});
}
}
}
}
# Verify DRBD devices now
validate_storage_drbd($anvil);
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0367"});
return(0);
}
# THis makes sure that the needed backing DRBD devices are on this node. If so, and if they are not up, they
# will be brought up. If that fails, it errors out.
sub validate_storage_drbd
{
my ($anvil) = @_;
# Now check storage.
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
my $xml_source = "from_disk";
my $host = $anvil->Get->short_host_name;
my $local_host = $anvil->Get->short_host_name();
# Did I find a resource for each disk?
foreach my $device_path (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{device}})
{
next if not $device_path;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"server::${local_host}::${server}::device::${device_path}::resource" => $anvil->data->{server}{$local_host}{$server}{device}{$device_path}{resource},
}});
if (not $anvil->data->{server}{$local_host}{$server}{device}{$device_path}{resource})
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 0, priority => "err", key => "log_0414", variables => { drbd_device => $device_path }});
$anvil->nice_exit({exit_code => 5});
}
}
foreach my $device_target (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{$xml_source}{device}{disk}{target}})
{
my $drbd_device = $anvil->data->{server}{$local_host}{$server}{$xml_source}{device}{disk}{target}{$device_target}{path};
my $drbd_resource = defined $anvil->data->{drbd}{config}{$host}{drbd_path}{$drbd_device}{resource} ? $anvil->data->{drbd}{config}{$host}{drbd_path}{$drbd_device}{resource} : "";
my $on_lv = defined $anvil->data->{drbd}{config}{$host}{drbd_path}{$drbd_device}{on} ? $anvil->data->{drbd}{config}{$host}{drbd_path}{$drbd_device}{on} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
host => $host,
drbd_device => $drbd_device,
drbd_resource => $drbd_resource,
on_lv => $on_lv,
}});
if (not $drbd_resource)
{
# See if we can find the resource in the 'by-res' hash.
$drbd_resource = defined $anvil->data->{drbd}{config}{$host}{'by-res'}{$drbd_device}{resource} ? $anvil->data->{drbd}{config}{$host}{'by-res'}{$drbd_device}{resource} : "";
$on_lv = defined $anvil->data->{drbd}{config}{$host}{'by-res'}{$drbd_device}{backing_lv} ? $anvil->data->{drbd}{config}{$host}{'by-res'}{$drbd_device}{backing_lv} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
drbd_resource => $drbd_resource,
on_lv => $on_lv,
}});
}
# If the logical volume here here and active?
if ((not $on_lv) or (not exists $anvil->data->{lvm}{$local_host}{lv}{$on_lv}))
{
# LV not found
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 0, priority => "err", key => "log_0374", variables => { drbd_device => $drbd_device, lv_path => $on_lv }});
$anvil->nice_exit({exit_code => 5});
}
elsif (not $anvil->data->{lvm}{$local_host}{lv}{$on_lv}{active})
{
# LV not active. If we're starting the server or we're the migration target, try to
# activate it.
my $active = $anvil->System->activate_lv({debug => 3, path => $on_lv});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { active => $active }});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 1, key => "log_0413", variables => { lv_path => $on_lv }});
if (not $active)
{
# Boo :(
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 0, priority => "err", key => "log_0375", variables => { drbd_device => $drbd_device, lv_path => $on_lv }});
$anvil->nice_exit({exit_code => 5});
}
}
# LV is good if I am still alive.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 1, key => "log_0376", variables => {
drbd_device => $drbd_device,
lv_path => $on_lv,
}});
}
### NOTE: Checking/Managing firewall ports is expensive option, so DRBD ports are permanently opened
### when a resource is created.
return(0);
}
# This verifies that the requested emulator exists and can be used.
sub validate_emulator
{
my ($anvil) = @_;
# What emulator is this using?
my $local_host = $anvil->Get->short_host_name();
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
my $emulator = $anvil->data->{server}{$local_host}{$server}{from_disk}{info}{emulator};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
emulator => $emulator,
"server::${local_host}::${server}::from_disk::info::emulator" => $anvil->data->{server}{$local_host}{$server}{from_disk}{info}{emulator}
}});
if (not -e $emulator)
{
# It doesn't exist. Exit with OCF_ERR_INSTALLED (5).
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0401", variables => {
emulator => $emulator,
definition_file => $anvil->data->{server}{definition_file},
}});
$anvil->nice_exit({exit_code => 5});
}
if (not -x $emulator)
{
# We can't execute it. Exit with OCF_ERR_PERM (4).
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0402", variables => { emulator => $emulator }});
$anvil->nice_exit({exit_code => 4});
}
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0364"});
return(0);
}
# This makes sure the name we see in the definition file matches what we expect.
sub validate_name
{
my ($anvil) = @_;
my $local_host = $anvil->Get->short_host_name();
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
server => $server,
"server::${local_host}::${server}::from_disk::info::name" => $anvil->data->{server}{$local_host}{$server}{from_disk}{info}{name},
}});
# If we failed to read the XML, the server probably doesn't exist.
if (not $anvil->data->{server}{$local_host}{$server}{from_disk}{xml})
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0403", variables => {
server => $server,
name => $anvil->data->{server}{definition_xml}->{name},
}});
$anvil->nice_exit({exit_code => 1});
}
# Is the name in the definition file what we expect?
if ($server ne $anvil->data->{server}{$local_host}{$server}{from_disk}{info}{name})
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0403", variables => {
server => $server,
name => $anvil->data->{server}{$local_host}{$server}{from_disk}{info}{name},
}});
$anvil->nice_exit({exit_code => 1});
}
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0363"});
return(0);
}
# This checks that there is enough RAM to run this server.
sub validate_ram
{
my ($anvil) = @_;
# How mcuh RAM does the server need and how much do we have free?
my $local_host = $anvil->Get->short_host_name();
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
my $server_ram_bytes = $anvil->data->{server}{$local_host}{$server}{from_disk}{memory};
my $available = $anvil->Get->free_memory({debug => 3});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
server_ram_bytes => $anvil->Convert->add_commas({number => $server_ram_bytes})." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $server_ram_bytes}).")",
available => $anvil->Convert->add_commas({number => $available})." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $available}).")",
}});
if ($server_ram_bytes > $available)
{
# Not enough free memory.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0404", variables => {
name => $server,
ram => $anvil->Convert->bytes_to_human_readable({'bytes' => $server_ram_bytes}),
ram_bytes => $anvil->Convert->add_commas({number => $server_ram_bytes}),
available_ram => $anvil->Convert->bytes_to_human_readable({'bytes' => $server_ram_bytes}),
available_ram_bytes => $anvil->Convert->add_commas({number => $available}),
}});
$anvil->nice_exit({exit_code => 1});
}
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0365"});
return(0);
}
### TODO: Make sure the appropriate SN ports are opened.
# This stops (drbdadm down <server>) the storage for a given server on both nodes.
sub manage_drbd_resource
{
my ($anvil, $task, $resource) = @_;
return(0);
}
# This reads the XML definition data into an XML data hash.
sub read_server_definition
{
my ($anvil) = @_;
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
server => $server,
}});
return(0);
}
# This logs the details of this call.
sub show_environment
{
my ($anvil, $level) = @_;
foreach my $key (sort {$a cmp $b} keys %{$anvil->data->{switches}})
{
next if $key eq "raw";
next if $anvil->data->{switches}{$key} eq "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $level, list => { "switches::${key}" => $anvil->data->{switches}{$key} }});
}
foreach my $key (sort {$a cmp $b} keys %{$anvil->data->{environment}})
{
next if $anvil->data->{environment}{$key} eq "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $level, list => { "environment::${key}" => $anvil->data->{environment}{$key} }});
}
foreach my $key (sort {$a cmp $b} keys %ENV)
{
next if exists $anvil->data->{environment}{$key};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $level, list => { "ENV::${key}" => $ENV{$key} }});
}
foreach my $value (@ARGV)
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $level, list => { "ARGV" => $value }});
}
return(0);
}
# This just prints a quick usage message for now.
sub show_usage
{
my ($anvil) = @_;
### TODO: How to use this...
$anvil->nice_exit({exit_code => 0});
}
# This prints out the metadata and exits.
sub show_metadata
{
my ($anvil) = @_;
# This is a pretty simple agent, by design. We only take a server name for now.
print '<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="ocs:alteeve:server" version="0.1">
<version>1.0</version>
<longdesc lang="en">
This resource agent manages KVM+qemu virtual servers on an Anvil! m3 Intelligent Availability(tm) system.
It manages underlying components like DRBD 9 storage resources, brodge connections and so forth.
</longdesc>
<shortdesc lang="en">Anvil! m3 server resource agent</shortdesc>
<parameters>
<parameter name="name" unique="1" required="1">
<longdesc lang="en">
This is the name of the server as reported by virsh.
</longdesc>
<shortdesc lang="en">Server name</shortdesc>
<content type="string"/>
</parameter>
</parameters>
<actions>
<action name="start" timeout="30" />
<action name="stop" timeout="60" />
<action name="monitor" timeout="10" />
<action name="notify" timeout="20" />
<action name="migrate_to" timeout="600" />
<action name="migrate_from" timeout="600" />
<action name="meta-data" timeout="5" />
<action name="validate-all" timeout="20" />
</actions>
</resource-agent>
';
$anvil->nice_exit({exit_code => 0});
}