Local modifications to ClusterLabs/Anvil by Alteeve
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2184 lines
91 KiB

#!/usr/bin/perl
#
# This is the resource agent used to manage servers on the Anvil! Intelligent Availability platform.
#
# License: GNU General Public License (GPL) v2+
# (c) 1997-2023 - Alteeve's Niche! Inc.
#
# WARNING: This is a pretty purpose-specific resource agent. No effort was made to test this on an rgmanager
# cluster or on any configuration outside how the Anvil! m3 uses it. If you plan to adapt it to
# another purpose, let us know and we'll try to help.
#
# NOTE: This method, for the sake of speed and reliability, does not connect to the Anvil! database. If you
# do work on this RA, be sure that a check is made for database connections before SQL calls are made
# in module methods.
#
# Based on: https://github.com/ClusterLabs/resource-agents/blob/master/doc/dev-guides/ra-dev-guide.asc
#
# Error types from pacemaker's perspective;
#
# - Soft Error - Unless specifically configured otherwise, pacemaker will attempt to recover a resource
# in-place-usually by restarting the resource on the same node.
# - Hard Error - Unless specifically configured otherwise, pacemaker will attempt to recover a resource
# which failed with this error by restarting the resource on a different node.
# - Fatal Error - This is a cluster-wide error, it would make no sense to recover such a resource on a
# different node, let alone in-place. When a resource fails with this error, Pacemaker will
# attempt to shut down the resource, and wait for administrator intervention.
#
# Exit codes;
# 0 - OCF_SUCCESS
# - The action completed successfully. This is the expected return code for any successful start, stop,
# migrate_to, meta_data, help, and usage action.
# - For monitor, however, a modified convention applies:
# - If the server is running we return, OCF_SUCCESS. If not running and gracefully stopped or migrated
# off, return OCF_NOT_RUNNING.
#
# 1 - OCF_ERR_GENERIC
# - The action returned a generic error. This is used only when none of the more specific error codes,
# defined below, accurately describes the problem.
# - Pacemaker interprets this exit code as a soft error.
#
# 2 - OCF_ERR_ARGS
# - The resource’s configuration is not valid on this machine. This can happen if the server fails to boot
# because of a missing bridge, for example.
#
# 3 - OCF_ERR_UNIMPLEMENTED
# - The resource agent was instructed to execute an action that we do not implement.
# - Not all resource agent actions are mandatory. We don't implement 'promote' or 'demote'. We do implement
# 'migrate_to', 'migrate_from', and 'notify'. If we're misconfigured as a master/slave resource, for
# example, then will alert the user about this misconfiguration by returning OCF_ERR_UNIMPLEMENTED.
#
# 4 - OCF_ERR_PERM
# - The action failed due to insufficient permissions. This may be due to a node not being able to open a
# definition file or resource config.
# - Pacemaker interprets this exit code as a hard error.
#
# 5 - OCF_ERR_INSTALLED
# - The action failed because a required component is missing on the node where the action was executed.
# This may be due to a required binary not being executable, or a the DRBD resource config file not
# existing.
# - Pacemaker interprets this exit code as a hard error.
#
# 6 - OCF_ERR_CONFIGURED
# - The action failed because the user misconfigured the resource in pacemaker. For example, the user may
# have configured an alphanumeric string for a parameter that really should be an integer.
# - Pacemaker interprets this exit code as a fatal error.
#
# 7 - OCF_NOT_RUNNING
# - The resource was found not to be running. This is an exit code that may be returned by the monitor
# action exclusively. Note that this implies that the resource has either gracefully shut down, or has
# never been started.
#
# 8 - OCF_RUNNING_MASTER
# 9 - OCF_FAILED_MASTER
# - These OCF exit codes are not used here.
#
# NOTE: We don't use Anvil::Tools to keep overhead low and to keep this agent independent as possible.
use strict;
use warnings;
use Anvil::Tools;
use XML::Simple;
use JSON;
use Math::BigInt;
use Data::Dumper;
# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete.
$| = 1;
# The name of this file is just 'server', which isn't helpful, so we manually set it to our RA name.
my $THIS_FILE = "ocf:alteeve:server";
my $running_directory = ($0 =~ /^(.*?)\/server/)[0];
if (($running_directory =~ /^\./) && ($ENV{PWD}))
{
$running_directory =~ s/^\./$ENV{PWD}/;
}
# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete.
$| = 1;
# NOTE: Setting 'log_level' and 'log_secure' here will get overridden in the main lopp. Use the Log methods
# in the loop as well to override defaults in code.
my $anvil = Anvil::Tools->new();
# Log PIDs
$anvil->data->{defaults}{'log'}{pids} = 1;
### Read or Set the environment variables
# This is the name of the server we're managing. # Example values:
$anvil->data->{environment}{OCF_RESKEY_name} = defined $ENV{OCF_RESKEY_name} ? $ENV{OCF_RESKEY_name} : "";
# This is our node name
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node} = defined $ENV{OCF_RESKEY_CRM_meta_on_node} ? $ENV{OCF_RESKEY_CRM_meta_on_node} : "";
# This says "UUID", but it's the node ID.
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node_uuid} = defined $ENV{OCF_RESKEY_CRM_meta_on_node_uuid} ? $ENV{OCF_RESKEY_CRM_meta_on_node_uuid} : ""; # Not used here, contains the pacemaker node ID
# This is the timeout for the called action in millisecond.
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout} = defined $ENV{OCF_RESKEY_CRM_meta_timeout} ? $ENV{OCF_RESKEY_CRM_meta_timeout} : ""; # 60000
# If this is set, we'll bump our log level as well.
$anvil->data->{environment}{PCMK_debug} = defined $ENV{PCMK_debug} ? $ENV{PCMK_debug} : "0"; # Disable debug by default
# These are other variables that are set, but we don't currently care about them
$anvil->data->{environment}{OCF_EXIT_REASON_PREFIX} = defined $ENV{OCF_EXIT_REASON_PREFIX} ? $ENV{OCF_EXIT_REASON_PREFIX} : "ocf-exit-reason:";
$anvil->data->{environment}{OCF_RA_VERSION_MAJOR} = defined $ENV{OCF_RA_VERSION_MAJOR} ? $ENV{OCF_RA_VERSION_MAJOR} : ""; # 1
$anvil->data->{environment}{OCF_RA_VERSION_MINOR} = defined $ENV{OCF_RA_VERSION_MINOR} ? $ENV{OCF_RA_VERSION_MINOR} : ""; # 0
$anvil->data->{environment}{OCF_RESKEY_crm_feature_set} = defined $ENV{OCF_RESKEY_crm_feature_set} ? $ENV{OCF_RESKEY_crm_feature_set} : ""; # Pacemaker OCF version - 3.7.1
$anvil->data->{environment}{OCF_RESOURCE_INSTANCE} = defined $ENV{OCF_RESOURCE_INSTANCE} ? $ENV{OCF_RESOURCE_INSTANCE} : ""; # Name of server / resource being acted on.
$anvil->data->{environment}{OCF_RESOURCE_PROVIDER} = defined $ENV{OCF_RESOURCE_PROVIDER} ? $ENV{OCF_RESOURCE_PROVIDER} : "alteeve";
$anvil->data->{environment}{OCF_RESOURCE_TYPE} = defined $ENV{OCF_RESOURCE_TYPE} ? $ENV{OCF_RESOURCE_TYPE} : "server";
$anvil->data->{environment}{OCF_ROOT} = defined $ENV{OCF_ROOT} ? $ENV{OCF_ROOT} : "/usr/lib/ocf";
$anvil->data->{environment}{OCF_RESKEY_log_level} = defined $ENV{OCF_RESKEY_log_level} ? $ENV{OCF_RESKEY_log_level} : "";
$anvil->data->{environment}{OCF_RESKEY_log_secure} = defined $ENV{OCF_RESKEY_log_secure} ? $ENV{OCF_RESKEY_log_secure} : 0;
# These are set during a migration
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source} = defined $ENV{OCF_RESKEY_CRM_meta_migrate_source} ? $ENV{OCF_RESKEY_CRM_meta_migrate_source} : "";
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target} = defined $ENV{OCF_RESKEY_CRM_meta_migrate_target} ? $ENV{OCF_RESKEY_CRM_meta_migrate_target} : "";
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_record_pending} = defined $ENV{OCF_RESKEY_CRM_meta_record_pending} ? $ENV{OCF_RESKEY_CRM_meta_record_pending} : "";
# When run from 'pcs', environment variables are scrubbed, which can cause us problems.
if (not $ENV{PATH})
{
$ENV{PATH} = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/root/bin";
}
# Any variable=value arguments in the resource are set under 'OCF_RESKEY_CRM_meta_'
foreach my $key (sort {$a cmp $b} keys %ENV)
{
#$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => { "ENV{".$key."}" => $ENV{$key} }});
next if $key !~ /^OCF_/;
$anvil->data->{environment}{$key} = $ENV{$key};
}
# If pacemaker is in debug, so are we,
if ($anvil->data->{environment}{PCMK_debug})
{
$anvil->Log->level({set => 2});
}
### TODO: Use the running log level in anvil-provision-server is used to set these log levels in the pcs call.
if (($anvil->data->{environment}{OCF_RESKEY_log_level}) && ($anvil->data->{environment}{OCF_RESKEY_log_level} >= 1) && ($anvil->data->{environment}{OCF_RESKEY_log_level} <= 3))
{
$anvil->Log->level({set => $anvil->data->{environment}{OCF_RESKEY_log_level}});
}
if ($anvil->data->{environment}{OCF_RESKEY_log_secure} == 1)
{
$anvil->Log->secure({set => 1});
}
# Originally, this was designed to start and stop a server's DRBD resources on demand. Early testing appears
# to show this prone to higher risk of fencing if something goes wrong. As such, we're changing the default
# behaviour to leave DRBD resources up. Set this to '1' (here or by switch) to revert back to the old
# behaviour.
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_stop_drbd_resources} = 0;
# We're used by anvil-boot-server and anvil-stop-server. They don't set environment variables, but instead
# use switches. Pick those up, if passed.
$anvil->data->{switches}{migrate_to} = ""; # Sets 'meta_migrate_target'
$anvil->data->{switches}{'migrate-to'} = "";
$anvil->data->{switches}{migrate_from} = ""; # Sets 'meta_migrate_source' When set without 'migrate_to', does a status check after migration
$anvil->data->{switches}{'migrate-from'} = "";
$anvil->data->{switches}{server} = ""; # Sets 'name'.
$anvil->data->{switches}{start} = "";
$anvil->data->{switches}{stop} = "";
$anvil->data->{switches}{monitor} = "";
$anvil->Get->switches();
if (($anvil->data->{switches}{'migrate-to'}) && not ($anvil->data->{switches}{migrate_to}))
{
$anvil->data->{switches}{migrate_to} = $anvil->data->{switches}{'migrate-to'};
}
if (($anvil->data->{switches}{'migrate-from'}) && not ($anvil->data->{switches}{migrate_from}))
{
$anvil->data->{switches}{migrate_from} = $anvil->data->{switches}{'migrate-from'};
}
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"switches::migrate_to" => $anvil->data->{switches}{migrate_to},
"switches::migrate_from" => $anvil->data->{switches}{migrate_from},
"switches::server" => $anvil->data->{switches}{server},
"switches::start" => $anvil->data->{switches}{start},
"switches::stop" => $anvil->data->{switches}{stop},
"switches::monitor" => $anvil->data->{switches}{monitor},
}});
if ($anvil->data->{switches}{stop_drbd_resources})
{
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source} = 1;
}
# Something for the logs
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 2, key => "log_0298"});
=cut Manual calls;
# Start a server;
/usr/lib/ocf/resource.d/alteeve/server -vv --log-secure --server <server_name> --start
# Stop a server
/usr/lib/ocf/resource.d/alteeve/server -vv --log-secure --server <server_name> --stop
# Monitor a server
/usr/lib/ocf/resource.d/alteeve/server -vv --log-secure --server <server_name> --monitor
# Migrate (run on current host)
/usr/lib/ocf/resource.d/alteeve/server -vv --log-secure --server <server_name> --migrate-to <target_node_name> --migrate-from <current_node_name>
=cut
foreach my $key (sort {$a cmp $b} keys %{$anvil->data->{environment}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"environment::${key}" => $anvil->data->{environment}{$key},
}});
}
foreach my $key (sort {$a cmp $b} keys %{$anvil->data->{switches}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"switches::${key}" => $anvil->data->{switches}{$key},
}});
}
# Set environment variables from switches, if otherwise not set.
if (($anvil->data->{switches}{server}) && (not $anvil->data->{environment}{OCF_RESKEY_name}))
{
$anvil->data->{environment}{OCF_RESKEY_name} = $anvil->data->{switches}{server};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"environment::OCF_RESKEY_name" => $anvil->data->{environment}{OCF_RESKEY_name},
}});
}
elsif ((not $anvil->data->{switches}{server}) && ($anvil->data->{environment}{OCF_RESKEY_name}))
{
$anvil->data->{switches}{server} =$anvil->data->{environment}{OCF_RESKEY_name};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"switches::server" => $anvil->data->{switches}{server},
}});
}
# In some cases, 'OCF_RESKEY_name' isn't set, but 'OCF_RESOURCE_INSTANCE' is. If this is the case, copy the
# data over.
if ((not $anvil->data->{environment}{OCF_RESKEY_name}) && ($anvil->data->{environment}{OCF_RESOURCE_INSTANCE}))
{
$anvil->data->{environment}{OCF_RESKEY_name} = $anvil->data->{environment}{OCF_RESOURCE_INSTANCE};
$anvil->data->{switches}{server} = $anvil->data->{environment}{OCF_RESOURCE_INSTANCE};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"environment::OCF_RESKEY_name" => $anvil->data->{environment}{OCF_RESKEY_name},
"switches::server" => $anvil->data->{switches}{server},
}});
}
if (($anvil->data->{switches}{migrate_to}) && (not $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target}))
{
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target} = $anvil->data->{switches}{migrate_to};
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node} = $anvil->data->{switches}{migrate_to};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"environment::OCF_RESKEY_CRM_meta_migrate_target" => $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target},
}});
}
if (($anvil->data->{switches}{migrate_from}) && (not $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source}))
{
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source} = $anvil->data->{switches}{migrate_from};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"environment::OCF_RESKEY_CRM_meta_migrate_source" => $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source},
}});
}
# This is for debugging.
if (not $anvil->data->{switches}{monitor})
{
show_environment($anvil, 3);
}
# If there's a test fail file, return '1' to cause pacemaker to fail this resource.
if ($anvil->data->{environment}{OCF_RESKEY_name})
{
my $test_fail_file = "/tmp/".$anvil->data->{environment}{OCF_RESKEY_name}.".fail";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { test_fail_file => $test_fail_file }});
if (-e $test_fail_file)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "warning_0150", variables => { fail_file => $test_fail_file }});
$anvil->nice_exit({exit_code => 1});
}
}
### What are we being asked to do?
# start -Starts the resource.
# stop -Shuts down the resource.
# monitor -(status aliases here) Queries the resource for its state.
# meta-data -Dumps the resource agent metadata.
# promote -Turns a resource into the Master role (Master/Slave resources only).
# demote -Turns a resource into the Slave role (Master/Slave resources only).
# migrate_to - migration target
# migrate_from-Implement live migration of resources.
# validate-all-Validates a resource’s configuration.
# help -(usage maps here) Displays a usage message when the resource agent is invoked from the command line, rather than by the cluster manager.
# notify -Inform resource about changes in state of other clones.
if ($anvil->data->{switches}{migrate_to})
{
# We don't support this, so we return OCF_ERR_UNIMPLEMENTED (3)
migrate_server($anvil);
}
elsif ($anvil->data->{switches}{migrate_from})
{
# This is called after a migration is complete, so we're basically just doing a status check. In case
# the TCP port isn't open in the firewall yet though, we'll check that now.
$anvil->Network->manage_firewall();
# Now do the status check.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0529", variables => { server => $anvil->data->{environment}{OCF_RESKEY_name} }});
server_status($anvil);
}
elsif ($anvil->data->{switches}{start})
{
# Start the server
start_server($anvil);
}
elsif ($anvil->data->{switches}{stop})
{
# Stop the server
stop_server($anvil);
}
elsif (($anvil->data->{switches}{monitor}) or ($anvil->data->{switches}{status}))
{
# Report the status of the server.
server_status($anvil);
}
elsif (($anvil->data->{switches}{metadaata}) or ($anvil->data->{switches}{'meta-data'}))
{
show_metadata($anvil);
}
elsif ($anvil->data->{switches}{promote})
{
# We don't support this, so we return OCF_ERR_UNIMPLEMENTED (3)
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0299", variables => { server => $anvil->data->{environment}{OCF_RESKEY_name} }});
$anvil->nice_exit({exit_code => 3});
}
elsif ($anvil->data->{switches}{demote})
{
# We don't support this, so we return OCF_ERR_UNIMPLEMENTED (3)
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0300", variables => { server => $anvil->data->{environment}{OCF_RESKEY_name} }});
$anvil->nice_exit({exit_code => 3});
}
elsif ($anvil->data->{switches}{'validate-all'})
{
# Validate our local config and setup.
validate_all($anvil);
$anvil->nice_exit({exit_code => 0});
}
elsif (($anvil->data->{switches}{help}) or ($anvil->data->{switches}{usage}))
{
# Show the usage information
show_usage($anvil);
$anvil->nice_exit({exit_code => 0});
}
elsif ($anvil->data->{switches}{notify})
{
### NOTE: See issue 392 (https://github.com/ClusterLabs/anvil/issues/392) to explain why we switched
### the RC from 3 to 0.
# We don't implement this
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level =>0, key => "log_0301"});
$anvil->nice_exit({exit_code => 0});
}
else
{
# We were called in some unexpected way. Log an error, show usage and exit.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level =>0, key => "log_0302"});
show_environment($anvil, 0);
$anvil->nice_exit({exit_code => 1});
}
# If we hit here, something very wrong happened.
$anvil->nice_exit({exit_code => 255});
#############################################################################################################
# Functions #
#############################################################################################################
# This will either verify that 'libvirtd' and 'drbd' are running (and start them if not) is called with
# "start". If called with "stop", a check is made on both nodes. If all VMs are gone, "libvirtd" and "drbd"
# are stopped.
sub check_daemons
{
my ($anvil, $task) = @_;
my $problem = $anvil->Cluster->parse_cib({debug => 3});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { problem => $problem }});
if ($problem)
{
# Pacemaker isn't running, or some other problem. Someone must have called this script
# directly or something.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0133"});
$anvil->nice_exit({exit_code => 1});
}
# Is the peer running? We'll use this to know whether to try and start daemons on the peer.
my $peer_name = $anvil->data->{cib}{parsed}{peer}{name};
my $peer_ready = $anvil->data->{cib}{parsed}{peer}{ready};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
peer_name => $peer_name,
peer_ready => $peer_ready,
}});
if ($task eq "start")
{
### It doesn't look like we need to start drbd. Up'ing the first resource works without it.
### libvirtd also starts on-demand.
=cut
foreach my $daemon ("libvirtd.service", "drbd.service")
{
my $running_local = 0;
my $running_peer = 0;
my ($output, $return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{systemctl}." status ".$daemon});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
output => $output,
return_code => $return_code,
}});
if ($return_code eq "3")
{
# It is stopped, start it..
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0482", variables => { daemon => $daemon }});
my ($output, $return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{systemctl}." start ".$daemon});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
output => $output,
return_code => $return_code,
}});
my $loops = 0;
my $running = 0;
until ($running)
{
my ($output, $return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{systemctl}." status ".$daemon});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
output => $output,
return_code => $return_code,
}});
if ($return_code eq "0")
{
# It's running
$running = 1;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0483", variables => { daemon => $daemon }});
}
else
{
$loops++;
if ($loops > 5)
{
# Give up
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0134", variables => { daemon => $daemon }});
$anvil->nice_exit({exit_code => 1});
}
else
{
# Wait for a second.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0484", variables => { daemon => $daemon }});
sleep 1;
}
}
}
}
elsif ($return_code eq "0")
{
# Running, nothing to do.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0485", variables => { daemon => $daemon }});
}
if ($peer_ready)
{
my ($output, $error, $return_code) = $anvil->Remote->call({
target => $peer_name,
shell_call => $anvil->data->{path}{exe}{systemctl}." status ".$daemon,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
error => $error,
return_code => $return_code,
}});
if ($return_code eq "3")
{
# Stopped, start it..
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0486", variables => {
daemon => $daemon,
host => $peer_name,
}});
my ($output, $error, $return_code) = $anvil->Remote->call({
target => $peer_name,
shell_call => $anvil->data->{path}{exe}{systemctl}." start ".$daemon,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
output => $output,
error => $error,
return_code => $return_code,
}});
my $loops = 0;
my $running = 0;
until ($running)
{
my ($output, $error, $return_code) = $anvil->Remote->call({
target => $peer_name,
shell_call => $anvil->data->{path}{exe}{systemctl}." status ".$daemon,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
output => $output,
error => $error,
return_code => $return_code,
}});
if ($return_code eq "0")
{
$running = 1;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0487", variables => {
daemon => $daemon,
host => $peer_name,
}});
}
else
{
$loops++;
if ($loops > 5)
{
### TODO: We may want to NOT die here, if
### we're booting a server (though we
### will if we're migrating).
# Give up
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0135", variables => {
daemon => $daemon,
host => $peer_name,
}});
$anvil->nice_exit({exit_code => 1});
}
else
{
# Wait for a second.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0488", variables => {
daemon => $daemon,
host => $peer_name,
}});
sleep 1;
}
}
}
}
elsif ($return_code eq "0")
{
# Running, nothing to do.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0489", variables => {
daemon => $daemon,
host => $peer_name,
}});
}
}
}
=cut
}
return(0);
}
=cut
STATES
The State field lists what state each domain is currently in. A domain can be in one of the following
possible states:
running - The domain is currently running on a CPU
idle - The domain is idle, and not running or runnable. This can be caused because the domain is
waiting on IO (a traditional wait state) or has gone to sleep because there was nothing else
for it to do.
paused - The domain has been paused, usually occurring through the administrator running virsh suspend.
When in a paused state the domain will still consume allocated resources like memory, but will
not be eligible for scheduling by the hypervisor.
in shutdown - The domain is in the process of shutting down, i.e. the guest operating system has been
notified and should be in the process of stopping its operations gracefully.
shut off - The domain is not running. Usually this indicates the domain has been shut down completely, or
has not been started.
crashed - The domain has crashed, which is always a violent ending. Usually this state can only occur if
the domain has been configured not to restart on crash.
pmsuspended - The domain has been suspended by guest power management, e.g. entered into s3 state.
=cut
# This boots the server if possible.
sub start_server
{
my ($anvil) = @_;
# Before we do anything, make sure that 'libvirtd' and 'drbd' services are running.
check_daemons($anvil, "start");
# Start procedure;
# 1. Read the XML definition file and find the backing storage and bridges. Soft error if read fails.
# 2. Make sure the name matches.
# 3. Make sure we have enough free RAM.
# 4. Make sure the emulator exists (can be an issue after migrating from an different gen Anvil!).
# 5.1. Make sure optical drives with mounted data have the disk present. Soft error if not.
# 5.2. Find any backing DRBD devices
# 6. For each DRBD device;
# 6.1. Make sure the backing LV is ACTIVE. Soft error if not.
# 6.2. Check if the drbd resource is up. If not, up it.
# 6.3. Make sure the backing disk is UpToDate. Soft error if not.
# 6.4. Make sure the backing device is 'Connected' or 'Connecting'. Call a connect if not.
# 7. Make sure all bridges exist and soft error if not.
# 8. Start the server.
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0303", variables => { server => $server }});
if ((not $anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node}) && ($anvil->data->{switches}{target}))
{
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node} = $anvil->data->{switches}{target};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"environment::OCF_RESKEY_CRM_meta_on_node" => $anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node},
}});
}
# Make sure things are sane.
validate_all($anvil);
# Is the server already running somewhere?
find_server($anvil);
# Start the resource, if needed.
start_drbd_resource($anvil);
# Get a connection so that we can mark is as being in shutdown and shut off.
$anvil->Database->connect({
check_for_resync => 0,
retry => 0,
sensitive => 1,
});
# Still alive? Boot!
my ($success) = $anvil->Server->boot_virsh({debug => 2, server => $server});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { success => $success }});
if ($success)
{
# Make sure server constraints favour us.
my $local_name = $anvil->data->{cib}{parsed}{'local'}{name};
my $peer_name = $anvil->data->{cib}{parsed}{peer}{name};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
local_name => $local_name,
peer_name => $peer_name,
}});
my $problem = $anvil->Cluster->_set_server_constraint({
debug => 2,
server => $server,
preferred_node => $local_name,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }});
# Success!
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0309", variables => { server => $server }});
$anvil->nice_exit({exit_code => 0});
}
else
{
# WTF?
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0310", variables => {
server => $server,
'state' => defined $anvil->data->{server}{location}{$server}{host_name} ? $anvil->data->{server}{location}{$server}{host_name} : "#!string!unit_0003!#",
}});
$anvil->nice_exit({exit_code => 6});
}
# If we're still alive, then we didn't see the server in the list of running servers, which is really weird.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0311", variables => { server => $server }});
$anvil->nice_exit({exit_code => 1});
}
# This stops the DRBD resource(s) that ran under a server.
sub stop_drbd_resource
{
my ($anvil) = @_;
my $local_host = $anvil->Get->short_host_name();
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
my $host = $anvil->Get->short_host_name;
my $peer = $anvil->data->{drbd}{config}{$host}{peer};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
server => $server,
host => $host,
peer => $peer,
}});
# Stop the DRBD resource.
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
{
my $peer_ip = $anvil->data->{drbd}{config}{$host}{resource}{$resource}{connection}{$peer}{ip_address};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0408", variables => {
server => $server,
peer => $peer,
peer_ip => $peer_ip,
resource => $resource,
}});
# Bring the peer's resource down.
$anvil->DRBD->manage_resource({
debug => 3,
resource => $resource,
task => "down",
target => $peer_ip,
});
# Bring the local resource down
$anvil->DRBD->manage_resource({
debug => 3,
resource => $resource,
task => "down",
});
}
return(0);
}
# This starts the drbd resource(s) for the requested server, if needed.
sub start_drbd_resource
{
my ($anvil) = @_;
my $local_host = $anvil->Get->short_host_name();
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
my $host = $anvil->Get->short_host_name;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
server => $server,
host => $host,
}});
# Do we need startup?
my $local_startup_needed = 0;
$anvil->DRBD->get_status({debug => 3});
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
{
# Is the current resource up locally already?
my $role = defined $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role} ? $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:resource' => $resource,
's2:role' => $role,
}});
if ((lc($role) ne "secondary") && (lc($role) ne "primary"))
{
$local_startup_needed = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_startup_needed => $local_startup_needed }});
last;
}
else
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0434", variables => {
resource => $resource,
role => $role,
}});
}
}
# Do I need to start the DRBD resource locally? If so, do so.
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_startup_needed => $local_startup_needed }});
if ($local_startup_needed)
{
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
{
# Bring the local resource up
$anvil->DRBD->manage_resource({
debug => 2,
resource => $resource,
task => "up",
});
# Now wait for it to come up.
my $waiting = 1;
my $wait_until = time + 5;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:time' => time,
's2:wait_until' => $wait_until,
}});
while($waiting)
{
$anvil->DRBD->get_status({debug => 3});
my $all_up = 1;
foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}})
{
my $disk_state = lc($anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}{$volume}{'disk-state'});
$disk_state = "" if not defined $disk_state;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:resource' => $resource,
's2:volume' => $volume,
's3:disk_state' => $disk_state,
}});
if (($disk_state ne "inconsistent") &&
($disk_state ne "outdated") &&
($disk_state ne "consistent") &&
($disk_state ne "uptodate"))
{
$all_up = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_up => $all_up }});
}
}
if ($all_up)
{
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
}
if ($waiting)
{
sleep 1;
}
elsif (time > $wait_until)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "warning_0138"});
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
}
}
}
# If auto-promote isn't set, promote the resource.
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"drbd::config::${local_host}::auto-promote" => $anvil->data->{drbd}{config}{$local_host}{'auto-promote'},
}});
if (not $anvil->data->{drbd}{config}{$local_host}{'auto-promote'})
{
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0420", variables => {
server => $server,
resource => $resource,
}});
# Make the local resource primary.
$anvil->DRBD->manage_resource({
resource => $resource,
task => "primary",
});
}
}
}
else
{
# Call an adjust to reset the config, in case something like 'allow-two-primaries' was
# enabled on one node but not the other.
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0745", variables => {
server => $server,
resource => $resource,
}});
$anvil->DRBD->manage_resource({
resource => $resource,
task => "adjust",
});
}
}
### NOTE: We always check the peer now, in case it's resource is down and ours happens to be up.
# See if we're inconsistent and, if so, if we can connect our peers.
if (0)
{
sleep 2;
$anvil->DRBD->get_status({debug => 3});
my $peer_startup_needed = 1;
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
{
# Is the current resource up locally already?
my $role = defined $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role} ? $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:resource' => $resource,
's2:role' => $role,
}});
# Check all volumes.
foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}})
{
my $disk_state = defined $anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}{$volume}{'disk-state'} ? $anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}{$volume}{'disk-state'} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { disk_state => $disk_state }});
if ((lc($disk_state) eq "consistent") or
(lc($disk_state) eq "outdated") or
(lc($disk_state) eq "failed") or
(not $disk_state))
{
# This will trigger trying to ssh into peer(s) and up'ing their resource.
$peer_startup_needed = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peer_startup_needed => $peer_startup_needed }});
last;
}
}
}
}
# Do we need to start the resource on our peers?
#$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peer_startup_needed => $peer_startup_needed }});
#if (not $peer_startup_needed)
#{
# $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0431"});
# return(0);
#}
# Start DRBD on the peer(s), if we can.
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { resource => $resource }});
foreach my $host (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}})
{
my $is_local = $anvil->Network->is_local({host => $host});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:host' => $host,
's2:is_local' => $is_local,
}});
my $connection_state = $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}{$host}{'connection-state'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { connection_state => $connection_state }});
if (lc($connection_state) ne "connected")
{
# Try to connect to the peer and up this reasource.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0694", variables => {
host => $host,
resource => $resource,
connection_state => $connection_state,
}});
my ($access) = $anvil->Remote->test_access({target => $host});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { access => $access }});
if ($access)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0695", variables => {
host => $host,
resource => $resource,
}});
$anvil->DRBD->manage_resource({
debug => 2,
resource => $resource,
task => "up",
target => $host,
});
}
else
{
# No access
my $all_uptodate = 1;
foreach my $volume (sort {$a <=> $b} %{$anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}{$host}{volume}})
{
my $peer_disk_state = $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}{$host}{volume}{$volume}{'peer-disk-state'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
volume => $volume,
peer_disk_state => $peer_disk_state,
}});
if (lc($peer_disk_state) ne "uptodate")
{
$all_uptodate = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_uptodate => $all_uptodate }});
}
}
if ($all_uptodate)
{
# Booting should be fine
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0747", variables => { host => $host }});
}
else
{
# Booting might fail...
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "warning_0136", variables => { host => $host }});
}
}
}
}
}
# Loop until all our resources are Connected or UpToDate
my $waiting = 1;
my $wait_until = time + 5;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:time' => time,
's2:wait_until' => $wait_until,
}});
while($waiting)
{
sleep 1;
$anvil->DRBD->get_status({debug => 3});
my $all_resources_ok = 1;
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
{
# This is set to '1' if either the volumes are UpToDate or Sync'ing.
$anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{ok} = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"drbd::status::${local_host}::resource::${resource}::ok" => $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{ok},
}});
foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}})
{
# This will be used to mark if a volume is being sync'ed later, if needed.
$anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}{$volume}{ok} = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"drbd::status::${local_host}::resource::${resource}::devices::volume::${volume}::ok" => $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}{$volume}{ok},
}});
my $disk_state = lc($anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}{$volume}{'disk-state'});
$disk_state = "" if not defined $disk_state;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { disk_state => $disk_state }});
if ($disk_state eq "uptodate")
{
$anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}{$volume}{ok} = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"drbd::status::${local_host}::resource::${resource}::devices::volume::${volume}::ok" => $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}{$volume}{ok},
}});
}
}
if (not $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{ok})
{
# See if we're a SyncTarget
foreach my $connection (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}})
{
my $connection_state = $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}{$connection}{'connection-state'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
connection => $connection,
connection_state => $connection_state,
}});
foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}{$connection}{volume}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"drbd::status::${local_host}::resource::${resource}::devices::volume::${volume}::ok" => $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}{$volume}{ok},
}});
next if $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}{$volume}{ok};
my $replication_state = lc($anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}{$connection}{volume}{$volume}{'replication-state'});
$replication_state = "" if not defined $replication_state;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:volume' => $volume,
's2:replication_state' => $replication_state,
}});
if ($replication_state =~ /sync/)
{
# We're good to go.
$anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}{$volume}{ok} = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"drbd::status::${local_host}::resource::${resource}::devices::volume::${volume}::ok" => $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}{$volume}{ok},
}});
}
}
}
}
# Loop through all volumes on all resources and see if they're OK. If they are, mark the resource as OK.
my $resource_ok = 1;
foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}})
{
my $volume_ok = $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}{$volume}{ok};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
volume => $volume,
volume_ok => $volume_ok,
}});
if (not $volume_ok)
{
$resource_ok = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { resource_ok => $resource_ok }});
}
}
if (not $resource_ok)
{
$all_resources_ok = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_resources_ok => $all_resources_ok }});
}
}
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_resources_ok => $all_resources_ok }});
if ($all_resources_ok)
{
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
}
elsif (time > $wait_until)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "warning_0137"});
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
}
}
return(0);
}
### TODO: Rework this to use Server->connect_to_libvirtd and phase out Server->find().
# This uses the DRBD information to find other peers and see if the server is running on them.
sub find_server
{
my ($anvil) = @_;
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
my $host = $anvil->Get->short_host_name;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0415", variables => { server => $server }});
foreach my $ip_address (sort {$a cmp $b} keys %{$anvil->data->{drbd}{config}{$host}{ip_addresses}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { ip_address => $ip_address }});
# Can we access this IP?
my $access = $anvil->Remote->test_access({target => $ip_address});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { access => $access }});
if ($access)
{
$anvil->Server->find({
debug => 3,
target => $ip_address,
remote_user => "root",
});
}
}
foreach my $this_server (sort {$a cmp $b} keys %{$anvil->data->{server}{location}})
{
my $status = $anvil->data->{server}{location}{$this_server}{status};
my $host = $anvil->data->{server}{location}{$this_server}{host_name};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
this_server => $this_server,
status => $status,
host => $host,
}});
}
if ((exists $anvil->data->{server}{location}{$server}) && ($anvil->data->{server}{location}{$server}{host_name}))
{
# The server is running. If it is running here, exit with success. If it's running elsewhere,
# exit with a failure.
my $status = $anvil->data->{server}{location}{$server}{status};
my $host = $anvil->data->{server}{location}{$server}{host_name};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
status => $status,
host => $host,
}});
if ($host eq $anvil->Get->host_name)
{
# Already running, we're good, and we're done.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0416", variables => { server => $server }});
$anvil->nice_exit({exit_code => 0});
}
elsif ($host =~ /dr(\d+)$/)
{
# The server is running elsewhere. If the peer host is DR, exit with
# OCF_ERR_CONFIGURED (6) so that pacemaker doesn't try to also start the server on
# the other node, because we don't know the state of it here.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0417", variables => {
server => $server,
host => $host,
}});
$anvil->nice_exit({exit_code => 6});
}
else
{
# It looks like it's running on the peer. So we'll exit OCF_ERR_INSTALLED (5) to tell
# pacemaker to try to start it on our peer.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0418", variables => {
server => $server,
host => $host,
}});
$anvil->nice_exit({exit_code => 6});
}
}
return(0);
}
# This shuts down the server if possible.
sub stop_server
{
my ($anvil) = @_;
# Stopping the server is simply a question of "is the server running?" and, if so, stop it. Once
# stopped, and if enabled, we stop the DRBD resource on both nodes.
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0582", variables => { server => $server }});
# Read in an parse the server's XML.
$anvil->System->check_storage();
$anvil->Server->get_status({debug => 2, server => $server});
# Get a connection so that we can mark is as being in shutdown and shut off.
$anvil->Database->connect({
check_for_resync => 0,
retry => 0,
sensitive => 1,
});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0313", variables => { server => $server }});
my $success = $anvil->Server->shutdown_virsh({server => $server});
if (not $success)
{
# Something went wrong. Details should be in the logs.
$anvil->nice_exit({exit_code => 1});
}
# Now stop the DRBD resource(s).
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'environment::OCF_RESKEY_CRM_meta_stop_drbd_resources' => $anvil->data->{environment}{OCF_RESKEY_CRM_meta_stop_drbd_resources},
}});
if ($anvil->data->{environment}{OCF_RESKEY_CRM_meta_stop_drbd_resources})
{
stop_drbd_resource($anvil);
}
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0324", variables => { server => $server }});
$anvil->nice_exit({exit_code => 0});
}
# This checks the status of the server.
sub server_status
{
my ($anvil) = @_;
# If the named server is running, return OCF_SUCCESS (rc: 0), otherwise OCF_NOT_RUNNING (rc: 7). If
# the server is failed, return OCF_ERR_GENERIC (1).
my $state = "";
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0521", variables => { server => $server }});
if (not $anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout})
{
# Set a sane default of 20 seconds.
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout} = 60000;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "log_0331", variables => { timeout => $anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout} }});
}
# Is 'libvirtd' running? We'll wait up to half the timeout for it to start (in case it _just_ started)
# before timing out.
my $wait_until = time + ($anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout} / 2000); # Devide by 2000 to convert to seconds and total second halved.
my $look_for_pid = 0;
my $libvirtd_wait = 1;
my $warning_shown = 0;
while($libvirtd_wait)
{
my $running = $anvil->System->check_daemon({daemon => "libvirtd.service"});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { running => $running }});
if ($running)
{
$libvirtd_wait = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { libvirtd_wait => $libvirtd_wait }});
}
else
{
# On EL8 and above, libvirtd starts on demand, so this error isn't
if (not $warning_shown)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0522", variables => { wait_time => ($wait_until - time) }});
$warning_shown = 1;
}
sleep 1;
if (time > $wait_until)
{
# Libvirtd isn't running, try to find the PID of the server (in case it's
# running and libvirtd isn't)
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, priority => "alert", key => "warning_0057"});
$look_for_pid = 1;
$libvirtd_wait = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
look_for_pid => $look_for_pid,
libvirtd_wait => $libvirtd_wait,
}});
}
}
}
# If libvirtd wasn't running, we'll manually look for a PID.
if ($look_for_pid)
{
my $server_up = 0;
my $shell_call = $anvil->data->{path}{exe}{ps}." aux";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
foreach my $line (split/\n/, $output)
{
next if $line !~ /qemu-kvm/;
$line = $anvil->Words->clean_spaces({ string => $line });
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { line => $line }});
if ($line =~ /guest=(.*?),/)
{
my $this_server = $1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { this_server => $this_server }});
if ($this_server eq $server)
{
# Found it.
$server_up = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { server_up => $server_up }});
last;
}
}
}
if ($server_up)
{
# The server is running. Exit with OCF_SUCCESS (rc 0);
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0523"});
$anvil->nice_exit({exit_code => 0});
}
else
{
# The server is not running. Exit with OCF_NOT_RUNNING (rc: 7)
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0524"});
$anvil->nice_exit({exit_code => 7});
}
}
else
{
# Parse the virsh state. If it's listed as 'crashed', return OCF_ERR_GENERIC (rc: 1). If it's
# 'in shutdown', 'loop' gets set to 1 and this will loop indefinitely. We don't put a timer
# on it, we let pacemaker handle that.
my $loop = 1;
while($loop)
{
$loop = 0;
my $found = 0;
my $shell_call = $anvil->data->{path}{exe}{setsid}." --wait ".$anvil->data->{path}{exe}{virsh}." list --all";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
foreach my $line (split/\n/, $output)
{
$line = $anvil->Words->clean_spaces({ string => $line });
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { line => $line }});
if ($line =~ /\s\Q$server\E\s+(.*)/)
{
my $state = $1;
$found = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
found => $found,
'state' => $state,
}});
* Got the code in scan-server to the point where it _should_ now gracefully and automatically detect changes to a server's definition originatin from the database (via Striker), directly editing the on-disk definition file, or editing via libvirt tools (like virt-manager). Still needs to be tested though. * Updated Server->migrate_virsh() to set 'servers' -> 'server_state' to 'migrating' and clear it again once the migation completes. Also added support for cold (frozen) versus live migrations. * Updated Cluster->parse_cib() to check if a server with the server_state set to 'migrating' isn't actually migrating anymore and, if not, to clear that state. This is needed as scan-server will blindly ignore/skip any migrating server, and if a migration call is interrupted, the state could get stuck. * Updated the 'servers' database table (and associated Database methods) to add columns for; ** server_ram_in_use - tracking RAM used by a running server ** server_configured_ram - RAM allocated to a running server (used with the above to alert a user and track _currently_ available RAM) ** server_updated_by_user - To be set by Striker tools to indicate when the user made a change that needs to push out to nodes / running server. ** server_boot_time - Tracks the unixtime when the server booted (to track uptime even if the server migrates across nodes). * Created Get->anvil_name_from_uuid() to easily convert an Anvil! UUID into a name. Also created ->host_uuid_from_name() to translate a host name into a host UUID. * Created Server->get_runtime() that translates a server name into a process ID and then uses that to determine how long (in seconds) it has been running. This is used when a server transitions from 'shut off' to 'running' to determine exactly when the server booted (current time - runtime). * Renamed all 'Server->parse_definition' calls that used 'from_memory' to 'from_virsh' to clarify the data source. * Made scan-hardware smarter about RAM change alerts. * Updated scancore to load agent strings on startup so that processing pending alerts works properly. Signed-off-by: Digimer <digimer@alteeve.ca>
4 years ago
=cut
* Server states;
running - The domain is currently running on a CPU
idle - The domain is idle, and not running or runnable. This can be caused because the domain is waiting on IO (a traditional wait state) or has gone to sleep because there was nothing else for it to do.
paused - The domain has been paused, usually occurring through the administrator running virsh suspend. When in a paused state the domain will still consume allocated resources like memory, but will not be eligible for scheduling by the hypervisor.
in shutdown - The domain is in the process of shutting down, i.e. the guest operating system has been notified and should be in the process of stopping its operations gracefully.
shut off - The domain is not running. Usually this indicates the domain has been shut down completely, or has not been started.
crashed - The domain has crashed, which is always a violent ending. Usually this state can only occur if the domain has been configured not to restart on crash.
pmsuspended - The domain has been suspended by guest power management, e.g. entered into s3 state.
=cut
### TODO: Should we treat 'idle' same as crashed?
if ($state eq "crashed")
{
# Woops. Exit with OCF_ERR_GENERIC (rc: 1).
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "warning_0058", variables => { server => $server }});
$anvil->nice_exit({exit_code => 1});
}
elsif ($state eq "in shutdown")
{
# Wait.
$loop = 1;
* Created Database->get_bridges() that, surprise, loads data from the 'bridges' table. * Started work on Get->available_resources() that will take an 'anvil_uuid' and figure out what resources are still available for use by new servers or that can be added to existing servers. * Fixed a bug in ScanCore->agent_startup() where tables weren't being generated properly from the agent's SQL file. * Made Storage->change_mode() return silently if it's called without a mode being passed. This happens frequently and is harmless so it's not worth filling the logs with errors. * Renamed the 'start_time' key to 'at_start' when recording files' MD5 sums in Storage->record_md5sums and ->check_md5sums. * When we moved the directory scan logic out of the 'scancore' daemon and into 'Storage->scan_directory', the logic to record scan agent names in 'scancore::agent::<file>' was removed. This broke a few things and, so, it was restored when it was found that a file starts with 'scan-' and the directory matches the scancore agent directory. * Moved the 'scancore' daemon's 'load_agent_strings' to 'Words' * Updated Words->parse_banged_string() to look for variables in the format 'value=X:units=Y' and translate it properly. * Fixed a bug in scan-ipmitool where discovered sensor INSERT SQL queries were queued, but not committed. * Fixed a bug in scan-storcli where a while loop was broken, preventing execution. * Fixed a bug in the 'scancore' daemon where it wouldn't exit if sums changed. Fixed a bug where alerts weren't being sent between loops. Fixed a bug where command-line log level wasn't surviving inside the main loop. Signed-off-by: Digimer <digimer@alteeve.ca>
4 years ago
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0525", variables => { server_name => $server }});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { loop => $loop }});
sleep 1;
last;
}
elsif ($state eq "shut off")
{
# Exit with OCF_NOT_RUNNING (rc: 7);
* Created Database->get_bridges() that, surprise, loads data from the 'bridges' table. * Started work on Get->available_resources() that will take an 'anvil_uuid' and figure out what resources are still available for use by new servers or that can be added to existing servers. * Fixed a bug in ScanCore->agent_startup() where tables weren't being generated properly from the agent's SQL file. * Made Storage->change_mode() return silently if it's called without a mode being passed. This happens frequently and is harmless so it's not worth filling the logs with errors. * Renamed the 'start_time' key to 'at_start' when recording files' MD5 sums in Storage->record_md5sums and ->check_md5sums. * When we moved the directory scan logic out of the 'scancore' daemon and into 'Storage->scan_directory', the logic to record scan agent names in 'scancore::agent::<file>' was removed. This broke a few things and, so, it was restored when it was found that a file starts with 'scan-' and the directory matches the scancore agent directory. * Moved the 'scancore' daemon's 'load_agent_strings' to 'Words' * Updated Words->parse_banged_string() to look for variables in the format 'value=X:units=Y' and translate it properly. * Fixed a bug in scan-ipmitool where discovered sensor INSERT SQL queries were queued, but not committed. * Fixed a bug in scan-storcli where a while loop was broken, preventing execution. * Fixed a bug in the 'scancore' daemon where it wouldn't exit if sums changed. Fixed a bug where alerts weren't being sent between loops. Fixed a bug where command-line log level wasn't surviving inside the main loop. Signed-off-by: Digimer <digimer@alteeve.ca>
4 years ago
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0526", variables => { server_name => $server }});
$anvil->nice_exit({exit_code => 7});
}
else
{
# In some fashion or another, the server is running. Exit with OCF_SUCCESS (rc: 0)
* Created Database->get_bridges() that, surprise, loads data from the 'bridges' table. * Started work on Get->available_resources() that will take an 'anvil_uuid' and figure out what resources are still available for use by new servers or that can be added to existing servers. * Fixed a bug in ScanCore->agent_startup() where tables weren't being generated properly from the agent's SQL file. * Made Storage->change_mode() return silently if it's called without a mode being passed. This happens frequently and is harmless so it's not worth filling the logs with errors. * Renamed the 'start_time' key to 'at_start' when recording files' MD5 sums in Storage->record_md5sums and ->check_md5sums. * When we moved the directory scan logic out of the 'scancore' daemon and into 'Storage->scan_directory', the logic to record scan agent names in 'scancore::agent::<file>' was removed. This broke a few things and, so, it was restored when it was found that a file starts with 'scan-' and the directory matches the scancore agent directory. * Moved the 'scancore' daemon's 'load_agent_strings' to 'Words' * Updated Words->parse_banged_string() to look for variables in the format 'value=X:units=Y' and translate it properly. * Fixed a bug in scan-ipmitool where discovered sensor INSERT SQL queries were queued, but not committed. * Fixed a bug in scan-storcli where a while loop was broken, preventing execution. * Fixed a bug in the 'scancore' daemon where it wouldn't exit if sums changed. Fixed a bug where alerts weren't being sent between loops. Fixed a bug where command-line log level wasn't surviving inside the main loop. Signed-off-by: Digimer <digimer@alteeve.ca>
4 years ago
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0527", variables => {
'state' => $state,
server_name => $server,
}});
$anvil->nice_exit({exit_code => 0});
}
}
}
# If it wasn't found at all, exit.
if (not $found)
{
# Exit with OCF_NOT_RUNNING (rc: 7);
* Created Database->get_bridges() that, surprise, loads data from the 'bridges' table. * Started work on Get->available_resources() that will take an 'anvil_uuid' and figure out what resources are still available for use by new servers or that can be added to existing servers. * Fixed a bug in ScanCore->agent_startup() where tables weren't being generated properly from the agent's SQL file. * Made Storage->change_mode() return silently if it's called without a mode being passed. This happens frequently and is harmless so it's not worth filling the logs with errors. * Renamed the 'start_time' key to 'at_start' when recording files' MD5 sums in Storage->record_md5sums and ->check_md5sums. * When we moved the directory scan logic out of the 'scancore' daemon and into 'Storage->scan_directory', the logic to record scan agent names in 'scancore::agent::<file>' was removed. This broke a few things and, so, it was restored when it was found that a file starts with 'scan-' and the directory matches the scancore agent directory. * Moved the 'scancore' daemon's 'load_agent_strings' to 'Words' * Updated Words->parse_banged_string() to look for variables in the format 'value=X:units=Y' and translate it properly. * Fixed a bug in scan-ipmitool where discovered sensor INSERT SQL queries were queued, but not committed. * Fixed a bug in scan-storcli where a while loop was broken, preventing execution. * Fixed a bug in the 'scancore' daemon where it wouldn't exit if sums changed. Fixed a bug where alerts weren't being sent between loops. Fixed a bug where command-line log level wasn't surviving inside the main loop. Signed-off-by: Digimer <digimer@alteeve.ca>
4 years ago
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0526", variables => { server_name => $server }});
$anvil->nice_exit({exit_code => 7});
}
}
}
# This method should never return. Just in case it does though, exit with generic error.
$anvil->nice_exit({exit_code => 1});
}
# Migrate the server
sub migrate_server
{
my ($anvil) = @_;
# Do a sensitive, quick DB connect.
$anvil->Database->connect({
check_for_resync => 0,
retry => 0,
sensitive => 1,
});
### NOTE: For now, we're not going to block if the target is not UpToDate. There are times when a
### user might want to do this (ie: sync will be done soon and the need to evacuate the node
### ASAP is high). Maybe we'll enforce this and require a '--force' switch later?
# If we were given 'migrate_to', we need to make sure the storage is UpToDate on the peer for all
# backing resources. We can't check the target's bridges, but the migation will fail if one is
# missing.
# If we're given 'migrate_from', we're pulling the server towards us, so we can check both brdiges
# and storage.
my $local_host = $anvil->Get->short_host_name();
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
my $source = $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source};
my $target = $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target};
my $meta_on_node = $anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
server => $server,
source => $source,
target => $target,
meta_on_node => $meta_on_node,
}});
# Make sure switches are at least defined.
$anvil->data->{switches}{migrate_to} = "" if not defined $anvil->data->{switches}{migrate_to};
$anvil->data->{switches}{migrate_from} = "" if not defined $anvil->data->{switches}{migrate_from};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'switches::migrate_to' => $anvil->data->{switches}{migrate_to},
'switches::migrate_from' => $anvil->data->{switches}{migrate_from},
}});
# Log what we're doing.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0528", variables => {
server => $server,
target_host => $target,
}});
# If there is a '<target>.mnX' (migration network X) entry that can be resolved, we'll change the
# target to use that. This is a dedicated, usually back-to-back network used in nodes specifically
# for migration.
my $test_target = $target;
$test_target =~ s/\..*$//;
$test_target .= ".mn1"; # Might want to make this a loop to support MN2+ later
my $test_ip = $anvil->Convert->host_name_to_ip({debug => 2, host_name => $test_target});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
test_target => $test_target,
test_ip => $test_ip,
}});
if ($test_ip)
{
# Can we access the peer with this?
my ($access) = $anvil->Remote->test_access({debug => 3, target => $test_ip});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { access => $access }});
# Did we get access?
if ($access)
{
# Yup! Switch the target.
$target = $test_ip;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0663", variables => {
target => $test_target,
ip => $target,
}});
}
}
# Before migrating, make sure the daemons are running on the peer.
check_daemons($anvil, "start");
# The actual migration command will involve enabling dual primary, then beginning the migration. The
# virsh call will depend on if we're pushing or pulling. Once the migration completes, regardless of
# success or failure, dual primary will be disabled again.
* Got the code in scan-server to the point where it _should_ now gracefully and automatically detect changes to a server's definition originatin from the database (via Striker), directly editing the on-disk definition file, or editing via libvirt tools (like virt-manager). Still needs to be tested though. * Updated Server->migrate_virsh() to set 'servers' -> 'server_state' to 'migrating' and clear it again once the migation completes. Also added support for cold (frozen) versus live migrations. * Updated Cluster->parse_cib() to check if a server with the server_state set to 'migrating' isn't actually migrating anymore and, if not, to clear that state. This is needed as scan-server will blindly ignore/skip any migrating server, and if a migration call is interrupted, the state could get stuck. * Updated the 'servers' database table (and associated Database methods) to add columns for; ** server_ram_in_use - tracking RAM used by a running server ** server_configured_ram - RAM allocated to a running server (used with the above to alert a user and track _currently_ available RAM) ** server_updated_by_user - To be set by Striker tools to indicate when the user made a change that needs to push out to nodes / running server. ** server_boot_time - Tracks the unixtime when the server booted (to track uptime even if the server migrates across nodes). * Created Get->anvil_name_from_uuid() to easily convert an Anvil! UUID into a name. Also created ->host_uuid_from_name() to translate a host name into a host UUID. * Created Server->get_runtime() that translates a server name into a process ID and then uses that to determine how long (in seconds) it has been running. This is used when a server transitions from 'shut off' to 'running' to determine exactly when the server booted (current time - runtime). * Renamed all 'Server->parse_definition' calls that used 'from_memory' to 'from_virsh' to clarify the data source. * Made scan-hardware smarter about RAM change alerts. * Updated scancore to load agent strings on startup so that processing pending alerts works properly. Signed-off-by: Digimer <digimer@alteeve.ca>
4 years ago
my $migrated = 0;
if ($target)
{
# Can I even connect to the target?
my ($access) = $anvil->Remote->test_access({debug => 3, target => $target});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { access => $access }});
if (not $access)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0429", variables => {
server => $server,
target => $target,
}});
### TODO: I wonder if this should be exit'ed with '6'?
$anvil->nice_exit({exit_code => 5});
}
# Find the server
$anvil->Server->find({debug => 3});
my $server_host = defined $anvil->data->{server}{location}{$server}{host_name} ? $anvil->data->{server}{location}{$server}{host_name} : "";
my $server_status = defined $anvil->data->{server}{location}{$server}{status} ? $anvil->data->{server}{location}{$server}{status} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
server_host => $server_host,
server_status => $server_status,
}});
# Is it already on the target?
if (not $server_status)
{
# Maybe...
$anvil->Server->find({debug => 3, target => $target});
$server_host = defined $anvil->data->{server}{location}{$server}{host_name} ? $anvil->data->{server}{location}{$server}{host_name} : "";
$server_status = defined $anvil->data->{server}{location}{$server}{status} ? $anvil->data->{server}{location}{$server}{status} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
server_host => $server_host,
server_status => $server_status,
}});
if (($server_host eq $target) && (($server_status) && ($server_status eq "running")))
{
# Already over there, we're done.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0430", variables => {
server => $server,
target => $target,
}});
$anvil->nice_exit({exit_code => 0});
}
}
if (not $server_host)
{
# The server wasn't found on this host
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0344", variables => { server => $server }});
$anvil->nice_exit({exit_code => 1});
}
# Get a view of the servers locally and on our peer.
validate_all($anvil);
# Get the DRBD status.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0387"});
$anvil->DRBD->get_status({debug => 3});
# Make sure all resource(s) are ready for the server.
my $all_up_to_date = 1;
my $host = $anvil->Get->short_host_name;
my $peer_name = $anvil->data->{drbd}{config}{$host}{peer};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
host => $host,
peer_name => $peer_name,
}});
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}})
{
my $connection_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'connection-state'};
my $peer_node_id = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'peer-node-id'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
peer_node_id => $peer_node_id,
resource => $resource,
connection_state => $connection_state,
}});
if (lc($connection_state) ne "connected")
{
# Try to bring the resource up on the peer now.
$anvil->DRBD->manage_resource({
debug => 2,
resource => $resource,
task => "up",
target => $target,
});
# We'll give it 20 seconds.
my $wait = 20;
while($wait)
{
$anvil->DRBD->get_status({debug => 3});
$connection_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'connection-state'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
resource => $resource,
connection_state => $connection_state,
}});
if (lc($connection_state) ne "connected")
{
# It's up!
$wait = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'wait' => $wait }});
}
else
{
$wait--;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'wait' => $wait }});
if (not $wait)
{
# We're done waiting.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0428", variables => {
server => $server,
target => $target,
resource => $resource,
connection_state => $connection_state,
}});
### TODO: I wonder if this should be exit'ed with '6'?
$anvil->nice_exit({exit_code => 5});
}
}
}
}
foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{volume}})
{
my $peer_disk_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{volume}{$volume}{'peer-disk-state'};
my $percent_in_sync = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{volume}{$volume}{'percent-in-sync'};
my $replication_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{volume}{$volume}{'replication-state'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
volume => $volume,
peer_disk_state => $peer_disk_state,
percent_in_sync => $percent_in_sync,
replication_state => $replication_state,
}});
if (lc($peer_disk_state) ne "uptodate")
{
$all_up_to_date = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_up_to_date => $all_up_to_date }});
}
}
}
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_up_to_date => $all_up_to_date }});
if (not $all_up_to_date)
{
### TODO: If we decide later to block migration to Inconsistent peers, here's where we'd do it.
}
# If we're still alive, we're ready to migrate.
($migrated) = $anvil->Server->migrate_virsh({
debug => 2,
server => $server,
source => $source,
target => $target,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { migrated => $migrated }});
}
elsif ($source)
{
### NOTE: Pacemaker doesn't seem to ever pull servers.
# Pull the server here. Start by verifying it's on the 'meta_on_node' host.
# Scan locally and on our peer
$anvil->Server->find({debug => 2});
$anvil->Server->find({debug => 2, target => $meta_on_node, refresh => 0});
my $host = defined $anvil->data->{server}{location}{$server}{host_name} ? $anvil->data->{server}{location}{$server}{host_name} : "";
my $short_host = ($host =~ /^(.*?)\..*$/)[0];
my $status = defined $anvil->data->{server}{location}{$server}{status} ? $anvil->data->{server}{location}{$server}{status} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
host => $host,
short_host => $short_host,
status => $status,
target => $target,
}});
# Convert the host to a short name, in case the node's name is the short version.
my $server_host = defined $anvil->data->{server}{location}{$server}{host_name} ? $anvil->data->{server}{location}{$server}{host_name} : "";
my $server_status = defined $anvil->data->{server}{location}{$server}{status} ? $anvil->data->{server}{location}{$server}{status} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
server_host => $server_host,
server_status => $server_status,
}});
# This is called after a migration. If that is the case here, the target will be us. Just
# make sure it is running and, if so, return '0'. The 'meta_on_node' is the new host.
if (($target eq $anvil->Get->host_name) or ($target eq $anvil->Get->short_host_name) or ($target eq $meta_on_node))
{
# If it's running, we're succesfully out.
if ((($host eq $target) or ($short_host eq $target)) && ($status eq "running"))
{
# Success!
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0347", variables => { server => $server }});
$anvil->nice_exit({exit_code => 0});
}
# If we're still alive, we'll proceed as if we're pulling the server to us, and maybe
# that will work.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0348", variables => { server => $server }});
}
# Validate as if we were about to boot the server.
validate_all($anvil);
# Call the pull migation.
($migrated) = $anvil->Server->migrate_virsh({
debug => 2,
server => $server,
source => $source,
target => $target
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { migrated => $migrated }});
}
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { migrated => $migrated }});
if (not $migrated)
{
# Exit
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0357"});
$anvil->nice_exit({exit_code => 1});
}
# If we made it here, we succeeded.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0360"});
$anvil->nice_exit({exit_code => 0});
}
# Validation checks that we have the definition XML, resource config and that needed apps are installed.
sub validate_all
{
my ($anvil) = @_;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0361"});
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
my $source = defined $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source} ? $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source} : "";
my $target = defined $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target} ? $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
server => $server,
source => $source,
target => $target,
}});
# Log what we're doing.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0581", variables => { server => $server }});
# Read in and parse the server's XML.
$anvil->System->check_storage({debug => 3});
$anvil->Server->get_status({debug => 2, server => $server});
# Is the name in the definition file what we expect (and did we read the XML data at all)?
validate_name($anvil);
# Make sure the emulator it wants is the one we have.
validate_emulator($anvil);
# These tests are only needed if we're about to boot the server
if (($anvil->data->{switches}{start}) or ($source))
{
# Check that we have enough RAM.
validate_ram($anvil);
}
# Validate bridges
validate_bridges($anvil);
# Validate storage (Disks and optical media)
validate_storage($anvil);
return(0);
}
# This ensures that the bridges the server connects to exist on this node.
sub validate_bridges
{
my ($anvil) = @_;
# Get my bridge list
$anvil->Get->bridges({debug => 3});
# Find the Optical drives and DRBD devices.
my $local_host = $anvil->Get->short_host_name();
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
foreach my $mac (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{from_disk}{device}{interface}})
{
# See if we have this bridge
my $found = 0;
my $bridge = $anvil->data->{server}{$local_host}{$server}{from_disk}{device}{interface}{$mac}{bridge};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { bridge => $bridge }});
foreach my $interface_name (sort {$a cmp $b} keys %{$anvil->data->{$local_host}{network}{bridges}{bridge}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { interface_name => $interface_name }});
if ((exists $anvil->data->{$local_host}{network}{bridges}{bridge}{$interface_name}) && ($anvil->data->{$local_host}{network}{bridges}{bridge}{$interface_name}{found}))
{
$found = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { found => $found }});
last;
}
}
if ($found)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0368", variables => { bridge => $bridge }});
}
else
{
# Missing bridge.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0369", variables => { bridge => $bridge }});
$anvil->nice_exit({exit_code => 5});
}
}
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0366"});
return(0);
}
# This looks up the disks and optical media connected to this server.
sub validate_storage
{
my ($anvil) = @_;
* Got the code in scan-server to the point where it _should_ now gracefully and automatically detect changes to a server's definition originatin from the database (via Striker), directly editing the on-disk definition file, or editing via libvirt tools (like virt-manager). Still needs to be tested though. * Updated Server->migrate_virsh() to set 'servers' -> 'server_state' to 'migrating' and clear it again once the migation completes. Also added support for cold (frozen) versus live migrations. * Updated Cluster->parse_cib() to check if a server with the server_state set to 'migrating' isn't actually migrating anymore and, if not, to clear that state. This is needed as scan-server will blindly ignore/skip any migrating server, and if a migration call is interrupted, the state could get stuck. * Updated the 'servers' database table (and associated Database methods) to add columns for; ** server_ram_in_use - tracking RAM used by a running server ** server_configured_ram - RAM allocated to a running server (used with the above to alert a user and track _currently_ available RAM) ** server_updated_by_user - To be set by Striker tools to indicate when the user made a change that needs to push out to nodes / running server. ** server_boot_time - Tracks the unixtime when the server booted (to track uptime even if the server migrates across nodes). * Created Get->anvil_name_from_uuid() to easily convert an Anvil! UUID into a name. Also created ->host_uuid_from_name() to translate a host name into a host UUID. * Created Server->get_runtime() that translates a server name into a process ID and then uses that to determine how long (in seconds) it has been running. This is used when a server transitions from 'shut off' to 'running' to determine exactly when the server booted (current time - runtime). * Renamed all 'Server->parse_definition' calls that used 'from_memory' to 'from_virsh' to clarify the data source. * Made scan-hardware smarter about RAM change alerts. * Updated scancore to load agent strings on startup so that processing pending alerts works properly. Signed-off-by: Digimer <digimer@alteeve.ca>
4 years ago
# When checking on a running server, use 'from_virsh'.
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
my $target = defined $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target} ? $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
server => $server,
target => $target,
}});
my $local_host = $anvil->Get->short_host_name();
my $xml_source = "from_disk";
if ($anvil->data->{server}{$local_host}{$server}{from_virsh}{host_name})
{
* Got the code in scan-server to the point where it _should_ now gracefully and automatically detect changes to a server's definition originatin from the database (via Striker), directly editing the on-disk definition file, or editing via libvirt tools (like virt-manager). Still needs to be tested though. * Updated Server->migrate_virsh() to set 'servers' -> 'server_state' to 'migrating' and clear it again once the migation completes. Also added support for cold (frozen) versus live migrations. * Updated Cluster->parse_cib() to check if a server with the server_state set to 'migrating' isn't actually migrating anymore and, if not, to clear that state. This is needed as scan-server will blindly ignore/skip any migrating server, and if a migration call is interrupted, the state could get stuck. * Updated the 'servers' database table (and associated Database methods) to add columns for; ** server_ram_in_use - tracking RAM used by a running server ** server_configured_ram - RAM allocated to a running server (used with the above to alert a user and track _currently_ available RAM) ** server_updated_by_user - To be set by Striker tools to indicate when the user made a change that needs to push out to nodes / running server. ** server_boot_time - Tracks the unixtime when the server booted (to track uptime even if the server migrates across nodes). * Created Get->anvil_name_from_uuid() to easily convert an Anvil! UUID into a name. Also created ->host_uuid_from_name() to translate a host name into a host UUID. * Created Server->get_runtime() that translates a server name into a process ID and then uses that to determine how long (in seconds) it has been running. This is used when a server transitions from 'shut off' to 'running' to determine exactly when the server booted (current time - runtime). * Renamed all 'Server->parse_definition' calls that used 'from_memory' to 'from_virsh' to clarify the data source. * Made scan-hardware smarter about RAM change alerts. * Updated scancore to load agent strings on startup so that processing pending alerts works properly. Signed-off-by: Digimer <digimer@alteeve.ca>
4 years ago
$xml_source = "from_virsh";
}
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
server => $server,
xml_source => $xml_source,
}});
### TODO: If we're called with a status and find an ISO file missing and eject it instead of failing.
### For now, we just fault out.
# Do the optical discs in the drive exist? If not, we'll eject it if we're about to boot and fail if
# we're about to migrate. We skip this check if we're migrating off or shutting down the server.
if ((exists $anvil->data->{server}{$local_host}{$server}{$xml_source}{device}{cdrom}) && (not $target) && (not $anvil->data->{switches}{stop}))
{
foreach my $device_target (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{$xml_source}{device}{cdrom}{target}})
{
if ($anvil->data->{server}{$local_host}{$server}{$xml_source}{device}{cdrom}{target}{$device_target}{path})
{
my $file = $anvil->data->{server}{$local_host}{$server}{$xml_source}{device}{cdrom}{target}{$device_target}{path};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { file => $file }});
if (not -e $file)
{
# It doesn't exist. Exit with OCF_ERR_INSTALLED (5).
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0398", variables => { file => $file }});
$anvil->nice_exit({exit_code => 5});
}
elsif (not -r $file)
{
# We can't read it. Exit with OCF_ERR_PERM (4).
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0399", variables => { file => $file }});
$anvil->nice_exit({exit_code => 4});
}
else
{
# We're OK.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 2, key => "log_0400", variables => { file => $file }});
}
}
}
}
# Verify DRBD devices now
validate_storage_drbd($anvil);
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0367"});
return(0);
}
# THis makes sure that the needed backing DRBD devices are on this node. If so, and if they are not up, they
# will be brought up. If that fails, it errors out.
sub validate_storage_drbd
{
my ($anvil) = @_;
# Now check storage.
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
my $xml_source = "from_disk";
my $host = $anvil->Get->short_host_name;
my $local_host = $anvil->Get->short_host_name();
# Did I find a resource for each disk?
foreach my $device_path (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{device}})
{
next if not $device_path;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"server::${local_host}::${server}::device::${device_path}::resource" => $anvil->data->{server}{$local_host}{$server}{device}{$device_path}{resource},
}});
if (not $anvil->data->{server}{$local_host}{$server}{device}{$device_path}{resource})
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 0, priority => "err", key => "log_0414", variables => { drbd_device => $device_path }});
$anvil->nice_exit({exit_code => 5});
}
}
foreach my $device_target (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{$xml_source}{device}{disk}{target}})
{
my $drbd_device = $anvil->data->{server}{$local_host}{$server}{$xml_source}{device}{disk}{target}{$device_target}{path};
my $drbd_resource = defined $anvil->data->{drbd}{config}{$host}{drbd_path}{$drbd_device}{resource} ? $anvil->data->{drbd}{config}{$host}{drbd_path}{$drbd_device}{resource} : "";
my $on_lv = defined $anvil->data->{drbd}{config}{$host}{drbd_path}{$drbd_device}{on} ? $anvil->data->{drbd}{config}{$host}{drbd_path}{$drbd_device}{on} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
host => $host,
drbd_device => $drbd_device,
drbd_resource => $drbd_resource,
on_lv => $on_lv,
}});
if (not $drbd_resource)
{
# See if we can find the resource in the 'by-res' hash.
$drbd_resource = defined $anvil->data->{drbd}{config}{$host}{'by-res'}{$drbd_device}{resource} ? $anvil->data->{drbd}{config}{$host}{'by-res'}{$drbd_device}{resource} : "";
$on_lv = defined $anvil->data->{drbd}{config}{$host}{'by-res'}{$drbd_device}{backing_lv} ? $anvil->data->{drbd}{config}{$host}{'by-res'}{$drbd_device}{backing_lv} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
drbd_resource => $drbd_resource,
on_lv => $on_lv,
}});
}
# If the logical volume here here and active?
if ((not $on_lv) or (not exists $anvil->data->{lvm}{$local_host}{lv}{$on_lv}))
{
# LV not found
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 0, priority => "err", key => "log_0374", variables => { drbd_device => $drbd_device, lv_path => $on_lv }});
$anvil->nice_exit({exit_code => 5});
}
elsif (not $anvil->data->{lvm}{$local_host}{lv}{$on_lv}{active})
{
# LV not active. If we're starting the server or we're the migration target, try to
# activate it.
my $active = $anvil->System->activate_lv({debug => 3, path => $on_lv});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { active => $active }});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 1, key => "log_0413", variables => { lv_path => $on_lv }});
if (not $active)
{
# Boo :(
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 0, priority => "err", key => "log_0375", variables => { drbd_device => $drbd_device, lv_path => $on_lv }});
$anvil->nice_exit({exit_code => 5});
}
}
# LV is good if I am still alive.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 1, key => "log_0376", variables => {
drbd_device => $drbd_device,
lv_path => $on_lv,
}});
}
### NOTE: Checking/Managing firewall ports is expensive option, so DRBD ports are permanently opened
### when a resource is created.
return(0);
}
# This verifies that the requested emulator exists and can be used.
sub validate_emulator
{
my ($anvil) = @_;
# What emulator is this using?
my $local_host = $anvil->Get->short_host_name();
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
my $emulator = $anvil->data->{server}{$local_host}{$server}{from_disk}{info}{emulator};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
emulator => $emulator,
"server::${local_host}::${server}::from_disk::info::emulator" => $anvil->data->{server}{$local_host}{$server}{from_disk}{info}{emulator}
}});
if (not -e $emulator)
{
# It doesn't exist. Exit with OCF_ERR_INSTALLED (5).
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0401", variables => {
emulator => $emulator,
definition_file => $anvil->data->{server}{definition_file},
}});
$anvil->nice_exit({exit_code => 5});
}
if (not -x $emulator)
{
# We can't execute it. Exit with OCF_ERR_PERM (4).
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0402", variables => { emulator => $emulator }});
$anvil->nice_exit({exit_code => 4});
}
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0364"});
return(0);
}
# This makes sure the name we see in the definition file matches what we expect.
sub validate_name
{
my ($anvil) = @_;
my $local_host = $anvil->Get->short_host_name();
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
server => $server,
"server::${local_host}::${server}::from_disk::info::name" => $anvil->data->{server}{$local_host}{$server}{from_disk}{info}{name},
}});
# If we failed to read the XML, the server probably doesn't exist.
if (not $anvil->data->{server}{$local_host}{$server}{from_disk}{xml})
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0403", variables => {
server => $server,
name => $anvil->data->{server}{definition_xml}->{name},
}});
$anvil->nice_exit({exit_code => 1});
}
# Is the name in the definition file what we expect?
if ($server ne $anvil->data->{server}{$local_host}{$server}{from_disk}{info}{name})
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0403", variables => {
server => $server,
name => $anvil->data->{server}{$local_host}{$server}{from_disk}{info}{name},
}});
$anvil->nice_exit({exit_code => 1});
}
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0363"});
return(0);
}
# This checks that there is enough RAM to run this server.
sub validate_ram
{
my ($anvil) = @_;
# How mcuh RAM does the server need and how much do we have free?
my $local_host = $anvil->Get->short_host_name();
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
my $server_ram_bytes = $anvil->data->{server}{$local_host}{$server}{from_disk}{memory};
my $available = $anvil->Get->free_memory({debug => 3});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
server_ram_bytes => $anvil->Convert->add_commas({number => $server_ram_bytes})." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $server_ram_bytes}).")",
available => $anvil->Convert->add_commas({number => $available})." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $available}).")",
}});
if ($server_ram_bytes > $available)
{
# Not enough free memory.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0404", variables => {
name => $server,
ram => $anvil->Convert->bytes_to_human_readable({'bytes' => $server_ram_bytes}),
ram_bytes => $anvil->Convert->add_commas({number => $server_ram_bytes}),
available_ram => $anvil->Convert->bytes_to_human_readable({'bytes' => $server_ram_bytes}),
available_ram_bytes => $anvil->Convert->add_commas({number => $available}),
}});
$anvil->nice_exit({exit_code => 1});
}
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0365"});
return(0);
}
### TODO: Make sure the appropriate SN ports are opened.
# This stops (drbdadm down <server>) the storage for a given server on both nodes.
sub manage_drbd_resource
{
my ($anvil, $task, $resource) = @_;
return(0);
}
# This reads the XML definition data into an XML data hash.
sub read_server_definition
{
my ($anvil) = @_;
my $server = $anvil->data->{environment}{OCF_RESKEY_name};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
server => $server,
}});
return(0);
}
# This logs the details of this call.
sub show_environment
{
my ($anvil, $level) = @_;
foreach my $key (sort {$a cmp $b} keys %{$anvil->data->{switches}})
{
next if $key eq "raw";
next if $anvil->data->{switches}{$key} eq "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $level, list => { "switches::${key}" => $anvil->data->{switches}{$key} }});
}
foreach my $key (sort {$a cmp $b} keys %{$anvil->data->{environment}})
{
next if $anvil->data->{environment}{$key} eq "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $level, list => { "environment::${key}" => $anvil->data->{environment}{$key} }});
}
foreach my $key (sort {$a cmp $b} keys %ENV)
{
next if exists $anvil->data->{environment}{$key};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $level, list => { "ENV::${key}" => $ENV{$key} }});
}
foreach my $value (@ARGV)
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $level, list => { "ARGV" => $value }});
}
return(0);
}
# This just prints a quick usage message for now.
sub show_usage
{
my ($anvil) = @_;
### TODO: How to use this...
$anvil->nice_exit({exit_code => 0});
}
# This prints out the metadata and exits.
sub show_metadata
{
my ($anvil) = @_;
# This is a pretty simple agent, by design. We only take a server name for now.
print '<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="ocs:alteeve:server" version="0.1">
<version>1.0</version>
<longdesc lang="en">
This resource agent manages KVM+qemu virtual servers on an Anvil! m3 Intelligent Availability(tm) system.
It manages underlying components like DRBD 9 storage resources, bridge connections and so forth.
</longdesc>
<shortdesc lang="en">Anvil! m3 server resource agent</shortdesc>
<parameters>
<parameter name="name" unique="1" required="1">
<longdesc lang="en">
This is the name of the server as reported by virsh.
</longdesc>
<shortdesc lang="en">Server name</shortdesc>
<content type="string"/>
</parameter>
<parameter name="log_level" unique="0" required="0">
<longdesc lang="en">
Set the logging level, valid values are 1 to 3, with 3 being extremely loud.
</longdesc>
<shortdesc lang="en">Set log level</shortdesc>
<content type="integer"/>
</parameter>
<parameter name="log_secure" unique="0" required="0">
<longdesc lang="en">
Enable logging of potentially sensitive data, like passwords.
</longdesc>
<shortdesc lang="en">Log secure data.</shortdesc>
<content type="integer"/>
</parameter>
</parameters>
<actions>
<action name="start" timeout="60" />
<action name="stop" timeout="60" />
<action name="monitor" timeout="60" />
<action name="notify" timeout="60" />
<action name="migrate_to" timeout="600" />
<action name="migrate_from" timeout="600" />
<action name="meta-data" timeout="5" />
<action name="validate-all" timeout="60" />
</actions>
</resource-agent>
';
$anvil->nice_exit({exit_code => 0});
}