my ($return_code, $output) = shell_call($conf, $conf->{path}{exe}{virsh}." create $definition_file");
if ($return_code)
{
# If this fails, we want to exit with OCF_ERR_CONFIGURED (6) so that pacemaker doesn't try to
# also start the server on another node, because we don't know the state of it here.
to_log($conf, {message => "All tests passed, yet the attempt to boot the server: [$server] exited with a non-zero return code: [$return_code]. The server is in an unknown state, so exiting with a fatal error. Human intervention is now required. The output, if any, was: [$output].", 'line' => __LINE__, level => 0, priority => "err"});
# If this fails, we want to exit with OCF_ERR_CONFIGURED (6) so that pacemaker doesn't try to
# also start the server on another node, because we don't know the state of it here.
to_log($conf, {message => "It appears that the call to boot the server: [$server] worked, but the call to list running servers exited with a non-zero return code: [$return_code]. The server is in an unknown state, so exiting with a fatal error. Human intervention is now required. The output, if any, was: [$output].", 'line' => __LINE__, level => 0, priority => "err"});
to_log($conf, {message => "The server: [$server] has started successfully.", 'line' => __LINE__, level => 2});
exit(0);
}
else
{
# WTF?
to_log($conf, {message => "The server: [$server] should have been started, but it's state is: [$state]. Human intervention is required!", 'line' => __LINE__, level => 1, priority => "err"});
exit(6);
}
last;
}
}
# If we're still alive, then we didn't see the server in the list of running servers, which is really weird.
to_log($conf, {message => "The server: [$server] should have been started, but it wasn't found in the list of running servers.", 'line' => __LINE__, level => 1, priority => "err"});
exit(1);
}
# This shuts down the server if possible.
@ -280,6 +359,159 @@ sub stop_server
{
my ($conf) = @_;
# Stopping the server is simply a question of "is the server running?" and, if so, stop it.
my $server = $conf->{environment}{OCF_RESKEY_name};
my ($return_code, $output) = shell_call($conf, $conf->{path}{exe}{virsh}." list");
if ($return_code)
{
# Looks like virsh isn't running.
to_log($conf, {message => "The attempt to list the running servers returned a non-zero return code: [$return_code]. The output, if any, was: [$output].", 'line' => __LINE__, level => 0, priority => "err"});
to_log($conf, {message => "The server: [$server] is running. We will ask it to shut down now.", 'line' => __LINE__, level => 2});
}
elsif ($state eq "paused")
{
# The server is paused. Resume it, wait a few, then proceed with the shutdown.
to_log($conf, {message => "The server: [$server] is paused. Resuming it now so that it can react to the shutdown request.", 'line' => __LINE__, level => 2});
my ($return_code, $output) = shell_call($conf, $conf->{path}{exe}{virsh}." resume $server");
if ($return_code)
{
# Looks like virsh isn't running.
to_log($conf, {message => "The attempt to resume the server: [$server] returned a non-zero return code: [$return_code]. The output, if any, was: [$output].", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
to_log($conf, {message => "Pausing for a moment to give the server time to resume.", 'line' => __LINE__, level => 2});
sleep 3;
}
elsif ($state eq "pmsuspended")
{
# The server is paused. Resume it, wait a few, then proceed with the shutdown.
to_log($conf, {message => "The server: [$server] is asleep. Waking it now so that it can react to the shutdown request.", 'line' => __LINE__, level => 2});
my ($return_code, $output) = shell_call($conf, $conf->{path}{exe}{virsh}." dompmwakeup $server");
if ($return_code)
{
# Looks like virsh isn't running.
to_log($conf, {message => "The attempt to wake the server: [$server] returned a non-zero return code: [$return_code]. The output, if any, was: [$output].", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
to_log($conf, {message => "Pausing for half a minute to give the server time to wake up.", 'line' => __LINE__, level => 2});
sleep 30;
}
elsif (($state eq "in shutdown") or ($state eq "shut off"))
{
# The server is already shutting down
to_log($conf, {message => "The server: [$server] is already shutting down.", 'line' => __LINE__, level => 2});
exit(0);
}
elsif (($state eq "idle") or ($state eq "crashed"))
{
# The server needs to be destroyed.
to_log($conf, {message => "The server: [$server] is hung. Its state is: [$state]. We will force it off now.", 'line' => __LINE__, level => 2});
my ($return_code, $output) = shell_call($conf, $conf->{path}{exe}{virsh}." destroy $server");
if ($return_code)
{
# Looks like virsh isn't running.
to_log($conf, {message => "The attempt to force-off the server: [$server] returned a non-zero return code: [$return_code]. The output, if any, was: [$output].", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
to_log($conf, {message => "The server: [$server] is now off.", 'line' => __LINE__, level => 2});
exit(0);
}
else
{
# WTF?
to_log($conf, {message => "The server: [$server] is running, but it is in an unexpected state: [$state]. Human intervention is required!", 'line' => __LINE__, level => 1, priority => "err"});
exit(6);
}
last;
}
}
# If we didn't see it, it's off and undefined.
if (not $found)
{
to_log($conf, {message => "The server: [$server] was not listed on this node, so it is not running here.", 'line' => __LINE__, level => 2});
to_log($conf, {message => "Asking the server: [$server] to shut down now. Please be patient.", 'line' => __LINE__, level => 1});
if ($return_code)
{
# Looks like virsh isn't running.
to_log($conf, {message => "The attempt to shut down the server: [$server] returned a non-zero return code: [$return_code]. The output, if any, was: [$output].", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
# Now loop until we see the server either vanish from virsh or enter "shut off" state. We wait
# forever and let pacemaker kill us if we time out.
while (1)
{
my $found = 0;
my ($return_code, $output) = shell_call($conf, $conf->{path}{exe}{virsh}." list");
if ($return_code)
{
# Looks like virsh isn't running.
to_log($conf, {message => "The attempt to list the running servers returned a non-zero return code: [$return_code]. The output, if any, was: [$output].", 'line' => __LINE__, level => 0, priority => "err"});
to_log($conf, {message => "The server: [$server] is now off.", 'line' => __LINE__, level => 2});
exit(0);
}
last;
}
}
# If we didn't find the server, it's off and undefined now.
if (not $found)
{
to_log($conf, {message => "The server: [$server] is no longer listed. It is now off.", 'line' => __LINE__, level => 2});
exit(0);
}
to_log($conf, {message => "The server: [$server] is not off yet, waiting a few seconds and then we'll check again.", 'line' => __LINE__, level => 2});
sleep 5;
}
exit(0);
}
@ -362,27 +594,8 @@ sub server_status
# If there is a state, see what the state is.
if ($state)
{
### What is the state?
## States we return OCF_SUCCESS (0).
# running - The domain is currently running on a CPU
# paused - The domain has been paused, usually occurring through the administrator
# running virsh suspend. When in a paused state the domain will still consume
# allocated resources like memory, but will not be eligible for scheduling by
# the hypervisor.
# pmsuspended - The domain has been suspended by guest power management, e.g. entered into s3
# state.
# in shutdown - The domain is in the process of shutting down, i.e. the guest operating
# system has been notified and should be in the process of stopping its
# operations gracefully.
## States we'll return OCF_NOT_RUNNING (7).
# shut off - The domain is not running. Usually this indicates the domain has been shut
# down completely, or has not been started.
## States we'll return OCF_ERR_GENERIC (1).
# idle - The domain is idle, and not running or runnable. This can be caused because
# the domain is waiting on IO (a traditional wait state) or has gone to sleep
# because there was nothing else for it to do.
# crashed - The domain has crashed, which is always a violent ending. Usually this state
# can only occur if the domain has been configured not to restart on crash.
# What is the state?
# (See the comment below the 'FUNCTIONS' divider above the first function for a full list of states.)
if (($state eq "running") or ($state eq "paused") or ($state eq "pmsuspended") or ($state eq "in shutdown"))
{
to_log($conf, {message => "The server: [$server] is: [$state], which is OK.", 'line' => __LINE__, level => 1});
@ -461,7 +674,7 @@ sub validate_all
validate_storage($conf);
to_log($conf, {message => "- Storage is valid and ready.", 'line' => __LINE__, level => 2});
exit(0);
return(0);
}
# This ensures that the bridges the server connects to exist on this node.
@ -715,11 +928,13 @@ sub validate_storage_drbd
}
# Give them a few seconds to start.
to_log($conf, {message => "Pausing briefly to give the resources time to start.", 'line' => __LINE__, level => 0});
sleep 3;
# Check DRBD setup again
$return_code = undef;
$status_json = undef;
to_log($conf, {message => "Checking the DRBD status again.", 'line' => __LINE__, level => 0});
($return_code, $status_json) = shell_call($conf, $conf->{path}{exe}{drbdsetup}." status --json");
if ($return_code)
{
@ -746,7 +961,7 @@ sub validate_storage_drbd
if ($conf->{server}{disks}{$device_path} eq "check")
{
# Failed to see it, see if we can bring it up.
my $check_again = 1;
$check_again = 1;
my $resource = $conf->{device_path}{$device_path}{resource};
to_log($conf, {message => "The DRBD resource: [$resource] backing the device: [$device_path] was not seen in the 'drbdsetup' status data. Attempting to bringing it up now.", 'line' => __LINE__, level => 2});
@ -759,22 +974,28 @@ sub validate_storage_drbd
}
}
# Give the resource a few seconds to start.
sleep 3;
# Check again.
$return_code = undef;
$status_json = undef;
($return_code, $status_json) = shell_call($conf, $conf->{path}{exe}{drbdsetup}." status --json");
to_log($conf, {message => "The attempt to read the DRBD status after bringing up the resource(s) for this server returned a non-zero code: [$return_code]. The returned output (if any) was: [$status_json].", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
# Give the resource a few seconds to start.
to_log($conf, {message => "Pausing briefly to give the resources time to start.", 'line' => __LINE__, level => 2});
sleep 3;
# Check again.
$return_code = undef;
$status_json = undef;
to_log($conf, {message => "Checking the DRBD status again.", 'line' => __LINE__, level => 2});
($return_code, $status_json) = shell_call($conf, $conf->{path}{exe}{drbdsetup}." status --json");
if ($return_code)
{
# Something went wrong.
to_log($conf, {message => "The attempt to read the DRBD status after bringing up the resource(s) for this server returned a non-zero code: [$return_code]. The returned output (if any) was: [$status_json].", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
# Check again.
check_drbd_status($conf, $status_json);
}
# Check again.
check_drbd_status($conf, $status_json);
}
# Do I need to check again?
@ -1362,7 +1583,7 @@ sub find_executables
{
if ( not -e $conf->{path}{exe}{$exe} )
{
to_log($conf, {message => "The program: [$exe] is not at: [".$conf->{path}{exe}{$exe}."]. Looking for it now..", 'line' => __LINE__, level => 1});
to_log($conf, {message => "The program: [$exe] is not at: [".$conf->{path}{exe}{$exe}."]. Looking for it now.", 'line' => __LINE__, level => 1});