pcs resource create srv01-c7 ocf:alteeve:server hypervisor="qemu:///system" config="/mnt/anvil/definitions/srv01-c7.xml" meta allow-migrate="true" op monitor interval="10" op stop timeout="60" on-fail="block" meta allow-migrate="true" failure-timeout="75"
pcs resource create srv01-c7 ocf:alteeve:server hypervisor="qemu:///system" config="/mnt/anvil/definitions/srv01-c7.xml" meta allow-migrate="true" op monitor interval="10" op stop timeout="60" on-fail="block" meta allow-migrate="true" failure-timeout="75"
pcs resource create srv01-c7 ocf:alteeve:server hypervisor="qemu:///system" config="/mnt/anvil/definitions/srv01-c7.xml" meta allow-migrate="true" op monitor interval="10" op on-fail="block" meta allow-migrate="true" failure-timeout="75"
pcs resource create srv01-c7 ocf:alteeve:server hypervisor="qemu:///system" config="/mnt/anvil/definitions/srv01-c7.xml" meta allow-migrate="true" op monitor interval="10" op on-fail="block" meta allow-migrate="true" failure-timeout="75"
pcs resource create srv01-c7 ocf:alteeve:server name="srv01-c7" meta allow-migrate="true" op monitor interval="10" op stop on-fail="block" meta allow-migrate="true" failure-timeout="75"
pcs resource create srv01-c7 ocf:alteeve:server name="srv01-c7" meta allow-migrate="true" op monitor interval="10" op stop on-fail="block" meta allow-migrate="true" failure-timeout="75"
pcs resource create srv01-c7 ocf:alteeve:server name="srv01-c7" meta allow-migrate="true" op monitor interval="60" op stop on-fail="block" op migrate_to on-fail="block" op migrate_from on-fail="block" meta allow-migrate="true" failure-timeout="75"
* A resource agent receives all configuration information about the resource it manages via environment variables. The names of these environment variables are always the name of the resource parameter, prefixed with OCF_RESKEY_. For example, if the resource has an ip parameter set to 192.168.1.1, then the resource agent will have access to an environment variable OCF_RESKEY_ip holding that value.
* A resource agent receives all configuration information about the resource it manages via environment variables. The names of these environment variables are always the name of the resource parameter, prefixed with OCF_RESKEY_. For example, if the resource has an ip parameter set to 192.168.1.1, then the resource agent will have access to an environment variable OCF_RESKEY_ip holding that value.
@ -438,5 +453,29 @@ Migrate servers;
- Set 'migration-limit' to '1' to enforce serial live migration (http://clusterlabs.org/pacemaker/doc/en-US/Pacemaker/1.1/html-single/Pacemaker_Explained/index.html#s-cluster-options).
- Set 'migration-limit' to '1' to enforce serial live migration (http://clusterlabs.org/pacemaker/doc/en-US/Pacemaker/1.1/html-single/Pacemaker_Explained/index.html#s-cluster-options).
Migrate a single server by setting a location constraint against the node we want the VM off of.
Migrate a single server by setting a location constraint against the node we want the VM off of.
-
- If anything goes wrong, the server will enter a blocked state in pacemaker.
- Recovery needs to be 'unmanage -> clean' to avoid a stop call.
11:57 <@kgaillot> for your design, that sounds right. between cleanup and manage, i'd make sure there was a PE run without any pending actions blocked by the unmanaging -- you can either look at the logs on the DC, run "crm_simulate -SL", or just check the status for a bit
11:58 <@kgaillot> you can play around with it by putting a higher preference on the to-be-cleaned node, to make sure it *does* move when you re-manage. that way you can see what logs/simulate/status look like
12:07 <@kgaillot> i'm thinking if you do crm_resource --reprobe instead of cleanup in the above sequence, that should prevent anything unexpected
12:07 <@kgaillot> unmanage -> adjust preferences if needed -> reprobe resource -> wait for probe results to come back in, and if status looks good -> re-manage
12:08 <@kgaillot> the reprobe will wipe the entire resource history and fail counts for the resource, causing pacemaker to recheck the current status on all nodes. if the status then shows the resource running where you expect/want it, with no errors, then it's not going to do anything further
12:09 <@kgaillot> (in 2.0, cleanup only erases the history where the resource has failed, while reprobe erases the history regardless)
12:13 <@kgaillot> if there are no failures in the resource history, there should be no risk of a full stop. if there is no resource history at all, then after reprobe, there should be no risk of any actions (assuming you've set up location preferences and stickiness how you want them)
Recover from a failed migration;
reset location to prefer current host -> unmanage resource -> cleanup resource -> manage resource
(running on node 2, so re-add location constraint - basically, make sure location constraint favours current host)
if (($conf->{switches}{monitor}) or ($conf->{switches}{status}))
if (($conf->{switches}{monitor}) or
($conf->{switches}{status}) or
($conf->{switches}{'meta-data'}) or
($conf->{switches}{metadaata}))
{
{
show_environment($conf, 3);
show_environment($conf, 3);
}
}
@ -304,6 +307,37 @@ sub start_server
my $server = $conf->{environment}{OCF_RESKEY_name};
my $server = $conf->{environment}{OCF_RESKEY_name};
to_log($conf, {message => "We've been asked to start the server: [$server].", 'line' => __LINE__, level => 2});
to_log($conf, {message => "We've been asked to start the server: [$server].", 'line' => __LINE__, level => 2});
# If the server is already here, we'll do nothing else.
my ($return_code, $output) = shell_call($conf, $conf->{path}{exe}{virsh}." list");
if ($return_code)
{
# If this fails, we want to exit with OCF_ERR_CONFIGURED (6) so that pacemaker doesn't try to
# also start the server on another node, because we don't know the state of it here.
to_log($conf, {message => "It appears that the list the currently running servers returned a non-zero return code: [$return_code]. We will proceed as we may be able to fix this. The output, if any, was: [$output].", 'line' => __LINE__, level => 0, priority => "err"});
to_log($conf, {message => "The server: [$server] is already on this node in the state: [$state], aborting the start request.", 'line' => __LINE__, level => 2});
my ($return_code, $output) = shell_call($conf, $conf->{path}{exe}{virsh}." list");
if ($return_code)
{
to_log($conf, {message => "It appears that the call to check if the server: [$server] is on this node returned a non-zero return code: [$return_code]. The output, if any, was: [$output].", 'line' => __LINE__, level => 0, priority => "err"});
to_log($conf, {message => "The server: [$server] state is: [$state]. A server must be 'running' in order to migrate it.", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
}
}
}
if (not $found)
{
to_log($conf, {message => "The server: [$server] wasn't found on this machine.", 'line' => __LINE__, level => 0, priority => "err"});
to_log($conf, {message => "Temporarily enabling dual primary for the resource: [$resource] to the node: [".$conf->{resource}{$resource}{target_name}." (".$conf->{resource}{$resource}{target_node_id}."].", 'line' => __LINE__, level => 1});
my ($return_code, $output) = shell_call($conf, $shell_call);
if ($return_code)
{
# Something went wrong.
to_log($conf, {message => "The attempt to enable dual-primary for the resource: [$resource] to the node: [".$conf->{resource}{$resource}{target_name}." (".$conf->{resource}{$resource}{target_node_id}.")] returned a non-zero return code [$return_code]. The returned output (if any) was: [$output].", 'line' => __LINE__, level => 0, priority => "err"});
# Disable migration (and any further attempts to enable dual-primary).
my ($return_code, $output) = shell_call($conf, $migration_command);
if ($return_code)
{
# Something went wrong.
to_log($conf, {message => "The attempt to migrate the server: [$server] to the node: [$target] returned a non-zero return code [$return_code]. The returned output (if any) was: [$output].", 'line' => __LINE__, level => 0, priority => "err"});
to_log($conf, {message => "Re-disabling dual primary for the resource: [$resource] to the node: [".$conf->{resource}{$resource}{target_name}." (".$conf->{resource}{$resource}{target_node_id}."].", 'line' => __LINE__, level => 1});
my ($return_code, $output) = shell_call($conf, $shell_call);
if ($return_code)
{
# Something went wrong.
to_log($conf, {message => "The attempt to re-disable dual-primary for the resource: [$resource] to the node: [".$conf->{resource}{$resource}{target_name}." (".$conf->{resource}{$resource}{target_node_id}.")] returned a non-zero return code [$return_code]. The returned output (if any) was: [$output].", 'line' => __LINE__, level => 0, priority => "err"});
}
}
# Did something go wrong during the dual-primary enable or the actual migration call?
if ((not $migrate) or (not $migrated))
{
# Exit
exit(1);
}
# If we made it here, we succeeded.
exit(0);
}
}
# Validation checks that we have the definition XML, resource config and that needed apps are installed.
# Validation checks that we have the definition XML, resource config and that needed apps are installed.
# These tests are only needed if we're about to boot the server
# These tests are only needed if we're about to boot the server
if ($conf->{switches}{start})
if (($conf->{switches}{start}) or ($conf->{switches}{migrate_from}))
{
{
# Check that we have enough RAM.
# Check that we have enough RAM.
validate_ram($conf);
validate_ram($conf);
@ -808,8 +954,11 @@ sub validate_storage
}
}
}
}
# Verify optical disks now
# Verify optical disks now, unless we're migrating a server off of us.
validate_storage_optical($conf);
if (not $conf->{switches}{migrate_to})
{
validate_storage_optical($conf);
}
# Verify DRBD devices now
# Verify DRBD devices now
validate_storage_drbd($conf);
validate_storage_drbd($conf);
@ -1054,6 +1203,15 @@ sub validate_storage_drbd
}
}
}
}
# If I am about to push a server off, we need to make sure the peer is UpToDate
if ($conf->{switches}{migrate_to})
{
to_log($conf, {message => "Checking that the peer's DRBD resources are Connected and UpToDate prior to migration.", 'line' => __LINE__, level => 2});
foreach my $device_path (sort {$a cmp $b} keys %{$conf->{server}{disks}})
{
}
}
return(0);
return(0);
}
}
@ -1076,29 +1234,31 @@ sub check_drbd_status
### This disk is in use by this server, check it.
### This disk is in use by this server, check it.
to_log($conf, {message => "The local replicated disk: [$device_path] is used by this server. Checking it out now.", 'line' => __LINE__, level => 2});
to_log($conf, {message => "The local replicated disk: [$device_path] is used by this server. Checking it out now.", 'line' => __LINE__, level => 2});
# First, are any of the local volumes not UpToDate?
# If we're booting a server or migrating it here, we need to make sure all local
foreach my $device_ref (@{$resource_ref->{devices}})
# volumes are UpToDate?
if (($conf->{switches}{start}) or ($conf->{switches}{migrate_from}))
{
{
# Are we UpToDate (or SyncSource)?
foreach my $device_ref (@{$resource_ref->{devices}})
if ((lc($device_ref->{'disk-state'}) ne "uptodate") && (lc($device_ref->{'disk-state'}) ne "syncsource"))
{
{
# If we've been asked to start, refuse.
# Are we UpToDate (or SyncSource)?
if ($conf->{switches}{start})
if ((lc($device_ref->{'disk-state'}) ne "uptodate") && (lc($device_ref->{'disk-state'}) ne "syncsource"))
{
{
# We can't start here.
to_log($conf, {message => "The DRBD resource: [$resource] volume: [".$device_ref->{volume}."] locat disk state is: [".$device_ref->{'disk-state'}."]. Unsafe to boot the server unless the disk state is UpToDate.", 'line' => __LINE__, level => 0, priority => "err"});
to_log($conf, {message => "The DRBD resource: [$resource] volume: [".$device_ref->{volume}."] locat disk state is: [".$device_ref->{'disk-state'}."]. Unsafe to boot the server unless the disk state is UpToDate.", 'line' => __LINE__, level => 0, priority => "err"});
exit(1);
exit(1);
}
}
}
else
else
{
{
to_log($conf, {message => "The DRBD resource: [$resource] volume: [".$device_ref->{volume}."] locat disk state is: [".$device_ref->{'disk-state'}."], good.", 'line' => __LINE__, level => 2});
to_log($conf, {message => "The DRBD resource: [$resource] volume: [".$device_ref->{volume}."] locat disk state is: [".$device_ref->{'disk-state'}."], good.", 'line' => __LINE__, level => 2});
}
}
}
}
}
# Is this a connection we care about?
# If we're booting a server, we need to be sure that *no* peer is Primary. If we're
# migrating, we need to be sure the migration target is UpToDate.
foreach my $connection_ref (@{$resource_ref->{connections}})
foreach my $connection_ref (@{$resource_ref->{connections}})
{
{
# Is the peer's role Primary?
# Is the peer's role Primary? In all cases, we abort if so.
if ((lc($volume_ref->{'peer-disk-state'}) ne "uptodate") && (lc($volume_ref->{'peer-disk-state'}) ne "syncsource"))
{
to_log($conf, {message => "The DRBD resource: [$resource] on the peer: [".$connection_ref->{name}."] is not UpToDate (or SyncSource). Refusing to migrate.", 'line' => __LINE__, level => 0, priority => "err"});