From 4e5dc9f1c2d52327a878b64cde1cd8be76cd9ed0 Mon Sep 17 00:00:00 2001 From: Digimer Date: Tue, 20 Feb 2018 02:14:59 -0500 Subject: [PATCH] * Started work on migration handling. * Fixed a bug where a stop operation on a server already in shutdown would exit immediately instead of waiting for the server to actually shut off. Signed-off-by: Digimer --- notes | 25 +++++++++- ocf/alteeve/server | 119 ++++++++++++++++++++++++++++++--------------- 2 files changed, 103 insertions(+), 41 deletions(-) diff --git a/notes b/notes index f6f96f72..c6e63e39 100644 --- a/notes +++ b/notes @@ -409,13 +409,34 @@ resource srv01-c7_0 { # Provision servers mkdir /mnt/anvil/{provision,files,archive,definitions} -pcs resource create srv01-c7 ocf:heartbeat:VirtualDomain hypervisor="qemu:///system" config="/mnt/anvil/definitions/srv01-c7.xml" meta allow-migrate="true" op monitor interval="10" +pcs resource create srv01-c7 ocf:alteeve:server hypervisor="qemu:///system" config="/mnt/anvil/definitions/srv01-c7.xml" meta allow-migrate="true" op monitor interval="10" op stop timeout="60" on-fail="block" meta allow-migrate="true" failure-timeout="75" +pcs resource create srv01-c7 ocf:alteeve:server hypervisor="qemu:///system" config="/mnt/anvil/definitions/srv01-c7.xml" meta allow-migrate="true" op monitor interval="10" op on-fail="block" meta allow-migrate="true" failure-timeout="75" + +pcs resource create srv01-c7 ocf:alteeve:server name="srv01-c7" meta allow-migrate="true" op monitor interval="10" op stop on-fail="block" meta allow-migrate="true" failure-timeout="75" == Resource Agent; https://github.com/ClusterLabs/resource-agents/blob/master/doc/dev-guides/ra-dev-guide.asc * A resource agent receives all configuration information about the resource it manages via environment variables. The names of these environment variables are always the name of the resource parameter, prefixed with OCF_RESKEY_. For example, if the resource has an ip parameter set to 192.168.1.1, then the resource agent will have access to an environment variable OCF_RESKEY_ip holding that value. -* +=== + +When stopping a server; +14:03 < lge> "on-fail: block" +14:03 < lge> is per operation type. +14:08 < lge> anyways, you can also "on-fail: retry" + +OK, set the stop timeout to 60, set 'on-fail: block" and set the failure-timeout to 60 and see how pacemaker reacts. +failure-timeout + +=== + +Migrate servers; + +- Let ScanCore set 'node-health' attribute (http://clusterlabs.org/pacemaker/doc/en-US/Pacemaker/1.1/html-single/Pacemaker_Explained/index.html#s-node-health) +- Set 'migration-limit' to '1' to enforce serial live migration (http://clusterlabs.org/pacemaker/doc/en-US/Pacemaker/1.1/html-single/Pacemaker_Explained/index.html#s-cluster-options). + +Migrate a single server by setting a location constraint against the node we want the VM off of. +- diff --git a/ocf/alteeve/server b/ocf/alteeve/server index 91385324..37d2375d 100755 --- a/ocf/alteeve/server +++ b/ocf/alteeve/server @@ -116,24 +116,28 @@ my $conf = { }, environment => { # This is the name of the server we're managing. # Example values: - OCF_RESKEY_name => defined $ENV{OCF_RESKEY_name} ? $ENV{OCF_RESKEY_name} : "", # srv01-c7 + OCF_RESKEY_name => defined $ENV{OCF_RESKEY_name} ? $ENV{OCF_RESKEY_name} : "", # srv01-c7 # This is our node name - OCF_RESKEY_CRM_meta_on_node => defined $ENV{OCF_RESKEY_CRM_meta_on_node} ? $ENV{OCF_RESKEY_CRM_meta_on_node} : "", # m3-a02n01.alteeve.com + OCF_RESKEY_CRM_meta_on_node => defined $ENV{OCF_RESKEY_CRM_meta_on_node} ? $ENV{OCF_RESKEY_CRM_meta_on_node} : "", # m3-a02n01.alteeve.com # This says "UUID", but it's the node ID. - OCF_RESKEY_CRM_meta_on_node_uuid => defined $ENV{OCF_RESKEY_CRM_meta_on_node_uuid} ? $ENV{OCF_RESKEY_CRM_meta_on_node_uuid} : "", # 1 + OCF_RESKEY_CRM_meta_on_node_uuid => defined $ENV{OCF_RESKEY_CRM_meta_on_node_uuid} ? $ENV{OCF_RESKEY_CRM_meta_on_node_uuid} : "", # 1 # This is the timeout for the called action in millisecond. - OCF_RESKEY_CRM_meta_timeout => defined $ENV{OCF_RESKEY_CRM_meta_timeout} ? $ENV{OCF_RESKEY_CRM_meta_timeout} : "", # 20000 + OCF_RESKEY_CRM_meta_timeout => defined $ENV{OCF_RESKEY_CRM_meta_timeout} ? $ENV{OCF_RESKEY_CRM_meta_timeout} : "", # 20000 # If this is set, we'll bump our log level as well. - PCMK_debug => defined $ENV{PCMK_debug} ? $ENV{PCMK_debug} : "", # 0 + PCMK_debug => defined $ENV{PCMK_debug} ? $ENV{PCMK_debug} : "", # 0 # These are other variables that are set, but we don't currently care about them - OCF_EXIT_REASON_PREFIX => defined $ENV{OCF_EXIT_REASON_PREFIX} ? $ENV{OCF_EXIT_REASON_PREFIX} : "", # ocf-exit-reason: - OCF_RA_VERSION_MAJOR => defined $ENV{OCF_RA_VERSION_MAJOR} ? $ENV{OCF_RA_VERSION_MAJOR} : "", # 1 - OCF_RA_VERSION_MINOR => defined $ENV{OCF_RA_VERSION_MINOR} ? $ENV{OCF_RA_VERSION_MINOR} : "", # 0 - OCF_RESKEY_crm_feature_set => defined $ENV{OCF_RESKEY_crm_feature_set} ? $ENV{OCF_RESKEY_crm_feature_set} : "", # 3.0.12 - OCF_RESOURCE_INSTANCE => defined $ENV{OCF_RESOURCE_INSTANCE} ? $ENV{OCF_RESOURCE_INSTANCE} : "", # srv01-c7 - OCF_RESOURCE_PROVIDER => defined $ENV{OCF_RESOURCE_PROVIDER} ? $ENV{OCF_RESOURCE_PROVIDER} : "", # alteeve - OCF_RESOURCE_TYPE => defined $ENV{OCF_RESOURCE_TYPE} ? $ENV{OCF_RESOURCE_TYPE} : "", # server - OCF_ROOT => defined $ENV{OCF_ROOT} ? $ENV{OCF_ROOT} : "", # /usr/lib/ocf + OCF_EXIT_REASON_PREFIX => defined $ENV{OCF_EXIT_REASON_PREFIX} ? $ENV{OCF_EXIT_REASON_PREFIX} : "", # ocf-exit-reason: + OCF_RA_VERSION_MAJOR => defined $ENV{OCF_RA_VERSION_MAJOR} ? $ENV{OCF_RA_VERSION_MAJOR} : "", # 1 + OCF_RA_VERSION_MINOR => defined $ENV{OCF_RA_VERSION_MINOR} ? $ENV{OCF_RA_VERSION_MINOR} : "", # 0 + OCF_RESKEY_crm_feature_set => defined $ENV{OCF_RESKEY_crm_feature_set} ? $ENV{OCF_RESKEY_crm_feature_set} : "", # 3.0.12 + OCF_RESOURCE_INSTANCE => defined $ENV{OCF_RESOURCE_INSTANCE} ? $ENV{OCF_RESOURCE_INSTANCE} : "", # srv01-c7 + OCF_RESOURCE_PROVIDER => defined $ENV{OCF_RESOURCE_PROVIDER} ? $ENV{OCF_RESOURCE_PROVIDER} : "", # alteeve + OCF_RESOURCE_TYPE => defined $ENV{OCF_RESOURCE_TYPE} ? $ENV{OCF_RESOURCE_TYPE} : "", # server + OCF_ROOT => defined $ENV{OCF_ROOT} ? $ENV{OCF_ROOT} : "", # /usr/lib/ocf + # These are set during a migration + OCF_RESKEY_CRM_meta_migrate_source => defined $ENV{OCF_RESKEY_CRM_meta_migrate_source} ? $ENV{OCF_RESKEY_CRM_meta_migrate_source} : "", # m3-a02n01.alteeve.com + OCF_RESKEY_CRM_meta_migrate_target => defined $ENV{OCF_RESKEY_CRM_meta_migrate_target} ? $ENV{OCF_RESKEY_CRM_meta_migrate_target} : "", # m3-a02n02.alteeve.com + OCF_RESKEY_CRM_meta_record_pending => defined $ENV{OCF_RESKEY_CRM_meta_record_pending} ? $ENV{OCF_RESKEY_CRM_meta_record_pending} : "", # true }, }; @@ -152,16 +156,25 @@ get_switches($conf); ### TEST: to be removed later if ($conf->{switches}{test}) { - $conf->{environment}{OCF_RESKEY_name} = "srv01-c7"; - $conf->{environment}{OCF_RESKEY_CRM_meta_on_node} = "m3-a02n01.alteeve.com"; - $conf->{environment}{OCF_RESKEY_CRM_meta_timeout} = 20000; + $conf->{environment}{OCF_RESKEY_name} = "srv01-c7"; + $conf->{environment}{OCF_RESKEY_CRM_meta_on_node} = "m3-a02n01.alteeve.com"; + $conf->{environment}{OCF_RESKEY_CRM_meta_timeout} = 20000; + $conf->{environment}{OCF_RESKEY_CRM_meta_migrate_source} = "m3-a02n01.alteeve.com"; + $conf->{environment}{OCF_RESKEY_CRM_meta_migrate_target} = "m3-a02n02.alteeve.com"; } # Something for the logs to_log($conf, {message => "ocf:alteeve:server invoked.", 'line' => __LINE__, level => 2}); # This is for debugging. -show_environment($conf, 2); +if (($conf->{switches}{monitor}) or ($conf->{switches}{status})) +{ + show_environment($conf, 3); +} +else +{ + show_environment($conf, 2); +} ### What are we being asked to do? # start  - Starts the resource. @@ -207,7 +220,7 @@ elsif ($conf->{switches}{demote}) to_log($conf, {message => "We were asked to demote: [".$conf->{environment}{OCF_RESKEY_name}."], which makes no sense and is not supported. Ignoreing.", 'line' => __LINE__, level => 0, priority => "err"}); exit(3); } -elsif (($conf->{switches}{migrate_to}) && ($conf->{switches}{migrate_from})) +elsif (($conf->{switches}{migrate_to}) or ($conf->{switches}{migrate_from})) { # We don't support this, so we return OCF_ERR_UNIMPLEMENTED (3) migrate_server($conf); @@ -218,7 +231,7 @@ elsif ($conf->{switches}{'validate-all'}) validate_all($conf); exit(0); } -elsif (($conf->{switches}{help}) && ($conf->{switches}{usage})) +elsif (($conf->{switches}{help}) or ($conf->{switches}{usage})) { # Show the usage information show_usage($conf); @@ -369,7 +382,8 @@ sub stop_server exit(1); } - my $found = 0; + my $shutdown = 1; + my $found = 0; foreach my $line (split/\n/, $output) { $line =~ s/^\s+//; @@ -415,10 +429,16 @@ sub stop_server to_log($conf, {message => "Pausing for half a minute to give the server time to wake up.", 'line' => __LINE__, level => 2}); sleep 30; } - elsif (($state eq "in shutdown") or ($state eq "shut off")) + elsif ($state eq "in shutdown") + { + # The server is already shutting down + to_log($conf, {message => "The server: [$server] is already shutting down. We'll monitor it until it actually shuts off.", 'line' => __LINE__, level => 2}); + $shutdown = 0; + } + elsif ($state eq "shut off") { # The server is already shutting down - to_log($conf, {message => "The server: [$server] is already shutting down.", 'line' => __LINE__, level => 2}); + to_log($conf, {message => "The server: [$server] is already off.", 'line' => __LINE__, level => 2}); exit(0); } elsif (($state eq "idle") or ($state eq "crashed")) @@ -455,15 +475,16 @@ sub stop_server } # If we're alive, it is time to stop the server - $return_code = undef; - $output = undef; - ($return_code, $output) = shell_call($conf, $conf->{path}{exe}{virsh}." shutdown $server"); - to_log($conf, {message => "Asking the server: [$server] to shut down now. Please be patient.", 'line' => __LINE__, level => 1}); - if ($return_code) + if ($shutdown) { - # Looks like virsh isn't running. - to_log($conf, {message => "The attempt to shut down the server: [$server] returned a non-zero return code: [$return_code]. The output, if any, was: [$output].", 'line' => __LINE__, level => 0, priority => "err"}); - exit(1); + my ($return_code, $output) = shell_call($conf, $conf->{path}{exe}{virsh}." shutdown $server"); + to_log($conf, {message => "Asking the server: [$server] to shut down now. Please be patient.", 'line' => __LINE__, level => 1}); + if ($return_code) + { + # Looks like virsh isn't running. + to_log($conf, {message => "The attempt to shut down the server: [$server] returned a non-zero return code: [$return_code]. The output, if any, was: [$output].", 'line' => __LINE__, level => 0, priority => "err"}); + exit(1); + } } # Now loop until we see the server either vanish from virsh or enter "shut off" state. We wait @@ -538,17 +559,17 @@ sub server_status my $current_time = time; my $timeout = $current_time + int(($conf->{environment}{OCF_RESKEY_CRM_meta_timeout} /= 1000) / 2); my $waiting = 1; - to_log($conf, {message => "current_time: [$current_time], timeout: [$timeout].", 'line' => __LINE__, level => 2}); + to_log($conf, {message => "current_time: [$current_time], timeout: [$timeout].", 'line' => __LINE__, level => 3}); while($waiting) { # Make the call ($return_code, $output) = shell_call($conf, $conf->{path}{exe}{virsh}." list"); - to_log($conf, {message => "return_code: [$return_code].", 'line' => __LINE__, level => 2}); + to_log($conf, {message => "return_code: [$return_code].", 'line' => __LINE__, level => 3}); if (not $return_code) { $waiting = 0; - to_log($conf, {message => "waiting: [$waiting].", 'line' => __LINE__, level => 2}); + to_log($conf, {message => "waiting: [$waiting].", 'line' => __LINE__, level => 3}); } elsif (time > $timeout) { @@ -558,7 +579,7 @@ sub server_status } else { - to_log($conf, {message => "The 'virsh' call exited with the return code: [$return_code]. The 'libvirtd' service might be starting, so we will check again shortly.", 'line' => __LINE__, level => 2}); + to_log($conf, {message => "The 'virsh' call exited with the return code: [$return_code]. The 'libvirtd' service might be starting, so we will check again shortly.", 'line' => __LINE__, level => 3}); sleep 2; } } @@ -585,7 +606,7 @@ sub server_status if ($line =~ /^(\d+) $server (.*)$/) { $state = $2; - to_log($conf, {message => "server: [$server], state: [$state]", 'line' => __LINE__, level => 2}); + to_log($conf, {message => "server: [$server], state: [$state]", 'line' => __LINE__, level => 3}); last; } @@ -633,8 +654,28 @@ sub migrate_server { my ($conf) = @_; - # If we were given 'migrate_to', then just verify that the node name makes sense. If we were given - # 'migrate_from', we need to find the peer. + # If we were given 'migrate_to', we need to make sure the storage is UpToDate on the peer for all + # backing resources. We can't check the target's bridges, but the migation will fail if one is + # missing. + # If we're given 'migrate_from', we're pulling the server towards us, so we can check both brdiges + # and storage. + my $server = $conf->{environment}{OCF_RESKEY_name}; + my $source = $conf->{environment}{OCF_RESKEY_CRM_meta_migrate_source}; + my $target = $conf->{environment}{OCF_RESKEY_CRM_meta_migrate_target}; + to_log($conf, {message => "server: [$server], source: [$source], target: [$target].", 'line' => __LINE__, level => 1}); + if ($conf->{switches}{migrate_to}) + { + to_log($conf, {message => "We're pushing the: [$server] to: [$target].", 'line' => __LINE__, level => 1}); + validate_all($conf); + } + elsif ($conf->{switches}{migrate_from}) + { + to_log($conf, {message => "We're pulling the: [$server] from: [$target].", 'line' => __LINE__, level => 1}); + } + else + { + # WTF? + } # Return failed until this is actually implemented. exit(1); @@ -1326,7 +1367,7 @@ sub show_environment foreach my $key (sort {$a cmp $b} keys %ENV) { next if exists $conf->{environment}{$key}; - to_log($conf, {message => "System Environment variable: [$key] -> [".$ENV{$key}."]", 'line' => __LINE__, level => ($level + 1)}); + to_log($conf, {message => "System Environment variable: [$key] -> [".$ENV{$key}."]", 'line' => __LINE__, level => $level}); } return(0); @@ -1368,7 +1409,7 @@ It manages underlying components like DRBD 9 storage resources, brodge connectio - +