diff --git a/Anvil/Tools/Cluster.pm b/Anvil/Tools/Cluster.pm
index 95b5b3d4..120d31aa 100644
--- a/Anvil/Tools/Cluster.pm
+++ b/Anvil/Tools/Cluster.pm
@@ -27,6 +27,7 @@ my $THIS_FILE = "Cluster.pm";
# get_peers
# get_primary_host_uuid
# is_primary
+# manage_fence_delay
# migrate_server
# parse_cib
# parse_crm_mon
@@ -555,7 +556,7 @@ This is the name of the server to boot.
If set, a resource constraint is placed so that the server prefers one node over the other before it boots.
-B<< Note >>; The method relies on pacemaker to boot the node. As such, if for some reason it decides the server can not be booted on the prefered node, it may boot on the other node. As such, this parameter does not guarantee that the server will be booted on the target node!
+B<< Note >>; The method relies on pacemaker to boot the node. As such, if for some reason it decides the server can not be booted on the preferred node, it may boot on the other node. As such, this parameter does not guarantee that the server will be booted on the target node!
=head3 wait (optional, default '1')
@@ -2329,6 +2330,206 @@ sub is_primary
}
+=head2 manage_fence_delay
+
+This method checks or sets the fence delay that controls which node survives in a network split. Generally, this is the node hosting servers, as ScanCore's C<< scan-cluster >> should set this based on where the servers are runn.
+
+If C<< set >> is given an invalid host name, or if this is called on a node that is not a cluster member, C<< !!error!! >> is returned. Otherwise, the node with the delay favouring it is returned. If, somehow, neither node has a delay, then an empty string is returned.
+
+B<< Note >>: This must run on a node in a cluster.
+
+Parameters;
+
+=head3 prefer (optional)
+
+If this is set to a node name, that node will have the fence delay set to favour it. Specifically, the first fence method on this node has the C<< delay="15" >> argument added to it. If a delay is found on any other method, it is removed.
+
+=cut
+sub manage_fence_delay
+{
+ my $self = shift;
+ my $parameter = shift;
+ my $anvil = $self->parent;
+ my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3;
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "Cluster->manage_fence_delay()" }});
+
+ my $prefer = defined $parameter->{prefer} ? $parameter->{prefer} : "";
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
+ prefer => $prefer,
+ }});
+
+ # Are we a node?
+ my $host_type = $anvil->Get->host_type({debug => $debug});
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { host_type => $host_type }});
+ if ($host_type ne "node")
+ {
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "warning_0123"});
+ return("!!error!!");
+ }
+
+ # Are we in the cluster?
+ my $problem = $anvil->Cluster->parse_cib({debug => $debug});
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { problem => $problem }});
+ if ($problem)
+ {
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "warning_0124"});
+ return('!!error!!');
+ }
+
+ # Are we a full member?
+ if (not $anvil->data->{cib}{parsed}{'local'}{ready})
+ {
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "warning_0125"});
+ return('!!error!!');
+ }
+
+ # Now look for stonith info.
+ foreach my $node_name (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{data}{node}})
+ {
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { node_name => $node_name }});
+ foreach my $order (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{data}{node}{$node_name}{fencing}{order}})
+ {
+ my $method = $anvil->data->{cib}{parsed}{data}{node}{$node_name}{fencing}{order}{$order}{devices};
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
+ 's1:order' => $order,
+ 's2:method' => $method,
+ }});
+
+ foreach my $this_method (split/,/, $method)
+ {
+ my $agent = $anvil->data->{cib}{parsed}{data}{stonith}{primitive_id}{$this_method}{agent};
+
+ # We ignore the fake, delay method
+ next if $agent eq "fence_delay";
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
+ 's1:this_method' => $this_method,
+ 's2:agent' => $agent,
+ }});
+
+ my $config_line = $agent." ";
+ foreach my $stdin_name (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{data}{node}{$node_name}{fencing}{device}{$this_method}{argument}})
+ {
+ next if $stdin_name =~ /pcmk_o\w+_action/;
+ my $value = $anvil->data->{cib}{parsed}{data}{node}{$node_name}{fencing}{device}{$this_method}{argument}{$stdin_name}{value};
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
+ 's1:stdin_name' => $stdin_name,
+ 's2:value' => $value,
+ }});
+
+ $config_line .= $stdin_name."=\"".$value."\" ";
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { config_line => $config_line }});
+ }
+ $anvil->data->{fence_method}{$node_name}{order}{$order}{method}{$this_method}{command} = $config_line;
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
+ "fence_method::${node_name}::order::${order}::method::${this_method}::command" => $anvil->data->{fence_method}{$node_name}{order}{$order}{method}{$this_method}{command},
+ }});
+ }
+ }
+ }
+
+ my $preferred_node = "";
+ foreach my $node_name (sort {$a cmp $b} keys %{$anvil->data->{fence_method}})
+ {
+ # There's only one, no reason to sort
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { node_name => $node_name }});
+ foreach my $method (keys %{$anvil->data->{fence_method}{$node_name}{order}{1}{method}})
+ {
+ my $config_line = $anvil->data->{fence_method}{$node_name}{order}{1}{method}{$method}{command};
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
+ 's1:method' => $method,
+ 's2:config_line' => $config_line,
+ }});
+ if ($config_line =~ / delay="(\d+)"/)
+ {
+ # If we're being asked to set a preferred node, and this isn't it, remove it.
+ if (($prefer) && ($prefer ne $node_name))
+ {
+ # Remove it.
+ $config_line =~ s/ delay=".*?"//;
+ my $shell_call = $anvil->data->{path}{exe}{pcs}." stonith update ".$method." ".$config_line;
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { shell_call => $shell_call }});
+ my ($output, $return_code) = $anvil->System->call({debug => ($debug + 1), shell_call => $shell_call});
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
+ output => $output,
+ return_code => $return_code,
+ }});
+
+ # Make sure we're not the preferred host anymore.
+ $preferred_node = $anvil->Cluster->manage_fence_delay({debug => $debug});;
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { preferred_node => $preferred_node }});
+
+ if (($preferred_node ne "!!error!!") && ($preferred_node ne $node_name))
+ {
+ # Success! Register an alert.
+ my $variables = {
+ node => $node_name,
+ };
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "message_0253", variables => $variables});
+ $anvil->Alert->register({alert_level => "notice", message => "message_0253", variables => $variables, set_by => $THIS_FILE});
+ }
+ else
+ {
+ # What?!
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0310", variables => {
+ node => $node_name,
+ current => $preferred_node,
+ }});
+ return("!!error!!")
+ }
+ }
+ else
+ {
+ $preferred_node = $node_name;
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { preferred_node => $preferred_node }});
+ }
+ }
+ else
+ {
+ # If 'prefer' is set, and this is the node, add it.
+ if (($prefer) && ($prefer eq $node_name))
+ {
+ $config_line .= " delay=\"15\"";
+ my $shell_call = $anvil->data->{path}{exe}{pcs}." stonith update ".$method." ".$config_line;
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { shell_call => $shell_call }});
+ my ($output, $return_code) = $anvil->System->call({debug => ($debug + 1), shell_call => $shell_call});
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
+ output => $output,
+ return_code => $return_code,
+ }});
+
+ # Verify that this is now the prferred host.
+ $preferred_node = $anvil->Cluster->manage_fence_delay({debug => $debug});;
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { preferred_node => $preferred_node }});
+
+ if ($prefer eq $preferred_node)
+ {
+ # Success! Register an alert.
+ my $variables = {
+ node => $node_name,
+ };
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "message_0254", variables => $variables});
+ $anvil->Alert->register({alert_level => "notice", message => "message_0254", variables => $variables, set_by => $THIS_FILE});
+
+ return($prefer);
+ }
+ else
+ {
+ # What?!
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0309", variables => {
+ prefer => $prefer,
+ current => $preferred_node,
+ }});
+ return("!!error!!")
+ }
+ }
+ }
+ }
+ }
+
+ return($preferred_node);
+}
+
+
=head2 migrate_server
This manipulates pacemaker's location constraints to trigger a pacemaker-controlled migration of one or more servers.
diff --git a/Anvil/Tools/Server.pm b/Anvil/Tools/Server.pm
index 402d2b2c..a9b8de19 100644
--- a/Anvil/Tools/Server.pm
+++ b/Anvil/Tools/Server.pm
@@ -1986,7 +1986,8 @@ WHERE
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { stop_waiting => $stop_waiting }});
};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { wait_time => $wait_time }});
- until($success)
+ my $waiting = 1;
+ while ($waiting)
{
# Update
$anvil->Server->find({debug => $debug});
@@ -2007,7 +2008,11 @@ WHERE
{
# Success!
$success = 1;
- $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0426", variables => { server => $server }});
+ $waiting = 0;
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0426", variables => {
+ server => $server,
+ waiting => $waiting,
+ }});
# Mark it as stopped now. (if we have a server_uuid, we have a database connection)
if ($server_uuid)
@@ -2042,9 +2047,12 @@ WHERE
if (($stop_waiting) && (time > $stop_waiting))
{
# Give up waiting.
+ $waiting = 0;
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0426", variables => { waiting => $waiting }});
+
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0427", variables => {
- server => $server,
- wait_time => $wait_time,
+ server => $server,
+ 'wait' => $wait_time,
}});
}
else
diff --git a/scancore-agents/scan-cluster/scan-cluster b/scancore-agents/scan-cluster/scan-cluster
index 0c69420f..3aa1264a 100755
--- a/scancore-agents/scan-cluster/scan-cluster
+++ b/scancore-agents/scan-cluster/scan-cluster
@@ -15,7 +15,6 @@
# TODO:
# - When a node is lost, update the location constraints to keep the servers on the surviving node when the
# peer returns.
-# - Test that the fence delay favours the host that has all the servers.
#
use strict;
@@ -104,12 +103,145 @@ find_changes($anvil);
# Check the cluster config.
check_config($anvil);
+# Check the fence delay
+check_fence_delay($anvil);
+
$anvil->nice_exit({exit_code => 0});
#############################################################################################################
# Functions #
#############################################################################################################
+# Check to see if we need to move the fence delay.
+sub check_fence_delay
+{
+ my ($anvil) = @_;
+
+ my $preferred_node = $anvil->Cluster->manage_fence_delay();
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { preferred_node => $preferred_node }});
+ if ($preferred_node ne "!!error!!")
+ {
+ ### NOTE: We don't make the peer be the preferred node, a node can only make itself the preferred
+ ### node.
+ # How many servers are running on each node.
+ $anvil->Database->get_anvils();
+ $anvil->Database->get_servers();
+ $anvil->Cluster->get_peers();
+ my $anvil_uuid = $anvil->Cluster->get_anvil_uuid();
+ my $local_node_is = $anvil->data->{sys}{anvil}{i_am};
+ my $local_node_name = $anvil->data->{cib}{parsed}{'local'}{name};
+ my $local_host_name = $anvil->data->{sys}{anvil}{$local_node_is}{host_name};
+ my $local_host_uuid = $anvil->data->{sys}{anvil}{$local_node_is}{host_uuid};
+ my $peer_node_is = $anvil->data->{sys}{anvil}{peer_is};
+ my $peer_node_name = $anvil->data->{cib}{parsed}{peer}{name};;
+ my $peer_host_name = $anvil->data->{sys}{anvil}{$peer_node_is}{host_name};
+ my $peer_host_uuid = $anvil->data->{sys}{anvil}{$peer_node_is}{host_uuid};
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
+ anvil_uuid => $anvil_uuid,
+ local_node_is => $local_node_is,
+ local_node_name => $local_node_name,
+ local_host_name => $local_host_name,
+ local_host_uuid => $local_host_uuid,
+ peer_node_is => $peer_node_is,
+ peer_node_name => $peer_node_name,
+ peer_host_name => $peer_host_name,
+ peer_host_uuid => $peer_host_uuid,
+ }});
+
+ # Get the short host names, as that's usually what the node name is.
+ my $local_short_host_name = $local_host_name;
+ $local_short_host_name =~ s/\..$//;
+ my $peer_short_host_name = $peer_host_name;
+ $peer_short_host_name =~ s/\..$//;
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
+ local_short_host_name => $local_short_host_name,
+ peer_short_host_name => $peer_short_host_name,
+ }});
+
+ # If my peer isn't in the cluster, make sure I am the fence delay host.
+ if (not $anvil->data->{cib}{parsed}{peer}{ready})
+ {
+ # My peer is not ready, make sure I'm the preferred host.
+ if (($preferred_node eq $local_node_name) or ($preferred_node eq $local_host_name) && ($preferred_node eq $local_short_host_name))
+ {
+ # We're good.
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0633"});
+ }
+ else
+ {
+ # We're not, set the delay to us.
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0634"});
+ my $preferred_node = $anvil->Cluster->manage_fence_delay({prefer => $local_node_name});
+ return(0);
+ }
+ }
+
+ # How many servers are on each node?
+ my $local_server_count = 0;
+ my $peer_server_count = 0;
+ foreach my $server_uuid (keys %{$anvil->data->{servers}{server_uuid}})
+ {
+ next if $anvil_uuid ne $anvil->data->{servers}{server_uuid}{$server_uuid}{server_anvil_uuid};
+
+ my $server_name = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_name};
+ my $server_state = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_state};
+ my $server_host_uuid = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_host_uuid};
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
+ server_uuid => $server_uuid,
+ server_name => $server_name,
+ server_state => $server_state,
+ server_host_uuid => $server_host_uuid,
+ }});
+ next if $server_state eq "shut off";
+ if ($server_state eq "migrating")
+ {
+ # Don't do anything.
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0635", variables => { server_name => $server_name }});
+ return(0);
+ }
+ if ($server_host_uuid eq $local_host_uuid)
+ {
+ $local_server_count++;
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_server_count => $local_server_count }});
+ }
+ elsif ($server_host_uuid eq $peer_host_uuid)
+ {
+ $peer_server_count++;
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peer_server_count => $peer_server_count }});
+ }
+ }
+
+ # Don't do anything if there are no servers running anywhere, or if both servers have at least one
+ # server.
+ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
+ local_server_count => $local_server_count,
+ peer_server_count => $peer_server_count,
+ }});
+ if ((not $local_server_count) && (not $peer_server_count))
+ {
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0636"});
+ return(0);
+ }
+ elsif (($local_server_count) && ($peer_server_count))
+ {
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0637", variables => {
+ local_server_count => $local_server_count,
+ peer_server_count => $peer_server_count,
+ }});
+ return(0);
+ }
+ elsif (($local_server_count) && ($preferred_node ne $local_node_name))
+ {
+ # Make us the preferred host.
+ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0638"});
+ my $preferred_node = $anvil->Cluster->manage_fence_delay({prefer => $local_node_name});
+ return(0);
+ }
+ }
+
+ return(0);
+}
+
sub check_config
{
my ($anvil) = @_;
diff --git a/share/words.xml b/share/words.xml
index 1c0f7625..aa545227 100644
--- a/share/words.xml
+++ b/share/words.xml
@@ -420,6 +420,8 @@ The attempt to start the servers appears to have failed. The return code '0' was
Unable to connect to the database, unable to provision a server at this time.
Failed to perform requested task(s) because the requester is not authenticated.
,manifest_uuid=,anvil_uuid='. Either the parse failed, or the data was somehow invalid.]]>
+ I tried to change the fencing preferred node to: [#!variable!prefer!#], but it doesn't appear to have worked. The preferred node is: [#!variable!current!#] ('--' means there is no preferred node)
+ I tried to remove the fence delay from the node: [#!variable!node!#], but it doesn't appear to have worked. The preferred node is: [#!variable!current!#] ('--' means there is no preferred node)
@@ -1823,6 +1825,12 @@ The file: [#!variable!file!#] needs to be updated. The difference is:
The bond: [#!variable!bond!#] will now be brought up (even if it already is up).
Network device names have changed, rebooting to ensure they take effect. The job will restart once the network comes back up.
The bridge: [#!variable!bridge!#] is down, tryin to bring it up now.
+ Our peer is offline and we're already the preferred fence node. Nothing to do.
+ Our peer is offline and we're not the preferred fence node. Updating the fence config to prefer this node.
+ The server: [#!variable!server_name!#] is migrating. Skipping fence delay preference checks for now.
+ No servers are running on either node. Skipping fence delay preference checks for now.
+ We've got: [#!variable!local_server_count!#] servers, and the peer has: [#!variable!peer_server_count!#] servers. Skipping fence delay preference checks for now.
+ We're hosting servers, and our peer is not. Making the fence delay favours this node.
The host name: [#!variable!target!#] does not resolve to an IP address.
@@ -2171,6 +2179,8 @@ Are you sure that you want to delete the server: [#!variable!server_name!#]? [Ty
The daemon: [#!variable!daemon!#] was not running, starting it now.
Preparing to manage a server.
Found the server: [#!variable!server_name!#] in the database, loading details now.
+ The fence delay to prefer the node: [#!variable!node!#] has been removed.
+ The fence delay now prefers the node: [#!variable!node!#].
Saved the mail server information successfully!
@@ -2802,6 +2812,9 @@ Read UUID: .... [#!variable!read_uuid!#]
[ Warning ] - The 'admin' user was created with the user ID: [#!variable!uid!#].
[ Warning ] - Timed out waiting for the database: [#!variable!uuid!#] to become available.
[ Warning ] - The Anvil! with the UUID: [#!variable!uuid!#] was not found. Exiting, will re-run the anvil-join-anvil job again in a few moments.
+ [ Warning ] - Asked to find or set the fence delay, but this is not a node.
+ [ Warning ] - Asked to find or set the fence delay, but node is not in a cluster.
+ [ Warning ] - Asked to find or set the fence delay, but node is not fully in the cluster yet.