Merge pull request #142 from ClusterLabs/scancore-debugging

* Added Cluster->manage_fence_delay() that reports back and, optional…
main
Digimer 4 years ago committed by GitHub
commit e2788f438e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 203
      Anvil/Tools/Cluster.pm
  2. 14
      Anvil/Tools/Server.pm
  3. 134
      scancore-agents/scan-cluster/scan-cluster
  4. 13
      share/words.xml

@ -27,6 +27,7 @@ my $THIS_FILE = "Cluster.pm";
# get_peers
# get_primary_host_uuid
# is_primary
# manage_fence_delay
# migrate_server
# parse_cib
# parse_crm_mon
@ -555,7 +556,7 @@ This is the name of the server to boot.
If set, a resource constraint is placed so that the server prefers one node over the other before it boots.
B<< Note >>; The method relies on pacemaker to boot the node. As such, if for some reason it decides the server can not be booted on the prefered node, it may boot on the other node. As such, this parameter does not guarantee that the server will be booted on the target node!
B<< Note >>; The method relies on pacemaker to boot the node. As such, if for some reason it decides the server can not be booted on the preferred node, it may boot on the other node. As such, this parameter does not guarantee that the server will be booted on the target node!
=head3 wait (optional, default '1')
@ -2329,6 +2330,206 @@ sub is_primary
}
=head2 manage_fence_delay
This method checks or sets the fence delay that controls which node survives in a network split. Generally, this is the node hosting servers, as ScanCore's C<< scan-cluster >> should set this based on where the servers are runn.
If C<< set >> is given an invalid host name, or if this is called on a node that is not a cluster member, C<< !!error!! >> is returned. Otherwise, the node with the delay favouring it is returned. If, somehow, neither node has a delay, then an empty string is returned.
B<< Note >>: This must run on a node in a cluster.
Parameters;
=head3 prefer (optional)
If this is set to a node name, that node will have the fence delay set to favour it. Specifically, the first fence method on this node has the C<< delay="15" >> argument added to it. If a delay is found on any other method, it is removed.
=cut
sub manage_fence_delay
{
my $self = shift;
my $parameter = shift;
my $anvil = $self->parent;
my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "Cluster->manage_fence_delay()" }});
my $prefer = defined $parameter->{prefer} ? $parameter->{prefer} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
prefer => $prefer,
}});
# Are we a node?
my $host_type = $anvil->Get->host_type({debug => $debug});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { host_type => $host_type }});
if ($host_type ne "node")
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "warning_0123"});
return("!!error!!");
}
# Are we in the cluster?
my $problem = $anvil->Cluster->parse_cib({debug => $debug});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { problem => $problem }});
if ($problem)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "warning_0124"});
return('!!error!!');
}
# Are we a full member?
if (not $anvil->data->{cib}{parsed}{'local'}{ready})
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "warning_0125"});
return('!!error!!');
}
# Now look for stonith info.
foreach my $node_name (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{data}{node}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { node_name => $node_name }});
foreach my $order (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{data}{node}{$node_name}{fencing}{order}})
{
my $method = $anvil->data->{cib}{parsed}{data}{node}{$node_name}{fencing}{order}{$order}{devices};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
's1:order' => $order,
's2:method' => $method,
}});
foreach my $this_method (split/,/, $method)
{
my $agent = $anvil->data->{cib}{parsed}{data}{stonith}{primitive_id}{$this_method}{agent};
# We ignore the fake, delay method
next if $agent eq "fence_delay";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
's1:this_method' => $this_method,
's2:agent' => $agent,
}});
my $config_line = $agent." ";
foreach my $stdin_name (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{data}{node}{$node_name}{fencing}{device}{$this_method}{argument}})
{
next if $stdin_name =~ /pcmk_o\w+_action/;
my $value = $anvil->data->{cib}{parsed}{data}{node}{$node_name}{fencing}{device}{$this_method}{argument}{$stdin_name}{value};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
's1:stdin_name' => $stdin_name,
's2:value' => $value,
}});
$config_line .= $stdin_name."=\"".$value."\" ";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { config_line => $config_line }});
}
$anvil->data->{fence_method}{$node_name}{order}{$order}{method}{$this_method}{command} = $config_line;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
"fence_method::${node_name}::order::${order}::method::${this_method}::command" => $anvil->data->{fence_method}{$node_name}{order}{$order}{method}{$this_method}{command},
}});
}
}
}
my $preferred_node = "";
foreach my $node_name (sort {$a cmp $b} keys %{$anvil->data->{fence_method}})
{
# There's only one, no reason to sort
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { node_name => $node_name }});
foreach my $method (keys %{$anvil->data->{fence_method}{$node_name}{order}{1}{method}})
{
my $config_line = $anvil->data->{fence_method}{$node_name}{order}{1}{method}{$method}{command};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
's1:method' => $method,
's2:config_line' => $config_line,
}});
if ($config_line =~ / delay="(\d+)"/)
{
# If we're being asked to set a preferred node, and this isn't it, remove it.
if (($prefer) && ($prefer ne $node_name))
{
# Remove it.
$config_line =~ s/ delay=".*?"//;
my $shell_call = $anvil->data->{path}{exe}{pcs}." stonith update ".$method." ".$config_line;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({debug => ($debug + 1), shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
output => $output,
return_code => $return_code,
}});
# Make sure we're not the preferred host anymore.
$preferred_node = $anvil->Cluster->manage_fence_delay({debug => $debug});;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { preferred_node => $preferred_node }});
if (($preferred_node ne "!!error!!") && ($preferred_node ne $node_name))
{
# Success! Register an alert.
my $variables = {
node => $node_name,
};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "message_0253", variables => $variables});
$anvil->Alert->register({alert_level => "notice", message => "message_0253", variables => $variables, set_by => $THIS_FILE});
}
else
{
# What?!
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0310", variables => {
node => $node_name,
current => $preferred_node,
}});
return("!!error!!")
}
}
else
{
$preferred_node = $node_name;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { preferred_node => $preferred_node }});
}
}
else
{
# If 'prefer' is set, and this is the node, add it.
if (($prefer) && ($prefer eq $node_name))
{
$config_line .= " delay=\"15\"";
my $shell_call = $anvil->data->{path}{exe}{pcs}." stonith update ".$method." ".$config_line;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({debug => ($debug + 1), shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
output => $output,
return_code => $return_code,
}});
# Verify that this is now the prferred host.
$preferred_node = $anvil->Cluster->manage_fence_delay({debug => $debug});;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { preferred_node => $preferred_node }});
if ($prefer eq $preferred_node)
{
# Success! Register an alert.
my $variables = {
node => $node_name,
};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "message_0254", variables => $variables});
$anvil->Alert->register({alert_level => "notice", message => "message_0254", variables => $variables, set_by => $THIS_FILE});
return($prefer);
}
else
{
# What?!
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0309", variables => {
prefer => $prefer,
current => $preferred_node,
}});
return("!!error!!")
}
}
}
}
}
return($preferred_node);
}
=head2 migrate_server
This manipulates pacemaker's location constraints to trigger a pacemaker-controlled migration of one or more servers.

@ -1986,7 +1986,8 @@ WHERE
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { stop_waiting => $stop_waiting }});
};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { wait_time => $wait_time }});
until($success)
my $waiting = 1;
while ($waiting)
{
# Update
$anvil->Server->find({debug => $debug});
@ -2007,7 +2008,11 @@ WHERE
{
# Success!
$success = 1;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0426", variables => { server => $server }});
$waiting = 0;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0426", variables => {
server => $server,
waiting => $waiting,
}});
# Mark it as stopped now. (if we have a server_uuid, we have a database connection)
if ($server_uuid)
@ -2042,9 +2047,12 @@ WHERE
if (($stop_waiting) && (time > $stop_waiting))
{
# Give up waiting.
$waiting = 0;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0426", variables => { waiting => $waiting }});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0427", variables => {
server => $server,
wait_time => $wait_time,
'wait' => $wait_time,
}});
}
else

@ -15,7 +15,6 @@
# TODO:
# - When a node is lost, update the location constraints to keep the servers on the surviving node when the
# peer returns.
# - Test that the fence delay favours the host that has all the servers.
#
use strict;
@ -104,12 +103,145 @@ find_changes($anvil);
# Check the cluster config.
check_config($anvil);
# Check the fence delay
check_fence_delay($anvil);
$anvil->nice_exit({exit_code => 0});
#############################################################################################################
# Functions #
#############################################################################################################
# Check to see if we need to move the fence delay.
sub check_fence_delay
{
my ($anvil) = @_;
my $preferred_node = $anvil->Cluster->manage_fence_delay();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { preferred_node => $preferred_node }});
if ($preferred_node ne "!!error!!")
{
### NOTE: We don't make the peer be the preferred node, a node can only make itself the preferred
### node.
# How many servers are running on each node.
$anvil->Database->get_anvils();
$anvil->Database->get_servers();
$anvil->Cluster->get_peers();
my $anvil_uuid = $anvil->Cluster->get_anvil_uuid();
my $local_node_is = $anvil->data->{sys}{anvil}{i_am};
my $local_node_name = $anvil->data->{cib}{parsed}{'local'}{name};
my $local_host_name = $anvil->data->{sys}{anvil}{$local_node_is}{host_name};
my $local_host_uuid = $anvil->data->{sys}{anvil}{$local_node_is}{host_uuid};
my $peer_node_is = $anvil->data->{sys}{anvil}{peer_is};
my $peer_node_name = $anvil->data->{cib}{parsed}{peer}{name};;
my $peer_host_name = $anvil->data->{sys}{anvil}{$peer_node_is}{host_name};
my $peer_host_uuid = $anvil->data->{sys}{anvil}{$peer_node_is}{host_uuid};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
anvil_uuid => $anvil_uuid,
local_node_is => $local_node_is,
local_node_name => $local_node_name,
local_host_name => $local_host_name,
local_host_uuid => $local_host_uuid,
peer_node_is => $peer_node_is,
peer_node_name => $peer_node_name,
peer_host_name => $peer_host_name,
peer_host_uuid => $peer_host_uuid,
}});
# Get the short host names, as that's usually what the node name is.
my $local_short_host_name = $local_host_name;
$local_short_host_name =~ s/\..$//;
my $peer_short_host_name = $peer_host_name;
$peer_short_host_name =~ s/\..$//;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
local_short_host_name => $local_short_host_name,
peer_short_host_name => $peer_short_host_name,
}});
# If my peer isn't in the cluster, make sure I am the fence delay host.
if (not $anvil->data->{cib}{parsed}{peer}{ready})
{
# My peer is not ready, make sure I'm the preferred host.
if (($preferred_node eq $local_node_name) or ($preferred_node eq $local_host_name) && ($preferred_node eq $local_short_host_name))
{
# We're good.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0633"});
}
else
{
# We're not, set the delay to us.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0634"});
my $preferred_node = $anvil->Cluster->manage_fence_delay({prefer => $local_node_name});
return(0);
}
}
# How many servers are on each node?
my $local_server_count = 0;
my $peer_server_count = 0;
foreach my $server_uuid (keys %{$anvil->data->{servers}{server_uuid}})
{
next if $anvil_uuid ne $anvil->data->{servers}{server_uuid}{$server_uuid}{server_anvil_uuid};
my $server_name = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_name};
my $server_state = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_state};
my $server_host_uuid = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_host_uuid};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
server_uuid => $server_uuid,
server_name => $server_name,
server_state => $server_state,
server_host_uuid => $server_host_uuid,
}});
next if $server_state eq "shut off";
if ($server_state eq "migrating")
{
# Don't do anything.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0635", variables => { server_name => $server_name }});
return(0);
}
if ($server_host_uuid eq $local_host_uuid)
{
$local_server_count++;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_server_count => $local_server_count }});
}
elsif ($server_host_uuid eq $peer_host_uuid)
{
$peer_server_count++;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peer_server_count => $peer_server_count }});
}
}
# Don't do anything if there are no servers running anywhere, or if both servers have at least one
# server.
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
local_server_count => $local_server_count,
peer_server_count => $peer_server_count,
}});
if ((not $local_server_count) && (not $peer_server_count))
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0636"});
return(0);
}
elsif (($local_server_count) && ($peer_server_count))
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0637", variables => {
local_server_count => $local_server_count,
peer_server_count => $peer_server_count,
}});
return(0);
}
elsif (($local_server_count) && ($preferred_node ne $local_node_name))
{
# Make us the preferred host.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0638"});
my $preferred_node = $anvil->Cluster->manage_fence_delay({prefer => $local_node_name});
return(0);
}
}
return(0);
}
sub check_config
{
my ($anvil) = @_;

@ -420,6 +420,8 @@ The attempt to start the servers appears to have failed. The return code '0' was
<key name="error_0306">Unable to connect to the database, unable to provision a server at this time.</key>
<key name="error_0307">Failed to perform requested task(s) because the requester is not authenticated.</key>
<key name="error_0308"><![CDATA[[ Error ] - The Job: [#!variable!job-uuid!#] appears to have passed malformed data. The raw data was: [#!variable!raw!#]. Expected 'as_machine=<host_type>,manifest_uuid=<manifest_uuid>,anvil_uuid=<anvil_uuid>'. Either the parse failed, or the data was somehow invalid.]]></key>
<key name="error_0309">I tried to change the fencing preferred node to: [#!variable!prefer!#], but it doesn't appear to have worked. The preferred node is: [#!variable!current!#] ('--' means there is no preferred node)</key>
<key name="error_0310">I tried to remove the fence delay from the node: [#!variable!node!#], but it doesn't appear to have worked. The preferred node is: [#!variable!current!#] ('--' means there is no preferred node)</key>
<!-- Files templates -->
<!-- NOTE: Translating these files requires an understanding of which lines are translatable -->
@ -1823,6 +1825,12 @@ The file: [#!variable!file!#] needs to be updated. The difference is:
<key name="log_0630">The bond: [#!variable!bond!#] will now be brought up (even if it already is up).</key>
<key name="log_0631">Network device names have changed, rebooting to ensure they take effect. The job will restart once the network comes back up.</key>
<key name="log_0632">The bridge: [#!variable!bridge!#] is down, tryin to bring it up now.</key>
<key name="log_0633">Our peer is offline and we're already the preferred fence node. Nothing to do.</key>
<key name="log_0634">Our peer is offline and we're not the preferred fence node. Updating the fence config to prefer this node.</key>
<key name="log_0635">The server: [#!variable!server_name!#] is migrating. Skipping fence delay preference checks for now.</key>
<key name="log_0636">No servers are running on either node. Skipping fence delay preference checks for now.</key>
<key name="log_0637">We've got: [#!variable!local_server_count!#] servers, and the peer has: [#!variable!peer_server_count!#] servers. Skipping fence delay preference checks for now.</key>
<key name="log_0638">We're hosting servers, and our peer is not. Making the fence delay favours this node.</key>
<!-- Messages for users (less technical than log entries), though sometimes used for logs, too. -->
<key name="message_0001">The host name: [#!variable!target!#] does not resolve to an IP address.</key>
@ -2171,6 +2179,8 @@ Are you sure that you want to delete the server: [#!variable!server_name!#]? [Ty
<key name="message_0250">The daemon: [#!variable!daemon!#] was not running, starting it now.</key>
<key name="message_0251">Preparing to manage a server.</key>
<key name="message_0252">Found the server: [#!variable!server_name!#] in the database, loading details now.</key>
<key name="message_0253">The fence delay to prefer the node: [#!variable!node!#] has been removed.</key>
<key name="message_0254">The fence delay now prefers the node: [#!variable!node!#].</key>
<!-- Success messages shown to the user -->
<key name="ok_0001">Saved the mail server information successfully!</key>
@ -2802,6 +2812,9 @@ Read UUID: .... [#!variable!read_uuid!#]
<key name="warning_0119">[ Warning ] - The 'admin' user was created with the user ID: [#!variable!uid!#].</key>
<key name="warning_0120">[ Warning ] - Timed out waiting for the database: [#!variable!uuid!#] to become available.</key>
<key name="warning_0121">[ Warning ] - The Anvil! with the UUID: [#!variable!uuid!#] was not found. Exiting, will re-run the anvil-join-anvil job again in a few moments.</key>
<key name="warning_0122">[ Warning ] - Asked to find or set the fence delay, but this is not a node.</key>
<key name="warning_0123">[ Warning ] - Asked to find or set the fence delay, but node is not in a cluster.</key>
<key name="warning_0124">[ Warning ] - Asked to find or set the fence delay, but node is not fully in the cluster yet.</key>
<!-- The entries below here are not sequential, but use a key to find the entry. -->
<!-- Run 'striker-parse-os-list to find new entries. -->

Loading…
Cancel
Save