* Updated DRBD->allow_two_primaries() to be more careful at evaluating peer-node-id.

* Updated DRBD->manage_resource() to set allow-two-primaries=no when up'ing a resource (as no migration can be in progress during an up command).
* Updated scan-drbd to look for StandAlone resources and call DRBD->manage_resource({task = 'up'}) if a connection to a peer node is StandAlone or if the local disk state is detached.

Signed-off-by: digimer <mkelly@alteeve.ca>
main
digimer 1 year ago
parent b6a249d5e7
commit dda0fbd7d5
  1. 1
      Anvil/Tools/Cluster.pm
  2. 22
      Anvil/Tools/DRBD.pm
  3. 3
      ocf/alteeve/server
  4. 129
      scancore-agents/scan-drbd/scan-drbd
  5. 2
      scancore-agents/scan-drbd/scan-drbd.xml

@ -4776,7 +4776,6 @@ sub _set_server_constraint
if ($problem)
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0145", variables => { server => $server }});
}
}

@ -193,7 +193,11 @@ sub allow_two_primaries
target => $target,
});
}
if ($anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'peer-node-id'} =~ /^\d+$/)
if ((exists $anvil->data->{drbd}{status}{$host}) &&
(exists $anvil->data->{drbd}{status}{$host}{resource}{$resource}) &&
(exists $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}) &&
(defined $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'peer-node-id'}) &&
($anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'peer-node-id'} =~ /^\d+$/))
{
$target_node_id = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'peer-node-id'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { target_node_id => $target_node_id }});
@ -2502,11 +2506,20 @@ sub manage_resource
### TODO: When taking down a resource, check to see if any machine is SyncTarget and take it/them
### down first. See anvil-rename-server -> verify_server_is_off() for the logic.
### TODO: Sanity check the resource name and task requested.
### NOTE: For an unknown reason, sometimes a resource is left with allow-two-primary enabled. This
### can block startup, so to be safe, during start, we'll call adjust
### NOTE: If a live-migration fails, one of the nodes could have their allow-two-primaries left up.
### This ensures that they're set to 'no' before connecting.
if ($task eq "up")
{
# This generally brings up the resource
my ($return) = $anvil->DRBD->allow_two_primaries({
debug => 2,
resource => $resource,
set_to => "no",
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'return' => $return }});
# Now call an adjust to make sure all other config details are loaded. It also up's the
# resource.
my $shell_call = $anvil->data->{path}{exe}{drbdadm}." adjust ".$resource;
my $output = "";
my $return_code = 255;
@ -2538,6 +2551,9 @@ sub manage_resource
return_code => $return_code,
}});
}
# Sleep for a moment to make sure adjust has taken hold.
sleep 1;
}
# If we 'adjust'ed above, this will likely complain that the backing disk already exists, and that's

@ -620,11 +620,12 @@ sub start_server
local_name => $local_name,
peer_name => $peer_name,
}});
$anvil->Cluster->_set_server_constraint({
my $problem = $anvil->Cluster->_set_server_constraint({
debug => 2,
server => $server,
preferred_node => $local_name,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }});
# Success!
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0309", variables => { server => $server }});

@ -100,6 +100,8 @@ find_changes($anvil);
check_config($anvil);
fix_things($anvil);
# Shut down.
$anvil->ScanCore->agent_shutdown({agent => $THIS_FILE});
@ -108,6 +110,133 @@ $anvil->ScanCore->agent_shutdown({agent => $THIS_FILE});
# Functions #
#############################################################################################################
# This looks for issues that we can fix, and if any are found, we try to fix them.
sub fix_things
{
my ($anvil) = @_;
# If the cluster is up and both nodes are online, make sure all DRBD resources are connected.
$anvil->Database->get_hosts();
$anvil->DRBD->get_status();
my $host_uuid = $anvil->Get->host_uuid();
my $host_type = $anvil->Get->host_type();
my $short_host_name = $anvil->Get->short_host_name();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
host_uuid => $host_uuid,
host_type => $host_type,
short_host_name => $short_host_name,
}});
if ($host_type eq "node")
{
my $problem = $anvil->Cluster->parse_cib({debug => 3});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
problem => $problem,
"cib::parsed::local::ready" => $anvil->data->{cib}{parsed}{'local'}{ready},
"cib::parsed::peer::ready" => $anvil->data->{cib}{parsed}{peer}{ready},
}});
if ((not $problem) && ($anvil->data->{cib}{parsed}{peer}{ready}) && ($anvil->data->{cib}{parsed}{'local'}{ready}))
{
# Walk through all resources and make sure the peer and local disks are attached and
# the connection established.
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{new}{resource}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { resource => $resource }});
# Check connection states.
foreach my $local_name (sort {$a cmp $b} keys%{$anvil->data->{drbd}{status}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_name => $local_name }});
foreach my $peer (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$local_name}{resource}{$resource}{connection}})
{
my $connection_state = lc($anvil->data->{drbd}{status}{$local_name}{resource}{$resource}{connection}{$peer}{'connection-state'});
my $peer_host_uuid = $anvil->Get->host_uuid_from_name({host_name => $peer, debug => 3});
my $peer_host_type = exists $anvil->data->{hosts}{host_uuid}{$host_uuid} ? $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:peer' => $peer,
's2:connection_state' => $connection_state,
's3:peer_host_uuid' => $peer_host_uuid,
's4:peer_host_type' => $peer_host_type,
}});
if (($peer_host_type eq "node") && ($connection_state eq "standalone"))
{
# Are either of the nodes still 'allow-two-primary',
# set it to 'no' (no migration is happening anyway,
# if we're StandAlone), and try reconnecting.
my $variables = {
resource => $resource,
peer => $peer,
};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "scan_drbd_message_0037", variables => $variables});
$anvil->Alert->register({alert_level => "notice", message => "scan_drbd_message_0037", variables => $variables, set_by => $THIS_FILE, sort_position => $anvil->data->{'scan-drbd'}{alert_sort}++});
$anvil->DRBD->manage_resource({
debug => 2,
resource => $resource,
task => "up",
});
}
}
}
foreach my $volume (sort {$a <=> $b} keys %{$anvil->data->{new}{resource}{$resource}{volume}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { volume => $volume }});
foreach my $peer (sort {$a cmp $b} keys %{$anvil->data->{new}{resource}{$resource}{volume}{$volume}{peer}})
{
my $peer_host_uuid = $anvil->Get->host_uuid_from_name({host_name => $peer});
my $peer_host_type = exists $anvil->data->{hosts}{host_uuid}{$host_uuid} ? $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:peer' => $peer,
's2:peer_host_uuid' => $peer_host_uuid,
's3:peer_host_type' => $peer_host_type,
}});
if ($peer_host_type eq "node")
{
my $connection_state = $anvil->data->{new}{resource}{$resource}{volume}{$volume}{peer}{$peer}{connection_state};
my $local_role = $anvil->data->{new}{resource}{$resource}{volume}{$volume}{peer}{$peer}{local_role};
my $peer_role = $anvil->data->{new}{resource}{$resource}{volume}{$volume}{peer}{$peer}{peer_role};
my $local_disk_state = $anvil->data->{new}{resource}{$resource}{volume}{$volume}{peer}{$peer}{local_disk_state};
my $peer_disk_state = $anvil->data->{new}{resource}{$resource}{volume}{$volume}{peer}{$peer}{peer_disk_state};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:connection_state' => $connection_state,
's2:local_role' => $local_role,
's3:peer_role' => $peer_role,
's4:local_disk_state' => $local_disk_state,
's5:peer_disk_state' => $peer_disk_state,
}});
# Find the peer's host_uuid and make sure we're
# talking to a node, not a DR host.
if ($local_disk_state eq "down")
{
# Bring it up.
my $variables = {
resource => $resource,
volume => $volume,
peer => $peer,
};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "scan_drbd_message_0036", variables => $variables});
$anvil->Alert->register({alert_level => "notice", message => "scan_drbd_message_0036", variables => $variables, set_by => $THIS_FILE, sort_position => $anvil->data->{'scan-drbd'}{alert_sort}++});
$anvil->DRBD->manage_resource({
debug => 2,
resource => $resource,
task => "up",
});
}
}
}
}
}
}
}
return(0);
}
# In the early days, scan_drbd_peers -> scan_drbd_peer_tcp_port was type numeric. This wasn't compatible with
# drbd-proxy and had to be changed to type text to support csv port lists.
sub check_schema

@ -155,6 +155,8 @@ The global common configuration file: [#!variable!file!#] needs to be updated. T
====
</key>
<key name="scan_drbd_message_0035">The DRBD peer: [#!variable!peer_name!#] resource: [#!variable!resource_name!#] volume: [#!variable!volume_number!#] was deleted, and the corresponding LVM data no longer exists. Purging this resource's data.</key>
<key name="scan_drbd_message_0036">The cluster is up and both nodes are ready, yet the DRBD resource/volume: [#!variable!resource!#/#!variable!volume!#] is down. Bringing it up now.</key>
<key name="scan_drbd_message_0037">The cluster is up and both nodes are ready, yet the DRBD resource/volume: [#!variable!resource!#] is StandAlone. Adjusting and reconnecting now.</key>
<!-- Units -->
<key name="scan_drbd_unit_0001">Enabled</key>

Loading…
Cancel
Save