* Finished updating ocf:alteeve:server to no longer require a database connection. To do this, and still be able to track live migration times, the Server->migrate_virsh() method now writes out the server name and migration time to a /tmp/anvil/migration-duration.<server_name>.<unix_time> file. This file is checked for by the scan-server resource agent and, when found, is parsed and the migration duration is recorded, then the file is purged.

* Updated anvil-daemon to have a new function called "handle_special_cases" called during startup that does any weird bug mitigation required. For now, this is used to mitigate against rhbz#1961562, though certainly it will be used for other reasons later.

Signed-off-by: Digimer <digimer@alteeve.ca>
main
Digimer 4 years ago
parent e15c1651ed
commit 96fffb0b96
  1. 22
      Anvil/Tools/Server.pm
  2. 98
      ocf/alteeve/server
  3. 66
      scancore-agents/scan-server/scan-server
  4. 36
      tools/anvil-daemon

@ -983,8 +983,8 @@ sub migrate_virsh
}); });
} }
### NOTE: This method is called by ocf:alteeve:server, which is allowed to operate without database ### NOTE: This method is called by ocf:alteeve:server, which operates without database access. As
### access. As such, queries need to be run only if we've got one or more DB connections. ### such, queries need to be run only if we've got one or more DB connections.
# Mark this server as being in a migration state. # Mark this server as being in a migration state.
if ($anvil->data->{sys}{database}{connections}) if ($anvil->data->{sys}{database}{connections})
{ {
@ -1113,6 +1113,9 @@ WHERE
$success = 1; $success = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { success => $success }}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { success => $success }});
# Update the server state, if we have a database connection.
if ($anvil->data->{sys}{database}{connections})
{
# Revert the server state and update the server host. # Revert the server state and update the server host.
my $server_host_uuid = $anvil->Get->host_uuid_from_name({debug => $debug, host_name => $target}); my $server_host_uuid = $anvil->Get->host_uuid_from_name({debug => $debug, host_name => $target});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { server_host_uuid => $server_host_uuid }}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { server_host_uuid => $server_host_uuid }});
@ -1123,7 +1126,7 @@ WHERE
$server_host_uuid = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_host_uuid}; $server_host_uuid = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_host_uuid};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { server_host_uuid => $server_host_uuid }}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { server_host_uuid => $server_host_uuid }});
} }
if (($server_uuid) && ($anvil->data->{sys}{database}{connections})) if ($server_uuid)
{ {
my $query = " my $query = "
UPDATE UPDATE
@ -1172,6 +1175,19 @@ WHERE
}); });
} }
} }
else
{
# There's no database, so write the migration time to a temp file.
my $body = "server_name=".$server.",migration_took=".$migration_took."\n";
my $file = "/tmp/anvil/migration-duration.".$server.".".time;
my ($failed) = $anvil->Storage->write_file({
file => $file,
body => $body,
mode => "0666",
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { failed => $failed }});
}
}
# Switch off dual-primary. # Switch off dual-primary.
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$target}{$server}{resource}}) foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$target}{$server}{resource}})

@ -9,6 +9,10 @@
# cluster or on any configuration outside how the Anvil! m3 uses it. If you plan to adapt it to # cluster or on any configuration outside how the Anvil! m3 uses it. If you plan to adapt it to
# another purpose, let us know and we'll try to help. # another purpose, let us know and we'll try to help.
# #
# NOTE: This method, for the sake of speed and reliability, does not connect to the Anvil! database. If you
# do work on this RA, be sure that a check is made for database connections before SQL calls are made
# in module methods.
#
# Based on: https://github.com/ClusterLabs/resource-agents/blob/master/doc/dev-guides/ra-dev-guide.asc # Based on: https://github.com/ClusterLabs/resource-agents/blob/master/doc/dev-guides/ra-dev-guide.asc
# #
# Error types from pacemaker's perspective; # Error types from pacemaker's perspective;
@ -169,16 +173,6 @@ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list
"switches::monitor" => $anvil->data->{switches}{monitor}, "switches::monitor" => $anvil->data->{switches}{monitor},
}}); }});
# If we can connect to a database, we'll set/clear the 'migrating' flag during migrations. For timing reasons
# we don't let the RA do resyncs.
# $anvil->Database->connect({sensitive => 1});
# $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"});
# if (not $anvil->data->{sys}{database}{connections})
# {
# # No databases,
# $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, secure => 0, key => "warning_0073"});
# }
if ($anvil->data->{switches}{stop_drbd_resources}) if ($anvil->data->{switches}{stop_drbd_resources})
{ {
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source} = 1; $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source} = 1;
@ -187,65 +181,19 @@ if ($anvil->data->{switches}{stop_drbd_resources})
# Something for the logs # Something for the logs
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 2, key => "log_0298"}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 2, key => "log_0298"});
=cut =cut Manual calls;
Start:
# Start a server;
environment::OCF_RESKEY_CRM_meta_name: [start] /usr/lib/ocf/resource.d/alteeve/server -vv --log-secure --server <server_name> --start
environment::OCF_RESKEY_CRM_meta_on_fail: [block]
environment::OCF_RESKEY_CRM_meta_on_node: [an-a02n01] # Stop a server
environment::OCF_RESKEY_CRM_meta_on_node_uuid: [1] /usr/lib/ocf/resource.d/alteeve/server -vv --log-secure --server <server_name> --stop
environment::OCF_RESKEY_CRM_meta_timeout: [300000]
environment::OCF_RESKEY_name: [srv02-c8s-fujitsu] # Monitor a server
/usr/lib/ocf/resource.d/alteeve/server -vv --log-secure --server <server_name> --monitor
Monitor:
# Migrate (run on current host)
environment::OCF_RESKEY_CRM_meta_interval: [60000] /usr/lib/ocf/resource.d/alteeve/server -vv --log-secure --server <server_name> --migrate-to <target_node_name> --migrate-from <current_node_name>
environment::OCF_RESKEY_CRM_meta_name: [monitor]
environment::OCF_RESKEY_CRM_meta_on_node: [an-a02n01]
environment::OCF_RESKEY_CRM_meta_on_node_uuid: [1]
environment::OCF_RESKEY_CRM_meta_timeout: [20000]
environment::OCF_RESKEY_name: [srv02-c8s-fujitsu]
Migrate from an-a02n01 to an-a02n02
environment::OCF_RESKEY_CRM_meta_migrate_source: [an-a02n01]
environment::OCF_RESKEY_CRM_meta_migrate_target: [an-a02n02]
environment::OCF_RESKEY_CRM_meta_name: [migrate_to]
environment::OCF_RESKEY_CRM_meta_on_node: [an-a02n01]
environment::OCF_RESKEY_CRM_meta_on_node_uuid: [1]
environment::OCF_RESKEY_CRM_meta_record_pending: [true]
environment::OCF_RESKEY_CRM_meta_stop_drbd_resources: [0]
environment::OCF_RESKEY_CRM_meta_timeout: [86400000]
environment::OCF_RESKEY_name: [srv02-c8s-fujitsu]
# Post migration on an-a02n01; stop is called:
Post migration on an-a02n02:
environment::OCF_RESKEY_CRM_meta_migrate_source: [an-a02n01]
environment::OCF_RESKEY_CRM_meta_migrate_target: [an-a02n02]
environment::OCF_RESKEY_CRM_meta_name: [migrate_from]
environment::OCF_RESKEY_CRM_meta_on_node: [an-a02n02]
environment::OCF_RESKEY_CRM_meta_on_node_uuid: [2]
environment::OCF_RESKEY_CRM_meta_timeout: [600000]
environment::OCF_RESKEY_name: [srv02-c8s-fujitsu]
Checking server state after: [srv02-c8s-fujitsu] was migrated to an-a02n02;
environment::OCF_RESKEY_CRM_meta_interval: [60000]
environment::OCF_RESKEY_CRM_meta_name: [monitor]
environment::OCF_RESKEY_CRM_meta_on_node: [an-a02n02]
environment::OCF_RESKEY_CRM_meta_on_node_uuid: [2]
environment::OCF_RESKEY_CRM_meta_timeout: [20000]
environment::OCF_RESKEY_name: [srv02-c8s-fujitsu]
Stop server (on an-a02n02):
environment::OCF_RESKEY_CRM_meta_name: [stop]
environment::OCF_RESKEY_CRM_meta_on_fail: [block]
environment::OCF_RESKEY_CRM_meta_on_node: [an-a02n02]
environment::OCF_RESKEY_CRM_meta_on_node_uuid: [2]
environment::OCF_RESKEY_CRM_meta_timeout: [86400000]
environment::OCF_RESKEY_name: [srv02-c8s-fujitsu]
=cut =cut
@ -1273,23 +1221,11 @@ pmsuspended - The domain has been suspended by guest power management, e.g. ente
$anvil->nice_exit({exit_code => 1}); $anvil->nice_exit({exit_code => 1});
} }
### TODO: Write the migration duration to /tmp/anvil.migration.<server>.data and have 'anvil-migrate-server' read that in to update the DB.
# Migrate the server # Migrate the server
sub migrate_server sub migrate_server
{ {
my ($anvil) = @_; my ($anvil) = @_;
### This requires a database
# If we can connect to a database, we'll set/clear the 'migrating' flag during migrations. For timing
# reasons we don't let the RA do resyncs.
# $anvil->Database->connect({sensitive => 1});
# $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => "log_0132"});
# if (not $anvil->data->{sys}{database}{connections})
# {
# # No databases, exit.
# $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, secure => 0, key => "error_0003"});
# return(1);
# }
### NOTE: For now, we're not going to block if the target is not UpToDate. There are times when a ### NOTE: For now, we're not going to block if the target is not UpToDate. There are times when a
### user might want to do this (ie: sync will be done soon and the need to evacuate the node ### user might want to do this (ie: sync will be done soon and the need to evacuate the node

@ -85,6 +85,9 @@ if ($host_type eq "striker")
# This is more than data collection in most agents, as it actually handles the changes on the fly # This is more than data collection in most agents, as it actually handles the changes on the fly
collect_data($anvil); collect_data($anvil);
# Look for migration times written out by ocf:alteeve:server.
record_migration_times($anvil);
# Mark that we ran. # Mark that we ran.
$anvil->Database->insert_or_update_updated({updated_by => $THIS_FILE}); $anvil->Database->insert_or_update_updated({updated_by => $THIS_FILE});
@ -94,6 +97,69 @@ $anvil->nice_exit({exit_code => 0});
# Functions # # Functions #
############################################################################################################# #############################################################################################################
# Look for migration times written out by ocf:alteeve:server.
sub record_migration_times
{
my ($anvil) = @_;
my $directory = "/tmp/anvil";
if (-d $directory)
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { directory => $directory }});
local(*DIRECTORY);
opendir(DIRECTORY, $directory);
while(my $file = readdir(DIRECTORY))
{
next if $file eq ".";
next if $file eq "..";
next if $file !~ /^migration-duration\./;
my $full_path = $directory."/".$file;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
file => $file,
full_path => $full_path,
}});
my $body = $anvil->Storage->read_file({file => $full_path});
$body =~ s/\n//;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { body => $body }});
if ($body =~ /server_name=(.*?),migration_took=(.*?)$/)
{
my $server_name = $1;
my $migration_took = $2;
my $anvil_uuid = $anvil->Cluster->get_anvil_uuid;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
server_name => $server_name,
migration_took => $migration_took,
anvil_uuid => $anvil_uuid,
}});
my $server_uuid = $anvil->Get->server_uuid_from_name({
server_name => $server_name,
anvil_uuid => $anvil_uuid,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { server_uuid => $server_uuid }});
if (($server_uuid) && ($migration_took))
{
my ($variable_uuid) = $anvil->Database->insert_or_update_variables({
file => $THIS_FILE,
line => __LINE__,
variable_name => "server::migration_duration",
variable_value => $migration_took,
variable_default => "",
variable_description => "message_0236",
variable_section => "servers",
variable_source_uuid => $server_uuid,
variable_source_table => "servers",
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { variable_uuid => $variable_uuid }});
}
}
unlink $full_path;
}
}
return(0);
}
# This reads in all the data we can find about servers running locally. This is more than data collection in # This reads in all the data we can find about servers running locally. This is more than data collection in
# most agents, as it actually handles the changes on the fly. # most agents, as it actually handles the changes on the fly.
sub collect_data sub collect_data

@ -946,6 +946,9 @@ AND
# Make sure /etc/hosts is updated. # Make sure /etc/hosts is updated.
$anvil->System->update_hosts(); $anvil->System->update_hosts();
# This handles weird bits for things like bug work-arounds.
handle_special_cases($anvil);
# Now look for jobs that have a job status of 'scancore_startup' # Now look for jobs that have a job status of 'scancore_startup'
run_jobs($anvil, 1); run_jobs($anvil, 1);
@ -963,6 +966,37 @@ AND
return(0); return(0);
} }
# This handles weird bits for things like bug work-arounds.
sub handle_special_cases
{
my ($anvil) = @_;
# RHBZ #1961562 - https://bugzilla.redhat.com/show_bug.cgi?id=1961562#c16
my $host_type = $anvil->Get->host_type();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_type => $host_type }});
if ($host_type ne "striker")
{
# We're a node or DR host. We need to touch this file.
my $work_around_file = "/etc/qemu/firmware/50-edk2-ovmf-cc.json";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { work_around_file => $work_around_file }});
if (not -e $work_around_file)
{
$anvil->Storage->write_file({
debug => 2,
file => $work_around_file,
body => "",
overwrite => 0,
backup => 0,
mode => "0644",
user => "root",
group => "root",
});
}
}
return(0);
}
# Configure the local database, if needed. # Configure the local database, if needed.
sub prep_database sub prep_database
{ {
@ -1274,7 +1308,7 @@ sub run_jobs
backup => 0, backup => 0,
mode => "0644", mode => "0644",
user => "apache", user => "apache",
group => "apache" group => "apache",
}); });
return(0); return(0);

Loading…
Cancel
Save