* Finished updating ocf:alteeve:server to no longer require a database connection. To do this, and still be able to track live migration times, the Server->migrate_virsh() method now writes out the server name and migration time to a /tmp/anvil/migration-duration.<server_name>.<unix_time> file. This file is checked for by the scan-server resource agent and, when found, is parsed and the migration duration is recorded, then the file is purged.

* Updated anvil-daemon to have a new function called "handle_special_cases" called during startup that does any weird bug mitigation required. For now, this is used to mitigate against rhbz#1961562, though certainly it will be used for other reasons later.

Signed-off-by: Digimer <digimer@alteeve.ca>
main
Digimer 4 years ago
parent e15c1651ed
commit 96fffb0b96
  1. 110
      Anvil/Tools/Server.pm
  2. 90
      ocf/alteeve/server
  3. 66
      scancore-agents/scan-server/scan-server
  4. 36
      tools/anvil-daemon

@ -983,8 +983,8 @@ sub migrate_virsh
});
}
### NOTE: This method is called by ocf:alteeve:server, which is allowed to operate without database
### access. As such, queries need to be run only if we've got one or more DB connections.
### NOTE: This method is called by ocf:alteeve:server, which operates without database access. As
### such, queries need to be run only if we've got one or more DB connections.
# Mark this server as being in a migration state.
if ($anvil->data->{sys}{database}{connections})
{
@ -1113,19 +1113,22 @@ WHERE
$success = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { success => $success }});
# Revert the server state and update the server host.
my $server_host_uuid = $anvil->Get->host_uuid_from_name({debug => $debug, host_name => $target});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { server_host_uuid => $server_host_uuid }});
if (not $server_host_uuid)
# Update the server state, if we have a database connection.
if ($anvil->data->{sys}{database}{connections})
{
# We didn't find the target's host_uuid, so use the old one and let scan-server
# handle it.
$server_host_uuid = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_host_uuid};
# Revert the server state and update the server host.
my $server_host_uuid = $anvil->Get->host_uuid_from_name({debug => $debug, host_name => $target});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { server_host_uuid => $server_host_uuid }});
}
if (($server_uuid) && ($anvil->data->{sys}{database}{connections}))
{
my $query = "
if (not $server_host_uuid)
{
# We didn't find the target's host_uuid, so use the old one and let scan-server
# handle it.
$server_host_uuid = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_host_uuid};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { server_host_uuid => $server_host_uuid }});
}
if ($server_uuid)
{
my $query = "
UPDATE
servers
SET
@ -1135,41 +1138,54 @@ SET
WHERE
server_uuid = ".$anvil->Database->quote($server_uuid)."
;";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { query => $query }});
$anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__});
$anvil->Database->insert_or_update_servers({
debug => $debug,
server_uuid => $server_uuid,
server_name => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_name},
server_anvil_uuid => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_anvil_uuid},
server_user_stop => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_user_stop},
server_start_after_server_uuid => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_start_after_server_uuid},
server_start_delay => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_start_delay},
server_host_uuid => $server_host_uuid,
server_state => $old_server_state,
server_live_migration => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_live_migration},
server_pre_migration_file_uuid => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_pre_migration_file_uuid},
server_pre_migration_arguments => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_pre_migration_arguments},
server_post_migration_file_uuid => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_post_migration_file_uuid},
server_post_migration_arguments => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_post_migration_arguments},
server_ram_in_use => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_ram_in_use},
server_configured_ram => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_configured_ram},
server_updated_by_user => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_updated_by_user},
server_boot_time => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_boot_time},
});
# Record the migration time.
my ($variable_uuid) = $anvil->Database->insert_or_update_variables({
file => $THIS_FILE,
line => __LINE__,
variable_name => "server::migration_duration",
variable_value => $migration_took,
variable_default => "",
variable_description => "message_0236",
variable_section => "servers",
variable_source_uuid => $server_uuid,
variable_source_table => "servers",
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { query => $query }});
$anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__});
$anvil->Database->insert_or_update_servers({
debug => $debug,
server_uuid => $server_uuid,
server_name => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_name},
server_anvil_uuid => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_anvil_uuid},
server_user_stop => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_user_stop},
server_start_after_server_uuid => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_start_after_server_uuid},
server_start_delay => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_start_delay},
server_host_uuid => $server_host_uuid,
server_state => $old_server_state,
server_live_migration => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_live_migration},
server_pre_migration_file_uuid => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_pre_migration_file_uuid},
server_pre_migration_arguments => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_pre_migration_arguments},
server_post_migration_file_uuid => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_post_migration_file_uuid},
server_post_migration_arguments => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_post_migration_arguments},
server_ram_in_use => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_ram_in_use},
server_configured_ram => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_configured_ram},
server_updated_by_user => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_updated_by_user},
server_boot_time => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_boot_time},
});
# Record the migration time.
my ($variable_uuid) = $anvil->Database->insert_or_update_variables({
file => $THIS_FILE,
line => __LINE__,
variable_name => "server::migration_duration",
variable_value => $migration_took,
variable_default => "",
variable_description => "message_0236",
variable_section => "servers",
variable_source_uuid => $server_uuid,
variable_source_table => "servers",
});
}
}
else
{
# There's no database, so write the migration time to a temp file.
my $body = "server_name=".$server.",migration_took=".$migration_took."\n";
my $file = "/tmp/anvil/migration-duration.".$server.".".time;
my ($failed) = $anvil->Storage->write_file({
file => $file,
body => $body,
mode => "0666",
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { failed => $failed }});
}
}

@ -8,6 +8,10 @@
# WARNING: This is a pretty purpose-specific resource agent. No effort was made to test this on an rgmanager
# cluster or on any configuration outside how the Anvil! m3 uses it. If you plan to adapt it to
# another purpose, let us know and we'll try to help.
#
# NOTE: This method, for the sake of speed and reliability, does not connect to the Anvil! database. If you
# do work on this RA, be sure that a check is made for database connections before SQL calls are made
# in module methods.
#
# Based on: https://github.com/ClusterLabs/resource-agents/blob/master/doc/dev-guides/ra-dev-guide.asc
#
@ -169,16 +173,6 @@ $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list
"switches::monitor" => $anvil->data->{switches}{monitor},
}});
# If we can connect to a database, we'll set/clear the 'migrating' flag during migrations. For timing reasons
# we don't let the RA do resyncs.
# $anvil->Database->connect({sensitive => 1});
# $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"});
# if (not $anvil->data->{sys}{database}{connections})
# {
# # No databases,
# $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, secure => 0, key => "warning_0073"});
# }
if ($anvil->data->{switches}{stop_drbd_resources})
{
$anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source} = 1;
@ -187,65 +181,19 @@ if ($anvil->data->{switches}{stop_drbd_resources})
# Something for the logs
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 2, key => "log_0298"});
=cut
Start:
environment::OCF_RESKEY_CRM_meta_name: [start]
environment::OCF_RESKEY_CRM_meta_on_fail: [block]
environment::OCF_RESKEY_CRM_meta_on_node: [an-a02n01]
environment::OCF_RESKEY_CRM_meta_on_node_uuid: [1]
environment::OCF_RESKEY_CRM_meta_timeout: [300000]
environment::OCF_RESKEY_name: [srv02-c8s-fujitsu]
Monitor:
environment::OCF_RESKEY_CRM_meta_interval: [60000]
environment::OCF_RESKEY_CRM_meta_name: [monitor]
environment::OCF_RESKEY_CRM_meta_on_node: [an-a02n01]
environment::OCF_RESKEY_CRM_meta_on_node_uuid: [1]
environment::OCF_RESKEY_CRM_meta_timeout: [20000]
environment::OCF_RESKEY_name: [srv02-c8s-fujitsu]
Migrate from an-a02n01 to an-a02n02
environment::OCF_RESKEY_CRM_meta_migrate_source: [an-a02n01]
environment::OCF_RESKEY_CRM_meta_migrate_target: [an-a02n02]
environment::OCF_RESKEY_CRM_meta_name: [migrate_to]
environment::OCF_RESKEY_CRM_meta_on_node: [an-a02n01]
environment::OCF_RESKEY_CRM_meta_on_node_uuid: [1]
environment::OCF_RESKEY_CRM_meta_record_pending: [true]
environment::OCF_RESKEY_CRM_meta_stop_drbd_resources: [0]
environment::OCF_RESKEY_CRM_meta_timeout: [86400000]
environment::OCF_RESKEY_name: [srv02-c8s-fujitsu]
# Post migration on an-a02n01; stop is called:
Post migration on an-a02n02:
environment::OCF_RESKEY_CRM_meta_migrate_source: [an-a02n01]
environment::OCF_RESKEY_CRM_meta_migrate_target: [an-a02n02]
environment::OCF_RESKEY_CRM_meta_name: [migrate_from]
environment::OCF_RESKEY_CRM_meta_on_node: [an-a02n02]
environment::OCF_RESKEY_CRM_meta_on_node_uuid: [2]
environment::OCF_RESKEY_CRM_meta_timeout: [600000]
environment::OCF_RESKEY_name: [srv02-c8s-fujitsu]
=cut Manual calls;
Checking server state after: [srv02-c8s-fujitsu] was migrated to an-a02n02;
# Start a server;
/usr/lib/ocf/resource.d/alteeve/server -vv --log-secure --server <server_name> --start
environment::OCF_RESKEY_CRM_meta_interval: [60000]
environment::OCF_RESKEY_CRM_meta_name: [monitor]
environment::OCF_RESKEY_CRM_meta_on_node: [an-a02n02]
environment::OCF_RESKEY_CRM_meta_on_node_uuid: [2]
environment::OCF_RESKEY_CRM_meta_timeout: [20000]
environment::OCF_RESKEY_name: [srv02-c8s-fujitsu]
# Stop a server
/usr/lib/ocf/resource.d/alteeve/server -vv --log-secure --server <server_name> --stop
Stop server (on an-a02n02):
# Monitor a server
/usr/lib/ocf/resource.d/alteeve/server -vv --log-secure --server <server_name> --monitor
environment::OCF_RESKEY_CRM_meta_name: [stop]
environment::OCF_RESKEY_CRM_meta_on_fail: [block]
environment::OCF_RESKEY_CRM_meta_on_node: [an-a02n02]
environment::OCF_RESKEY_CRM_meta_on_node_uuid: [2]
environment::OCF_RESKEY_CRM_meta_timeout: [86400000]
environment::OCF_RESKEY_name: [srv02-c8s-fujitsu]
# Migrate (run on current host)
/usr/lib/ocf/resource.d/alteeve/server -vv --log-secure --server <server_name> --migrate-to <target_node_name> --migrate-from <current_node_name>
=cut
@ -1273,23 +1221,11 @@ pmsuspended - The domain has been suspended by guest power management, e.g. ente
$anvil->nice_exit({exit_code => 1});
}
### TODO: Write the migration duration to /tmp/anvil.migration.<server>.data and have 'anvil-migrate-server' read that in to update the DB.
# Migrate the server
sub migrate_server
{
my ($anvil) = @_;
### This requires a database
# If we can connect to a database, we'll set/clear the 'migrating' flag during migrations. For timing
# reasons we don't let the RA do resyncs.
# $anvil->Database->connect({sensitive => 1});
# $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => "log_0132"});
# if (not $anvil->data->{sys}{database}{connections})
# {
# # No databases, exit.
# $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, secure => 0, key => "error_0003"});
# return(1);
# }
### NOTE: For now, we're not going to block if the target is not UpToDate. There are times when a
### user might want to do this (ie: sync will be done soon and the need to evacuate the node

@ -85,6 +85,9 @@ if ($host_type eq "striker")
# This is more than data collection in most agents, as it actually handles the changes on the fly
collect_data($anvil);
# Look for migration times written out by ocf:alteeve:server.
record_migration_times($anvil);
# Mark that we ran.
$anvil->Database->insert_or_update_updated({updated_by => $THIS_FILE});
@ -94,6 +97,69 @@ $anvil->nice_exit({exit_code => 0});
# Functions #
#############################################################################################################
# Look for migration times written out by ocf:alteeve:server.
sub record_migration_times
{
my ($anvil) = @_;
my $directory = "/tmp/anvil";
if (-d $directory)
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { directory => $directory }});
local(*DIRECTORY);
opendir(DIRECTORY, $directory);
while(my $file = readdir(DIRECTORY))
{
next if $file eq ".";
next if $file eq "..";
next if $file !~ /^migration-duration\./;
my $full_path = $directory."/".$file;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
file => $file,
full_path => $full_path,
}});
my $body = $anvil->Storage->read_file({file => $full_path});
$body =~ s/\n//;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { body => $body }});
if ($body =~ /server_name=(.*?),migration_took=(.*?)$/)
{
my $server_name = $1;
my $migration_took = $2;
my $anvil_uuid = $anvil->Cluster->get_anvil_uuid;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
server_name => $server_name,
migration_took => $migration_took,
anvil_uuid => $anvil_uuid,
}});
my $server_uuid = $anvil->Get->server_uuid_from_name({
server_name => $server_name,
anvil_uuid => $anvil_uuid,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { server_uuid => $server_uuid }});
if (($server_uuid) && ($migration_took))
{
my ($variable_uuid) = $anvil->Database->insert_or_update_variables({
file => $THIS_FILE,
line => __LINE__,
variable_name => "server::migration_duration",
variable_value => $migration_took,
variable_default => "",
variable_description => "message_0236",
variable_section => "servers",
variable_source_uuid => $server_uuid,
variable_source_table => "servers",
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { variable_uuid => $variable_uuid }});
}
}
unlink $full_path;
}
}
return(0);
}
# This reads in all the data we can find about servers running locally. This is more than data collection in
# most agents, as it actually handles the changes on the fly.
sub collect_data

@ -946,6 +946,9 @@ AND
# Make sure /etc/hosts is updated.
$anvil->System->update_hosts();
# This handles weird bits for things like bug work-arounds.
handle_special_cases($anvil);
# Now look for jobs that have a job status of 'scancore_startup'
run_jobs($anvil, 1);
@ -963,6 +966,37 @@ AND
return(0);
}
# This handles weird bits for things like bug work-arounds.
sub handle_special_cases
{
my ($anvil) = @_;
# RHBZ #1961562 - https://bugzilla.redhat.com/show_bug.cgi?id=1961562#c16
my $host_type = $anvil->Get->host_type();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_type => $host_type }});
if ($host_type ne "striker")
{
# We're a node or DR host. We need to touch this file.
my $work_around_file = "/etc/qemu/firmware/50-edk2-ovmf-cc.json";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { work_around_file => $work_around_file }});
if (not -e $work_around_file)
{
$anvil->Storage->write_file({
debug => 2,
file => $work_around_file,
body => "",
overwrite => 0,
backup => 0,
mode => "0644",
user => "root",
group => "root",
});
}
}
return(0);
}
# Configure the local database, if needed.
sub prep_database
{
@ -1274,7 +1308,7 @@ sub run_jobs
backup => 0,
mode => "0644",
user => "apache",
group => "apache"
group => "apache",
});
return(0);

Loading…
Cancel
Save