anvil/scancore-agents/scan-cluster/scan-cluster

817 lines
38 KiB
Plaintext
Raw Normal View History

#!/usr/bin/perl
#
# This scans the cluster by processing the CIB and storing information about nodes, cluster states, etc.
#
# NOTE: The data stored here is not bound to a given host. As such, node 1 will run this agent and node 2
# won't, _except_ when node 1 is offline and only node 2 is up.
#
# Examples;
#
# Exit codes;
# 0 = Normal exit.
# 1 = Startup failure (not running as root, no DB, bad file read, etc)
# 2 = Not a cluster member
#
# TODO:
# - When a node is lost, update the location constraints to keep the servers on the surviving node when the
# peer returns.
#
use strict;
use warnings;
use Anvil::Tools;
use Data::Dumper;
use Text::Diff;
# Disable buffering
$| = 1;
# Prevent a discrepency between UID/GID and EUID/EGID from throwing an error.
$< = $>;
$( = $);
my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0];
my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0];
if (($running_directory =~ /^\./) && ($ENV{PWD}))
{
$running_directory =~ s/^\./$ENV{PWD}/;
}
my $anvil = Anvil::Tools->new();
# Make sure we're running as 'root'
# $< == real UID, $> == effective UID
if (($< != 0) && ($> != 0))
{
# Not root
print $anvil->Words->string({key => "error_0005"})."\n";
$anvil->nice_exit({exit_code => 1});
}
$anvil->data->{scancore}{'scan-cluster'}{disable} = 0;
$anvil->data->{switches}{force} = 0;
$anvil->Storage->read_config();
# Read switches
$anvil->Get->switches;
# Handle start-up tasks
my $problem = $anvil->ScanCore->agent_startup({agent => $THIS_FILE});
if ($problem)
{
$anvil->nice_exit({exit_code => 1});
}
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }});
if ($anvil->data->{switches}{purge})
{
# This can be called when doing bulk-database purges.
* THe get_cpu endpoint was completed. * The get_mmeory endpoint was completed. * The get_replicated_storage endpoint was completed, though it requires testing and likely has issues. To prepare for the get_status endpoint work, I needed to update ScanCore and modules to track the host_status. This commit contains the work needed for this. * Updated ScanCore->post_scan_analysis_striker() to use configured fence devices (except PDUs) to check if a target host is off or on, in there is no host_ipmi interface. In all cases, if a machine can be confirmed on or off, the host_status is now updated. * To support the above fence based power checks, updated scan-cluster to store the on-disk CIB in the new scan_cluster -> scan_cluster_cib colume. * Updated ScanCore->parse_cib() to map stonith primitive IDs to fence agents. Updated ->parse_crm_mon() to not call if the executable doesn't exist to avoid unhelpful error messages in the logs when called from a Striker. * Update DRBD->gather_data() to get the size data from /sys/block/drbd<minor>/size' x '/sys/block/drbd<minor>/queue/logical_block_size so it works when a device is Secondary (and can't be promoted). * Updated Database->get_hosts_info() to record the short host name as well as the stored host name. Created ->update_host_status() as a wrapper to ->insert_or_update_hosts() that only updates the host status. * Updated anvil-join-anvil to disabled ksm and ksmtuned daemons. * Updated scancore and anvil-daemon to set the host_status to 'online' on startup. Signed-off-by: Digimer <digimer@alteeve.ca>
2021-04-10 00:51:29 +00:00
my $schema_file = $anvil->data->{path}{directories}{scan_agents}."/".$THIS_FILE."/".$THIS_FILE.".sql";
$anvil->Database->purge_data({
debug => 2,
* THe get_cpu endpoint was completed. * The get_mmeory endpoint was completed. * The get_replicated_storage endpoint was completed, though it requires testing and likely has issues. To prepare for the get_status endpoint work, I needed to update ScanCore and modules to track the host_status. This commit contains the work needed for this. * Updated ScanCore->post_scan_analysis_striker() to use configured fence devices (except PDUs) to check if a target host is off or on, in there is no host_ipmi interface. In all cases, if a machine can be confirmed on or off, the host_status is now updated. * To support the above fence based power checks, updated scan-cluster to store the on-disk CIB in the new scan_cluster -> scan_cluster_cib colume. * Updated ScanCore->parse_cib() to map stonith primitive IDs to fence agents. Updated ->parse_crm_mon() to not call if the executable doesn't exist to avoid unhelpful error messages in the logs when called from a Striker. * Update DRBD->gather_data() to get the size data from /sys/block/drbd<minor>/size' x '/sys/block/drbd<minor>/queue/logical_block_size so it works when a device is Secondary (and can't be promoted). * Updated Database->get_hosts_info() to record the short host name as well as the stored host name. Created ->update_host_status() as a wrapper to ->insert_or_update_hosts() that only updates the host status. * Updated anvil-join-anvil to disabled ksm and ksmtuned daemons. * Updated scancore and anvil-daemon to set the host_status to 'online' on startup. Signed-off-by: Digimer <digimer@alteeve.ca>
2021-04-10 00:51:29 +00:00
tables => $anvil->Database->get_tables_from_schema({schema_file => $schema_file}),
});
$anvil->nice_exit({exit_code => 0});
}
# Before we do anything, are we a node in a pacemaker cluster?
my $host_type = $anvil->Get->host_type;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { host_type => $host_type }});
if ($host_type ne "node")
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "scan_cluster_log_0002", variables => { host_type => $host_type }});
$anvil->nice_exit({exit_code => 0});
}
# Read last scan
read_last_scan($anvil);
# Read and process in one shot.
collect_data($anvil);
# Find changes.
find_changes($anvil);
The core logic is done!!!! Still need to finish end-points for the WebUI to hook into, but the core of M3 is complete! Many, many bugs are expected, of course. :) * Created DRBD->check_if_syncsource() and ->check_if_synctarget() that return '1' if the target host is currently SyncSource or SyncTarget for any resource, respectively. * Updated DRBD->update_global_common() to return the unified-format diff if any changes were made to global-common.conf. * Created ScanCore->check_health() that returns the health score for a host. Created ->count_servers() that returns the number of servers on a host, how much RAM is used by those servers and, if available, the estimated migration time of the servers. Updated ->check_temperature() to set/clear/return the time that a host has been in a warning or critical temperature state. * Finished ScanCore->post_scan_analysis_node()!!! It certainly has bugs, and much testing is needed, but the logic is all in place! Oh what a slog that was... It should be far more intelligent than M2 though, once flushed out and tested. * Created Server->active_migrations() that returns '1' if any servers are in a migration on an Anvil! system. Updated ->migrate_virsh() to record how long a migration took in the "server::migration_duration" variable, which is averaged by ScanCore->count_servers() to estimate migration times. * Updated scan-drbd to check/update the global-common.conf file's config at the end of a scan. * Updated ScanCore itself to not scan when in maintenance mode. Also updated it to call 'anvil-safe-start' when ScanCore starts, so long as it is within ten minutes of the host booting. Signed-off-by: Digimer <digimer@alteeve.ca>
2021-05-01 02:58:01 +00:00
# Check the cluster config.
check_config($anvil);
# Check the fence delay
check_fence_delay($anvil);
# Shut down.
$anvil->ScanCore->agent_shutdown({agent => $THIS_FILE});
#############################################################################################################
# Functions #
#############################################################################################################
# Check to see if we need to move the fence delay.
sub check_fence_delay
{
my ($anvil) = @_;
my $preferred_node = $anvil->Cluster->manage_fence_delay();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { preferred_node => $preferred_node }});
if ($preferred_node ne "!!error!!")
{
### NOTE: We don't make the peer be the preferred node, a node can only make itself the preferred
### node.
# How many servers are running on each node.
$anvil->Database->get_anvils();
$anvil->Database->get_servers();
$anvil->Cluster->get_peers();
my $anvil_uuid = $anvil->Cluster->get_anvil_uuid();
my $local_node_is = $anvil->data->{sys}{anvil}{i_am};
my $local_node_name = $anvil->data->{cib}{parsed}{'local'}{name};
my $local_host_name = $anvil->data->{sys}{anvil}{$local_node_is}{host_name};
my $local_host_uuid = $anvil->data->{sys}{anvil}{$local_node_is}{host_uuid};
my $peer_node_is = $anvil->data->{sys}{anvil}{peer_is};
my $peer_node_name = $anvil->data->{cib}{parsed}{peer}{name};;
my $peer_host_name = $anvil->data->{sys}{anvil}{$peer_node_is}{host_name};
my $peer_host_uuid = $anvil->data->{sys}{anvil}{$peer_node_is}{host_uuid};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
anvil_uuid => $anvil_uuid,
local_node_is => $local_node_is,
local_node_name => $local_node_name,
local_host_name => $local_host_name,
local_host_uuid => $local_host_uuid,
peer_node_is => $peer_node_is,
peer_node_name => $peer_node_name,
peer_host_name => $peer_host_name,
peer_host_uuid => $peer_host_uuid,
}});
# Get the short host names, as that's usually what the node name is.
my $local_short_host_name = $local_host_name;
$local_short_host_name =~ s/\..$//;
my $peer_short_host_name = $peer_host_name;
$peer_short_host_name =~ s/\..$//;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
local_short_host_name => $local_short_host_name,
peer_short_host_name => $peer_short_host_name,
}});
# If my peer isn't in the cluster, make sure I am the fence delay host.
if (not $anvil->data->{cib}{parsed}{peer}{ready})
{
# My peer is not ready, make sure I'm the preferred host.
if (($preferred_node eq $local_node_name) or ($preferred_node eq $local_host_name) && ($preferred_node eq $local_short_host_name))
{
# We're good.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0633"});
}
else
{
# We're not, set the delay to us.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0634"});
my $preferred_node = $anvil->Cluster->manage_fence_delay({prefer => $local_node_name});
return(0);
}
}
# How many servers are on each node?
my $local_server_count = 0;
my $peer_server_count = 0;
foreach my $server_uuid (keys %{$anvil->data->{servers}{server_uuid}})
{
next if $anvil_uuid ne $anvil->data->{servers}{server_uuid}{$server_uuid}{server_anvil_uuid};
my $server_name = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_name};
my $server_state = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_state};
my $server_host_uuid = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_host_uuid};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
server_uuid => $server_uuid,
server_name => $server_name,
server_state => $server_state,
server_host_uuid => $server_host_uuid,
}});
next if $server_state eq "shut off";
if ($server_state eq "migrating")
{
# Don't do anything.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0635", variables => { server_name => $server_name }});
return(0);
}
if ($server_host_uuid eq $local_host_uuid)
{
$local_server_count++;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_server_count => $local_server_count }});
}
elsif ($server_host_uuid eq $peer_host_uuid)
{
$peer_server_count++;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peer_server_count => $peer_server_count }});
}
}
# Don't do anything if there are no servers running anywhere, or if both servers have at least one
# server.
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
local_server_count => $local_server_count,
peer_server_count => $peer_server_count,
}});
if ((not $local_server_count) && (not $peer_server_count))
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0636"});
return(0);
}
elsif (($local_server_count) && ($peer_server_count))
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0637", variables => {
local_server_count => $local_server_count,
peer_server_count => $peer_server_count,
}});
return(0);
}
elsif (($local_server_count) && ($preferred_node ne $local_node_name))
{
# Make us the preferred host.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0638"});
my $preferred_node = $anvil->Cluster->manage_fence_delay({
debug => 2,
prefer => $local_node_name,
});
return(0);
}
}
return(0);
}
The core logic is done!!!! Still need to finish end-points for the WebUI to hook into, but the core of M3 is complete! Many, many bugs are expected, of course. :) * Created DRBD->check_if_syncsource() and ->check_if_synctarget() that return '1' if the target host is currently SyncSource or SyncTarget for any resource, respectively. * Updated DRBD->update_global_common() to return the unified-format diff if any changes were made to global-common.conf. * Created ScanCore->check_health() that returns the health score for a host. Created ->count_servers() that returns the number of servers on a host, how much RAM is used by those servers and, if available, the estimated migration time of the servers. Updated ->check_temperature() to set/clear/return the time that a host has been in a warning or critical temperature state. * Finished ScanCore->post_scan_analysis_node()!!! It certainly has bugs, and much testing is needed, but the logic is all in place! Oh what a slog that was... It should be far more intelligent than M2 though, once flushed out and tested. * Created Server->active_migrations() that returns '1' if any servers are in a migration on an Anvil! system. Updated ->migrate_virsh() to record how long a migration took in the "server::migration_duration" variable, which is averaged by ScanCore->count_servers() to estimate migration times. * Updated scan-drbd to check/update the global-common.conf file's config at the end of a scan. * Updated ScanCore itself to not scan when in maintenance mode. Also updated it to call 'anvil-safe-start' when ScanCore starts, so long as it is within ten minutes of the host booting. Signed-off-by: Digimer <digimer@alteeve.ca>
2021-05-01 02:58:01 +00:00
sub check_config
{
my ($anvil) = @_;
$anvil->Database->get_manifests();
my $anvil_name = $anvil->Cluster->get_anvil_name({});
my $manifest_uuid = $anvil->data->{manifests}{manifest_name}{$anvil_name}{manifest_uuid};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
anvil_name => $anvil_name,
manifest_uuid => $manifest_uuid,
}});
if ($manifest_uuid)
{
# Check to see if the stonith config needs to be updated.
$anvil->Cluster->check_stonith_config({debug => 2});
The core logic is done!!!! Still need to finish end-points for the WebUI to hook into, but the core of M3 is complete! Many, many bugs are expected, of course. :) * Created DRBD->check_if_syncsource() and ->check_if_synctarget() that return '1' if the target host is currently SyncSource or SyncTarget for any resource, respectively. * Updated DRBD->update_global_common() to return the unified-format diff if any changes were made to global-common.conf. * Created ScanCore->check_health() that returns the health score for a host. Created ->count_servers() that returns the number of servers on a host, how much RAM is used by those servers and, if available, the estimated migration time of the servers. Updated ->check_temperature() to set/clear/return the time that a host has been in a warning or critical temperature state. * Finished ScanCore->post_scan_analysis_node()!!! It certainly has bugs, and much testing is needed, but the logic is all in place! Oh what a slog that was... It should be far more intelligent than M2 though, once flushed out and tested. * Created Server->active_migrations() that returns '1' if any servers are in a migration on an Anvil! system. Updated ->migrate_virsh() to record how long a migration took in the "server::migration_duration" variable, which is averaged by ScanCore->count_servers() to estimate migration times. * Updated scan-drbd to check/update the global-common.conf file's config at the end of a scan. * Updated ScanCore itself to not scan when in maintenance mode. Also updated it to call 'anvil-safe-start' when ScanCore starts, so long as it is within ten minutes of the host booting. Signed-off-by: Digimer <digimer@alteeve.ca>
2021-05-01 02:58:01 +00:00
}
return(0);
}
# Looks for changes.
sub find_changes
{
my ($anvil) = @_;
# We can't track a cluster through name change, so either we're INSERTing a new one, or bust.
my $scan_cluster_anvil_uuid = $anvil->Cluster->get_anvil_uuid();
my $anvil_name = $anvil->Get->anvil_name_from_uuid({anvil_uuid => $scan_cluster_anvil_uuid});
my $scan_cluster_uuid = "";
my $cluster_name = $anvil->data->{cib}{parsed}{data}{cluster}{name};
my $stonith_enabled = $anvil->data->{cib}{parsed}{data}{stonith}{enabled};
my $stonith_max_attempts = $anvil->data->{cib}{parsed}{data}{stonith}{'max-attempts'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
cluster_name => $cluster_name,
stonith_enabled => $stonith_enabled,
stonith_max_attempts => $stonith_max_attempts,
}});
* THe get_cpu endpoint was completed. * The get_mmeory endpoint was completed. * The get_replicated_storage endpoint was completed, though it requires testing and likely has issues. To prepare for the get_status endpoint work, I needed to update ScanCore and modules to track the host_status. This commit contains the work needed for this. * Updated ScanCore->post_scan_analysis_striker() to use configured fence devices (except PDUs) to check if a target host is off or on, in there is no host_ipmi interface. In all cases, if a machine can be confirmed on or off, the host_status is now updated. * To support the above fence based power checks, updated scan-cluster to store the on-disk CIB in the new scan_cluster -> scan_cluster_cib colume. * Updated ScanCore->parse_cib() to map stonith primitive IDs to fence agents. Updated ->parse_crm_mon() to not call if the executable doesn't exist to avoid unhelpful error messages in the logs when called from a Striker. * Update DRBD->gather_data() to get the size data from /sys/block/drbd<minor>/size' x '/sys/block/drbd<minor>/queue/logical_block_size so it works when a device is Secondary (and can't be promoted). * Updated Database->get_hosts_info() to record the short host name as well as the stored host name. Created ->update_host_status() as a wrapper to ->insert_or_update_hosts() that only updates the host status. * Updated anvil-join-anvil to disabled ksm and ksmtuned daemons. * Updated scancore and anvil-daemon to set the host_status to 'online' on startup. Signed-off-by: Digimer <digimer@alteeve.ca>
2021-04-10 00:51:29 +00:00
# If we're a full cluster member, read the CIB as well.
my $cluster_cib = "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "cib::parsed::local::ready" => $anvil->data->{cib}{parsed}{'local'}{ready} }});
if ($anvil->data->{cib}{parsed}{'local'}{ready})
{
($cluster_cib, my $return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{pcs}." cluster cib", source => $THIS_FILE, line => __LINE__});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
cluster_cib => $cluster_cib,
return_code => $return_code,
}});
* THe get_cpu endpoint was completed. * The get_mmeory endpoint was completed. * The get_replicated_storage endpoint was completed, though it requires testing and likely has issues. To prepare for the get_status endpoint work, I needed to update ScanCore and modules to track the host_status. This commit contains the work needed for this. * Updated ScanCore->post_scan_analysis_striker() to use configured fence devices (except PDUs) to check if a target host is off or on, in there is no host_ipmi interface. In all cases, if a machine can be confirmed on or off, the host_status is now updated. * To support the above fence based power checks, updated scan-cluster to store the on-disk CIB in the new scan_cluster -> scan_cluster_cib colume. * Updated ScanCore->parse_cib() to map stonith primitive IDs to fence agents. Updated ->parse_crm_mon() to not call if the executable doesn't exist to avoid unhelpful error messages in the logs when called from a Striker. * Update DRBD->gather_data() to get the size data from /sys/block/drbd<minor>/size' x '/sys/block/drbd<minor>/queue/logical_block_size so it works when a device is Secondary (and can't be promoted). * Updated Database->get_hosts_info() to record the short host name as well as the stored host name. Created ->update_host_status() as a wrapper to ->insert_or_update_hosts() that only updates the host status. * Updated anvil-join-anvil to disabled ksm and ksmtuned daemons. * Updated scancore and anvil-daemon to set the host_status to 'online' on startup. Signed-off-by: Digimer <digimer@alteeve.ca>
2021-04-10 00:51:29 +00:00
}
if (exists $anvil->data->{sql}{anvil_uuid}{$scan_cluster_anvil_uuid})
{
# Check for a name change
$scan_cluster_uuid = $anvil->data->{sql}{anvil_uuid}{$scan_cluster_anvil_uuid};
my $old_cluster_name = $anvil->data->{sql}{scan_cluster}{scan_cluster_uuid}{$scan_cluster_uuid}{scan_cluster_name};
* THe get_cpu endpoint was completed. * The get_mmeory endpoint was completed. * The get_replicated_storage endpoint was completed, though it requires testing and likely has issues. To prepare for the get_status endpoint work, I needed to update ScanCore and modules to track the host_status. This commit contains the work needed for this. * Updated ScanCore->post_scan_analysis_striker() to use configured fence devices (except PDUs) to check if a target host is off or on, in there is no host_ipmi interface. In all cases, if a machine can be confirmed on or off, the host_status is now updated. * To support the above fence based power checks, updated scan-cluster to store the on-disk CIB in the new scan_cluster -> scan_cluster_cib colume. * Updated ScanCore->parse_cib() to map stonith primitive IDs to fence agents. Updated ->parse_crm_mon() to not call if the executable doesn't exist to avoid unhelpful error messages in the logs when called from a Striker. * Update DRBD->gather_data() to get the size data from /sys/block/drbd<minor>/size' x '/sys/block/drbd<minor>/queue/logical_block_size so it works when a device is Secondary (and can't be promoted). * Updated Database->get_hosts_info() to record the short host name as well as the stored host name. Created ->update_host_status() as a wrapper to ->insert_or_update_hosts() that only updates the host status. * Updated anvil-join-anvil to disabled ksm and ksmtuned daemons. * Updated scancore and anvil-daemon to set the host_status to 'online' on startup. Signed-off-by: Digimer <digimer@alteeve.ca>
2021-04-10 00:51:29 +00:00
my $old_cluster_cib = $anvil->data->{sql}{scan_cluster}{scan_cluster_uuid}{$scan_cluster_uuid}{scan_cluster_cib};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
scan_cluster_uuid => $scan_cluster_uuid,
old_cluster_name => $old_cluster_name,
* THe get_cpu endpoint was completed. * The get_mmeory endpoint was completed. * The get_replicated_storage endpoint was completed, though it requires testing and likely has issues. To prepare for the get_status endpoint work, I needed to update ScanCore and modules to track the host_status. This commit contains the work needed for this. * Updated ScanCore->post_scan_analysis_striker() to use configured fence devices (except PDUs) to check if a target host is off or on, in there is no host_ipmi interface. In all cases, if a machine can be confirmed on or off, the host_status is now updated. * To support the above fence based power checks, updated scan-cluster to store the on-disk CIB in the new scan_cluster -> scan_cluster_cib colume. * Updated ScanCore->parse_cib() to map stonith primitive IDs to fence agents. Updated ->parse_crm_mon() to not call if the executable doesn't exist to avoid unhelpful error messages in the logs when called from a Striker. * Update DRBD->gather_data() to get the size data from /sys/block/drbd<minor>/size' x '/sys/block/drbd<minor>/queue/logical_block_size so it works when a device is Secondary (and can't be promoted). * Updated Database->get_hosts_info() to record the short host name as well as the stored host name. Created ->update_host_status() as a wrapper to ->insert_or_update_hosts() that only updates the host status. * Updated anvil-join-anvil to disabled ksm and ksmtuned daemons. * Updated scancore and anvil-daemon to set the host_status to 'online' on startup. Signed-off-by: Digimer <digimer@alteeve.ca>
2021-04-10 00:51:29 +00:00
old_cluster_cib => $old_cluster_cib,
}});
if ($cluster_name ne $old_cluster_name)
{
# The name of the cluster has changed.
my $query = "
UPDATE
scan_cluster
SET
scan_cluster_name = ".$anvil->Database->quote($cluster_name).",
modified_date = ".$anvil->Database->quote($anvil->Database->refresh_timestamp)."
WHERE
scan_cluster_uuid = ".$anvil->Database->quote($scan_cluster_uuid)."
;";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }});
$anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__});
* THe get_cpu endpoint was completed. * The get_mmeory endpoint was completed. * The get_replicated_storage endpoint was completed, though it requires testing and likely has issues. To prepare for the get_status endpoint work, I needed to update ScanCore and modules to track the host_status. This commit contains the work needed for this. * Updated ScanCore->post_scan_analysis_striker() to use configured fence devices (except PDUs) to check if a target host is off or on, in there is no host_ipmi interface. In all cases, if a machine can be confirmed on or off, the host_status is now updated. * To support the above fence based power checks, updated scan-cluster to store the on-disk CIB in the new scan_cluster -> scan_cluster_cib colume. * Updated ScanCore->parse_cib() to map stonith primitive IDs to fence agents. Updated ->parse_crm_mon() to not call if the executable doesn't exist to avoid unhelpful error messages in the logs when called from a Striker. * Update DRBD->gather_data() to get the size data from /sys/block/drbd<minor>/size' x '/sys/block/drbd<minor>/queue/logical_block_size so it works when a device is Secondary (and can't be promoted). * Updated Database->get_hosts_info() to record the short host name as well as the stored host name. Created ->update_host_status() as a wrapper to ->insert_or_update_hosts() that only updates the host status. * Updated anvil-join-anvil to disabled ksm and ksmtuned daemons. * Updated scancore and anvil-daemon to set the host_status to 'online' on startup. Signed-off-by: Digimer <digimer@alteeve.ca>
2021-04-10 00:51:29 +00:00
my $variables = {
new_cluster_name => $cluster_name,
old_cluster_name => $old_cluster_name,
};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_cluster_alert_0002", variables => $variables});
$anvil->Alert->register({debug => 2, alert_level => "notice", message => "scan_cluster_alert_0002", variables => $variables, set_by => $THIS_FILE});
}
if (($cluster_cib) && ($cluster_cib ne $old_cluster_cib))
{
my $query = "
UPDATE
scan_cluster
SET
scan_cluster_cib = ".$anvil->Database->quote($cluster_cib).",
modified_date = ".$anvil->Database->quote($anvil->Database->refresh_timestamp)."
* THe get_cpu endpoint was completed. * The get_mmeory endpoint was completed. * The get_replicated_storage endpoint was completed, though it requires testing and likely has issues. To prepare for the get_status endpoint work, I needed to update ScanCore and modules to track the host_status. This commit contains the work needed for this. * Updated ScanCore->post_scan_analysis_striker() to use configured fence devices (except PDUs) to check if a target host is off or on, in there is no host_ipmi interface. In all cases, if a machine can be confirmed on or off, the host_status is now updated. * To support the above fence based power checks, updated scan-cluster to store the on-disk CIB in the new scan_cluster -> scan_cluster_cib colume. * Updated ScanCore->parse_cib() to map stonith primitive IDs to fence agents. Updated ->parse_crm_mon() to not call if the executable doesn't exist to avoid unhelpful error messages in the logs when called from a Striker. * Update DRBD->gather_data() to get the size data from /sys/block/drbd<minor>/size' x '/sys/block/drbd<minor>/queue/logical_block_size so it works when a device is Secondary (and can't be promoted). * Updated Database->get_hosts_info() to record the short host name as well as the stored host name. Created ->update_host_status() as a wrapper to ->insert_or_update_hosts() that only updates the host status. * Updated anvil-join-anvil to disabled ksm and ksmtuned daemons. * Updated scancore and anvil-daemon to set the host_status to 'online' on startup. Signed-off-by: Digimer <digimer@alteeve.ca>
2021-04-10 00:51:29 +00:00
WHERE
scan_cluster_uuid = ".$anvil->Database->quote($scan_cluster_uuid)."
;";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }});
$anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__});
my $difference = diff \$old_cluster_cib, \$cluster_cib, { STYLE => 'Unified' };
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { difference => $difference }});
my $variables = {
cluster_name => $cluster_name,
difference => $difference,
};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_cluster_alert_0012", variables => $variables});
$anvil->Alert->register({debug => 2, alert_level => "notice", message => "scan_cluster_alert_0012", variables => $variables, set_by => $THIS_FILE});
}
}
else
{
# New cluster, INSERT
$scan_cluster_uuid = $anvil->Get->uuid();
my $query = "
INSERT INTO
scan_cluster
(
scan_cluster_uuid,
scan_cluster_anvil_uuid,
scan_cluster_name,
* THe get_cpu endpoint was completed. * The get_mmeory endpoint was completed. * The get_replicated_storage endpoint was completed, though it requires testing and likely has issues. To prepare for the get_status endpoint work, I needed to update ScanCore and modules to track the host_status. This commit contains the work needed for this. * Updated ScanCore->post_scan_analysis_striker() to use configured fence devices (except PDUs) to check if a target host is off or on, in there is no host_ipmi interface. In all cases, if a machine can be confirmed on or off, the host_status is now updated. * To support the above fence based power checks, updated scan-cluster to store the on-disk CIB in the new scan_cluster -> scan_cluster_cib colume. * Updated ScanCore->parse_cib() to map stonith primitive IDs to fence agents. Updated ->parse_crm_mon() to not call if the executable doesn't exist to avoid unhelpful error messages in the logs when called from a Striker. * Update DRBD->gather_data() to get the size data from /sys/block/drbd<minor>/size' x '/sys/block/drbd<minor>/queue/logical_block_size so it works when a device is Secondary (and can't be promoted). * Updated Database->get_hosts_info() to record the short host name as well as the stored host name. Created ->update_host_status() as a wrapper to ->insert_or_update_hosts() that only updates the host status. * Updated anvil-join-anvil to disabled ksm and ksmtuned daemons. * Updated scancore and anvil-daemon to set the host_status to 'online' on startup. Signed-off-by: Digimer <digimer@alteeve.ca>
2021-04-10 00:51:29 +00:00
scan_cluster_cib,
modified_date
) VALUES (
".$anvil->Database->quote($scan_cluster_uuid).",
".$anvil->Database->quote($scan_cluster_anvil_uuid).",
".$anvil->Database->quote($cluster_name).",
* THe get_cpu endpoint was completed. * The get_mmeory endpoint was completed. * The get_replicated_storage endpoint was completed, though it requires testing and likely has issues. To prepare for the get_status endpoint work, I needed to update ScanCore and modules to track the host_status. This commit contains the work needed for this. * Updated ScanCore->post_scan_analysis_striker() to use configured fence devices (except PDUs) to check if a target host is off or on, in there is no host_ipmi interface. In all cases, if a machine can be confirmed on or off, the host_status is now updated. * To support the above fence based power checks, updated scan-cluster to store the on-disk CIB in the new scan_cluster -> scan_cluster_cib colume. * Updated ScanCore->parse_cib() to map stonith primitive IDs to fence agents. Updated ->parse_crm_mon() to not call if the executable doesn't exist to avoid unhelpful error messages in the logs when called from a Striker. * Update DRBD->gather_data() to get the size data from /sys/block/drbd<minor>/size' x '/sys/block/drbd<minor>/queue/logical_block_size so it works when a device is Secondary (and can't be promoted). * Updated Database->get_hosts_info() to record the short host name as well as the stored host name. Created ->update_host_status() as a wrapper to ->insert_or_update_hosts() that only updates the host status. * Updated anvil-join-anvil to disabled ksm and ksmtuned daemons. * Updated scancore and anvil-daemon to set the host_status to 'online' on startup. Signed-off-by: Digimer <digimer@alteeve.ca>
2021-04-10 00:51:29 +00:00
".$anvil->Database->quote($cluster_cib).",
".$anvil->Database->quote($anvil->Database->refresh_timestamp)."
);";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }});
$anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__});
my $variables = { cluster_name => $cluster_name };
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_cluster_alert_0001", variables => $variables});
$anvil->Alert->register({debug => 2, alert_level => "notice", message => "scan_cluster_alert_0001", variables => $variables, set_by => $THIS_FILE});
}
$anvil->Database->get_anvils();
foreach my $scan_cluster_node_name (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{data}{node}})
{
my $scan_cluster_node_host_uuid = $anvil->Get->host_uuid_from_name({host_name => $scan_cluster_node_name});
my $scan_cluster_node_pacemaker_id = $anvil->data->{cib}{parsed}{data}{node}{$scan_cluster_node_name}{node_state}{pacemaker_id};
my $scan_cluster_node_in_ccm = $anvil->data->{cib}{parsed}{data}{node}{$scan_cluster_node_name}{node_state}{in_ccm};
my $scan_cluster_node_crmd_member = $anvil->data->{cib}{parsed}{data}{node}{$scan_cluster_node_name}{node_state}{crmd};
my $scan_cluster_node_cluster_member = $anvil->data->{cib}{parsed}{data}{node}{$scan_cluster_node_name}{node_state}{'join'};
my $scan_cluster_node_maintenance_mode = $anvil->data->{cib}{parsed}{data}{node}{$scan_cluster_node_name}{node_state}{'maintenance-mode'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
scan_cluster_node_name => $scan_cluster_node_name,
scan_cluster_node_host_uuid => $scan_cluster_node_host_uuid,
scan_cluster_node_pacemaker_id => $scan_cluster_node_pacemaker_id,
scan_cluster_node_in_ccm => $scan_cluster_node_in_ccm,
scan_cluster_node_crmd_member => $scan_cluster_node_crmd_member,
scan_cluster_node_cluster_member => $scan_cluster_node_cluster_member,
scan_cluster_node_maintenance_mode => $scan_cluster_node_maintenance_mode,
}});
if (exists $anvil->data->{sql}{scan_cluster_node_host_uuid}{$scan_cluster_node_host_uuid})
{
# Look for changes.
my $scan_cluster_node_uuid = $anvil->data->{sql}{scan_cluster_node_host_uuid}{$scan_cluster_node_host_uuid};
my $old_scan_cluster_node_name = $anvil->data->{sql}{scan_cluster_node}{scan_cluster_node_uuid}{$scan_cluster_node_uuid}{scan_cluster_node_name};
my $old_scan_cluster_node_pacemaker_id = $anvil->data->{sql}{scan_cluster_node}{scan_cluster_node_uuid}{$scan_cluster_node_uuid}{scan_cluster_node_pacemaker_id};
my $old_scan_cluster_node_in_ccm = $anvil->data->{sql}{scan_cluster_node}{scan_cluster_node_uuid}{$scan_cluster_node_uuid}{scan_cluster_node_in_ccm};
my $old_scan_cluster_node_crmd_member = $anvil->data->{sql}{scan_cluster_node}{scan_cluster_node_uuid}{$scan_cluster_node_uuid}{scan_cluster_node_crmd_member};
my $old_scan_cluster_node_cluster_member = $anvil->data->{sql}{scan_cluster_node}{scan_cluster_node_uuid}{$scan_cluster_node_uuid}{scan_cluster_node_cluster_member};
my $old_scan_cluster_node_maintenance_mode = $anvil->data->{sql}{scan_cluster_node}{scan_cluster_node_uuid}{$scan_cluster_node_uuid}{scan_cluster_node_maintenance_mode};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
scan_cluster_node_uuid => $scan_cluster_node_uuid,
old_scan_cluster_node_name => $old_scan_cluster_node_name,
old_scan_cluster_node_pacemaker_id => $old_scan_cluster_node_pacemaker_id,
old_scan_cluster_node_in_ccm => $old_scan_cluster_node_in_ccm,
old_scan_cluster_node_crmd_member => $old_scan_cluster_node_crmd_member,
old_scan_cluster_node_cluster_member => $old_scan_cluster_node_cluster_member,
old_scan_cluster_node_maintenance_mode => $old_scan_cluster_node_maintenance_mode,
}});
my $update = 0;
if ($scan_cluster_node_name ne $old_scan_cluster_node_name)
{
$update = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { update => $update }});
my $variables = {
new_node_name => $scan_cluster_node_name,
old_node_name => $old_scan_cluster_node_name,
};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_cluster_alert_0008", variables => $variables});
$anvil->Alert->register({debug => 2, alert_level => "notice", message => "scan_cluster_alert_0009", variables => $variables, set_by => $THIS_FILE});
}
if ($scan_cluster_node_pacemaker_id ne $old_scan_cluster_node_pacemaker_id)
{
$update = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { update => $update }});
my $variables = {
node_name => $scan_cluster_node_name,
new_pacemaker_id => $scan_cluster_node_pacemaker_id ? "#!string!scan_cluster_unit_0001!#" : "#!string!scan_cluster_unit_0002!#",
old_pacemaker_id => $old_scan_cluster_node_pacemaker_id ? "#!string!scan_cluster_unit_0001!#" : "#!string!scan_cluster_unit_0002!#",
};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_cluster_alert_0004", variables => $variables});
$anvil->Alert->register({debug => 2, alert_level => "notice", message => "scan_cluster_alert_0004", variables => $variables, set_by => $THIS_FILE});
}
if ($scan_cluster_node_in_ccm ne $old_scan_cluster_node_in_ccm)
{
$update = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { update => $update }});
my $variables = {
node_name => $scan_cluster_node_name,
new_in_ccm => $scan_cluster_node_in_ccm ? "#!string!scan_cluster_unit_0001!#" : "#!string!scan_cluster_unit_0002!#",
old_in_ccm => $old_scan_cluster_node_in_ccm ? "#!string!scan_cluster_unit_0001!#" : "#!string!scan_cluster_unit_0002!#",
};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_cluster_alert_0005", variables => $variables});
$anvil->Alert->register({debug => 2, alert_level => "notice", message => "scan_cluster_alert_0005", variables => $variables, set_by => $THIS_FILE});
}
if ($scan_cluster_node_crmd_member ne $old_scan_cluster_node_crmd_member)
{
$update = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { update => $update }});
my $variables = {
node_name => $scan_cluster_node_name,
new_crmd_member => $scan_cluster_node_crmd_member ? "#!string!scan_cluster_unit_0001!#" : "#!string!scan_cluster_unit_0002!#",
old_crmd_member => $old_scan_cluster_node_crmd_member ? "#!string!scan_cluster_unit_0001!#" : "#!string!scan_cluster_unit_0002!#",
};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_cluster_alert_0006", variables => $variables});
$anvil->Alert->register({debug => 2, alert_level => "notice", message => "scan_cluster_alert_0006", variables => $variables, set_by => $THIS_FILE});
}
if ($scan_cluster_node_cluster_member ne $old_scan_cluster_node_cluster_member)
{
$update = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { update => $update }});
my $variables = {
node_name => $scan_cluster_node_name,
new_cluster_member => $scan_cluster_node_cluster_member ? "#!string!scan_cluster_unit_0001!#" : "#!string!scan_cluster_unit_0002!#",
old_cluster_member => $old_scan_cluster_node_cluster_member ? "#!string!scan_cluster_unit_0001!#" : "#!string!scan_cluster_unit_0002!#",
};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_cluster_alert_0007", variables => $variables});
$anvil->Alert->register({debug => 2, alert_level => "notice", message => "scan_cluster_alert_0007", variables => $variables, set_by => $THIS_FILE});
}
if ($scan_cluster_node_maintenance_mode ne $old_scan_cluster_node_maintenance_mode)
{
$update = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { update => $update }});
my $variables = {
node_name => $scan_cluster_node_name,
new_maintenance_mode => $scan_cluster_node_maintenance_mode ? "#!string!scan_cluster_unit_0001!#" : "#!string!scan_cluster_unit_0002!#",
old_maintenance_mode => $old_scan_cluster_node_maintenance_mode ? "#!string!scan_cluster_unit_0001!#" : "#!string!scan_cluster_unit_0002!#",
};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_cluster_alert_0008", variables => $variables});
$anvil->Alert->register({debug => 2, alert_level => "notice", message => "scan_cluster_alert_0008", variables => $variables, set_by => $THIS_FILE});
}
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { update => $update }});
if ($update)
{
my $query = "
UPDATE
scan_cluster_nodes
SET
scan_cluster_node_name = ".$anvil->Database->quote($scan_cluster_node_name).",
scan_cluster_node_pacemaker_id = ".$anvil->Database->quote($scan_cluster_node_pacemaker_id).",
scan_cluster_node_in_ccm = ".$anvil->Database->quote($scan_cluster_node_in_ccm).",
scan_cluster_node_crmd_member = ".$anvil->Database->quote($scan_cluster_node_crmd_member).",
scan_cluster_node_cluster_member = ".$anvil->Database->quote($scan_cluster_node_cluster_member).",
scan_cluster_node_maintenance_mode = ".$anvil->Database->quote($scan_cluster_node_maintenance_mode).",
modified_date = ".$anvil->Database->quote($anvil->Database->refresh_timestamp)."
WHERE
scan_cluster_node_uuid = ".$anvil->Database->quote($scan_cluster_node_uuid)."
;";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }});
$anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__});
}
}
else
{
# Add the node.
my $scan_cluster_node_uuid = $anvil->Get->uuid();
my $query = "
INSERT INTO
scan_cluster_nodes
(
scan_cluster_node_uuid,
scan_cluster_node_scan_cluster_uuid,
scan_cluster_node_host_uuid,
scan_cluster_node_name,
scan_cluster_node_pacemaker_id,
scan_cluster_node_in_ccm,
scan_cluster_node_crmd_member,
scan_cluster_node_cluster_member,
scan_cluster_node_maintenance_mode,
modified_date
) VALUES (
".$anvil->Database->quote($scan_cluster_node_uuid).",
".$anvil->Database->quote($scan_cluster_uuid).",
".$anvil->Database->quote($scan_cluster_node_host_uuid).",
".$anvil->Database->quote($scan_cluster_node_name).",
".$anvil->Database->quote($scan_cluster_node_pacemaker_id).",
".$anvil->Database->quote($scan_cluster_node_in_ccm).",
".$anvil->Database->quote($scan_cluster_node_crmd_member).",
".$anvil->Database->quote($scan_cluster_node_cluster_member).",
".$anvil->Database->quote($scan_cluster_node_maintenance_mode).",
".$anvil->Database->quote($anvil->Database->refresh_timestamp)."
);";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }});
$anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__});
my $host_name = $anvil->Get->host_name_from_uuid({host_uuid => $scan_cluster_node_host_uuid});
my $variables = {
cluster_name => $cluster_name,
node_name => $scan_cluster_node_name,
host_uuid => $scan_cluster_node_host_uuid,
host_name => $host_name,
pacemaker_id => $scan_cluster_node_pacemaker_id,
in_ccm => $scan_cluster_node_in_ccm ? "#!string!scan_cluster_unit_0001!#" : "#!string!scan_cluster_unit_0002!#", # Yes or No
crmd_member => $scan_cluster_node_crmd_member ? "#!string!scan_cluster_unit_0001!#" : "#!string!scan_cluster_unit_0002!#",
cluster_member => $scan_cluster_node_cluster_member ? "#!string!scan_cluster_unit_0001!#" : "#!string!scan_cluster_unit_0002!#",
maintenance_mode => $scan_cluster_node_maintenance_mode ? "#!string!scan_cluster_unit_0001!#" : "#!string!scan_cluster_unit_0002!#",
};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_cluster_alert_0003", variables => $variables});
$anvil->Alert->register({debug => 2, alert_level => "notice", message => "scan_cluster_alert_0003", variables => $variables, set_by => $THIS_FILE});
}
}
### TODO: Check for / repair bad cluster config issues
# If we're still alive, we're either node 1, or we're node 2 and node 1 is not ready. If we're not ready,
if ($stonith_max_attempts ne "INFINITY")
{
### TODO: Call pcs to update
}
return(0);
}
# Read in existing data from the database.
sub read_last_scan
{
my ($anvil) = @_;
my $query = "
SELECT
scan_cluster_uuid,
scan_cluster_anvil_uuid,
* THe get_cpu endpoint was completed. * The get_mmeory endpoint was completed. * The get_replicated_storage endpoint was completed, though it requires testing and likely has issues. To prepare for the get_status endpoint work, I needed to update ScanCore and modules to track the host_status. This commit contains the work needed for this. * Updated ScanCore->post_scan_analysis_striker() to use configured fence devices (except PDUs) to check if a target host is off or on, in there is no host_ipmi interface. In all cases, if a machine can be confirmed on or off, the host_status is now updated. * To support the above fence based power checks, updated scan-cluster to store the on-disk CIB in the new scan_cluster -> scan_cluster_cib colume. * Updated ScanCore->parse_cib() to map stonith primitive IDs to fence agents. Updated ->parse_crm_mon() to not call if the executable doesn't exist to avoid unhelpful error messages in the logs when called from a Striker. * Update DRBD->gather_data() to get the size data from /sys/block/drbd<minor>/size' x '/sys/block/drbd<minor>/queue/logical_block_size so it works when a device is Secondary (and can't be promoted). * Updated Database->get_hosts_info() to record the short host name as well as the stored host name. Created ->update_host_status() as a wrapper to ->insert_or_update_hosts() that only updates the host status. * Updated anvil-join-anvil to disabled ksm and ksmtuned daemons. * Updated scancore and anvil-daemon to set the host_status to 'online' on startup. Signed-off-by: Digimer <digimer@alteeve.ca>
2021-04-10 00:51:29 +00:00
scan_cluster_name,
scan_cluster_cib
FROM
scan_cluster
;";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }});
my $results = $anvil->Database->query({query => $query, source => $THIS_FILE, line => __LINE__});
my $count = @{$results};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
results => $results,
count => $count,
}});
foreach my $row (@{$results})
{
# NOTE: There's no known way to track a cluster name change, so we can't really avoid having
# an entery per cluster name.
my $scan_cluster_uuid = $row->[0];
my $scan_cluster_anvil_uuid = $row->[1];
my $scan_cluster_name = $row->[2];
* THe get_cpu endpoint was completed. * The get_mmeory endpoint was completed. * The get_replicated_storage endpoint was completed, though it requires testing and likely has issues. To prepare for the get_status endpoint work, I needed to update ScanCore and modules to track the host_status. This commit contains the work needed for this. * Updated ScanCore->post_scan_analysis_striker() to use configured fence devices (except PDUs) to check if a target host is off or on, in there is no host_ipmi interface. In all cases, if a machine can be confirmed on or off, the host_status is now updated. * To support the above fence based power checks, updated scan-cluster to store the on-disk CIB in the new scan_cluster -> scan_cluster_cib colume. * Updated ScanCore->parse_cib() to map stonith primitive IDs to fence agents. Updated ->parse_crm_mon() to not call if the executable doesn't exist to avoid unhelpful error messages in the logs when called from a Striker. * Update DRBD->gather_data() to get the size data from /sys/block/drbd<minor>/size' x '/sys/block/drbd<minor>/queue/logical_block_size so it works when a device is Secondary (and can't be promoted). * Updated Database->get_hosts_info() to record the short host name as well as the stored host name. Created ->update_host_status() as a wrapper to ->insert_or_update_hosts() that only updates the host status. * Updated anvil-join-anvil to disabled ksm and ksmtuned daemons. * Updated scancore and anvil-daemon to set the host_status to 'online' on startup. Signed-off-by: Digimer <digimer@alteeve.ca>
2021-04-10 00:51:29 +00:00
my $scan_cluster_cib = $row->[3];
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
* THe get_cpu endpoint was completed. * The get_mmeory endpoint was completed. * The get_replicated_storage endpoint was completed, though it requires testing and likely has issues. To prepare for the get_status endpoint work, I needed to update ScanCore and modules to track the host_status. This commit contains the work needed for this. * Updated ScanCore->post_scan_analysis_striker() to use configured fence devices (except PDUs) to check if a target host is off or on, in there is no host_ipmi interface. In all cases, if a machine can be confirmed on or off, the host_status is now updated. * To support the above fence based power checks, updated scan-cluster to store the on-disk CIB in the new scan_cluster -> scan_cluster_cib colume. * Updated ScanCore->parse_cib() to map stonith primitive IDs to fence agents. Updated ->parse_crm_mon() to not call if the executable doesn't exist to avoid unhelpful error messages in the logs when called from a Striker. * Update DRBD->gather_data() to get the size data from /sys/block/drbd<minor>/size' x '/sys/block/drbd<minor>/queue/logical_block_size so it works when a device is Secondary (and can't be promoted). * Updated Database->get_hosts_info() to record the short host name as well as the stored host name. Created ->update_host_status() as a wrapper to ->insert_or_update_hosts() that only updates the host status. * Updated anvil-join-anvil to disabled ksm and ksmtuned daemons. * Updated scancore and anvil-daemon to set the host_status to 'online' on startup. Signed-off-by: Digimer <digimer@alteeve.ca>
2021-04-10 00:51:29 +00:00
"s1:scan_cluster_uuid" => $scan_cluster_uuid,
"s2:scan_cluster_anvil_uuid" => $scan_cluster_anvil_uuid,
"s3:scan_cluster_name" => $scan_cluster_name,
"s4:scan_cluster_cib" => $scan_cluster_cib,
}});
# Store the old data now.
$anvil->data->{sql}{scan_cluster}{scan_cluster_uuid}{$scan_cluster_uuid}{scan_cluster_name} = $scan_cluster_name;
$anvil->data->{sql}{scan_cluster}{scan_cluster_uuid}{$scan_cluster_uuid}{scan_cluster_anvil_uuid} = $scan_cluster_anvil_uuid;
* THe get_cpu endpoint was completed. * The get_mmeory endpoint was completed. * The get_replicated_storage endpoint was completed, though it requires testing and likely has issues. To prepare for the get_status endpoint work, I needed to update ScanCore and modules to track the host_status. This commit contains the work needed for this. * Updated ScanCore->post_scan_analysis_striker() to use configured fence devices (except PDUs) to check if a target host is off or on, in there is no host_ipmi interface. In all cases, if a machine can be confirmed on or off, the host_status is now updated. * To support the above fence based power checks, updated scan-cluster to store the on-disk CIB in the new scan_cluster -> scan_cluster_cib colume. * Updated ScanCore->parse_cib() to map stonith primitive IDs to fence agents. Updated ->parse_crm_mon() to not call if the executable doesn't exist to avoid unhelpful error messages in the logs when called from a Striker. * Update DRBD->gather_data() to get the size data from /sys/block/drbd<minor>/size' x '/sys/block/drbd<minor>/queue/logical_block_size so it works when a device is Secondary (and can't be promoted). * Updated Database->get_hosts_info() to record the short host name as well as the stored host name. Created ->update_host_status() as a wrapper to ->insert_or_update_hosts() that only updates the host status. * Updated anvil-join-anvil to disabled ksm and ksmtuned daemons. * Updated scancore and anvil-daemon to set the host_status to 'online' on startup. Signed-off-by: Digimer <digimer@alteeve.ca>
2021-04-10 00:51:29 +00:00
$anvil->data->{sql}{scan_cluster}{scan_cluster_uuid}{$scan_cluster_uuid}{scan_cluster_cib} = $scan_cluster_cib;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"sql::scan_cluster::scan_cluster_uuid::${scan_cluster_uuid}::scan_cluster_name" => $anvil->data->{sql}{scan_cluster}{scan_cluster_uuid}{$scan_cluster_uuid}{scan_cluster_name},
"sql::scan_cluster::scan_cluster_uuid::${scan_cluster_uuid}::scan_cluster_anvil_uuid" => $anvil->data->{sql}{scan_cluster}{scan_cluster_uuid}{$scan_cluster_uuid}{scan_cluster_anvil_uuid},
* THe get_cpu endpoint was completed. * The get_mmeory endpoint was completed. * The get_replicated_storage endpoint was completed, though it requires testing and likely has issues. To prepare for the get_status endpoint work, I needed to update ScanCore and modules to track the host_status. This commit contains the work needed for this. * Updated ScanCore->post_scan_analysis_striker() to use configured fence devices (except PDUs) to check if a target host is off or on, in there is no host_ipmi interface. In all cases, if a machine can be confirmed on or off, the host_status is now updated. * To support the above fence based power checks, updated scan-cluster to store the on-disk CIB in the new scan_cluster -> scan_cluster_cib colume. * Updated ScanCore->parse_cib() to map stonith primitive IDs to fence agents. Updated ->parse_crm_mon() to not call if the executable doesn't exist to avoid unhelpful error messages in the logs when called from a Striker. * Update DRBD->gather_data() to get the size data from /sys/block/drbd<minor>/size' x '/sys/block/drbd<minor>/queue/logical_block_size so it works when a device is Secondary (and can't be promoted). * Updated Database->get_hosts_info() to record the short host name as well as the stored host name. Created ->update_host_status() as a wrapper to ->insert_or_update_hosts() that only updates the host status. * Updated anvil-join-anvil to disabled ksm and ksmtuned daemons. * Updated scancore and anvil-daemon to set the host_status to 'online' on startup. Signed-off-by: Digimer <digimer@alteeve.ca>
2021-04-10 00:51:29 +00:00
"sql::scan_cluster::scan_cluster_uuid::${scan_cluster_uuid}::scan_cluster_cib" => $anvil->data->{sql}{scan_cluster}{scan_cluster_uuid}{$scan_cluster_uuid}{scan_cluster_cib},
}});
# Make it easy to look up the cluster_uuid from the anvil_uuid.
$anvil->data->{sql}{anvil_uuid}{$scan_cluster_anvil_uuid} = $scan_cluster_uuid;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"sql::anvil_uuid::${scan_cluster_anvil_uuid}" => $anvil->data->{sql}{anvil_uuid}{$scan_cluster_anvil_uuid},
}});
}
undef $count;
undef $results;
$query = "
SELECT
scan_cluster_node_uuid,
scan_cluster_node_scan_cluster_uuid,
scan_cluster_node_host_uuid,
scan_cluster_node_name,
scan_cluster_node_pacemaker_id,
scan_cluster_node_in_ccm,
scan_cluster_node_crmd_member,
scan_cluster_node_cluster_member,
scan_cluster_node_maintenance_mode
FROM
scan_cluster_nodes
;";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }});
$results = $anvil->Database->query({query => $query, source => $THIS_FILE, line => __LINE__});
$count = @{$results};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
results => $results,
count => $count,
}});
foreach my $row (@{$results})
{
# We've got an entry in the 'scan_cluster_nodes' table, so now we'll look for data in the node and
# services tables.
my $scan_cluster_node_uuid = $row->[0];
my $scan_cluster_node_scan_cluster_uuid = $row->[1];
my $scan_cluster_node_host_uuid = $row->[2];
my $scan_cluster_node_name = $row->[3];
my $scan_cluster_node_pacemaker_id = $row->[4];
my $scan_cluster_node_in_ccm = $row->[5];
my $scan_cluster_node_crmd_member = $row->[6];
my $scan_cluster_node_cluster_member = $row->[7];
my $scan_cluster_node_maintenance_mode = $row->[8];
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
scan_cluster_node_uuid => $scan_cluster_node_uuid,
scan_cluster_node_scan_cluster_uuid => $scan_cluster_node_scan_cluster_uuid,
scan_cluster_node_host_uuid => $scan_cluster_node_host_uuid,
scan_cluster_node_name => $scan_cluster_node_name,
scan_cluster_node_pacemaker_id => $scan_cluster_node_pacemaker_id,
scan_cluster_node_in_ccm => $scan_cluster_node_in_ccm,
scan_cluster_node_crmd_member => $scan_cluster_node_crmd_member,
scan_cluster_node_cluster_member => $scan_cluster_node_cluster_member,
scan_cluster_node_maintenance_mode => $scan_cluster_node_maintenance_mode,
}});
# Store the old data now.
$anvil->data->{sql}{scan_cluster_node}{scan_cluster_node_uuid}{$scan_cluster_node_uuid} = {
scan_cluster_node_scan_cluster_uuid => $scan_cluster_node_scan_cluster_uuid,
scan_cluster_node_host_uuid => $scan_cluster_node_host_uuid,
scan_cluster_node_name => $scan_cluster_node_name,
scan_cluster_node_pacemaker_id => $scan_cluster_node_pacemaker_id,
scan_cluster_node_in_ccm => $scan_cluster_node_in_ccm,
scan_cluster_node_crmd_member => $scan_cluster_node_crmd_member,
scan_cluster_node_cluster_member => $scan_cluster_node_cluster_member,
scan_cluster_node_maintenance_mode => $scan_cluster_node_maintenance_mode,
};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"sql::scan_cluster_node::scan_cluster_node_uuid::${scan_cluster_node_uuid}::scan_cluster_node_scan_cluster_uuid" => $anvil->data->{sql}{scan_cluster_node}{scan_cluster_node_uuid}{$scan_cluster_node_uuid}{scan_cluster_node_scan_cluster_uuid},
"sql::scan_cluster_node::scan_cluster_node_uuid::${scan_cluster_node_uuid}::scan_cluster_node_host_uuid" => $anvil->data->{sql}{scan_cluster_node}{scan_cluster_node_uuid}{$scan_cluster_node_uuid}{scan_cluster_node_host_uuid},
"sql::scan_cluster_node::scan_cluster_node_uuid::${scan_cluster_node_uuid}::scan_cluster_node_name" => $anvil->data->{sql}{scan_cluster_node}{scan_cluster_node_uuid}{$scan_cluster_node_uuid}{scan_cluster_node_name},
"sql::scan_cluster_node::scan_cluster_node_uuid::${scan_cluster_node_uuid}::scan_cluster_node_pacemaker_id" => $anvil->data->{sql}{scan_cluster_node}{scan_cluster_node_uuid}{$scan_cluster_node_uuid}{scan_cluster_node_pacemaker_id},
"sql::scan_cluster_node::scan_cluster_node_uuid::${scan_cluster_node_uuid}::scan_cluster_node_in_ccm" => $anvil->data->{sql}{scan_cluster_node}{scan_cluster_node_uuid}{$scan_cluster_node_uuid}{scan_cluster_node_in_ccm},
"sql::scan_cluster_node::scan_cluster_node_uuid::${scan_cluster_node_uuid}::scan_cluster_node_crmd_member" => $anvil->data->{sql}{scan_cluster_node}{scan_cluster_node_uuid}{$scan_cluster_node_uuid}{scan_cluster_node_crmd_member},
"sql::scan_cluster_node::scan_cluster_node_uuid::${scan_cluster_node_uuid}::scan_cluster_node_cluster_member" => $anvil->data->{sql}{scan_cluster_node}{scan_cluster_node_uuid}{$scan_cluster_node_uuid}{scan_cluster_node_cluster_member},
"sql::scan_cluster_node::scan_cluster_node_uuid::${scan_cluster_node_uuid}::scan_cluster_node_maintenance_mode" => $anvil->data->{sql}{scan_cluster_node}{scan_cluster_node_uuid}{$scan_cluster_node_uuid}{scan_cluster_node_maintenance_mode},
}});
$anvil->data->{sql}{scan_cluster_node_host_uuid}{$scan_cluster_node_host_uuid} = $scan_cluster_node_uuid;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"sql::scan_cluster_node_host_uuid::${scan_cluster_node_host_uuid}" => $anvil->data->{sql}{scan_cluster_node_host_uuid}{$scan_cluster_node_host_uuid},
}});
}
return(0);
}
# This reads in all the data we can find on the local system
sub collect_data
{
my ($anvil) = @_;
# Pick out core cluster details.
my $problem = $anvil->Cluster->parse_cib({debug => 3});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }});
# If there was a problem, we're not in the cluster.
if ($problem)
{
my $changed = $anvil->Alert->check_alert_sent({
record_locator => "scan_cluster::in_cluster",
set_by => $THIS_FILE,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { changed => $changed }});
if ($changed)
{
# Register an alert.
my $variables = { host_name => $anvil->Get->host_name() };
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_cluster_alert_0010", variables => $variables});
$anvil->Alert->register({alert_level => "warning", message => "scan_cluster_alert_0010", variables => $variables, set_by => $THIS_FILE});
# See if I need to mark us as out of the cluster. Normally, our peer would do this,
# but if we went down at the same time as our peer, both of us might not update the
# membership values.
my $query = "
SELECT
scan_cluster_node_uuid,
scan_cluster_node_in_ccm,
scan_cluster_node_crmd_member,
scan_cluster_node_cluster_member
FROM
scan_cluster_nodes
WHERE
scan_cluster_node_host_uuid = ".$anvil->Database->quote($anvil->Get->host_uuid)."
;";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }});
my $results = $anvil->Database->query({query => $query, source => $THIS_FILE, line => __LINE__});
my $count = @{$results};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
results => $results,
count => $count,
}});
foreach my $row (@{$results})
{
# We've got an entry in the 'scan_cluster_nodes' table, so now we'll look for data in the node and
# services tables.
my $scan_cluster_node_uuid = $row->[0];
my $scan_cluster_node_in_ccm = $row->[1];
my $scan_cluster_node_crmd_member = $row->[2];
my $scan_cluster_node_cluster_member = $row->[3];
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
scan_cluster_node_uuid => $scan_cluster_node_uuid,
scan_cluster_node_in_ccm => $scan_cluster_node_in_ccm,
scan_cluster_node_crmd_member => $scan_cluster_node_crmd_member,
scan_cluster_node_cluster_member => $scan_cluster_node_cluster_member,
}});
if (($scan_cluster_node_in_ccm) or ($scan_cluster_node_crmd_member) or ($scan_cluster_node_cluster_member))
{
# Update
my $query = "
UPDATE
scan_cluster_nodes
SET
scan_cluster_node_in_ccm = '0',
scan_cluster_node_crmd_member = '0',
scan_cluster_node_cluster_member = '0',
modified_date = ".$anvil->Database->quote($anvil->Database->refresh_timestamp)."
WHERE
scan_cluster_node_uuid = ".$anvil->Database->quote($scan_cluster_node_uuid)."
;";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }});
$anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__});
}
}
}
# Exit now.
$anvil->nice_exit({exit_code => 2});
}
else
{
# See if we came back into the cluster
my $changed = $anvil->Alert->check_alert_sent({
clear => 1,
record_locator => "scan_cluster::in_cluster",
set_by => $THIS_FILE,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { changed => $changed }});
if ($changed)
{
# Register an alert.
my $variables = { host_name => $anvil->Get->host_name() };
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "scan_cluster_alert_0011", variables => $variables});
$anvil->Alert->register({alert_level => "warning", clear_alert => 1, message => "scan_cluster_alert_0011", variables => $variables, set_by => $THIS_FILE});
}
}
return(0);
}