The core logic is done!!!! Still need to finish end-points for the WebUI to hook into, but the core of M3 is complete! Many, many bugs are expected, of course. :)

* Created DRBD->check_if_syncsource() and ->check_if_synctarget() that return '1' if the target host is currently SyncSource or SyncTarget for any resource, respectively.
* Updated DRBD->update_global_common() to return the unified-format diff if any changes were made to global-common.conf.
* Created ScanCore->check_health() that returns the health score for a host. Created ->count_servers() that returns the number of servers on a host, how much RAM is used by those servers and, if available, the estimated migration time of the servers. Updated ->check_temperature() to set/clear/return the time that a host has been in a warning or critical temperature state.
* Finished ScanCore->post_scan_analysis_node()!!! It certainly has bugs, and much testing is needed, but the logic is all in place! Oh what a slog that was... It should be far more intelligent than M2 though, once flushed out and tested.
* Created Server->active_migrations() that returns '1' if any servers are in a migration on an Anvil! system. Updated ->migrate_virsh() to record how long a migration took in the "server::migration_duration" variable, which is averaged by ScanCore->count_servers() to estimate migration times.
* Updated scan-drbd to check/update the global-common.conf file's config at the end of a scan.
* Updated ScanCore itself to not scan when in maintenance mode. Also updated it to call 'anvil-safe-start' when ScanCore starts, so long as it is within ten minutes of the host booting.

Signed-off-by: Digimer <digimer@alteeve.ca>
main
Digimer 4 years ago
parent 15dab8aab7
commit ca7052dd53
  1. 2
      Anvil/Tools.pm
  2. 12
      Anvil/Tools/Cluster.pm
  3. 102
      Anvil/Tools/DRBD.pm
  4. 1393
      Anvil/Tools/ScanCore.pm
  5. 93
      Anvil/Tools/Server.pm
  6. 9
      anvil.conf
  7. 38
      notes
  8. 1
      scancore-agents/scan-apc-ups/scan-apc-ups
  9. 28
      scancore-agents/scan-cluster/scan-cluster
  10. 30
      scancore-agents/scan-drbd/scan-drbd
  11. 6
      scancore-agents/scan-drbd/scan-drbd.xml
  12. 1
      scancore-agents/scan-ipmitool/scan-ipmitool
  13. 46
      share/words.xml
  14. 4
      tools/anvil-daemon
  15. 20
      tools/anvil-safe-stop
  16. 44
      tools/scancore

@ -1106,6 +1106,8 @@ sub _set_paths
'anvil-parse-fence-agents' => "/usr/sbin/anvil-parse-fence-agents",
'anvil-provision-server' => "/usr/sbin/anvil-provision-server",
'anvil-report-memory' => "/usr/sbin/anvil-report-memory",
'anvil-safe-start' => "/usr/sbin/anvil-safe-start",
'anvil-safe-stop' => "/usr/sbin/anvil-safe-stop",
'anvil-shutdown-server' => "/usr/sbin/anvil-shutdown-server",
'anvil-sync-shared' => "/usr/sbin/anvil-sync-shared",
'anvil-update-files' => "/usr/sbin/anvil-update-files",

@ -872,7 +872,7 @@ sub get_anvil_name
my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "Cluster->get_anvil_name()" }});
my $anvil_uuid = defined $parameter->{anvil_uuid} ? $parameter->{anvil_uuid} : $anvil->Get->anvil_uuid;
my $anvil_uuid = defined $parameter->{anvil_uuid} ? $parameter->{anvil_uuid} : $anvil->Cluster->get_anvil_uuid;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
anvil_uuid => $anvil_uuid,
}});
@ -2127,11 +2127,13 @@ sub parse_cib
else
{
# It's our peer.
$anvil->data->{cib}{parsed}{peer}{ready} = $ready;
$anvil->data->{cib}{parsed}{peer}{name} = $node_name;
$anvil->data->{cib}{parsed}{peer}{ready} = $ready;
$anvil->data->{cib}{parsed}{peer}{name} = $node_name;
$anvil->data->{cib}{parsed}{peer}{host_uuid} = $anvil->Get->host_uuid_from_name({debug => $debug, host_name => $node_name});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
"cib::parsed::peer::ready" => $anvil->data->{cib}{parsed}{peer}{ready},
"cib::parsed::peer::name" => $anvil->data->{cib}{parsed}{peer}{name},
"cib::parsed::peer::ready" => $anvil->data->{cib}{parsed}{peer}{ready},
"cib::parsed::peer::name" => $anvil->data->{cib}{parsed}{peer}{name},
"cib::parsed::peer::host_uuid" => $anvil->data->{cib}{parsed}{peer}{host_uuid},
}});
}
}

@ -15,6 +15,8 @@ my $THIS_FILE = "DRBD.pm";
### Methods;
# allow_two_primaries
# check_if_syncsource
# check_if_synctarget
# delete_resource
# gather_data
# get_devices
@ -237,6 +239,98 @@ sub allow_two_primaries
}
=head2 check_if_syncsource
This method checks to see if the local machine is C<< SyncSource >>. If so, this returns C<< 1 >>. Otherwise, it returns C<< 0 >>.
This method takes no parameters.
=cut
sub check_if_syncsource
{
my $self = shift;
my $parameter = shift;
my $anvil = $self->parent;
my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "DRBD->check_if_syncsource()" }});
my $short_host_name = $anvil->Get->short_host_name();
$anvil->DRBD->get_status({debug => $debug});
# Now check to see if anything is sync'ing.
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}})
{
foreach my $peer_name (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}{$resource}{connection}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { peer_name => $peer_name }});
foreach my $volume (sort {$a cmp $b} %{$anvil->data->{drbd}{status}{$short_host_name}{resource}{$resource}{connection}{$peer_name}{volume}})
{
next if not exists $anvil->data->{drbd}{status}{$short_host_name}{resource}{$resource}{connection}{$peer_name}{volume}{$volume}{'replication-state'};
my $replication_state = $anvil->data->{drbd}{status}{$short_host_name}{resource}{$resource}{connection}{$peer_name}{volume}{$volume}{'replication-state'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
volume => $volume,
replication_state => $replication_state,
}});
if ($replication_state =~ /SyncSource/i)
{
# We're SyncSource
return(1);
}
}
}
}
return(0);
}
=head2 check_if_synctarget
This method checks to see if the local machine is C<< SyncTarget >>. If so, this returns C<< 1 >>. Otherwise, it returns C<< 0 >>.
This method takes no parameters.
=cut
sub check_if_synctarget
{
my $self = shift;
my $parameter = shift;
my $anvil = $self->parent;
my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "DRBD->check_if_synctarget()" }});
my $short_host_name = $anvil->Get->short_host_name();
$anvil->DRBD->get_status({debug => $debug});
# Now check to see if anything is sync'ing.
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}})
{
foreach my $peer_name (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}{$resource}{connection}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { peer_name => $peer_name }});
foreach my $volume (sort {$a cmp $b} %{$anvil->data->{drbd}{status}{$short_host_name}{resource}{$resource}{connection}{$peer_name}{volume}})
{
next if not exists $anvil->data->{drbd}{status}{$short_host_name}{resource}{$resource}{connection}{$peer_name}{volume}{$volume}{'replication-state'};
my $replication_state = $anvil->data->{drbd}{status}{$short_host_name}{resource}{$resource}{connection}{$peer_name}{volume}{$volume}{'replication-state'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
volume => $volume,
replication_state => $replication_state,
}});
if ($replication_state =~ /SyncTarget/i)
{
# We're SyncTarget
return(1);
}
}
}
}
return(0);
}
=head2 delete_resource
This method deletes an entire resource. It does this by looping through the volumes configured in a resource and deleting them one after the other (even if there is only one volume).
@ -2019,7 +2113,7 @@ sub resource_uuid
=head2 update_global_common
This configures C<< global_common.conf >> on the local host. Returns C<< !!error!! >> if there is a problem, C<< 0 >> if no update was needed and C<< 1 >> if a change was made.
This configures C<< global_common.conf >> on the local host. Returns C<< !!error!! >> if there is a problem, an empty string if no update was needed and a unified C<< diff >> of the changes made, if any.
Parameters;
@ -2061,6 +2155,7 @@ sub update_global_common
# These values will be used to track where we are in processing the config file and what values are needed.
my $update = 0;
my $difference = "";
my $usage_count_seen = 0;
my $udev_always_use_vnr_seen = 0;
my $fence_peer_seen = 0;
@ -2702,9 +2797,10 @@ sub update_global_common
}});
if ($update)
{
$difference = diff \$old_global_common, \$new_global_common, { STYLE => 'Unified' };
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0517", variables => {
file => $anvil->data->{path}{configs}{'global-common.conf'},
diff => diff \$old_global_common, \$new_global_common, { STYLE => 'Unified' },
diff => $difference,
}});
my $failed = $anvil->Storage->write_file({
@ -2725,7 +2821,7 @@ sub update_global_common
}
}
return($update);
return($difference);
}
# =head3

File diff suppressed because it is too large Load Diff

@ -12,6 +12,7 @@ our $VERSION = "3.0.0";
my $THIS_FILE = "Server.pm";
### Methods;
# active_migrations
# boot_virsh
# find
# get_definition
@ -81,6 +82,45 @@ sub parent
# Public methods #
#############################################################################################################
=head2 active_migrations
This method returns C<< 1 >> if any servers are migrating to or from the local system. It returns C<< 0 >> otherwise.
This method takes no parameters.
=cut
sub active_migrations
{
my $self = shift;
my $parameter = shift;
my $anvil = $self->parent;
my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "Server->active_migrations()" }});
# Are we in an Anvil! system?
my $anvil_uuid = $anvil->Cluster->get_anvil_uuid({debug => $debug});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { anvil_uuid => $anvil_uuid }});
if (not $anvil_uuid)
{
# We're not in an Anvil.
return(0);
}
$anvil->Database->get_servers({debug => $debug});
foreach my $server_uuid (keys %{$anvil->data->{servers}{server_uuid}})
{
my $server_name = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_name};
my $server_state = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_state};
if ($server_state eq "migrating")
{
return(1);
}
}
return(0);
}
=head2 boot_virsh
This takes a server name and tries to boot it (using C<< virsh create /mnt/shared/definition/<server>.xml >>. It requires that any supporting systems already be started (ie: DRBD resource is up).
@ -950,6 +990,7 @@ sub migrate_virsh
{
$anvil->Database->get_servers({debug => 2});
}
my $migation_started = time;
my $server_uuid = "";
my $old_server_state = "";
foreach my $this_server_uuid (keys %{$anvil->data->{servers}{server_uuid}})
@ -981,7 +1022,11 @@ WHERE
# The virsh command switches host names to IPs and needs to have both the source and target IPs in
# the known_hosts file to work.
my $live_migrate = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_live_migration} ? "--live" : "";
my $live_migrate = "";
if (($server_uuid) && ($anvil->data->{servers}{server_uuid}{$server_uuid}{server_live_migration}))
{
$live_migrate = "--live";
}
my $target_ip = $anvil->Convert->host_name_to_ip({debug => $debug, host_name => $target});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
target_ip => $target_ip,
@ -995,6 +1040,7 @@ WHERE
target => $host,
});
}
my $migration_command = $anvil->data->{path}{exe}{virsh}." migrate --undefinesource --tunnelled --p2p ".$live_migrate." ".$server." qemu+ssh://".$target."/system";
if ($source)
{
@ -1054,20 +1100,28 @@ WHERE
}
else
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0354"});
my $migration_took = time - $migation_started;
my $say_migration_time = $anvil->Convert->time({'time' => $migration_took});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
migration_took => $migration_took,
say_migration_time => $say_migration_time,
}});
# Log the migration.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0354", variables => { migration_time => $say_migration_time }});
$success = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { success => $success }});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { success => $success }});
# Revert the server state and update the server host.
my $server_host_uuid = $anvil->Get->host_uuid_from_name({debug => $debug, host_name => $target});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { server_host_uuid => $server_host_uuid }});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { server_host_uuid => $server_host_uuid }});
if (not $server_host_uuid)
{
# We didn't find the target's host_uuid, so use the old one and let scan-server
# handle it.
$server_host_uuid = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_host_uuid};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { server_host_uuid => $server_host_uuid }});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { server_host_uuid => $server_host_uuid }});
}
if (($server_uuid) && ($anvil->data->{sys}{database}{connections}))
{
@ -1103,6 +1157,19 @@ WHERE
server_updated_by_user => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_updated_by_user},
server_boot_time => $anvil->data->{servers}{server_uuid}{$server_uuid}{server_boot_time},
});
# Record the migration time.
my ($variable_uuid) = $anvil->Database->insert_or_update_variables({
file => $THIS_FILE,
line => __LINE__,
variable_name => "server::migration_duration",
variable_value => $migration_took,
variable_default => "",
variable_description => "message_0236",
variable_section => "servers",
variable_source_uuid => $server_uuid,
variable_source_table => "servers",
});
}
}
@ -1717,6 +1784,8 @@ This is the name of the server (as it appears in C<< virsh >>) to shut down.
By default, this method will wait indefinetly for the server to shut down before returning. If this is set to a non-zero number, the method will wait that number of seconds for the server to shut dwwn. If the server is still not off by then, C<< 0 >> is returned.
Setting this to C<< 1 >> effectively disables waiting.
=cut
sub shutdown_virsh
{
@ -1777,7 +1846,7 @@ sub shutdown_virsh
elsif ($status eq "paused")
{
# The server is paused. Resume it, wait a few, then proceed with the shutdown.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0314", variables => { server => $server }});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0314", variables => { server => $server }});
my ($output, $return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{virsh}." resume $server"});
if ($return_code)
{
@ -1789,13 +1858,13 @@ sub shutdown_virsh
}});
$anvil->nice_exit({exit_code => 1});
}
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0316"});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0316"});
sleep 3;
}
elsif ($status eq "pmsuspended")
{
# The server is suspended. Resume it, wait a few, then proceed with the shutdown.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0317", variables => { server => $server }});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0317", variables => { server => $server }});
my ($output, $return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{virsh}." dompmwakeup $server"});
if ($return_code)
{
@ -1807,14 +1876,14 @@ sub shutdown_virsh
}});
$anvil->nice_exit({exit_code => 1});
}
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0319"});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0319"});
sleep 30;
}
elsif (($status eq "idle") or ($status eq "crashed"))
{
# The server needs to be destroyed.
$task = "destroy";
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0322", variables => {
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0322", variables => {
server => $server,
status => $status,
}});
@ -1823,7 +1892,7 @@ sub shutdown_virsh
{
# The server is already shutting down
$shutdown = 0;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0320", variables => { server => $server }});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0320", variables => { server => $server }});
}
elsif ($status ne "running")
{
@ -1875,7 +1944,7 @@ WHERE
}
}
}
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0520", variables => { server => $server }});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0520", variables => { server => $server }});
my ($output, $return_code) = $anvil->System->call({
debug => $debug,
shell_call => $anvil->data->{path}{exe}{virsh}." ".$task." ".$server,

@ -1,9 +1,16 @@
### This is the main Anvil! configuration file.
# To help understand how the Anvil! is used, some features will "call home" to record anonymous information
# about a machine participating in an Anvil! system. If you wish to not have this happen, set this to '1'.
# about a machine participating in an Anvil! system. An example is DRBD's 'usage-count' option. If you wish
# to not have this happen, set this to '1'.
sys::privacy::strong = 0
### Features
# Normally, if one node in the Anvil! is healthier than the other, it will pull the servers from the peer
# on to it. This is a process called "preventative live migration". If you would like to disable this
# feature, set this to '1'.
feature::scancore::disable::preventative-live-migration = 0
### Database
# Database connections;
#

38
notes

@ -7,6 +7,44 @@ TODO:
============
# ScanCore post-scan logic;
Sole node:
1. Evaluate critical shutdown only, if hosting VMs.
2. If not hosting VMs, load-shed if over-heat / power loss for more than 120 seconds
Action options;
1. Do nothing
2. Pull servers
3. Shut down (once servers are gone)
4. Shut down (gracefully stop servers)
* Peer not available
- Thermal is critical, gracefully shut down.
- Power is strongest UPS below ten minutes and time on batteries is over 2 minutes, graceful shut down
* Peer available
- If one node is healthier than the other;
- If we're sicker, do nothing until we have no servers
- If we're healthier, after two minutes, pull
- If health is equal;
- Both nodes have servers;
- Decide who can be evacuated fastest, in case load shed needed.
- Both nodes on batteries or in warning temp for more than 2 minutes;
- If we're the designated survivor, pull servers.
- If we're the sacrifice, wait for the servers to be taken off of us, then shut down.
- Peer has servers, we don't
- If thermal warning or both/all UPSes on batter for two minutes+, shut down
- We have servers, peer doesn't.
- Keep running
1.1 - Our peer may pull from us.
2. - Not Hosting Servers
2.1 -
Jenkins;
Initial setup:

@ -11,6 +11,7 @@
# 255 - The host's UUID isn't in the hosts table yet, ScanCore itself hasn't been run.
#
# TODO:
# - Set a health score if we lose one or both/all UPSes.
# - Support UPSes with extended runtime batteries
# - Pick a battery temperature where the UPS will automatically shut down if it passes over.
# - Add support for sudden temperature jumps in process_temperature().

@ -97,12 +97,40 @@ collect_data($anvil);
# Find changes.
find_changes($anvil);
# Check the cluster config.
check_config($anvil);
$anvil->nice_exit({exit_code => 0});
#############################################################################################################
# Functions #
#############################################################################################################
sub check_config
{
my ($anvil) = @_;
$anvil->Database->get_manifests();
my $anvil_name = $anvil->Cluster->get_anvil_name({});
my $manifest_uuid = $anvil->data->{manifests}{manifest_name}{$anvil_name}{manifest_uuid};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
anvil_name => $anvil_name,
manifest_uuid => $manifest_uuid,
}});
if ($manifest_uuid)
{
my ($host_ipmi) = $anvil->System->configure_ipmi({
debug => 2,
manifest_uuid => $manifest_uuid,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_ipmi => $host_ipmi }});
}
return(0);
}
# Looks for changes.
sub find_changes
{

@ -101,6 +101,7 @@ read_last_scan($anvil);
find_changes($anvil);
check_config($anvil);
# Update the database
$anvil->Database->insert_or_update_updated({updated_by => $THIS_FILE});
@ -113,6 +114,34 @@ $anvil->nice_exit({exit_code => 0});
# Functions #
#############################################################################################################
# This looks at the global-common.conf file and updates it, if needed.
sub check_config
{
my ($anvil) = @_;
if (not exists $anvil->data->{sys}{privacy}{strong})
{
$anvil->data->{sys}{privacy}{strong} = 0;
}
my $updated = $anvil->DRBD->update_global_common({
usage_count => $anvil->data->{sys}{privacy}{strong} ? 0 : 1,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { updated => $updated }});
if ($updated)
{
# Send a notice-level alert
my $variables = {
file => $anvil->data->{path}{configs}{'global-common.conf'},
diff => $updated,
};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "scan_drbd_message_0034", variables => $variables});
$anvil->Alert->register({alert_level => "notice", message => "scan_drbd_message_0034", variables => $variables, set_by => $THIS_FILE, sort_position => $anvil->data->{'scan-drbd'}{alert_sort}++});
}
return(0);
}
sub process_drbd
{
my ($anvil) = @_;
@ -159,7 +188,6 @@ sub process_drbd
};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "scan_drbd_message_0002", variables => $variables});
$anvil->Alert->register({alert_level => "notice", message => "scan_drbd_message_0002", variables => $variables, set_by => $THIS_FILE, sort_position => $anvil->data->{'scan-drbd'}{alert_sort}++});
}
if ($new_scan_drbd_flush_md ne $old_scan_drbd_flush_md)
{

@ -180,6 +180,12 @@ The DRBD resource was not found in the database, but appears to have been in the
- Resource Name: ...... [#!variable!resource_name!#]
- Resource State: ..... [#!variable!resource_state!#]
</key>
<key name="scan_drbd_message_0034">
The global common configuration file: [#!variable!file!#] needs to be updated. The difference is:
====
#!variable!diff!#
====
</key>
<!-- Units -->
<key name="scan_drbd_unit_0001">Enabled</key>

@ -15,6 +15,7 @@
# 255 - The host's UUID isn't in the hosts table yet, ScanCore itself hasn't been run.
#
# TODO:
# - Set a health score for over/under-temp events.
# - Don't bother scanning other hosts.... ScanCore does direct calls to decide if/when to reboot an offline
# node.
# - Decide if we should parse 'ipmitool sel list'

@ -240,7 +240,7 @@ The error was:
<key name="error_0162">The 'anvil_uuid': [#!variable!anvil_uuid!#] in invalid.</key>
<key name="error_0163">The MIB file: [#!variable!mib!#] doesn't exist or can't be read.</key>
<key name="error_0164">The date: [#!variable!date!#] is not in either the 'mm/dd/yy' or 'mm/dd/yyyy' formats. Can't convert to 'yyyy/mm/dd'.</key>
<key name="error_0165">The temperature: [#!variable!temperature!#] does not appear to be valid..</key>
<key name="error_0165">The temperature: [#!variable!temperature!#] does not appear to be valid.</key>
<key name="error_0166">The resource: [#!variable!resource!#] in the config file: [#!variable!file!#] was found, but does not appear to be a valid UUID: [#!variable!uuid!#].</key>
<key name="error_0167">The resource: [#!variable!resource!#] in the config file: [#!variable!file!#] was found, and we were asked to replace the 'scan_drbd_resource_uuid' but the new UUID: [#!variable!uuid!#] is not a valud UUID.</key>
<key name="error_0168">The 'fence_ipmilan' command: [#!variable!command!#] does not appear to be valid.</key>
@ -833,7 +833,7 @@ It should be provisioned in the next minute or two.</key>
<key name="job_0312">We are the SyncSource for the peer: [#!variable!peer_host!#] for the resource/volume: [#!variable!resource!#/#!variable!volume!#]. We have to wait for the peer to complete the sync or close it's connection before we can proceed with shut down.</key>
<key name="job_0313">The cluster has stopped.</key>
<key name="job_0314">Stopping all DRBD resources.</key>
<key name="job_0315">The server: [#!variable!server_name!#] is migrating. Will check again shortly to see if it is done.</key>
<key name="job_0315">The server: [#!variable!server!#] is migrating. Will check again shortly to see if it is done.</key>
<key name="job_0316">Asking the cluster to shut down the server: [#!variable!server!#] now.</key>
<key name="job_0317">The server: [#!variable!server!#] has not shut down yet. Asking 'virsh' to shut it down. If the cluster stop woke it up, this should trigger a shutdown. If not, manual shutdown will be required.</key>
<key name="job_0318">The server: [#!variable!server!#] will now be migrated to: [#!variable!node!#]. This could take some time, depending on the amount of RAM allocated to the server, the speed of the BCN and the activity on the server. Please be patient!</key>
@ -1262,7 +1262,7 @@ Output of: [#!variable!command!#] was;
<key name="log_0351">The attempt to enable dual-primary for the resource: [#!variable!resource!#] to the node: [#!variable!target_name!# (#!variable!target_node_id!#)] returned a non-zero return code [#!variable!return_code!#]. The returned output (if any) was: [#!variable!output!#].</key>
<key name="log_0352">The migration of: [#!variable!server!#] to the node: [#!variable!target!#] will now begin.</key>
<key name="log_0353">The attempt to migrate the server: [#!variable!server!#] to the node: [#!variable!target!#] returned a non-zero return code [#!variable!return_code!#]. The returned output (if any) was: [#!variable!output!#].</key>
<key name="log_0354">It looks like the migration was successful.</key>
<key name="log_0354">The migration was successfully completed in: [#!variable!migration_time!#].</key>
<key name="log_0355">Re-disabling dual primary by restoring config file settings.</key>
<key name="log_0356">The attempt to reset DRBD to config file settings returned a non-zero return code: [#!variable!return_code!#]. The output, if any, was: [#!variable!output!#].</key>
<key name="log_0357">Failure, exiting with '1'.</key>
@ -1558,6 +1558,7 @@ The file: [#!variable!file!#] needs to be updated. The difference is:
<key name="log_0617">We were asked to delete the file: [#!variable!file!#] on the target: [#!variable!target!#], but it doesn't exist, so nothing to do.</key>
<key name="log_0618">Successfully deleted the file: [#!variable!file!#] on the target: [#!variable!target!#].</key>
<key name="log_0619">The host: [#!variable!host_name!#] has shut down for thermal reasons: [#!variable!count!#] times. To prevent a frequent boot / thermal excursion / shutdown loop, we will wait: [#!variable!wait_for!#] before marking it's temperature as being OK again.</key>
<key name="log_0620">This host has been running for: [#!variable!uptime!#]. The cluster will not be started (uptime must be less than 10 minutes for 'anvil-safe-start' to be called automatically).</key>
<!-- Messages for users (less technical than log entries), though sometimes used for logs, too. -->
<key name="message_0001">The host name: [#!variable!target!#] does not resolve to an IP address.</key>
@ -1889,6 +1890,10 @@ Are you sure that you want to delete the server: [#!variable!server_name!#]? [Ty
<key name="message_0233">It appears that another instance of 'anvil-safe-start' is already runing. Please wait for it to complete (or kill it manually if needed).</key>
<key name="message_0234">Preparing to rename a server.</key>
<key name="message_0235">Preparing to rename stop this node.</key>
<key name="message_0236">This records how long it took to migate a given server. The average of the last five migations is used to guess how long future migrations will take.</key>
<key name="message_0237">One or more servers are migrating. While this is the case, ScanCore post-scan checks are not performed.</key>
<key name="message_0238">Preventative live migration has completed.</key>
<key name="message_0239">Preventative live migration has been disabled. We're healthier than our peer, but we will take no action.</key>
<!-- Success messages shown to the user -->
<key name="ok_0001">Saved the mail server information successfully!</key>
@ -2476,6 +2481,41 @@ Read UUID: .... [#!variable!read_uuid!#]
<key name="warning_0079">[ Warning ] - Failed to read the JSON formatted output of 'lsblk'. Expected the return code '0' but received: [#!variable!return_code!#]. The output, if any, was: [#!variable!output!#].</key>
<key name="warning_0080">[ Warning ] - Failed to read the XML formatted output of 'lshw'. Expected the return code '0' but received: [#!variable!return_code!#]. The output, if any, was: [#!variable!output!#].</key>
<key name="warning_0081">[ Warning ] - The temporary file: [#!variable!temp_file!#] vanished (or failed to be created) before it could be copied to: [#!variable!target!#].</key>
<key name="warning_0082">[ Warning ] - This host is not in the cluster, and all UPSes are running on batteries, and have been for at least: [#!variable!time_on_batteries!#]. Shutting down to conserve power.</key>
<key name="warning_0083">[ Warning ] - This host is not in the cluster, and the temperatures is anomalous. Shutting down to limit thermal loading.</key>
<key name="warning_0084">[ Warning ] - We are healthier than our peer: [#!variable!peer_name!#]! Scores (local/peer): [#!variable!local_health!# / #!variable!peer_health!#]. This has been the case for: [#!variable!age!# seconds]. After 120 seconds, preventative migration will be triggered.</key>
<key name="warning_0085">[ Warning ] - Initiating preventative live migration, taking the servers from our peer: [#!variable!peer_name!#]! Scores (local/peer): [#!variable!local_health!# / #!variable!peer_health!#]. This has been so for over two minutes, so we will not perform a preventative migration of server.</key>
<key name="warning_0086">[ Warning ] - We're not a cluster member, but the server: [#!variable!server_name!#] is in the status: [#!variable!status!#]. ScanCore will take no action on this node.</key>
<key name="warning_0087">[ Warning ] - We're alone in the cluster, and our temperature is now critical. Gracefully stopping servers and then shutting down.</key>
<key name="warning_0088">[ Warning ] - We're alone in the cluster, we've been running on batteries for more than 2 minutes, and the strongest UPS shows less than ten minutes hold up time left. Gracefully stopping servers and then shutting down.</key>
<key name="warning_0089">[ Warning ] - This host is not in the cluster, and all UPSes are running on batteries. The most recent UPS to lose power was roughly: [#!variable!time_on_batteries!#] seconds ago. After 120 seconds, this node will power down to conserve battery power.</key>
<key name="warning_0090">[ Warning ] - This host is not in the cluster, and the temperatures is anomalous. This has been the case for roughly: [#!variable!age!#] seconds. After 120 seconds, this node will shut down to reduce thermal loading.</key>
<key name="warning_0091">[ Warning ] - Both nodes have been running on batteries for more than two minutes, and both show the strongest UPS as having less than 10 minutes runtime left. Full power loss is highly likely, and imminent. Gracefully shutting down servers and powering off.</key>
<key name="warning_0092">[ Warning ] - Both nodes have been running on batteries for more than two minutes. To conserve battery power, load shedding will begin. A node will be selected for shutdown momentarily.</key>
<key name="warning_0093">[ Warning ] - Both nodes are running on batteries, but this has been so for less than two minutes. Will take no action yet in the hopes that this is a transient issue.</key>
<key name="warning_0094">[ Warning ] - Our peer node: [#!variable!host_name!#] has been running on batteries for more than two minutes. We've still got power, so we will pull the servers off of our peer and on to this machine.</key>
<key name="warning_0095">[ Warning ] - Our peer node: [#!variable!host_name!#] is running on batteries, but it has been less than two minutes. Not doing anything, yet.</key>
<key name="warning_0096">[ Warning ] - We're running on batteries, have been so for more than two minutes, and the strongest UPS has an estimated hold up time below ten minutes. Power loss is innevitable, so we will start a graceful shutdown now.</key>
<key name="warning_0097">[ Warning ] - We're running on batteries, and have been for more than two minutes. We'll shut down to conserve battery power now.</key>
<key name="warning_0098">[ Warning ] - We're running on batteries, but it's been less than two minutes. We'll wait to see if this is a transient event before taking any action.</key>
<key name="warning_0099">[ Warning ] - Both node's temperatures have been anomolous for more than two minutes. We'll shut down to reduce thermal loading of the room we're in.</key>
<key name="warning_0100">[ Warning ] - Both node's temperatures are anomolous, and we've been critically anomolous for more than two minutes. Hardware shutdown is very likely, so we'll gracefully shutdown now.</key>
<key name="warning_0101">[ Warning ] - Both node's temperatures are anomolous, but this has been the case for less than two minutes. We'll wait to see if the temperatures clear before taking action.</key>
<key name="warning_0102">[ Warning ] - Our peer node: [#!variable!host_name!#]'s temperature has been anomolous for more than two minutes. We're still thermally nominal, so we will pull the servers off of our peer and on to this machine.</key>
<key name="warning_0103">[ Warning ] - Our peer node: [#!variable!host_name!#]'s is anomolous, but it hasn't been so for two minutes yet. Not doing anything, yet.</key>
<key name="warning_0104">[ Warning ] - Our temperature is anomolous, and have been so for more than two minutes. We'll shut down to reduce thermal loading in the room.</key>
<key name="warning_0105">[ Warning ] - We are "SyncSource" for at least one resource, meaning that a peer is copying data from our storage in order to synchronize. As such, all shut down options are disabled until the sync ends or the peer goes offline.</key>
<key name="warning_0106">[ Warning ] - Our temperature is critically anomolous, and has been so for more than two minutes. Hardware shutdown is highly likely, so will gracefully shut down now.</key>
<key name="warning_0107">[ Warning ] - We're doing a load shed to conserve UPS power, and we're SyncSource (meaning our data is more complete than our peer's data). We will stay up and pull the servers to us.</key>
<key name="warning_0108">[ Warning ] - We're doing a load shed to reduce thermal loading, and we're SyncSource (meaning our data is more complete than our peer's data). We will stay up and pull the servers to us.</key>
<key name="warning_0109">[ Warning ] - We're doing a load shed to conserve UPS power, and we have no servers running locally. We will shut down now.</key>
<key name="warning_0110">[ Warning ] - We're doing a load shed to reduce thermal loading, and we have no servers running locally. We will shut down now.</key>
<key name="warning_0111">[ Warning ] - We're doing a load shed to conserve UPS power, and the amount of RAM allocated to servers on our peer is less than the amount of RAM allocated to servers running locally. As such, we'll pull the peer's servers to here.</key>
<key name="warning_0112">[ Warning ] - We're doing a load shed to reduce thermal loading, and the amount of RAM allocated to servers on our peer is less than the amount of RAM allocated to servers running locally. As such, we'll pull the peer's servers to here.</key>
<key name="warning_0113">[ Warning ] - We're doing a load shed to conserve UPS power, and the estimated migration time to pull the servers to us from our peer is shorter than the reverse. As such, we'll pull the peer's servers to here.</key>
<key name="warning_0114">[ Warning ] - We're doing a load shed to reduce thermal loading, and the estimated migration time to pull the servers to us from our peer is shorter than the reverse. As such, we'll pull the peer's servers to here.</key>
<key name="warning_0115">[ Warning ] - We're doing a load shed to conserve UPS power, and by all measures, the time to migrate off either node is equal. We're node 1, so we will pull the servers to us now.</key>
<key name="warning_0116">[ Warning ] - We're doing a load shed to reduce thermal loading, and by all measures, the time to migrate off either node is equal. We're node 1, so we will pull the servers to us now.</key>
<!-- The entries below here are not sequential, but use a key to find the entry. -->
<!-- Run 'striker-parse-os-list to find new entries. -->

@ -799,10 +799,10 @@ AND
variable_default => '',
variable_description => 'striker_0279',
variable_section => 'system',
variable_source_uuid => '4c4c4544-0043-4210-8042-c3c04f523533',
variable_source_uuid => $host_uuid,
variable_source_table => 'hosts',
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { variable_uuid => $variable_uuid }});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { variable_uuid => $variable_uuid }});
}
# Now look for jobs that have a job status of 'scancore_startup'

@ -179,8 +179,8 @@ sub stop_cluster
if ($problem)
{
# Cluster has stopped.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0324"});
$anvil->Job->update_progress({progress => 5, message => "job_0324"});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0313"});
$anvil->Job->update_progress({progress => 5, message => "job_0313"});
}
else
{
@ -191,7 +191,9 @@ sub stop_cluster
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0323"});
$anvil->Job->update_progress({progress => 70, message => "job_0323"});
my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster stop";
### NOTE: '--force' is needed or else sole-running nodes can't exit
### (complains about the loss of quorum)
my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster stop --force";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
@ -204,8 +206,8 @@ sub stop_cluster
}
else
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0313"});
$anvil->Job->update_progress({progress => 80, message => "job_0313"});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0324"});
$anvil->Job->update_progress({progress => 80, message => "job_0324"});
}
}
if ($waiting)
@ -312,10 +314,10 @@ sub process_servers
# Use virsh
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0317", variables => { server => $server }});
$anvil->Job->update_progress({progress => 20, message => "job_0317,!!server!".$server."!!"});
$anvil->Cluster->shutdown_server({
debug => 2,
server => $server,
'wait' => 0,
$anvil->Server->shutdown_virsh({
debug => 2,
server => $server,
wait_time => 1,
});
$anvil->data->{server_shutdown}{$server}{virsh_called} = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {

@ -100,6 +100,26 @@ while(1)
my $start_time = time;
prepare_for_run($anvil);
# Set our sleep time
my $run_interval = 30;
if ((exists $anvil->data->{scancore}{timing}{run_interval}) && ($anvil->data->{scancore}{timing}{run_interval} =~ /^\d+$/))
{
$run_interval = $anvil->data->{scancore}{timing}{run_interval};
}
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0249", variables => {
run_interval => $run_interval,
runtime => (time - $start_time),
}});
# If we're in maintenance mode, do nothing.
my $maintenance_mode = $anvil->System->maintenance_mode({debug => $debug});
if ($maintenance_mode)
{
# Sleep and skip.
sleep($run_interval);
next;
}
# Do we have at least one database?
my $agent_runtime = 0;
if ($anvil->data->{sys}{database}{connections})
@ -143,15 +163,6 @@ while(1)
cleanup_after_run($anvil);
# Sleep until it's time to run again.
my $run_interval = 30;
if ((exists $anvil->data->{scancore}{timing}{run_interval}) && ($anvil->data->{scancore}{timing}{run_interval} =~ /^\d+$/))
{
$run_interval = $anvil->data->{scancore}{timing}{run_interval};
}
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0249", variables => {
run_interval => $run_interval,
runtime => (time - $start_time),
}});
sleep($run_interval);
# In case something has changed, exit.
@ -327,7 +338,7 @@ sub startup_tasks
variable_default => '',
variable_description => 'striker_0279',
variable_section => 'system',
variable_source_uuid => '4c4c4544-0043-4210-8042-c3c04f523533',
variable_source_uuid => $anvil->Get->host_uuid,
variable_source_table => 'hosts',
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { variable_uuid => $variable_uuid }});
@ -335,10 +346,19 @@ sub startup_tasks
# If we've been up for less than ten minutes, call anvil-safe-start as a background process. It will
# exit if it is disabled.
my $uptime = $anvil->Get->uptime;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { uptime => $uptime }});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { uptime => $uptime }});
if ($uptime < 600)
{
# Run it as a background task
my $shell_call = $anvil->data->{path}{exe}{'anvil-safe-start'};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0210", variables => { command => $shell_call }});
$anvil->System->call({shell_call => $shell_call, background => 1});
}
else
{
# Log that we've been up too long to auto-start the cluster.
my $say_uptime = $anvil->Convert->time({'time' => $uptime});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0620", variables => { uptime => $say_uptime }});
}
return(0);

Loading…
Cancel
Save