Major thing in this commit is reworking striker-update-cluster to work without expecting anvil-daemon to be running on target machines. Similarly, they had to be able to work when the Striker DBs were not available. This is to account for cases where the Striker dashboards have updated, and the schema has changed, preventing the not-yet-updated DR hosts and subnodes from being able to use the DB. To do this, anvil-safe-stop, anvil-update-system, and anvil-shutdown-server had to be updated to use the new --no-db switch, which tells then to run without the database being available.

* Updated Server->shutdown_virsh() to work without a database connection.
* Updated System->reboot_needed() to store/read from a cache file when the database is not available.
* Updated anvil-safe-start to remove the old --enable/disable/status switches, now that we use anvil-safe-start.service systemd unit.
* Reworked anvil-safe-stop to work without a database connection, and to work on DR hosts.
* Updated anvil-special-operations to add new tasks, but it's likely these new tasks aren't needed and will be removed very shortly.
* Added/updated multiple man pages.

Signed-off-by: digimer <mkelly@alteeve.ca>
main
digimer 1 year ago
parent 5f40522cdb
commit 7bd76c10dc
  1. 2
      Anvil/Tools.pm
  2. 66
      Anvil/Tools/Server.pm
  3. 124
      Anvil/Tools/System.pm
  4. 8
      man/Makefile.am
  5. 6
      man/anvil-boot-server.8
  6. 29
      man/anvil-safe-start.8
  7. 45
      man/anvil-safe-stop.8
  8. 45
      man/anvil-shutdown-server.8
  9. 12
      man/anvil-special-operations.8
  10. 6
      man/anvil-update-system.8
  11. 4
      man/striker-update-cluster.8
  12. 10
      share/words.xml
  13. 26
      tools/anvil-safe-start
  14. 146
      tools/anvil-safe-stop
  15. 148
      tools/anvil-shutdown-server
  16. 91
      tools/anvil-special-operations
  17. 105
      tools/anvil-update-system
  18. 62
      tools/striker-collect-debug
  19. 334
      tools/striker-update-cluster

@ -1079,6 +1079,7 @@ sub _set_paths
issue => "/etc/issue",
network_cache => "/tmp/network_cache.anvil",
passwd => "/etc/passwd",
reboot_cache => "/tmp/anvil.reboot-needed",
'redhat-release' => "/etc/redhat-release",
fences_unified_metadata => "/var/www/html/fences_unified_metadata.xml",
},
@ -1231,6 +1232,7 @@ sub _set_paths
nc => "/usr/bin/nc",
nmap => "/usr/bin/nmap",
nmcli => "/bin/nmcli",
nohup => "/usr/bin/nohup",
ocf_alteeve => "/usr/lib/ocf/resource.d/alteeve/server",
openssl => "/usr/bin/openssl",
'osinfo-query' => "/usr/bin/osinfo-query",

@ -2180,27 +2180,29 @@ sub shutdown_virsh
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { "sys::database::connections" => $anvil->data->{sys}{database}{connections} }});
if ($anvil->data->{sys}{database}{connections})
{
my $anvil_uuid = $anvil->Cluster->get_anvil_uuid({debug => $debug});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { anvil_uuid => $anvil_uuid }});
$server_uuid = $anvil->Get->server_uuid_from_name({
debug => $debug,
server_name => $server,
anvil_uuid => $anvil_uuid,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { server_uuid => $server_uuid }});
if (($server_uuid) && ($server_uuid ne "!!error!!"))
if ($anvil->data->{sys}{database}{connections})
{
$anvil->Database->get_servers({debug => $debug});
if (exists $anvil->data->{servers}{server_uuid}{$server_uuid})
my $anvil_uuid = $anvil->Cluster->get_anvil_uuid({debug => $debug});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { anvil_uuid => $anvil_uuid }});
$server_uuid = $anvil->Get->server_uuid_from_name({
debug => $debug,
server_name => $server,
anvil_uuid => $anvil_uuid,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { server_uuid => $server_uuid }});
if (($server_uuid) && ($server_uuid ne "!!error!!"))
{
my $old_state = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_state};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { old_state => $old_state }});
if ($old_state ne "in shutdown")
$anvil->Database->get_servers({debug => $debug});
if (exists $anvil->data->{servers}{server_uuid}{$server_uuid})
{
# Update it.
my $query = "
my $old_state = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_state};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { old_state => $old_state }});
if ($old_state ne "in shutdown")
{
# Update it.
my $query = "
UPDATE
servers
SET
@ -2209,8 +2211,9 @@ SET
WHERE
server_uuid = ".$anvil->Database->quote($server_uuid)."
;";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { query => $query }});
$anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { query => $query }});
$anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__});
}
}
}
}
@ -2273,16 +2276,18 @@ WHERE
# Mark it as stopped now. (if we have a server_uuid, we have a database connection)
if ($server_uuid)
{
$anvil->Database->get_servers({debug => $debug});
if (exists $anvil->data->{servers}{server_uuid}{$server_uuid})
if ($anvil->data->{sys}{database}{connections})
{
my $old_state = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_state};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { old_state => $old_state }});
if ($old_state ne "shut off")
$anvil->Database->get_servers({debug => $debug});
if (exists $anvil->data->{servers}{server_uuid}{$server_uuid})
{
# Update it.
my $query = "
my $old_state = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_state};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { old_state => $old_state }});
if ($old_state ne "shut off")
{
# Update it.
my $query = "
UPDATE
servers
SET
@ -2293,8 +2298,9 @@ SET
WHERE
server_uuid = ".$anvil->Database->quote($server_uuid)."
;";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { query => $query }});
$anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { query => $query }});
$anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__});
}
}
}
}

@ -4440,7 +4440,7 @@ sub reload_daemon
This sets, clears or checks if the local system needs to be restart.
This returns C<< 1 >> if a reset is currently needed and C<< 0 >> if not.
This returns C<< 1 >> if a reset is currently needed and C<< 0 >> if not. In most cases, this is recorded in the database (variables -> variable_name = 'reboot::needed'). If there are no available databases, then the cache file '/tmp/anvil.reboot-needed' will be used, which wil contain the digit '0' or '1'.
Parameters;
@ -4460,6 +4460,8 @@ sub reboot_needed
my $set = defined $parameter->{set} ? $parameter->{set} : "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { set => $set }});
my $cache_file = $anvil->data->{paths}{data}{reboot_cache};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { cache_file => $cache_file }});
if (($set) or ($set eq "0"))
{
### TODO: stop other systems from using this database.
@ -4467,34 +4469,67 @@ sub reboot_needed
if ($set eq "1")
{
# Set
$anvil->Database->insert_or_update_variables({
debug => $debug,
file => $THIS_FILE,
line => __LINE__,
variable_name => "reboot::needed",
variable_value => "1",
variable_default => "0",
variable_description => "striker_0089",
variable_section => "system",
variable_source_uuid => $anvil->Get->host_uuid,
variable_source_table => "hosts",
});
if ($anvil->data->{sys}{database}{connections})
{
$anvil->Database->insert_or_update_variables({
debug => $debug,
file => $THIS_FILE,
line => __LINE__,
variable_name => "reboot::needed",
variable_value => "1",
variable_default => "0",
variable_description => "striker_0089",
variable_section => "system",
variable_source_uuid => $anvil->Get->host_uuid,
variable_source_table => "hosts",
});
}
else
{
# Record that a reboot is needed in a temp file.
my $failed = $anvil->Storage->write_file({
debug => $debug,
overwrite => 1,
file => $cache_file,
body => 1,
user => "root",
group => "root",
mode => "0644",
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { failed => $failed }});
}
}
elsif ($set eq "0")
{
# Clear
$anvil->Database->insert_or_update_variables({
debug => $debug,
file => $THIS_FILE,
line => __LINE__,
variable_name => "reboot::needed",
variable_value => "0",
variable_default => "0",
variable_description => "striker_0089",
variable_section => "system",
variable_source_uuid => $anvil->Get->host_uuid,
variable_source_table => "hosts",
});
if ($anvil->data->{sys}{database}{connections})
{
$anvil->Database->insert_or_update_variables({
debug => $debug,
file => $THIS_FILE,
line => __LINE__,
variable_name => "reboot::needed",
variable_value => "0",
variable_default => "0",
variable_description => "striker_0089",
variable_section => "system",
variable_source_uuid => $anvil->Get->host_uuid,
variable_source_table => "hosts",
});
}
else
{
my $failed = $anvil->Storage->write_file({
debug => $debug,
overwrite => 1,
file => $cache_file,
body => 0,
user => "root",
group => "root",
mode => "0644",
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { failed => $failed }});
}
}
else
{
@ -4504,19 +4539,32 @@ sub reboot_needed
}
}
my ($reboot_needed, $variable_uuid, $modified_date) = $anvil->Database->read_variable({
debug => $debug,
file => $THIS_FILE,
line => __LINE__,
variable_name => "reboot::needed",
variable_source_table => "hosts",
variable_source_uuid => $anvil->Get->host_uuid,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
reboot_needed => $reboot_needed,
variable_uuid => $variable_uuid,
modified_date => $modified_date,
}});
# Read from the cache file, if it exists.
my $reboot_needed = 0;
if (-e $cache_file)
{
$reboot_needed = $anvil->Storage->read_file({
debug => $debug,
file => $cache_file,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { reboot_needed => $reboot_needed }});
}
elsif ($anvil->data->{sys}{database}{connections})
{
($reboot_needed, my $variable_uuid, my $modified_date) = $anvil->Database->read_variable({
debug => $debug,
file => $THIS_FILE,
line => __LINE__,
variable_name => "reboot::needed",
variable_source_table => "hosts",
variable_source_uuid => $anvil->Get->host_uuid,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
reboot_needed => $reboot_needed,
variable_uuid => $variable_uuid,
modified_date => $modified_date,
}});
}
if ($reboot_needed eq "")
{

@ -22,10 +22,18 @@ dist_man8_MANS = \
anvil-manage-server.8 \
anvil-manage-server-storage.8 \
anvil-manage-storage-groups.8 \
anvil-safe-start.8 \
anvil-safe-stop.8 \
anvil-shutdown-server.8 \
anvil-special-operations.8 \
anvil-update-system.8 \
anvil-watch-drbd.8 \
scancore.8 \
striker-check-machines.8 \
striker-collect-debug.8 \
striker-initialize-host.8 \
striker-update-cluster.8

@ -23,10 +23,10 @@ When logging, record sensitive data, like passwords.
Set the log level to 1, 2 or 3 respectively. Be aware that level 3 generates a significant amount of log data.
.SS "Commands:"
.TP
\fB\-\-job-uuid\fR <uuid>
\fB\-\-job\-uuid\fR <uuid>
This is set to the job UUID when the request to boot is coming from a database job. When set, the referenced job will be updated and marked as complete / failed when the run completes.
.TP
\fB\-\-no-wait\fR
\fB\-\-no\-wait\fR
This controls whether the request to boot the server waits for the server to actually boot up before returning. Normally, the program will check every couple of seconds to see if the server has actually booted before returning. Setting this tells the program to return as soon as the request to boot the server has been passed on to the resource manager.
.TP
\fB\-\-server\fR <all|name|uuid>
@ -34,7 +34,7 @@ This is either 'all', the name, or server UUID (as set in the definition XML) of
.TP
When set to 'all', all servers assigned to the local sub-cluster are booted. Servers on other Anvil! nodes are not started.
.TP
\fB\-\-server-uuid\fR <uuid>
\fB\-\-server\-uuid\fR <uuid>
This is the server UUID of the server to boot. Generally this isn't needed, except when two servers somehow share the same name. This should not be possible, but this option exists in case it happens anyway.
.TP
\fB\-\-wait\fR

@ -0,0 +1,29 @@
.\" Manpage for the Anvil! tool to safely start an Anvil! node's subnode.
.\" Contact mkelly@alteeve.com to report issues, concerns or suggestions.
.TH anvil-safe-start "8" "July 22 2023" "Anvil! Intelligent Availability™ Platform"
.SH NAME
anvil-safe-start \- This program safely joins an Anvil! subnode to a node.
.SH SYNOPSIS
.B anvil-safe-start
\fI\,<command> \/\fR[\fI\,options\/\fR]
.SH DESCRIPTION
This program will safely join an Anvil! subnode to an Anvil! node. If both nodes are starting, it will communicate with the peer, once available. This includes booting hosted servers.
.TP
NOTE: This tool runs at boot (or not) via the 'anvil-safe-start.service' systemd daemon.
.TP
\-?, \-h, \fB\-\-help\fR
Show this man page.
.TP
\fB\-\-log-secure\fR
When logging, record sensitive data, like passwords.
.TP
\-v, \-vv, \-vvv
Set the log level to 1, 2 or 3 respectively. Be aware that level 3 generates a significant amount of log data.
.SS "Commands:"
.TP
NOTE: This tool takes no specific commands.
.IP
.SH AUTHOR
Written by Madison Kelly, Alteeve staff and the Anvil! project contributors.
.SH "REPORTING BUGS"
Report bugs to users@clusterlabs.org

@ -0,0 +1,45 @@
.\" Manpage for the Anvil! safely stopping Anvil! node hosts
.\" Contact mkelly@alteeve.com to report issues, concerns or suggestions.
.TH anvil-safe-stop "8" "July 22 2023" "Anvil! Intelligent Availability™ Platform"
.SH NAME
anvil-safe-stop \- This program safely stop a subnode in an Anvil! node, and DR hosts
.SH SYNOPSIS
.B anvil-safe-stop
\fI\,<command> \/\fR[\fI\,options\/\fR]
.SH DESCRIPTION
This program will safely withdraw a subnode from an Anvil! node, and safely stop DR hosts. Optionally, it can also power off the machine.
.TP
\-?, \-h, \fB\-\-help\fR
Show this man page.
.TP
\fB\-\-log-secure\fR
When logging, record sensitive data, like passwords.
.TP
\-v, \-vv, \-vvv
Set the log level to 1, 2 or 3 respectively. Be aware that level 3 generates a significant amount of log data.
.SS "Commands:"
.TP
\fB\-\-no\-db\fR
.TP
This tells this program to run without connecting to the Striker databases. This should only be used if the Strikers are not available (either they're off, or they've been updated and this host hasn't been, and can't use them until this host is also updated).
.TP
NOTE: This is generally only used by 'striker-update-cluster'.
.TP
\fB\-\-poweroff\fR, \fB\-\-power\-off\fR
.TP
By default, the host will remain powered on when this program exits. Using this switch will have the host power off once the host is safely stopped.
.TP
\fB\-\-stop\-reason\fR <user, power, thermal>
.TP
Optionally used to set 'system::stop_reason' reason for this host. Valid values are 'user' (default), 'power' and 'thermal'. If set to 'user', ScanCore will not turn this host back on. If 'power', then ScanCore will reboot the host once the power under the host looks safe again. If thermal, then ScanCore will reboot the host once themperatures are back into safe levels.
.TP
\fB\-\-stop\-servers\fR
.TP
By default, on Anvil! sub-nodes, any servers running on this host will be migrated to the peer subnode. If the peer isn't available, this will refuse to stop. Using this switch will instead tell the system to stop all servers running on this host.
.TP
NOTE: On DR hosts, any running servers are always stopped.
.IP
.SH AUTHOR
Written by Madison Kelly, Alteeve staff and the Anvil! project contributors.
.SH "REPORTING BUGS"
Report bugs to users@clusterlabs.org

@ -0,0 +1,45 @@
.\" Manpage for the Anvil! server shutdown tool
.\" Contact mkelly@alteeve.com to report issues, concerns or suggestions.
.TH anvil-shutdown-server "8" "July 20 2023" "Anvil! Intelligent Availability™ Platform"
.SH NAME
anvil-shutdown-server \- This program shuts down servers hosted on the Anvil! cluster.
.SH SYNOPSIS
.B anvil-shutdown-server
\fI\,<command> \/\fR[\fI\,options\/\fR]
.SH DESCRIPTION
This program shuts down a server that is running on a Anvil! node or DR host. It can optionally stop all servers.
.TP
\-?, \-h, \fB\-\-help\fR
Show this man page.
.TP
\fB\-\-log-secure\fR
When logging, record sensitive data, like passwords.
.TP
\-v, \-vv, \-vvv
Set the log level to 1, 2 or 3 respectively. Be aware that level 3 generates a significant amount of log data.
.SS "Commands:"
.TP
\fB\-\-no\-db\fR
.TP
This tells the program to run without connecting to any databases. This is used mainly when the host is being taken down as part of a cluster-wise upgrade.
.TP
\fB\-\-no\-wait\fR
.TP
This tells the program to call the shut down, but not wait for the server to actually stop. By default, when shutting down one specific server, this program will wait for the server to be off before it returns.
.TP
\fB\-\-server\fR {<name>,all}
.TP
This is the name of the server to shut down. Optionally, this can be 'all' to shut down all servers on this host.
.TP
\fB\-\-server\-uuid\fR <uuid>
.TP
This is the server UUID of the server to shut down. NOTE: This can not be used with \fB\-\-no\-db\fR.
.TP
\fB\-\-wait\fR
.TP
This tells the program to wait for the server(s) to stop before returning. By default, when '\fB\-\-server all\fR' is used,, the shutdown will NOT wait. This makes the shutdowns sequential.
.IP
.SH AUTHOR
Written by Madison Kelly, Alteeve staff and the Anvil! project contributors.
.SH "REPORTING BUGS"
Report bugs to users@clusterlabs.org

@ -25,6 +25,18 @@ Set the log level to 1, 2 or 3 respectively. Be aware that level 3 generates a s
This is the task being requested. Current optiopns are:
.IP refresh-drbd-resource
This requires \fB\-\-resource <new name>\fR, and will call 'drbdadm adjust <resource>' as a background task and then return immediately. This is required when adding a new volume to an existing resource as 'drbdadm adjust <res>' will hold until it is called on all active DRBD nodes. This blocks the caller after the first remote host call.
.TP
.IP safe-stop
This implies \fB\-\-no\-db\fR, and will call 'anvil-safe-stop' as a background task. This is designed to ensure that a nodes leave the subcluster, and for DR host to shut down servers. This is done when the host is not yet updated, and the Striker dashboards have been upgraded with a new database schema.
.TP
.IP update-system
This implies \fB\-\-no\-db\fR, and will call 'anvil-update-system' as a background task. This allows remote machines to call for the update without risk of timing out the network connection.
.TP
Note: \fB\-\-no\-reboot\fR, \fB\-\-clear\-cache\fR, and \fB\-\-reboot\fR are all available here and passed to 'anvil-update-system'. See it's manpage for usage information.
.TP
\fB\-\-no\-db\fR
.TP
This tells the program to run without a database connection.
.IP
.SH AUTHOR
Written by Madison Kelly, Alteeve staff and the Anvil! project contributors.

@ -29,6 +29,12 @@ Set the log level to 1, 2 or 3 respectively. Be aware that level 3 generates a s
.TP
This will force the dnf cache to be cleared before the OS update is started. This slows the update down a bit, but ensures the latest updates are installed.
.TP
\fB\-\-no\-db\fR
.TP
This tells the update tool to run without a database connection. This is needed if the Striker dashboards are already updated, and the local system may no longer be able to talk to them.
.TP
NOTE: After the OS update is complete, an attempt will be made to connect to the database(s). This allows for registering a request to reboot if needed.
.TP
\fB\-\-no\-reboot\fR
.TP
If the kernel is updated, the system will normally be rebooted. This switch prevents the reboot from occuring.

@ -54,6 +54,10 @@ See \fB\-\-reboot\fR for rebooting if anything is updated.
Normally, the system will only reboot if the kernel is updated. If this is used, and if any packages are updated, then a reboot will be performed. This is recommended in most cases.
.TP
Must be used with \fB\-\-reboot\-self\fR to reboot the local system. Otherwise, it is passed along to target machines via their anvil-update-system calls.
.TP
\fB\-\-timeout\fR <seconds>
.TP
When given, if a system update doesn't complete in this amount of time, error out and abort the update. By default, updates will wait forever.
.IP
.SH AUTHOR
Written by Madison Kelly, Alteeve staff and the Anvil! project contributors.

@ -366,12 +366,12 @@ The attempt to start the cluster appears to have failed. The return code '0' was
<key name="error_0257"><![CDATA[No server specified to boot. Please use '--server <name|all>' or '--server-uuid <UUID>.]]></key>
<key name="error_0258">This host is not a node or DR, unable to boot servers.</key>
<key name="error_0259">The definition file: [#!variable!definition_file!#] doesn't exist, unable to boot the server.</key>
<key name="error_0260">This host is not in an Anvil! system, aborting.</key>
<key name="error_0260">This subnode is not in an Anvil! node yet, aborting.</key>
<key name="error_0261">The definition file: [#!variable!definition_file!#] exists, but the server: [#!variable!server!#] does not appear to be in the cluster. Unable to boot it.</key>
<key name="error_0262">The server: [#!variable!server!#] status is: [#!variable!status!#]. We can only boot servers that are off, not booting it.</key>
<key name="error_0263"><![CDATA[No server specified to shut down. Please use '--server <name|all>' or '--server-uuid <UUID>.]]></key>
<key name="error_0264">This host is not a node or DR, unable to shut down servers.</key>
<key name="error_0265">This feature isn't enabled on DR hosts yet.</key>
<key name="error_0265">Specifying a server to shutdown using a UUID is not available when there are no DB connections.</key>
<key name="error_0266">The server: [#!variable!server!#] does not appear to be in the cluster. Unable to shut it down.</key>
<key name="error_0267">The server: [#!variable!server!#] failed to boot. The reason why should be in the logs.</key>
<key name="error_0268">The server: [#!variable!server!#] failed to shut down. The reason why should be in the logs.</key>
@ -1562,7 +1562,7 @@ Note: This is a permanent action! If you protect this server again later, a full
<key name="job_0467">Update the base operating system.</key>
<key name="job_0468">This uses 'dnf' to do an OS update on the host. If this is run on a node, 'anvil-safe-stop' will be called to withdraw the subnode from the node's cluster. If the peer subnode is also offline, hosted servers will be shut down.</key>
<key name="job_0469">Update beginning. Verifying all known machines are accessible...</key>
<key name="job_0470"></key>
<key name="job_0470">This is a DR host, no migration possible.</key>
<!-- Log entries -->
<key name="log_0001">Starting: [#!variable!program!#].</key>
@ -2254,7 +2254,7 @@ The file: [#!variable!file!#] needs to be updated. The difference is:
<key name="log_0595">Updated the lvm.conf file to add the filter: [#!variable!filter!#] to prevent LVM from seeing the DRBD devices as LVM devices.</key>
<key name="log_0596">The host: [#!variable!host_name!#] last updated the database: [#!variable!difference!#] seconds ago, skipping power checks.</key>
<key name="log_0597">The host: [#!variable!host_name!#] has no entries in the 'updated' table, so ScanCore has likely never run. Skipping this host for now.</key>
<key name="log_0598">This host is not a node, this program isn't designed to run here.</key>
<key name="log_0598">This host is not an Anvil! sub node, this program isn't designed to run here.</key>
<key name="log_0599">Enabled 'anvil-safe-start' locally on this node.</key>
<key name="log_0600">Enabled 'anvil-safe-start' on both nodes in this Anvil! system.</key>
<key name="log_0601">Disabled 'anvil-safe-start' locally on this node.</key>
@ -2407,6 +2407,7 @@ The file: [#!variable!file!#] needs to be updated. The difference is:
<key name="log_0740">Running the scan-agent: [#!variable!agent!#] now to ensure that the database has an updated view of resources.</key>
<key name="log_0741">I was about to start: [#!variable!command!#] with the job UUID: [#!variable!this_job_uuid!#]. However, another job using the same command with the job UUID: [#!variable!other_job_uuid!#]. To avoid race conditions, only one process with a given command is run at the same time.</key>
<key name="log_0742">The job with the command: [#!variable!command!#] and job UUID: [#!variable!job_uuid!#] is restarting.</key>
<key name="log_0743">Will run without connecting to the databases. Some features will be unavailable.</key>
<!-- Messages for users (less technical than log entries), though sometimes used for logs, too. -->
<key name="message_0001">The host name: [#!variable!target!#] does not resolve to an IP address.</key>
@ -2920,6 +2921,7 @@ Proceed? [y/N]</key>
<key name="message_0321">Removing the old drbd-kmod RPMs now.</key>
<key name="message_0322">Installing the latest DRBD kmod RPM now.</key>
<key name="message_0323">Retrying the OS update now.</key>
<key name="message_0324">Update almost complete. Picked this job up after a '--no-db' run, and now we have database access again.</key>
<!-- Translate names (protocols, etc) -->
<key name="name_0001">Normal Password</key> <!-- none in mail-server -->

@ -39,6 +39,11 @@ $| = 1;
my $anvil = Anvil::Tools->new();
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, secure => 0, key => "log_0115", variables => { program => $THIS_FILE }});
# Read switches
$anvil->Get->switches({list => [], man => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }});
# Make sure we're running as 'root'
# $< == real UID, $> == effective UID
if (($< != 0) && ($> != 0))
@ -53,22 +58,6 @@ if (($< != 0) && ($> != 0))
$anvil->Database->connect();
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => "log_0132"});
$anvil->data->{switches}{'job-uuid'} = "";
$anvil->data->{switches}{disable} = "";
$anvil->data->{switches}{enable} = "";
$anvil->data->{switches}{force} = "";
$anvil->data->{switches}{'local'} = "";
$anvil->data->{switches}{status} = "";
$anvil->Get->switches;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'switches::job-uuid' => $anvil->data->{switches}{'job-uuid'},
'switches::disable' => $anvil->data->{switches}{disable},
'switches::enable' => $anvil->data->{switches}{enable},
'switches::force' => $anvil->data->{switches}{force},
'switches::local' => $anvil->data->{switches}{'local'},
'switches::status' => $anvil->data->{switches}{status},
}});
# If I have no databases, sleep until I do
if (not $anvil->data->{sys}{database}{connections})
{
@ -629,6 +618,8 @@ sub prerun_checks
"sys::peer_password" => $anvil->Log->is_secure($anvil->data->{sys}{peer_password}),
}});
# We don't use this anymore, it's managed by the 'anvil-safe-start.service' daemon.
=cut
# Are we being asked to enable or disable?
my $nodes = [$host_uuid];
my $set_to = 1;
@ -742,6 +733,7 @@ sub prerun_checks
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => $message});
$anvil->nice_exit({exit_code => 0});
}
=cut
# Is another instance running?
my $pids = $anvil->System->pids({
@ -758,6 +750,7 @@ sub prerun_checks
$anvil->nice_exit({exit_code => 0});
}
=cut
# Last test, enabled or forced?
if (not $local_enabled)
{
@ -775,6 +768,7 @@ sub prerun_checks
$anvil->nice_exit({exit_code => 0});
}
}
=cut
return(0);
}

@ -29,19 +29,16 @@ if (($running_directory =~ /^\./) && ($ENV{PWD}))
$| = 1;
my $anvil = Anvil::Tools->new();
$anvil->data->{switches}{'job-uuid'} = "";
$anvil->data->{switches}{'poweroff'} = "";
$anvil->data->{switches}{'power-off'} = ""; # By default, the node is withdrawn. With this switch, the node will power off as well.
$anvil->data->{switches}{'stop-reason'} = ""; # Optionally used to set 'system::stop_reason' reason for this host. Valid values are 'user', 'power' and 'thermal'.
$anvil->data->{switches}{'stop-servers'} = ""; # Default behaviour is to migrate servers to the peer, if the peer is up. This overrides that and forces hosted servers to shut down.
$anvil->Get->switches;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'switches::job-uuid' => $anvil->data->{switches}{'job-uuid'},
'switches::poweroff' => $anvil->data->{switches}{'poweroff'},
'switches::power-off' => $anvil->data->{switches}{'power-off'},
'switches::stop-reason' => $anvil->data->{switches}{'stop-reason'},
'switches::stop-servers' => $anvil->data->{switches}{'stop-servers'},
}});
# Read switches (target ([user@]host[:port]) and the file with the target's password.
$anvil->Get->switches({list => [
"no-db",
"poweroff",
"power-off",
"stop-reason",
"stop-servers"], man => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }});
# Let 'poweroff' work as a mis-spell of 'power-off'
if (($anvil->data->{switches}{'poweroff'}) && (not $anvil->data->{switches}{'power-off'}))
@ -63,15 +60,27 @@ if (($< != 0) && ($> != 0))
$anvil->nice_exit({exit_code => 1});
}
$anvil->Database->connect();
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 3, key => "log_0132"});
if (not $anvil->data->{sys}{database}{connections})
if ($anvil->data->{switches}{'no-db'})
{
# No databases, update the job, sleep for a bit and then exit. The daemon will pick it up and try
# again after we exit.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0075"});
sleep 10;
$anvil->nice_exit({exit_code => 1});
$anvil->data->{sys}{database}{connections} = 0;
$anvil->data->{switches}{'job-uuid'} = "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'sys::database::connections' => $anvil->data->{sys}{database}{connections},
'switches::job-uuid' => $anvil->data->{switches}{'job-uuid'},
}});
}
else
{
$anvil->Database->connect();
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 3, key => "log_0132"});
if (not $anvil->data->{sys}{database}{connections})
{
# No databases, update the job, sleep for a bit and then exit. The daemon will pick it up and try
# again after we exit.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0075"});
sleep 10;
$anvil->nice_exit({exit_code => 1});
}
}
# If we still don't have a job-uuit, go into interactive mode.
@ -115,12 +124,13 @@ if ($anvil->data->{switches}{'job-uuid'})
}
}
# Make sure we're in an Anvil!
$anvil->data->{sys}{anvil_uuid} = $anvil->Cluster->get_anvil_uuid();
if (not $anvil->data->{sys}{anvil_uuid})
# Make sure we're a subnode or DR host
my $host_type = $anvil->Get->host_type();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_type => $host_type }});
if (($host_type ne "node") && ($host_type ne "dr"))
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0260"});
$anvil->Job->update_progress({progress => 100, message => "error_0260"});
$anvil->Job->update_progress({progress => 100, message => "error_0260"}) if $anvil->data->{switches}{'job-uuid'};
$anvil->nice_exit({exit_code => 1});
}
@ -154,7 +164,7 @@ if ($anvil->data->{switches}{'power-off'})
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0687", variables => { reason => "#!string!log_0692!#" }});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0325"});
$anvil->Job->update_progress({progress => 100, message => "job_0325"});
$anvil->Job->update_progress({progress => 100, message => "job_0325"}) if $anvil->data->{switches}{'job-uuid'};
# Set the stop reason.
if ($anvil->data->{switches}{'stop-reason'})
@ -216,7 +226,7 @@ sub stop_cluster
{
# Cluster has stopped.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0313"});
$anvil->Job->update_progress({progress => 5, message => "job_0313"});
$anvil->Job->update_progress({progress => 5, message => "job_0313"}) if $anvil->data->{switches}{'job-uuid'};
}
else
{
@ -225,7 +235,7 @@ sub stop_cluster
{
# Stop pacemaker now.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0323"});
$anvil->Job->update_progress({progress => 70, message => "job_0323"});
$anvil->Job->update_progress({progress => 70, message => "job_0323"}) if $anvil->data->{switches}{'job-uuid'};
### NOTE: '--force' is needed or else sole-running nodes can't exit
### (complains about the loss of quorum)
@ -243,7 +253,7 @@ sub stop_cluster
else
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0324"});
$anvil->Job->update_progress({progress => 80, message => "job_0324"});
$anvil->Job->update_progress({progress => 80, message => "job_0324"}) if $anvil->data->{switches}{'job-uuid'};
}
}
if ($waiting)
@ -279,31 +289,32 @@ sub process_servers
my $can_migrate = 1;
if ($server_count)
{
my $problem = $anvil->Cluster->parse_cib({debug => 2});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:problem' => $problem,
's2:cib::parsed::local::ready' => $anvil->data->{cib}{parsed}{'local'}{ready},
's3:cib::parsed::peer::ready' => $anvil->data->{cib}{parsed}{peer}{ready},
}});
if ($problem)
{
# We're not in the node's cluster, we can't migrate.
$can_migrate = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { can_migrate => $can_migrate }});
}
elsif ((not $anvil->data->{cib}{parsed}{'local'}{ready}) or (not $anvil->data->{cib}{parsed}{peer}{ready}))
if ($anvil->Get->host_type() eq "dr")
{
# One of the subnodes is not in the cluster, so we can't migrate.
# No pacemaker, only stop servers.
$can_migrate = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { can_migrate => $can_migrate }});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, key => "job_0470"});
}
if ((not $anvil->data->{switches}{'stop-servers'}) && (not $can_migrate))
else
{
# We would have to stop the servers, and the user didn't tell us to do that, abort.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0372"});
$anvil->Job->update_progress({progress => 100, message => "error_0372"});
$anvil->nice_exit({exit_code => 1});
my $problem = $anvil->Cluster->parse_cib({debug => 2});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
's1:problem' => $problem,
's2:cib::parsed::local::ready' => $anvil->data->{cib}{parsed}{'local'}{ready},
's3:cib::parsed::peer::ready' => $anvil->data->{cib}{parsed}{peer}{ready},
}});
if ($problem)
{
# We're not in the node's cluster, we can't migrate.
$can_migrate = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { can_migrate => $can_migrate }});
}
elsif ((not $anvil->data->{cib}{parsed}{'local'}{ready}) or (not $anvil->data->{cib}{parsed}{peer}{ready}))
{
# One of the subnodes is not in the cluster, so we can't migrate.
$can_migrate = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { can_migrate => $can_migrate }});
}
}
}
@ -311,13 +322,20 @@ sub process_servers
{
# Tell the user we're about to shut down servers.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0320"});
$anvil->Job->update_progress({progress => 10, message => "job_0320"});
$anvil->Job->update_progress({progress => 10, message => "job_0320"}) if $anvil->data->{switches}{'job-uuid'};
}
else
elsif ($can_migrate)
{
# Tell the user we're about to migrate servers.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0321"});
$anvil->Job->update_progress({progress => 10, message => "job_0321"});
$anvil->Job->update_progress({progress => 10, message => "job_0321"}) if $anvil->data->{switches}{'job-uuid'};
}
else
{
# We would have to stop the servers, and the user didn't tell us to do that, abort.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0372"});
$anvil->Job->update_progress({progress => 100, message => "error_0372"}) if $anvil->data->{switches}{'job-uuid'};
$anvil->nice_exit({exit_code => 1});
}
while ($waiting)
@ -351,7 +369,7 @@ sub process_servers
{
# It's running despite the cluster being own, stop it.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "job_0419", variables => { server => $server_name }});
$anvil->Job->update_progress({progress => $progress, message => "job_0419,!!server!".$server_name."!!"});
$anvil->Job->update_progress({progress => $progress, message => "job_0419,!!server!".$server_name."!!"}) if $anvil->data->{switches}{'job-uuid'};
$anvil->Server->shutdown_virsh({
debug => 2,
server => $server_name,
@ -365,7 +383,7 @@ sub process_servers
{
# Hit the power button again.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "job_0420", variables => { server => $server_name }});
$anvil->Job->update_progress({progress => $progress, message => "job_0420,!!server!".$server_name."!!"});
$anvil->Job->update_progress({progress => $progress, message => "job_0420,!!server!".$server_name."!!"}) if $anvil->data->{switches}{'job-uuid'};
$anvil->Server->shutdown_virsh({
debug => 2,
server => $server_name,
@ -407,7 +425,7 @@ sub process_servers
}
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0313"});
$anvil->Job->update_progress({progress => 80, message => "job_0313"});
$anvil->Job->update_progress({progress => 80, message => "job_0313"}) if $anvil->data->{switches}{'job-uuid'};
}
else
{
@ -436,7 +454,7 @@ sub process_servers
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0315", variables => { server => $server }});
$anvil->Job->update_progress({progress => 20, message => "job_0315,!!server!".$server."!!"});
$anvil->Job->update_progress({progress => 20, message => "job_0315,!!server!".$server."!!"}) if $anvil->data->{switches}{'job-uuid'};
}
elsif ($host_name eq $local_name)
{
@ -454,7 +472,7 @@ sub process_servers
{
# Use PCS.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0316", variables => { server => $server }});
$anvil->Job->update_progress({progress => 20, message => "job_0316,!!server!".$server."!!"});
$anvil->Job->update_progress({progress => 20, message => "job_0316,!!server!".$server."!!"}) if $anvil->data->{switches}{'job-uuid'};
$anvil->Cluster->shutdown_server({
debug => 2,
server => $server,
@ -473,7 +491,7 @@ sub process_servers
{
# Use virsh
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0317", variables => { server => $server }});
$anvil->Job->update_progress({progress => 20, message => "job_0317,!!server!".$server."!!"});
$anvil->Job->update_progress({progress => 20, message => "job_0317,!!server!".$server."!!"}) if $anvil->data->{switches}{'job-uuid'};
$anvil->Server->shutdown_virsh({
debug => 2,
server => $server,
@ -495,7 +513,7 @@ sub process_servers
server => $server,
node => $peer_name,
}});
$anvil->Job->update_progress({progress => 20, message => "job_0318,!!server!".$server."!!,!!node!".$peer_name."!!"});
$anvil->Job->update_progress({progress => 20, message => "job_0318,!!server!".$server."!!,!!node!".$peer_name."!!"}) if $anvil->data->{switches}{'job-uuid'};
$anvil->Cluster->migrate_server({
server => $server,
node => $peer_name,
@ -512,7 +530,7 @@ sub process_servers
}
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0319"});
$anvil->Job->update_progress({progress => 30, message => "job_0319"});
$anvil->Job->update_progress({progress => 30, message => "job_0319"}) if $anvil->data->{switches}{'job-uuid'};
return(0);
}
@ -523,7 +541,7 @@ sub wait_on_drbd
my ($anvil) = @_;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0322"});
$anvil->Job->update_progress({progress => 40, message => "job_0322"});
$anvil->Job->update_progress({progress => 40, message => "job_0322"}) if $anvil->data->{switches}{'job-uuid'};
my $short_host_name = $anvil->Get->short_host_name();
my $waiting = 1;
while ($waiting)
@ -557,7 +575,7 @@ sub wait_on_drbd
resource => $server_name,
volume => $volume,
}});
$anvil->Job->update_progress({progress => 50, message => "job_0312,!!peer_host!".$peer_name."!!,!!resource!".$server_name."!!,!!volume!".$volume."!!"});
$anvil->Job->update_progress({progress => 50, message => "job_0312,!!peer_host!".$peer_name."!!,!!resource!".$server_name."!!,!!volume!".$volume."!!"}) if $anvil->data->{switches}{'job-uuid'};
}
}
}
@ -570,7 +588,7 @@ sub wait_on_drbd
# All servers should be down now, so stop DRBD.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0314"});
$anvil->Job->update_progress({progress => 60, message => "job_0314"});
$anvil->Job->update_progress({progress => 60, message => "job_0314"}) if $anvil->data->{switches}{'job-uuid'};
my $shell_call = $anvil->data->{path}{exe}{drbdadm}." down all";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});

@ -27,30 +27,41 @@ $| = 1;
my $anvil = Anvil::Tools->new();
$anvil->data->{switches}{'job-uuid'} = "";
$anvil->data->{switches}{'no-wait'} = ""; # When set, we'll not wait when we shut down a single server
$anvil->data->{switches}{'server'} = "";
$anvil->data->{switches}{'server-uuid'} = "";
$anvil->data->{switches}{'wait'} = ""; # When set, we'll wait for each server to shut down when using '--all'
$anvil->Get->switches;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, secure => 0, key => "log_0115", variables => { program => $THIS_FILE }});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'switches::job-uuid' => $anvil->data->{switches}{'job-uuid'},
'switches::no-wait' => $anvil->data->{switches}{'no-wait'},
'switches::server' => $anvil->data->{switches}{'server'},
'switches::server-uuid' => $anvil->data->{switches}{'server-uuid'},
'switches::wait' => $anvil->data->{switches}{'wait'},
}});
# Read switches (target ([user@]host[:port]) and the file with the target's password.
$anvil->Get->switches({list => [
"no-db",
"no-wait",
"server",
"server-uuid",
"wait"], man => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }});
$anvil->Database->connect();
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"});
if (not $anvil->data->{sys}{database}{connections})
# Connect to DBs.
if ($anvil->data->{switches}{'no-db'})
{
# No databases, update the job, sleep for a bit and then exit. The daemon will pick it up and try
# again after we exit.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0075"});
sleep 10;
$anvil->nice_exit({exit_code => 1});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, secure => 0, key => "log_0743"});
# If there was a job-uuid, clear it.
$anvil->data->{sys}{database}{connections} = 0;
$anvil->data->{switches}{'job-uuid'} = "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'sys::database::connections' => $anvil->data->{sys}{database}{connections},
'switches::job-uuid' => $anvil->data->{switches}{'job-uuid'},
}});
}
else
{
$anvil->Database->connect();
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"});
if (not $anvil->data->{sys}{database}{connections})
{
# No databases, update the job, sleep for a bit and then exit. The daemon will pick it up and try
# again after we exit.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0075"});
sleep 10;
$anvil->nice_exit({exit_code => 1});
}
}
if ($anvil->data->{switches}{'job-uuid'})
@ -88,6 +99,14 @@ if ($anvil->data->{switches}{'job-uuid'})
# Now check that we have a server. If it's a server_uuid, read the server name.
if ($anvil->data->{switches}{'server-uuid'})
{
# DO we have DB connection(s)?
if (not $anvil->data->{sys}{database}{connections})
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0265"});
$anvil->Job->update_progress({progress => 100, message => "error_0265"});
$anvil->nice_exit({exit_code => 1});
}
# Convert the server_uuid to a server_name.
my $query = "SELECT server_name FROM servers WHERE server_uuid = ".$anvil->Database->quote($anvil->data->{switches}{'server-uuid'}).";";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }});
@ -125,6 +144,9 @@ if (not $anvil->data->{switches}{'server'})
# Are we a node or DR host?
$anvil->data->{sys}{host_type} = $anvil->Get->host_type();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'sys::host_type' => $anvil->data->{sys}{host_type},
}});
if (($anvil->data->{sys}{host_type} ne "node") && ($anvil->data->{sys}{host_type} ne "dr"))
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0264"});
@ -132,26 +154,28 @@ if (($anvil->data->{sys}{host_type} ne "node") && ($anvil->data->{sys}{host_type
$anvil->nice_exit({exit_code => 1});
}
### TODO: Add DR support. For now, this only works on Nodes in a cluster
if ($anvil->data->{sys}{host_type} eq "dr")
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0265"});
$anvil->Job->update_progress({progress => 100, message => "error_0265"});
$anvil->nice_exit({exit_code => 1});
}
# Make sure that we're in an Anvil! system.
$anvil->data->{sys}{anvil_uuid} = $anvil->Cluster->get_anvil_uuid();
if (not $anvil->data->{sys}{anvil_uuid})
$anvil->data->{sys}{anvil_uuid} = "";
if (($anvil->data->{sys}{host_type} eq "node") && ($anvil->data->{sys}{database}{connections}))
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0260"});
$anvil->Job->update_progress({progress => 100, message => "error_0260"});
$anvil->nice_exit({exit_code => 1});
$anvil->data->{sys}{anvil_uuid} = $anvil->Cluster->get_anvil_uuid();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'sys::anvil_uuid' => $anvil->data->{sys}{anvil_uuid},
}});
if (not $anvil->data->{sys}{anvil_uuid})
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0260"});
$anvil->Job->update_progress({progress => 100, message => "error_0260"});
$anvil->nice_exit({exit_code => 1});
}
}
# This is copied from anvil-boot-server, but it works here as well. We can't use 'pcs' without pacemaker
# being up.
wait_for_pacemaker($anvil);
if ($anvil->data->{sys}{host_type} eq "node")
{
wait_for_pacemaker($anvil);
}
# If 'server' is 'all', shut down all servers.
if (lc($anvil->data->{switches}{'server'}) eq "all")
@ -165,7 +189,7 @@ else
}
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0281"});
$anvil->Job->update_progress({progress => 100, message => "job_0281"});
$anvil->Job->update_progress({progress => 100, message => "job_0281"}) if $anvil->data->{switches}{'job-uuid'};
$anvil->nice_exit({exit_code => 0});
@ -195,20 +219,20 @@ sub wait_for_pacemaker
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0279"});
$anvil->Job->update_progress({progress => 15, message => "job_0279"});
$anvil->Job->update_progress({progress => 15, message => "job_0279"}) if $anvil->data->{switches}{'job-uuid'};
}
else
{
# Node isn't ready yet.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0278"});
$anvil->Job->update_progress({progress => 10, message => "job_0278"});
$anvil->Job->update_progress({progress => 10, message => "job_0278"}) if $anvil->data->{switches}{'job-uuid'};
}
}
else
{
# Cluster hasn't started.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0277"});
$anvil->Job->update_progress({progress => 5, message => "job_0277"});
$anvil->Job->update_progress({progress => 5, message => "job_0277"}) if $anvil->data->{switches}{'job-uuid'};
}
if ($waiting)
{
@ -233,7 +257,7 @@ sub shutdown_server
{
# Nope.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "log_0548", variables => { server => $server }});
$anvil->Job->update_progress({progress => 100, message => "log_0548,!!server!".$server."!!"});
$anvil->Job->update_progress({progress => 100, message => "log_0548,!!server!".$server."!!"}) if $anvil->data->{switches}{'job-uuid'};
$anvil->nice_exit({exit_code => 1});
}
@ -243,24 +267,42 @@ sub shutdown_server
{
# It's off already
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0284", variables => { server => $server }});
$anvil->Job->update_progress({progress => $progress, message => "job_0284,!!server!".$server."!!"});
$anvil->Job->update_progress({progress => $progress, message => "job_0284,!!server!".$server."!!"}) if $anvil->data->{switches}{'job-uuid'};
return(0);
}
# Now shut down.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0289", variables => { server => $server }});
$anvil->Job->update_progress({progress => $progress, message => "job_0289,!!server!".$server."!!"});
my $problem = $anvil->Cluster->shutdown_server({
debug => 2,
server => $server,
'wait' => $wait,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }});
$anvil->Job->update_progress({progress => $progress, message => "job_0289,!!server!".$server."!!"}) if $anvil->data->{switches}{'job-uuid'};
my $problem = 0;
if ($anvil->Get->host_type eq "dr")
{
# Shut down using virsh. Invert the return.
my $success = $anvil->Server->shutdown_virsh({
debug => 2,
wait_time => $wait ? 0 : 1,
});
$problem = $success ? 0 : 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
success => $success,
problem => $problem,
}});
}
else
{
$problem = $anvil->Cluster->shutdown_server({
debug => 2,
server => $server,
'wait' => $wait,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }});
}
if ($problem)
{
# Failed, abort.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0268", variables => { server => $server }});
$anvil->Job->update_progress({progress => 100, message => "error_0268,!!server!".$server."!!"});
$anvil->Job->update_progress({progress => 100, message => "error_0268,!!server!".$server."!!"}) if $anvil->data->{switches}{'job-uuid'};
$anvil->nice_exit({exit_code => 1});
}
else
@ -269,13 +311,13 @@ sub shutdown_server
{
# Stopped!
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0285", variables => { server => $server }});
$anvil->Job->update_progress({progress => $progress, message => "job_0285,!!server!".$server."!!"});
$anvil->Job->update_progress({progress => $progress, message => "job_0285,!!server!".$server."!!"}) if $anvil->data->{switches}{'job-uuid'};
}
else
{
# Stop requested.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0286", variables => { server => $server }});
$anvil->Job->update_progress({progress => $progress, message => "job_0286,!!server!".$server."!!"});
$anvil->Job->update_progress({progress => $progress, message => "job_0286,!!server!".$server."!!"}) if $anvil->data->{switches}{'job-uuid'};
}
}
@ -289,7 +331,7 @@ sub shutdown_all_servers
### TODO: Manage the stop order here, inverse of boot order.
# We top out at 90, bottom is 20.
my $server_count = keys %{$anvil->data->{cib}{parsed}{data}{server}};
my $increment = int(70 / $server_count);
my $increment = $server_count ? int(70 / $server_count) : 70;
my $percent = 15;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
server_count => $server_count,

@ -29,21 +29,27 @@ my $anvil = Anvil::Tools->new();
# Read switches (target ([user@]host[:port]) and the file with the target's password.
$anvil->Get->switches({list => [
"task",
"clear-cache",
"no-db",
"no-reboot",
"reboot",
"resource",
], man => $THIS_FILE});
"task"], man => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }});
$anvil->Database->connect();
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"});
if (not $anvil->data->{sys}{database}{connections})
if ((not $anvil->data->{switches}{'no-db'}) && ($anvil->data->{switches}{task} eq "stop-target"))
{
# No databases, update the job, sleep for a bit and then exit. The daemon will pick it up and try
# again after we exit.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0306"});
sleep 10;
$anvil->nice_exit({exit_code => 1});
$anvil->Database->connect();
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"});
if (not $anvil->data->{sys}{database}{connections})
{
# No databases, update the job, sleep for a bit and then exit. The daemon will pick it up and try
# again after we exit.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0306"});
sleep 10;
$anvil->nice_exit({exit_code => 1});
}
}
if ($anvil->data->{switches}{'job-uuid'})
@ -57,10 +63,19 @@ if ($anvil->data->{switches}{'job-uuid'})
message => "message_0311",
});
}
if ($anvil->data->{switches}{task} eq "refresh-drbd-resource")
{
refresh_drbd_resource($anvil);
}
elsif ($anvil->data->{switches}{task} eq "safe-stop")
{
stop_target($anvil);
}
elsif ($anvil->data->{switches}{task} eq "update-system")
{
stop_target($anvil);
}
$anvil->nice_exit({exit_code => 0});
@ -117,4 +132,58 @@ sub refresh_drbd_resource
$anvil->nice_exit({exit_code => 0});
return(0);
}
}
# This calls 'anvil-safe-stop' on the local host as a background process. This is designed to allow remote
# machines to withdraw the system from a subcluster, or stop servers on a DR host, when the Strikers may have
# already updated and could prevent the local system from connecting to any databases.
sub stop_target
{
my ($anvil) = @_;
my $shell_call = $anvil->data->{path}{exe}{'anvil-safe-stop'}." --no-db -vv --log-secure";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({
shell_call => $shell_call,
background => 1,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
return(0);
}
# This calls 'anvil-safe-stop' on the local host as a background process. This is designed to allow remote
# machines to withdraw the system from a subcluster, or stop servers on a DR host, when the Strikers may have
# already updated and could prevent the local system from connecting to any databases.
sub update_target
{
my ($anvil) = @_;
my $shell_call = $anvil->data->{path}{exe}{'anvil-update-system'}." --no-db -vv --log-secure";
if ($anvil->data->{switches}{'clear-cache'})
{
$shell_call .= " --clear-cache";
}
if ($anvil->data->{switches}{'no-reboot'})
{
$shell_call .= " --no-reboot";
}
if ($anvil->data->{switches}{'reboot'})
{
$shell_call .= " --reboot";
}
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my ($output, $return_code) = $anvil->System->call({
shell_call => $shell_call,
background => 1,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
return(0);
}

@ -40,6 +40,7 @@ my $anvil = Anvil::Tools->new();
# Read switches (target ([user@]host[:port]) and the file with the target's password.
$anvil->Get->switches({list => [
"clear-cache",
"no-db",
"no-reboot",
"reboot"], man => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}});
@ -49,13 +50,28 @@ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, secure => 0, key => "log_0115", variables => { program => $THIS_FILE }});
# Connect to DBs.
$anvil->Database->connect;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"});
if (not $anvil->data->{sys}{database}{connections})
if ($anvil->data->{switches}{'no-db'})
{
# No databases, exit.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, secure => 0, key => "error_0003"});
$anvil->nice_exit({exit_code => 1});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, secure => 0, key => "log_0743"});
# If there was a job-uuid, clear it.
$anvil->data->{sys}{database}{connections} = 0;
$anvil->data->{switches}{'job-uuid'} = "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'sys::database::connections' => $anvil->data->{sys}{database}{connections},
'switches::job-uuid' => $anvil->data->{switches}{'job-uuid'},
}});
}
else
{
$anvil->Database->connect;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"});
if (not $anvil->data->{sys}{database}{connections})
{
# No databases, exit.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, secure => 0, key => "error_0003"});
$anvil->nice_exit({exit_code => 1});
}
}
if ($anvil->data->{switches}{'job-uuid'})
@ -70,7 +86,7 @@ if ($anvil->data->{switches}{'job-uuid'})
}
}
# Clea any old runs.
# Clear any old runs.
update_progress($anvil, 0, "clear");
# We'll keep a count of lines and packages to show the user.
@ -84,11 +100,54 @@ update_progress($anvil, 1, "message_0058,!!downloaded!0!!,!!installed!0!!,!!veri
update_progress($anvil, 2, "message_0033");
# Make sure maintenance mode is enabled.
$anvil->System->maintenance_mode({set => 1});
$anvil->System->maintenance_mode({set => 1}) if $anvil->data->{sys}{database}{connections};
# Run the update
run_os_update($anvil, 1, 3);
# If we had no database, try to reconnect now tha
if (not $anvil->data->{sys}{database}{connections})
{
# Start the anvil-daemon, the caller likely called without a DB because we're being updated by
# striker-update-cluster, and so there will be a job waiting for us.
$anvil->System->enable_daemon({now => 1, daemon => "anvil-daemon"});
$anvil->Database->connect;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => "log_0132"});
if ($anvil->data->{sys}{database}{connections})
{
# If there's a job for us waiting, mark it as almost done.
my $query = "
SELECT
job_uuid
FROM
jobs
WHERE
job_command LIKE '%".$THIS_FILE."%'
AND
job_host_uuid = ".$anvil->Database->quote($anvil->Get->host_uuid)."
AND
job_progress = 0
;";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }});
my $job_uuid = $anvil->Database->query({query => $query, source => $THIS_FILE, line => __LINE__})->[0]->[0];
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }});
if ($job_uuid)
{
$anvil->data->{switches}{'job-uuid'} = $job_uuid;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
'switches::job-uuid' => $anvil->data->{switches}{'job-uuid'},
}});
update_progress($anvil, 0, "clear");
update_progress($anvil, 90, "message_0324");
}
}
}
# We're done updating
my $reboot_needed = $anvil->System->reboot_needed({debug => 2});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { reboot_needed => $reboot_needed }});
@ -97,7 +156,7 @@ if ($reboot_needed)
if (not $anvil->data->{switches}{'no-reboot'})
{
# Clear maintenance mode.
$anvil->System->maintenance_mode({set => 0});
$anvil->System->maintenance_mode({set => 0}) if $anvil->data->{sys}{database}{connections};
# Record that we're rebooting so that 'striker-update-cluster' knows to wait for a reboot.
if ($anvil->data->{switches}{'job-uuid'})
@ -117,17 +176,21 @@ WHERE
# Register a job to reboot.
update_progress($anvil, 98, "message_0318");
my ($job_uuid) = $anvil->Database->insert_or_update_jobs({
file => $THIS_FILE,
line => __LINE__,
job_command => $anvil->data->{path}{exe}{'anvil-manage-power'}." --reboot -y".$anvil->Log->switches,
job_data => "",
job_name => "reboot::system",
job_title => "job_0009",
job_description => "job_0006",
job_progress => 0,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }});
if ($anvil->data->{sys}{database}{connections})
{
my ($job_uuid) = $anvil->Database->insert_or_update_jobs({
file => $THIS_FILE,
line => __LINE__,
job_command => $anvil->data->{path}{exe}{'anvil-manage-power'}." --reboot -y".$anvil->Log->switches,
job_data => "",
job_name => "reboot::system",
job_title => "job_0009",
job_description => "job_0006",
job_progress => 0,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }});
}
# Record that we're going to reboot now.
update_progress($anvil, 100, "message_0317");
@ -144,7 +207,7 @@ else
}
# Clear maintenance mode.
$anvil->System->maintenance_mode({set => 0});
$anvil->System->maintenance_mode({set => 0}) if $anvil->data->{sys}{database}{connections};
$anvil->nice_exit({exit_code => 0});

@ -373,10 +373,14 @@ sub collect_remote_data
$shell_call = $anvil->data->{path}{exe}{journalctl}." -b 0 > /tmp/journalctl-current-boot.log";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
($output, $error, $return_code) = $anvil->Remote->call({
shell_call => $shell_call,
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
error => $error,
return_code => $return_code,
}});
# Copying the file
@ -403,6 +407,46 @@ sub collect_remote_data
});
}
# Grab cloud-init data, if it exists.
$shell_call = "if [ -e /var/log/cloud-init.log ]; then echo 1; else echo 0; fi";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
($output, $error, $return_code) = $anvil->Remote->call({
shell_call => $shell_call,
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
error => $error,
return_code => $return_code,
}});
if ($output eq "1")
{
print "- Grabbing cloud-init logs... ";
$anvil->Storage->rsync({
source => "root\@".$anvil->data->{peer}{$short_host_name}{access}{ip}.":/var/log/cloud-init*",
destination => $target_directory."/",
});
$test_file = $target_directory."/cloud-init.log";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { test_file => $test_file }});
if (not -e $test_file)
{
print "Done.\n";
}
else
{
print "Failed!\n";
print "- For some reason, these files were not collected.\n";
$anvil->Storage->write_file({
file => $test_file,
body => $failed_body,
overwrite => 1,
backup => 0,
});
}
}
# If we're a striker, dump the database also.
if ($this_host_type eq "striker")
{
@ -636,6 +680,20 @@ sub collect_local_data
}});
print "Done!\n";
if (-e "/var/log/cloud-init.log")
{
print "- Grabbing cloud-init logs... ";
$shell_call = $anvil->data->{path}{exe}{cp}." /var/log/cloud-init* ".$target_directory."/";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
($output, $return_code) = $anvil->System->call({shell_call => $shell_call});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
return_code => $return_code,
}});
print "Done!\n";
}
# If this is a node, grab the shared files.
if ($this_host_type eq "node")
{

@ -46,6 +46,7 @@ $anvil->Get->switches({list => [
"no-reboot",
"reboot",
"reboot-self",
"timeout",
"y",
"yes"], man => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}});
@ -145,6 +146,8 @@ update_strikers_and_dr($anvil);
# Update DR Host
update_nodes($anvil);
manage_daemons($anvil, "start");
print "Updates complete!\n";
my $host_uuid = $anvil->Get->host_uuid;
@ -310,26 +313,12 @@ sub update_nodes
print "- [ Note ] - If the node has servers that need to be migrated off, or if the node is SyncSource for storage,\n";
print "- [ Note ] - this could take some time to complete.\n";
# Register an anvil-safe-stop job and then wait.
my $job_uuid = $anvil->Database->insert_or_update_jobs({
debug => 2,
job_command => $anvil->data->{path}{exe}{'anvil-safe-stop'},
job_host_uuid => $host_uuid,
job_description => "job_0339",
job_name => "cgi-bin::set_membership::leave",
job_progress => 0,
job_title => "job_0338"
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }});
print "- [ Note ] - Job registered with UUID: [".$job_uuid."], waiting for it to complete.\n";
# Log into the target machine and make sure anvil-daemon is running.
print "- Making sure anvil-daemon is running... ";
my $shell_call = $anvil->data->{path}{exe}{systemctl}." enable --now anvil-daemon.service";
# Make sure VMs are off, DRBD is down and the node is out of the cluster. Call this
# with nohup so it doesn't get killed by the loss of the SSH connection.
my $shell_call = $anvil->data->{path}{exe}{'anvil-safe-stop'}." --no-db".$anvil->Log->switches()." >/dev/null 2>&1 &";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my ($output, $error, $return_code) = $anvil->Remote->call({
'close' => 1,
no_cache => 1,
shell_call => $shell_call,
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
});
@ -338,84 +327,106 @@ sub update_nodes
error => $error,
return_code => $return_code,
}});
if (not $return_code)
{
print " running.\n";
}
else
{
print " not running!\n";
}
# Verify that the node is no longer in the cluster.
my $waiting = 1;
my $next_log = time + 60;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }});
# Now wait for DRBD resources to stop (which requires VMs be off).
print "- Waiting for all DRBD resource (and the servers using them) to stop before proceeding.\n";
my $wait_until = $anvil->data->{switches}{timeout} ? $anvil->data->{switches}{timeout} : 3600;
$wait_until += time;
my $next_log = time + 60;
my $waiting = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
wait_until => $wait_until,
next_log => $next_log,
waiting => $waiting,
}});
while ($waiting)
{
my $drbd_up = 0;
my $pacemaker_up = 0;
$anvil->DRBD->get_status({
host => $short_host_name,
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
});
# How may resources are up?
my $resource_count = keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { resource_count => $resource_count }});
if ($resource_count)
{
# DRBD is still up.
$drbd_up = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { drbd_up => $drbd_up }});
}
# Is pacemaker down?
my $problem = $anvil->Cluster->parse_cib({target => $anvil->data->{peer}{$short_host_name}{access}{ip}});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }});
if ($problem)
if (not $problem)
{
# Node is still in the cluster.
$pacemaker_up = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { pacemaker_up => $pacemaker_up }});
}
if ((not $pacemaker_up) && (not $drbd_up))
{
# This is good, it didn't parse so it's out of the cluster.
print "- The subnode is out of the node cluster. Proceeding.\n";
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
}
else
if ($waiting)
{
# Log which resources are still up
if (time > $next_log)
{
$anvil->Job->get_job_details({job_uuid => $job_uuid});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"jobs::job_progress" => $anvil->data->{jobs}{job_progress},
"jobs::job_data" => $anvil->data->{jobs}{job_data},
}});
if ($anvil->data->{jobs}{job_progress} == 0)
my $say_time = $anvil->Get->date_and_time({time_only => 1});
if ($pacemaker_up)
{
print "[ Warning ] - The job has not been picked up yet. Is 'anvil-daemon' running on: [".$short_host_name."]?\n";
print "[ Note ] - [".$say_time."] - The subnode is still in the cluster.\n";
}
else
{
print "[ Note ] - [".$anvil->Get->date_and_time({time_only => 1})."] - The job progress is: [".$anvil->data->{jobs}{job_progress}."], continuing to wait.\n";
print "[ Note ] - [".$say_time."] - The subnode is no longer in the cluster, good.\n";
}
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}})
{
print "[ Note ] - [".$say_time."] - The resource: [".$resource."] is still up.\n";
}
$next_log = time + 60;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }});
print "- Will check again shortly\n";
}
sleep 5;
if (time > $wait_until)
{
# Timeout.
print "[ Error ] - Timed out while waiting for the subnode: [".$short_host_name."] to stop all DRBD resources nad leave the cluster. Aborting the update.\n";
$anvil->nice_exit({exit_code => 1});
}
sleep 10;
}
}
# Record the start time so that we can be sure the subnode has rebooted (uptime is
# less than the current time minus this start time), if the host reboots as part of
# the update.
my $reboot_time = time;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
reboot_time => $reboot_time,
short_host_name => $short_host_name,
}});
# Do the OS update.
print "- Beginning OS update of: [".$short_host_name."]\n";
my $rebooted = 0;
$shell_call = $anvil->data->{path}{exe}{'anvil-update-system'};
my $update_switches = "";
if ($anvil->data->{switches}{'no-reboot'})
{
$shell_call .= " --no-reboot";
}
if ($anvil->data->{switches}{'clear-cache'})
{
$shell_call .= " --clear-cache";
$update_switches .= " --no-reboot";
}
if ($anvil->data->{switches}{reboot})
{
$shell_call .= " --reboot";
$update_switches .= " --reboot";
}
$shell_call .= $anvil->Log->switches();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { update_switches => $update_switches }});
# We register a job, even though anvil-daemon isn't running. This will get picked up
# by 'anvil-update-systems --no-db' towards the end of it's run.
print "- Registering a job to update the subnode, which we can track to confirm when the update is done.\n";
$shell_call = $anvil->data->{path}{exe}{'anvil-update-system'}.$update_switches.$anvil->Log->switches();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
$job_uuid = "";
$job_uuid = $anvil->Database->insert_or_update_jobs({
my $job_uuid = $anvil->Database->insert_or_update_jobs({
debug => 2,
job_command => $shell_call,
job_description => "job_0468",
@ -427,6 +438,39 @@ sub update_nodes
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }});
print "- [ Note ] - Job registered with UUID: [".$job_uuid."], waiting for it to complete.\n";
# Now call anvil-update-system with --no-db and background it so we can close
# the DB connection without killing the process.
print "- Calling the no-database update of: [".$short_host_name."]\n";
$shell_call = $anvil->data->{path}{exe}{nohup}." ".$anvil->data->{path}{exe}{'anvil-update-system'}." --no-db".$update_switches;
if ($anvil->data->{switches}{'clear-cache'})
{
# We'll only call clear-cache on this one.
$shell_call .= " --clear-cache";
}
$shell_call .= $anvil->Log->switches()." >/dev/null 2>&1 &";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
($output, $error, $return_code) = $anvil->Remote->call({
shell_call => $shell_call,
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
error => $error,
return_code => $return_code,
}});
# Record the start time so that we can be sure the subnode has rebooted (uptime is
# less than the current time minus this start time), if the host reboots as part of
# the update.
my $rebooted = 0;
my $reboot_time = time;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
rebooted => $rebooted,
reboot_time => $reboot_time,
short_host_name => $short_host_name,
}});
# Verify that the node is no longer in the cluster.
$waiting = 1;
$next_log = time + 60;
@ -440,7 +484,7 @@ sub update_nodes
}});
if ($anvil->data->{jobs}{job_progress} == 100)
{
print "- Done! The host: [".$short_host_name."] has been updated\n";
print "- Done! The subnode: [".$short_host_name."] has been updated\n";
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
@ -461,15 +505,13 @@ sub update_nodes
}
else
{
my $say_date = $anvil->Get->date_and_time({time_only => 1});
if (time > $next_log)
{
if ($anvil->data->{jobs}{job_progress} == 0)
{
print "[ Warning ] - The job has not been picked up yet. Is 'anvil-daemon' running on: [".$short_host_name."]?\n";
}
else
print "[ Note ] - [".$say_date."] - The job progress is: [".$anvil->data->{jobs}{job_progress}."], continuing to wait.\n";
if ($anvil->data->{jobs}{job_progress} eq "0")
{
print "[ Note ] - [".$anvil->Get->date_and_time({time_only => 1})."] - The job progress is: [".$anvil->data->{jobs}{job_progress}."], continuing to wait.\n";
print "[ Note ] - [".$say_date."] - It is expected for the job to stay at '0' for a while.\n";
}
$next_log = time + 60;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }});
@ -488,7 +530,6 @@ sub update_nodes
else
{
print "- Reboot not needed, kernel appears to be up to date.\n";
$run_anvil_safe_start = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { run_anvil_safe_start => $run_anvil_safe_start }});
}
@ -512,7 +553,7 @@ sub update_nodes
{
print "- Calling 'anvil-safe-start' to rejoin the subnode to the node.\n";
$start_called = 1;
my $shell_call = $anvil->data->{path}{exe}{'anvil-safe-start'};
my $shell_call = $anvil->data->{path}{exe}{'anvil-safe-start'}.$anvil->Log->switches()." >/dev/null 2>&1 &";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
start_called => $start_called,
shell_call => $shell_call,
@ -707,13 +748,19 @@ sub update_strikers_and_dr
{
my ($anvil) = @_;
foreach my $host_type ("striker", "dr")
# Make sure the timeout, if set, is valid.
if ($anvil->data->{switches}{timeout})
{
if ($host_type eq "dr")
if ($anvil->data->{switches}{timeout} =~ /\D/)
{
# Restart daemons.
manage_daemons($anvil, "start");
# Invalid, error out.
print "The --timeout switch was used: [".$anvil->data->{switches}{timeout}."], but the value isn't a number of seconds.\n";
$anvil->nice_exit({exit_code => 1});
}
}
foreach my $host_type ("striker", "dr")
{
foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}})
{
my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name};
@ -859,20 +906,93 @@ sub update_strikers_and_dr
{
# Call anvil-update-system and then wait.
print "- Beginning OS update of: [".$short_host_name."]\n";
my $shell_call = $anvil->data->{path}{exe}{'anvil-update-system'};
if ($anvil->data->{switches}{'no-reboot'})
if ($host_type eq "dr")
{
$shell_call .= " --no-reboot";
# Make sure VMs are off and DRBD is down. Call this with nohup so it
# doesn't get killed by the loss of the SSH connection.
my $shell_call = $anvil->data->{path}{exe}{'anvil-safe-stop'}." --no-db".$anvil->Log->switches()." >/dev/null 2>&1 &";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my ($output, $error, $return_code) = $anvil->Remote->call({
shell_call => $shell_call,
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
output => $output,
error => $error,
return_code => $return_code,
}});
# Now wait for DRBD resources to stop (which requires VMs be off).
print "- Waiting for all DRBD resource (and the servers using them) to stop before proceeding.\n";
my $wait_until = $anvil->data->{switches}{timeout} ? $anvil->data->{switches}{timeout} : 3600;
$wait_until += time;
my $next_log = time + 60;
my $waiting = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
wait_until => $wait_until,
next_log => $next_log,
waiting => $waiting,
}});
while ($waiting)
{
my $drbd_up = 0;
$anvil->DRBD->get_status({
host => $short_host_name,
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
});
# How may resources are up?
my $resource_count = keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { resource_count => $resource_count }});
if (not $resource_count)
{
# Done!
$waiting = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }});
}
if ($waiting)
{
# Log which resources are still up
if (time > $next_log)
{
foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}})
{
print "[ Note ] - [".$anvil->Get->date_and_time({time_only => 1})."] - The resource: [".$resource."] is still up.\n";
}
$next_log = time + 60;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }});
print "- Will check again shortly\n";
}
if (time > $wait_until)
{
# Timeout.
print "[ Error ] - Timed out while waiting for the DR host: [".$short_host_name."] to stop all DRBD resources. Aborting the update.\n";
$anvil->nice_exit({exit_code => 1});
}
sleep 10;
}
}
}
if ($anvil->data->{switches}{'clear-cache'})
my $update_switches = "";
if ($anvil->data->{switches}{'no-reboot'})
{
$shell_call .= " --clear-cache";
$update_switches .= " --no-reboot";
}
if ($anvil->data->{switches}{reboot})
{
$shell_call .= " --reboot";
$update_switches .= " --reboot";
}
$shell_call .= $anvil->Log->switches();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { update_switches => $update_switches }});
# We register a job, even though anvil-daemon isn't running. This will get
# picked up by 'anvil-update-systems --no-db' towards the end of it's run.
print "- Registering a job to update the system, which we can track to confirm when the update is done.\n";
my $shell_call = $anvil->data->{path}{exe}{'anvil-update-system'}.$update_switches.$anvil->Log->switches();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my $job_uuid = $anvil->Database->insert_or_update_jobs({
debug => 2,
@ -886,13 +1006,19 @@ sub update_strikers_and_dr
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }});
print "- [ Note ] - Job registered with UUID: [".$job_uuid."], waiting for it to complete.\n";
# Log into the target machine and make sure anvil-daemon is running.
print "- Making sure anvil-daemon is running... ";
$shell_call = $anvil->data->{path}{exe}{systemctl}." start anvil-daemon.service";
# Now call anvil-update-system with --no-db and background it so we can close
# the DB connection without killing the process.
print "- Calling the no-database update of: [".$short_host_name."]\n";
$shell_call = $anvil->data->{path}{exe}{nohup}." ".$anvil->data->{path}{exe}{'anvil-update-system'}." --no-db".$update_switches;
if ($anvil->data->{switches}{'clear-cache'})
{
# We'll only call clear-cache on this one.
$shell_call .= " --clear-cache";
}
$shell_call .= $anvil->Log->switches()." >/dev/null 2>&1 &";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }});
my ($output, $error, $return_code) = $anvil->Remote->call({
'close' => 1,
no_cache => 1,
shell_call => $shell_call,
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
});
@ -901,16 +1027,10 @@ sub update_strikers_and_dr
error => $error,
return_code => $return_code,
}});
if (not $return_code)
{
print " running.\n";
}
else
{
print " not running!\n";
}
# Verify that the node is no longer in the cluster.
# Verify / wait until the update is done.
my $wait_until = $anvil->data->{switches}{timeout} ? $anvil->data->{switches}{timeout} : 3600;
$wait_until += time;
my $waiting = 1;
my $next_log = time + 60;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }});
@ -946,17 +1066,21 @@ sub update_strikers_and_dr
{
if (time > $next_log)
{
my $say_date = $anvil->Get->date_and_time({time_only => 1});
print "[ Note ] - [".$say_date."] - The job progress is: [".$anvil->data->{jobs}{job_progress}."], continuing to wait.\n";
if ($anvil->data->{jobs}{job_progress} == 0)
{
print "[ Warning ] - The job has not been picked up yet. Is 'anvil-daemon' running on: [".$short_host_name."]?\n";
}
else
{
print "[ Note ] - [".$anvil->Get->date_and_time({time_only => 1})."] - The job progress is: [".$anvil->data->{jobs}{job_progress}."], continuing to wait.\n";
print "[ Note ] - [".$say_date."] - It is normal for the job to show '0' progress until the database access is restored.\n";
}
$next_log = time + 60;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }});
}
if (time > $wait_until)
{
# Timeout.
print "[ Error ] - Timed out while waiting for the machine: [".$short_host_name."] to update the OS. Aborting the update.\n";
$anvil->nice_exit({exit_code => 1});
}
sleep 5;
}
}
@ -991,6 +1115,8 @@ sub update_strikers_and_dr
else
{
($output, $error, $return_code) = $anvil->Remote->call({
'close' => 1,
no_cache => 1,
shell_call => $shell_call,
target => $anvil->data->{peer}{$short_host_name}{access}{ip},
});

Loading…
Cancel
Save