diff --git a/Anvil/Tools.pm b/Anvil/Tools.pm index f415db42..9601da25 100644 --- a/Anvil/Tools.pm +++ b/Anvil/Tools.pm @@ -851,9 +851,6 @@ sub _set_defaults }, }; $anvil->data->{sys} = { - apache => { - user => "admin", - }, daemon => { dhcpd => "dhcpd.service", firewalld => "firewalld.service", @@ -1150,6 +1147,7 @@ sub _set_paths 'anvil-safe-start' => "/usr/sbin/anvil-safe-start", 'anvil-safe-stop' => "/usr/sbin/anvil-safe-stop", 'anvil-shutdown-server' => "/usr/sbin/anvil-shutdown-server", + 'anvil-special-operations' => "/usr/sbin/anvil-special-operations", 'anvil-sync-shared' => "/usr/sbin/anvil-sync-shared", 'anvil-update-files' => "/usr/sbin/anvil-update-files", 'anvil-update-states' => "/usr/sbin/anvil-update-states", @@ -1261,6 +1259,7 @@ sub _set_paths 'shutdown' => "/usr/sbin/shutdown", snmpget => "/usr/bin/snmpget", snmpset => "/usr/bin/snmpset", + 'sort' => "/usr/bin/sort", 'ssh-keygen' => "/usr/bin/ssh-keygen", 'ssh-keyscan' => "/usr/bin/ssh-keyscan", 'stat' => "/usr/bin/stat", @@ -1282,6 +1281,8 @@ sub _set_paths swapon => "/usr/sbin/swapon", sysctl => "/usr/sbin/sysctl", systemctl => "/usr/bin/systemctl", + tail => "/usr/bin/tail", + tar => "/usr/bin/tar", timeout => "/usr/bin/timeout", touch => "/usr/bin/touch", tput => "/usr/bin/tput", diff --git a/Anvil/Tools/Cluster.pm b/Anvil/Tools/Cluster.pm index 831d7c45..a50a68ab 100644 --- a/Anvil/Tools/Cluster.pm +++ b/Anvil/Tools/Cluster.pm @@ -2441,7 +2441,7 @@ sub get_peers =head2 get_primary_host_uuid -This takes an Anvil! UUID and returns with node is currently the "primary" node. That is to say, which node has the most servers running on it, by allocated RAM. For example, if node 1 has two servers, each with 8 GiB of RAN and node 2 has one VM with 32 GiB of RAM, node 2 will be considered primary as it would take longest to migrate servers off. +This takes an Anvil! UUID and returns with the node's host UUID that is currently the "primary" node. That is to say, which node has the most servers running on it, by allocated RAM. For example, if node 1 has two servers, each with 8 GiB of RAN and node 2 has one VM with 32 GiB of RAM, node 2 will be considered primary as it would take longest to migrate servers off. If all is equal, node 1 is considered primary. If only one node is a cluster member, it is considered primary. If neither node is up, an empty string is returned. @@ -2478,8 +2478,11 @@ sub get_primary_host_uuid return(""); } - # Get the two node UUIDs. - $anvil->Database->get_anvils({debug => $debug}); + # Get the two node UUIDs, if not already loaded + if (not exists $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}) + { + $anvil->Database->get_anvils({debug => $debug}); + } if (not exists $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}) { @@ -2600,6 +2603,7 @@ sub get_primary_host_uuid my $node2_ram_in_use_by_servers = 0; # Loop through servers. + $anvil->Database->get_servers({debug => $debug}); foreach my $server_name (sort {$a cmp $b} keys %{$anvil->data->{servers}{anvil_uuid}{$anvil_uuid}{server_name}}) { my $server_uuid = $anvil->data->{servers}{anvil_uuid}{$anvil_uuid}{server_name}{$server_name}{server_uuid}; diff --git a/Anvil/Tools/DRBD.pm b/Anvil/Tools/DRBD.pm index 94dc1086..fc4aa327 100644 --- a/Anvil/Tools/DRBD.pm +++ b/Anvil/Tools/DRBD.pm @@ -2146,6 +2146,10 @@ If any data for the host was stored in a previous call, it will be deleted befor Parameters; +=head3 host (optional) + +By default, the hash key C<< host_name >> listed above is either the local system's short host name, or the C<< target >>. If you'd like to use a specific host name in the hash key, you can use this parameter to set it. + =head3 password (optional) This is the password to use when connecting to a remote machine. If not set, but C<< target >> is, an attempt to connect without a password will be made. @@ -2172,22 +2176,42 @@ sub get_status my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "DRBD->get_status()" }}); + my $host = defined $parameter->{host} ? $parameter->{host} : ""; my $password = defined $parameter->{password} ? $parameter->{password} : ""; my $port = defined $parameter->{port} ? $parameter->{port} : ""; my $remote_user = defined $parameter->{remote_user} ? $parameter->{remote_user} : "root"; my $target = defined $parameter->{target} ? $parameter->{target} : ""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + host => $host, password => $anvil->Log->is_secure($password), port => $port, remote_user => $remote_user, target => $target, }}); + # If we weren't passed a host, use this machine's short host name. + my $is_local = $anvil->Network->is_local({host => $target}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { is_local => $is_local }}); + if (not $host) + { + # Host not set, set one. + if ($is_local) + { + $host = $anvil->Get->short_host_name(); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { host => $host }}); + } + else + { + # Remote, using the target as the host. + $host = $target; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { host => $host }}); + } + } + # Is this a local call or a remote call? my $shell_call = $anvil->data->{path}{exe}{drbdsetup}." status --json"; my $output = ""; - my $host = $anvil->Get->short_host_name(); - my $is_local = $anvil->Network->is_local({host => $target}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { shell_call => $shell_call }}); if ($is_local) { # Local. @@ -2200,7 +2224,6 @@ sub get_status else { # Remote call. - $host = $target; ($output, my $error, $anvil->data->{drbd}{status}{$host}{return_code}) = $anvil->Remote->call({ debug => $debug, shell_call => $shell_call, @@ -2291,12 +2314,14 @@ sub get_status $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{congested} = $hash_ref->{connections}->[$i]->{congested}; $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'connection-state'} = $hash_ref->{connections}->[$i]->{'connection-state'}; $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'peer-node-id'} = $hash_ref->{connections}->[$i]->{'peer-node-id'}; + $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'peer-role'} = $hash_ref->{connections}->[$i]->{'peer-role'}; $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'rs-in-flight'} = $hash_ref->{connections}->[$i]->{'rs-in-flight'}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { "drbd::status::${host}::resource::${resource}::connection::${peer_name}::ap-in-flight" => $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'ap-in-flight'}, "drbd::status::${host}::resource::${resource}::connection::${peer_name}::congested" => $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{congested}, "drbd::status::${host}::resource::${resource}::connection::${peer_name}::connection-state" => $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'connection-state'}, "drbd::status::${host}::resource::${resource}::connection::${peer_name}::peer-node-id" => $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'peer-node-id'}, + "drbd::status::${host}::resource::${resource}::connection::${peer_name}::peer-role" => $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'peer-role'}, "drbd::status::${host}::resource::${resource}::connection::${peer_name}::rs-in-flight" => $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'rs-in-flight'}, }}); diff --git a/Anvil/Tools/Database.pm b/Anvil/Tools/Database.pm index f6c0ebe6..aedab183 100644 --- a/Anvil/Tools/Database.pm +++ b/Anvil/Tools/Database.pm @@ -4491,6 +4491,7 @@ WHERE } +### TODO: Delete this and convert over to Jobs->get_job_details() =head2 get_job_details This gets the details for a given job. If the job is found, a hash reference is returned containing the tables that were read in. diff --git a/Anvil/Tools/Get.pm b/Anvil/Tools/Get.pm index d0101457..d9d590c0 100644 --- a/Anvil/Tools/Get.pm +++ b/Anvil/Tools/Get.pm @@ -161,10 +161,10 @@ sub anvil_from_switch "switches::anvil_uuid" => $anvil->data->{switches}{anvil_uuid}, }}); } - elsif (exists $anvil->data->{anvils}{anvil_uuid}{$anvil_string}) + elsif (exists $anvil->data->{anvils}{anvil_name}{$anvil_string}) { $anvil->data->{switches}{anvil_name} = $anvil_string; - $anvil->data->{switches}{anvil_uuid} = $anvil->data->{anvils}{anvil_uuid}{$anvil_string}{anvil_uuid}; + $anvil->data->{switches}{anvil_uuid} = $anvil->data->{anvils}{anvil_name}{$anvil_string}{anvil_uuid}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { "switches::anvil_name" => $anvil->data->{switches}{anvil_name}, "switches::anvil_uuid" => $anvil->data->{switches}{anvil_uuid}, @@ -326,7 +326,7 @@ sub anvil_version schema_cache_file => $schema_cache_file, user => $user, }}); - if ($user eq "apache") + if (($user eq "apache") or ($user eq "striker-ui-api")) { # Try to read the local cached version. if (-e $anvil_cache_file) @@ -1867,8 +1867,8 @@ sub host_uuid debug => $debug, file => $anvil->data->{path}{data}{host_uuid}, body => $uuid, - user => "apache", - group => "apache", + user => "striker-ui-api", + group => "striker-ui-api", mode => "0666", overwrite => 0, }); @@ -2529,7 +2529,7 @@ sub switches $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { found => $found }}); if (not $found) { - print "Switch '--".$set_switch." not recognized.\n"; + print "Switch '--".$set_switch."' is not recognized.\n"; $problem = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { problem => $problem }}); } diff --git a/Anvil/Tools/Job.pm b/Anvil/Tools/Job.pm index b31ec09d..460ca803 100644 --- a/Anvil/Tools/Job.pm +++ b/Anvil/Tools/Job.pm @@ -756,29 +756,6 @@ WHERE $job_status =~ s/message_0058,!!downloaded!.*?!!,!!installed!.*?!!,!!verified!.*?!!,!!lines!.*?!!/message_0058,!!downloaded!$downloaded!!,!!installed!$installed!!,!!verified!$verified!!,!!lines!$lines!!/sm; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { "<< job_status" => $job_status }}); } - # This is used by 'anvil-download-file' - if ($job_status =~ /message_0142/gs) - { - ### NOTE: Is this needed anymore? -# my $downloaded = $anvil->data->{counts}{downloaded} ? $anvil->Convert->add_commas({number => $anvil->data->{counts}{downloaded}}) : 0; -# my $installed = $anvil->data->{counts}{installed} ? $anvil->Convert->add_commas({number => $anvil->data->{counts}{installed}}) : 0; -# my $verified = $anvil->data->{counts}{verified} ? $anvil->Convert->add_commas({number => $anvil->data->{counts}{verified}}) : 0; -# my $lines = $anvil->data->{counts}{lines} ? $anvil->Convert->add_commas({number => $anvil->data->{counts}{lines}}) : 0; -# $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { -# "s1:counts::downloaded" => $anvil->data->{counts}{downloaded}, -# "s2:downloaded" => $downloaded, -# "s3:counts::installed" => $anvil->data->{counts}{installed}, -# "s4:installed" => $installed, -# "s5:counts::verified" => $anvil->data->{counts}{verified}, -# "s6:verified" => $verified, -# "s7:counts::lines" => $anvil->data->{counts}{lines}, -# "s8:lines" => $lines, -# }}); -# -# $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { ">> job_status" => $job_status }}); -# $job_status =~ s/message_0142,!!downloaded!.*?!!,!!installed!.*?!!,!!verified!.*?!!,!!lines!.*?!!/message_0058,!!downloaded!$downloaded!!,!!installed!$installed!!,!!verified!$verified!!,!!lines!$lines!!/sm; -# $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { "<< job_status" => $job_status }}); - } $job_uuid = $anvil->Database->insert_or_update_jobs({ file => $THIS_FILE, diff --git a/Anvil/Tools/Network.pm b/Anvil/Tools/Network.pm index 3df3192a..b5534734 100644 --- a/Anvil/Tools/Network.pm +++ b/Anvil/Tools/Network.pm @@ -1212,7 +1212,7 @@ Paramters; =head3 target (required) -This is the host we're looking for connection options with. +This is the host (name or UUID) we're looking for connection options with. =cut sub find_access @@ -1662,6 +1662,12 @@ sub get_company_from_mac $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { company => $company }}); } + if ((not $company) && ($mac =~ /^52:54:00/)) + { + $company = "KVM/qemu"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { company => $company }}); + } + return($company); } diff --git a/Anvil/Tools/Remote.pm b/Anvil/Tools/Remote.pm index aa9582b0..937db22c 100644 --- a/Anvil/Tools/Remote.pm +++ b/Anvil/Tools/Remote.pm @@ -302,7 +302,7 @@ sub call # Now pick up the rest of the variables. my $close = defined $parameter->{'close'} ? $parameter->{'close'} : 0; my $no_cache = defined $parameter->{no_cache} ? $parameter->{no_cache} : 0; - my $password = defined $parameter->{password} ? $parameter->{password} : $anvil->data->{sys}{root_password}; + my $password = defined $parameter->{password} ? $parameter->{password} : ""; my $secure = defined $parameter->{secure} ? $parameter->{secure} : 0; my $shell_call = defined $parameter->{shell_call} ? $parameter->{shell_call} : ""; my $timeout = defined $parameter->{timeout} ? $parameter->{timeout} : 10; @@ -311,16 +311,26 @@ sub call # NOTE: The shell call might contain sensitive data, so we show '--' if 'secure' is set and $anvil->Log->secure is not. $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { 'close' => $close, + no_cache => $no_cache, password => $anvil->Log->is_secure($password), secure => $secure, shell_call => (not $secure) ? $shell_call : $anvil->Log->is_secure($shell_call), ssh_fh => $ssh_fh, start_time => $start_time, + timeout => $timeout, port => $port, target => $target, ssh_fh_key => $ssh_fh_key, }}); + if ((not $password) && (defined $anvil->data->{sys}{root_password})) + { + $password = $anvil->data->{sys}{root_password}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + password => $anvil->Log->is_secure($password), + }}); + } + # In case 'target' is our short host name, change it to ''. if ($target eq $anvil->Get->short_host_name()) { @@ -625,6 +635,19 @@ sub call { $error = $anvil->Words->string({key => $message_key, variables => $variables}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => $message_key, variables => $variables}); + + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => { + 'close' => $close, + password => $anvil->Log->is_secure($password), + secure => $secure, + shell_call => (not $secure) ? $shell_call : $anvil->Log->is_secure($shell_call), + ssh_fh => $ssh_fh, + start_time => $start_time, + timeout => $timeout, + port => $port, + target => $target, + ssh_fh_key => $ssh_fh_key, + }}); } } @@ -667,6 +690,10 @@ sub call error => $ssh_fh->error, }}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, secure => $secure, list => { error => $error }}); + + # Close the connection. + $close = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, secure => $secure, list => { 'close' => $close }}); } # Take the last new line off. @@ -914,6 +941,10 @@ This attempts to log into the target to verify that the target is up and reachab Parameters; +=head3 close (optional, default '1') + +If set, the SSH connection used to test the access to the remote host wil be closed. This can be useful it there might be a delay between when the connecton is tested and when it is used again. + =head3 password (optional) This is the password used to connect to the remote target as the given user. @@ -941,12 +972,14 @@ sub test_access my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "Remote->test_access()" }}); + my $close = defined $parameter->{'close'} ? $parameter->{'close'} : 1; my $password = defined $parameter->{password} ? $parameter->{password} : ""; my $port = defined $parameter->{port} ? $parameter->{port} : 22; my $target = defined $parameter->{target} ? $parameter->{target} : ""; my $user = defined $parameter->{user} ? $parameter->{user} : getpwuid($<); my $access = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, secure => 0, list => { + 'close' => $close, password => $anvil->Log->is_secure($password), port => $port, target => $target, @@ -960,7 +993,7 @@ sub test_access shell_call => $anvil->data->{path}{exe}{echo}." 1", target => $target, remote_user => $user, - 'close' => 1, + 'close' => $close, no_cache => 1, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { diff --git a/Anvil/Tools/Storage.pm b/Anvil/Tools/Storage.pm index 1588a847..1815f95e 100644 --- a/Anvil/Tools/Storage.pm +++ b/Anvil/Tools/Storage.pm @@ -452,7 +452,7 @@ sub change_mode This changes the owner and/or group of a file or directory. - $anvil->Storage->change_owner({path => "/tmp/foo", user => "apache", group => "apache" }); + $anvil->Storage->change_owner({path => "/tmp/foo", user => "striker-ui-api", group => "striker-ui-api" }); If it fails to write the file, an alert will be logged and 'C<< 1 >>' will be returned. Otherwise, 'C<< 0 >>' will be returned. @@ -4972,11 +4972,11 @@ sub update_config body => $new_file, debug => $debug, file => $anvil->data->{path}{configs}{'anvil.conf'}, - group => "apache", + group => "striker-ui-api", mode => "0640", overwrite => 1, secure => 1, - user => "apache", + user => "striker-ui-api", password => $password, port => $port, target => $target, diff --git a/man/Makefile.am b/man/Makefile.am index 0150315a..5464bfeb 100644 --- a/man/Makefile.am +++ b/man/Makefile.am @@ -18,10 +18,14 @@ dist_man8_MANS = \ anvil-manage-dr.8 \ anvil-manage-files.8 \ anvil-manage-keys.1 \ + anvil-manage-power.8 \ anvil-manage-server.8 \ anvil-manage-server-storage.8 \ anvil-manage-storage-groups.8 \ + anvil-special-operations.8 \ anvil-watch-drbd.8 \ scancore.8 \ striker-check-machines.8 \ - striker-initialize-host.8 + striker-collect-debug.8 \ + striker-initialize-host.8 \ + striker-update-cluster.8 diff --git a/man/anvil-boot-server.8 b/man/anvil-boot-server.8 index 783faecb..27ed8c3c 100644 --- a/man/anvil-boot-server.8 +++ b/man/anvil-boot-server.8 @@ -40,7 +40,7 @@ This is the server UUID of the server to boot. Generally this isn't needed, exce \fB\-\-wait\fR When using '\fB\-\-server\fR all', the request to boot each server will normally not wait for the server to boot. When this is set, this behaviour is changed and the boot will wait before moving on to boot the next server. .TP -Be away that when this is used, if a server fails to boot, no further servers will be started. +Be aware that when this is used, if a server fails to boot, no further servers will be started. .IP .SH AUTHOR Written by Madison Kelly, Alteeve staff and the Anvil! project contributors. diff --git a/man/anvil-manage-power.8 b/man/anvil-manage-power.8 new file mode 100644 index 00000000..a35c367f --- /dev/null +++ b/man/anvil-manage-power.8 @@ -0,0 +1,45 @@ +.\" Manpage for the Anvil! power management tool +.\" Contact mkelly@alteeve.com to report issues, concerns or suggestions. +.TH anvil-manage-power "8" "July 11 2023" "Anvil! Intelligent Availability™ Platform" +.SH NAME +anvil-manage-power \- This program can power off, reboot, or set a flag indicating one of these actions are required. +.SH SYNOPSIS +.B anvil-manage-power +\fI\, \/\fR[\fI\,options\/\fR] +.SH DESCRIPTION +This program can mark a machine as needing to be powered off or rebooted, or perform those actions directly or as a job. +.TP +\-?, \-h, \fB\-\-help\fR +Show this man page. +.TP +\fB\-\-log-secure\fR +When logging, record sensitive data, like passwords. +.TP +\-v, \-vv, \-vvv +Set the log level to 1, 2 or 3 respectively. Be aware that level 3 generates a significant amount of log data. +.SS "Commands:" +.TP +\fB\-\-no-wait\fR +.TP +Normally, this program will not reboot a machine until the uptime is over five minutes. This is done to provide a chance for someone to log in and disable anvil-daemon in the case of a reboot loop. This switch prevents waiting for that 5 minute delay. +.TP +\fB\-\-poweroff\fR, \fB\-\-power\-off\fR +.TP +This powers off the host. +.TP +\fB\-\-reboot\fR +.TP +This reboots the host. +.TP +\fB\-\-reboot\-needed\fR [0,1] +.TP +This sets (1) or clears (0) the 'reboot needed' flag for the host system. +.TP +\fB\-\-y\fR, \fB\-\-yes\fR +.TP +If passed, requests to reboot or power off won't ask for confirmation. +.IP +.SH AUTHOR +Written by Madison Kelly, Alteeve staff and the Anvil! project contributors. +.SH "REPORTING BUGS" +Report bugs to users@clusterlabs.org diff --git a/man/anvil-special-operations.8 b/man/anvil-special-operations.8 new file mode 100644 index 00000000..8ef90225 --- /dev/null +++ b/man/anvil-special-operations.8 @@ -0,0 +1,32 @@ +.\" Manpage for the Anvil! storage groups +.\" Contact mkelly@alteeve.com to report issues, concerns or suggestions. +.TH anvil-special-operations "8" "Jun 30 2023" "Anvil! Intelligent Availability™ Platform" +.SH NAME +anvil-special-operations \- This program is generally meant to be used by other programs. +.SH SYNOPSIS +.B anvil-special-operations +\fI\, \/\fR[\fI\,options\/\fR] +.SH DESCRIPTION +This tool is used, generally by other parts of the Anvil!, the accomplish tasks that generally can't be accomplished by direct system calls. It's a general purpose tool meant to solve specific corner cases. +.TP +.SH OPTIONS +.TP +\-?, \-h, \fB\-\-help\fR +Show this man page. +.TP +\fB\-\-log-secure\fR +When logging, record sensitive data, like passwords. +.TP +\-v, \-vv, \-vvv +Set the log level to 1, 2 or 3 respectively. Be aware that level 3 generates a significant amount of log data. +.SS "Commands:" +.TP +\fB\-\-task\fR +This is the task being requested. Current optiopns are: +.IP refresh-drbd-resource +This requires \fB\-\-resource \fR, and will call 'drbdadm adjust ' as a background task and then return immediately. This is required when adding a new volume to an existing resource as 'drbdadm adjust ' will hold until it is called on all active DRBD nodes. This blocks the caller after the first remote host call. +.IP +.SH AUTHOR +Written by Madison Kelly, Alteeve staff and the Anvil! project contributors. +.SH "REPORTING BUGS" +Report bugs to users@clusterlabs.org diff --git a/man/anvil-update-system.8 b/man/anvil-update-system.8 new file mode 100644 index 00000000..7b36ba1d --- /dev/null +++ b/man/anvil-update-system.8 @@ -0,0 +1,39 @@ +.\" Manpage for the Anvil! cluster update tool. +.\" Contact mkelly@alteeve.com to report issues, concerns or suggestions. +.TH anvil-update-system "8" "July 14 2023" "Anvil! Intelligent Availability™ Platform" +.SH NAME +anvil-update-system \- This program updates the local operting system +.SH SYNOPSIS +.B anvil-update-system +\fI\, \/\fR[\fI\,options\/\fR] +.SH DESCRIPTION +This program updates the local operating system. If the kernel is updated, a reboot will be performed. +.TP +.B Note: +.TP +If the host is an Anvil! subnode, the subnode will be removed from the Anvil! node (and servers migrated off, or, shut down if the peer subnode is offline). +.TP +.SH OPTIONS +.TP +\-?, \-h, \fB\-\-help\fR +Show this man page. +.TP +\fB\-\-log-secure\fR +When logging, record sensitive data, like passwords. +.TP +\-v, \-vv, \-vvv +Set the log level to 1, 2 or 3 respectively. Be aware that level 3 generates a significant amount of log data. +.SS "Commands:" +.TP +\fB\-\-clear\-cache\fR +.TP +This will force the dnf cache to be cleared before the OS update is started. This slows the update down a bit, but ensures the latest updates are installed. +.TP +\fB\-\-no\-reboot\fR +.TP +If the kernel is updated, the system will normally be rebooted. This switch prevents the reboot from occuring. +.IP +.SH AUTHOR +Written by Madison Kelly, Alteeve staff and the Anvil! project contributors. +.SH "REPORTING BUGS" +Report bugs to users@clusterlabs.org diff --git a/man/striker-check-machines.8 b/man/striker-check-machines.8 index 9e9c41d4..fff74b73 100644 --- a/man/striker-check-machines.8 +++ b/man/striker-check-machines.8 @@ -22,7 +22,7 @@ Set the log level to 1, 2 or 3 respectively. Be aware that level 3 generates a s .SS "Commands:" .TP This program takes no commands. -.TP +.IP .SH AUTHOR Written by Madison Kelly, Alteeve staff and the Anvil! project contributors. .SH "REPORTING BUGS" diff --git a/man/striker-collect-debug.8 b/man/striker-collect-debug.8 new file mode 100644 index 00000000..1853dc98 --- /dev/null +++ b/man/striker-collect-debug.8 @@ -0,0 +1,45 @@ +.\" Manpage for the Anvil! machine power and access reporting tool. +.\" Contact mkelly@alteeve.com to report issues, concerns or suggestions. +.TH striker-collect-data "8" "July 04 2023" "Anvil! Intelligent Availability™ Platform" +.SH NAME +striker-collect-data \- This program collects data needed to help diagnose problems with an Anvil! system. +.SH SYNOPSIS +.B striker-collect-data +\fI\, \/\fR[\fI\,options\/\fR] +.SH DESCRIPTION +This program collects database data, logs, config files and other information needed to help diagnose problems with the Anvil! platform. By default, this collects all data from all accessible machines. +.TP +.B Note: +.TP +This program collects potentially secure information, like passwords. Be careful who you share the collected data with! +.TP +The data from Striker dashboards are always collected. +.TP +.SH OPTIONS +.TP +\-?, \-h, \fB\-\-help\fR +Show this man page. +.TP +\fB\-\-log-secure\fR +When logging, record sensitive data, like passwords. +.TP +\-v, \-vv, \-vvv +Set the log level to 1, 2 or 3 respectively. Be aware that level 3 generates a significant amount of log data. +.SS "Commands:" +.TP +\fB\-\-anvil\fR +.TP +This restricts the data to be collected to the Striker dashboards and the specific Anvil! node pair. +.TP +\fB\-\-hosts\fR +.TP +This can be used to specify which specific hosts data is collected from. Note that this can be used in conjuction with \fB\-\-anvil\fR to add additional hosts to collect data from, like DR hosts. +.TP +\fB\-\-output\-file\fR +.TP +This allows you to specify the output compressed tarball that the files will be saved in. By default, the output file is \fB/root/anvil-debug_.tar.bz2\fR. If this is a directory (ending in \fB/\fR), the normal file name is created, just in a different directory. If the path ends in a file that doesn't have the \fB.tar.bz2\fR suffix, that suffix will be added automatically. The output file will always be a bzip2's tarball. +.IP +.SH AUTHOR +Written by Madison Kelly, Alteeve staff and the Anvil! project contributors. +.SH "REPORTING BUGS" +Report bugs to users@clusterlabs.org diff --git a/man/striker-update-cluster.8 b/man/striker-update-cluster.8 new file mode 100644 index 00000000..0c15566e --- /dev/null +++ b/man/striker-update-cluster.8 @@ -0,0 +1,53 @@ +.\" Manpage for the Anvil! cluster update tool. +.\" Contact mkelly@alteeve.com to report issues, concerns or suggestions. +.TH striker-update-cluster "8" "July 11 2023" "Anvil! Intelligent Availability™ Platform" +.SH NAME +striker-update-cluster \- This program updates all physical machines in an Anvil! cluster +.SH SYNOPSIS +.B striker-update-cluster +\fI\, \/\fR[\fI\,options\/\fR] +.SH DESCRIPTION +This program sequentially updates Striker dashboards, DR hosts and Anvil! nodes (the paired sub-nodes). It does this without needing to take hosted servers offline. +.TP +.B Note: +.TP +This program requires all machines be online, and Anvil! nodes being paired and sync'ed. When nodes are updated, the inactive subnode will be removed from the node, updated, rebooted if necessary, and then rejoined to the node. Then hosted servers will migrate to the now-updated subnode, and the process repeated for the other subnode. Anvil! nodes are updated sequentially, so the process can take some time to complete, but should not require a maintenance window. +.TP +The upgrade process will live-migrate all hosted servers! If any hosted server is either under heavy load, or the replication link (the BCN or MN) is relatively lower bandwidth, this could cause performance concerns. As such, it's ideal to run the upgrades at a time less sensitive to performance impacts. +.TP +.SH OPTIONS +.TP +\-?, \-h, \fB\-\-help\fR +Show this man page. +.TP +\fB\-\-log-secure\fR +When logging, record sensitive data, like passwords. +.TP +\-v, \-vv, \-vvv +Set the log level to 1, 2 or 3 respectively. Be aware that level 3 generates a significant amount of log data. +.SS "Commands:" +.TP +\fB\-\-clear\-cache\fR +.TP +This will force the dnf cache to be cleared before the OS update is started. This slows the update down a bit, but ensures the latest updates are installed. +.TP +\fB\-\-force\fR +.TP +If any Striker dashboards or DR hosts are unavailable, or if an entire node (paired subnodes) is offline, this switch will allow you to force the upgrade attempt. +.TP +\fB\-y\fR, \fB\-\-yes\fR +.TP +Automatically continue with the upgrade without prompting for confirmation. +.TP +\fB\-\-no\-reboot\fR +.TP +If the kernel is updated on a remote system, the system will normally be rebooted. This switch prevents the reboot from occuring. +.TP +\fB\-\-reboot\-self\fR +.TP +By default, if the local system needs to be updated, a message is printed but the local system is NOT rebooted. This switch will instead cause this host to reboot at the end of the cluster update. +.IP +.SH AUTHOR +Written by Madison Kelly, Alteeve staff and the Anvil! project contributors. +.SH "REPORTING BUGS" +Report bugs to users@clusterlabs.org diff --git a/share/words.xml b/share/words.xml index e3e96bc7..22b825e7 100644 --- a/share/words.xml +++ b/share/words.xml @@ -602,6 +602,11 @@ The error was: There was a problem with finding a common storage network between: [#!variable!node1_name!#] and: [#!variable!node2_name!#]. Found node 1 to have the IP: [#!variable!node1_ip!#] and node 2: [#!variable!node2_ip!#]. Is there a problem with '/etc/hosts'? Failed to find a network to use for storage replication. Is there a problem with '/etc/hosts'? + '.]]> + Failed to withdraw the subnode from the node's cluster. Expected the 'anvil-safe-stop' call to return '0', but got: [#!variable!return_code!#]. The output, if anything, was: +======== +#!variable!output!# +======== @@ -1552,6 +1557,12 @@ Note: This is a permanent action! If you protect this server again later, a full Enabling the enable-safe-start daemon. Calling select ScanCore scan agents to ensure the database is updated. + Reload (adjust) a DRBD resource + This job is to reload (adjust) a DRBD resource. It's run as a job as it blocks until the adjust is run on all nodes. + Update the base operating system. + This uses 'dnf' to do an OS update on the host. If this is run on a node, 'anvil-safe-stop' will be called to withdraw the subnode from the node's cluster. If the peer subnode is also offline, hosted servers will be shut down. + Update beginning. Verifying all known machines are accessible... + Starting: [#!variable!program!#]. @@ -2405,7 +2416,10 @@ The file: [#!variable!file!#] needs to be updated. The difference is: There was an unknown error while connecting as: [#!variable!user!#] to: [#!variable!remote_user!#@#!variable!target!#]. The error was: [#!variable!error!#] We were unable to log in to: [#!variable!connection!#]. Please check that the password is correct or that passwordless SSH is configured properly. An SSH session was successfully opened to: [#!variable!target!#]. - The remote shell call: [#!variable!shell_call!#] to: [#!variable!connection!#] failed with the error: [#!variable!error!#]. + The remote shell call: [#!variable!shell_call!#] to: [#!variable!connection!#] failed with the error: +==== +#!variable!error!# +==== The SSH session to: [#!variable!target!#] was successfully closed. The SSH session to: [#!variable!target!#] was closed because 'no_cache' was set and there was an open SSH connection. Wrote the system UUID to the file: [#!variable!file!#] to enable the web based tools to read this system's UUID. @@ -2893,6 +2907,15 @@ Proceed? [y/N] The DRBD config file was not found. A protect job needs to be run from the Anvil! node hosting the server to be protected. Waiting a bit to make sure the file: [#!variable!file!#] is done uploading... Upload complete. + Picked up the special operation job. + Reloading (adjusting) the DRBD resource: [#!variable!resource!#]. This will not complete until all peers have also reloaded this resource. + DRBD resource: [#!variable!resource!#] has been reloaded. + Checking if the subnode is out of the node's cluster before updating the OS. + The subnode is in the node's cluster, asking it to withdraw. This could take some time if servers need to be migrated. + Cleared 'dnf' cache. + The kernel was updated, so a reboot is required. Rebooting now. + Registering a job to reboot this host. + Preparing to update the entire Anvil! cluster. Normal Password @@ -3590,7 +3613,7 @@ We will sleep a bit and try again. [ Warning ] - Failed to connect to the host: [#!variable!host!#]! Unable to up the resource, so the server may not start. If the peer can't be recovered, manually forcing the local resource(s) to UpToDate may be required. [ Warning ] - Timed out waiting for the connections to the peers, and the local resource(s) is not in 'UpToDate' state. Booting the server will likely fail. [ Warning ] - Timed out waiting for the connections to the peers. - [ Warning ] - We're using: [#!variable!ram_used!#] (#!variable!ram_used_bytes!# Bytes). but there is a job: [#!variable!job_command!#] is runnng, which might be why the RAM is high. NOT exiting while this program is running. + [ Warning ] - We're using: [#!variable!ram_used!#] (#!variable!ram_used_bytes!# Bytes). but there is a job: [#!variable!job_command!#] is runnng, (progress is: [#!variable!job_progress!#]), which might be why the RAM is high. NOT exiting while this program is running. [ Warning ] - A no-longer active PID: [#!variable!pid!#] (used by: [#!variable!caller!#] had marked the database: [#!variable!db!#] as "in_use", but the PID is gone now. Reaping the flag. [ Warning ] - We waited for: [#!variable!wait_time!#] seconds for all users of the local database to exit. Giving up waiting and taking the database down now. [ Warning ] - The command: [#!variable!command!#] is still using our database. diff --git a/tools/Makefile.am b/tools/Makefile.am index cd2b412f..153f39e6 100644 --- a/tools/Makefile.am +++ b/tools/Makefile.am @@ -37,6 +37,7 @@ dist_sbin_SCRIPTS = \ anvil-scan-network \ anvil-show-local-ips \ anvil-shutdown-server \ + anvil-special-operations \ anvil-sync-shared \ anvil-test-alerts \ anvil-update-definition \ @@ -51,6 +52,7 @@ dist_sbin_SCRIPTS = \ striker-auto-initialize-all \ striker-boot-machine \ striker-check-machines \ + striker-collect-debug \ striker-db-report \ striker-db-status \ striker-file-manager \ @@ -65,7 +67,8 @@ dist_sbin_SCRIPTS = \ striker-prep-database \ striker-purge-target \ striker-scan-network \ - striker-show-db-counts + striker-show-db-counts \ + striker-update-cluster fencedir = ${FASEXECPREFIX}/sbin diff --git a/tools/anvil-daemon b/tools/anvil-daemon index 4643c72c..67cd9a4e 100755 --- a/tools/anvil-daemon +++ b/tools/anvil-daemon @@ -252,8 +252,8 @@ sub check_ram }}); if ($problem) { - # See if an 'anvil-sync-shared' job is running and, if so, don't exit. The file copy is - # counted and not an actual problem. + # See if any jobs are running, and if so, hold because those jobs might be doing things (like + # OS updates or file syncs) that could make anvil-daemon appear to be using more memory. $anvil->Database->get_jobs({debug => 2}); foreach my $job_uuid (keys %{$anvil->data->{jobs}{running}}) { @@ -264,11 +264,12 @@ sub check_ram job_progress => $job_progress, }}); - if (($job_progress != 100) && ($job_command =~ /anvil-sync-shared/)) + if (($job_progress != 100) && ($job_progress != 0)) { # Don't abort. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "warning_0139", variables => { job_command => $job_command, + job_progress => $job_progress, ram_used => $anvil->Convert->bytes_to_human_readable({'bytes' => $ram_used}), ram_used_bytes => $anvil->Convert->add_commas({number => $ram_used}), }}); diff --git a/tools/anvil-manage-dr b/tools/anvil-manage-dr index 184814d8..88f176c6 100755 --- a/tools/anvil-manage-dr +++ b/tools/anvil-manage-dr @@ -382,7 +382,8 @@ sub sanity_check if (($anvil->data->{switches}{'connect'}) or ($anvil->data->{switches}{'disconnect'})) { # Is this server configured to be protected? - my $config_file = $anvil->data->{path}{directories}{drbd_resources}."/".$server_name.".res"; + my $config_file = $anvil->data->{path}{directories}{drbd_resources}."/".$server_name.".res"; + $config_file =~ s/\/\//\//g; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { config_file => $config_file }}); if (not -e $config_file) { @@ -398,8 +399,8 @@ sub sanity_check variables => $variables, job_status => "failed", }); + $anvil->nice_exit({exit_code => 1}); } - $anvil->nice_exit({exit_code => 1}); } # If we're doing a --protect or --remove, make sure we're a node, the cluster is up, and both nodes @@ -533,7 +534,6 @@ sub sanity_check # Get the Anvil! details. $anvil->Database->get_hosts(); - $anvil->Database->get_anvils(); $anvil->Database->get_storage_group_data({debug => 2}); $anvil->Database->get_dr_links({debug => 2}); @@ -559,7 +559,9 @@ sub sanity_check } } - # If I don't have a dr_host_uuid yet, see which are available. If only one, use it. If two or more, tell the user they need to specify which. + # If I don't have a dr_host_uuid yet, see which are available. If only one, use it. If two or more, + # and if the server is already being protected, determine which to use from it's config. Otherwise, + # tell the user they need to specify which. if (not $dr_host_uuid) { my $dr_count = keys %{$anvil->data->{dr_links}{by_anvil_uuid}{$anvil_uuid}{dr_link_host_name}}; @@ -587,6 +589,26 @@ sub sanity_check $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { dr_host_uuid => $dr_host_uuid }}); } } + else + { + # Two or more. Is the server already protected? If so, try to find which DR it's + # using. + if (($anvil->data->{switches}{'connect'}) or ($anvil->data->{switches}{'disconnect'})) + { + # Read the config. + my $config_file = $anvil->data->{path}{directories}{drbd_resources}."/".$server_name.".res"; + $config_file =~ s/\/\//\//g; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { config_file => $config_file }}); + + my $resource_config = $anvil->Storage->read_file({file => $config_file}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { resource_config => $resource_config }}); + foreach my $line (split/\n/, $resource_config) + { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { lineg => $line }}); + + } + } + } } # If I still don't have a DR host, fail out. diff --git a/tools/anvil-manage-power b/tools/anvil-manage-power index 1bf093e1..7b8eb093 100755 --- a/tools/anvil-manage-power +++ b/tools/anvil-manage-power @@ -34,17 +34,15 @@ if (($running_directory =~ /^\./) && ($ENV{PWD})) my $anvil = Anvil::Tools->new(); -# Read switches -$anvil->data->{switches}{'poweroff'} = ""; -$anvil->data->{switches}{'power-off'} = ""; -$anvil->data->{switches}{'reboot'} = ""; -$anvil->data->{switches}{'y'} = ""; -$anvil->data->{switches}{'yes'} = ""; -$anvil->data->{switches}{'reboot-needed'} = ""; -$anvil->data->{switches}{'job-uuid'} = ""; -$anvil->data->{switches}{'no-delay'} = ""; -$anvil->Get->switches; -$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => "log_0115", variables => { program => $THIS_FILE }}); +# Read switches (target ([user@]host[:port]) and the file with the target's password. +$anvil->Get->switches({list => [ + "no-wait", + "power-off", + "poweroff", + "reboot", + "reboot-needed"], man => $THIS_FILE}); +$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}}); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }}); if ($anvil->data->{switches}{'power-off'}) { @@ -188,10 +186,13 @@ sub do_poweroff my ($anvil, $task) = @_; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { task => $task }}); + # In case we're being called by another job, we'll sleep for a few second to let those close out. + sleep 3; + # We'll wait until the system has at least 5 minutes of uptime, unless '--no-wait' was given. my $uptime = $anvil->data->{switches}{'no-wait'} ? 0 : $anvil->Get->uptime; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - "switches::no-wait" => $anvil->data->{switches}{'no-delay'}, + "switches::no-wait" => $anvil->data->{switches}{'no-wait'}, uptime => $uptime, }}); diff --git a/tools/anvil-manage-server-storage b/tools/anvil-manage-server-storage index 99dbe8f2..e57c9168 100755 --- a/tools/anvil-manage-server-storage +++ b/tools/anvil-manage-server-storage @@ -24,6 +24,7 @@ use warnings; use Anvil::Tools; require POSIX; use Term::Cap; +use Text::Diff; use Data::Dumper; my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0]; @@ -217,22 +218,27 @@ sub manage_disk foreach my $volume_number (sort {$a cmp $b} keys %{$anvil->data->{drbd_resource}{$drbd_resource}{host_uuid}{$host_uuid}{volume_number}}) { my $device_path = $anvil->data->{drbd_resource}{$drbd_resource}{host_uuid}{$host_uuid}{volume_number}{$volume_number}{device_path}; + next if $device_path eq "DELETED"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:volume_number' => $volume_number, + 's2:device_path' => $device_path, + }}); + my $device_minor = $anvil->data->{drbd_resource}{$drbd_resource}{host_uuid}{$host_uuid}{volume_number}{$volume_number}{device_minor}; my $volume_size = $anvil->data->{drbd_resource}{$drbd_resource}{host_uuid}{$host_uuid}{volume_number}{$volume_number}{volume_size}; my $backing_disk = $anvil->data->{new}{resource}{$drbd_resource}{host_uuid}{$host_uuid}{volume_number}{$volume_number}{backing_disk}; my $meta_disk = $anvil->data->{new}{resource}{$drbd_resource}{host_uuid}{$host_uuid}{volume_number}{$volume_number}{meta_disk}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - 's1:volume_number' => $volume_number, - 's2:device_path' => $device_path, - 's3:device_minor' => $device_minor, - 's4:volume_size' => $volume_size, - 's5:backing_disk' => $backing_disk, - 's6:meta_disk' => $meta_disk, + 's1:device_minor' => $device_minor, + 's2:volume_size' => $volume_size, + 's3:backing_disk' => $backing_disk, + 's4:meta_disk' => $meta_disk, }}); # Which volume group is the backing device in? foreach my $this_scan_lvm_lv_name (sort {$a cmp $b} keys %{$anvil->data->{lvm}{host_name}{$short_host_name}{lv}}) { + next if not $this_scan_lvm_lv_name; my $this_scan_lvm_lv_path = $anvil->data->{lvm}{host_name}{$short_host_name}{lv}{$this_scan_lvm_lv_name}{scan_lvm_lv_path}; my $this_scan_lvm_lv_on_vg = $anvil->data->{lvm}{host_name}{$short_host_name}{lv}{$this_scan_lvm_lv_name}{scan_lvm_lv_on_vg}; my $this_scan_lvm_lv_uuid = $anvil->data->{lvm}{host_name}{$short_host_name}{lv}{$this_scan_lvm_lv_name}{scan_lvm_lv_uuid}; @@ -311,149 +317,1085 @@ sub manage_disk my $drbd_volume = $anvil->data->{lvm}{host_name}{$short_host_name}{lv_path}{$on_lv}{drbd}{volume}; my $max_free_space = $anvil->data->{server_name}{$server_name}{drbd_resource}{$drbd_resource}{volume}{$drbd_volume}{free_space}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - 's01:device_target' => $device_target, - 's02:alias' => $alias, - 's03:boot_order' => $boot_order, - 's04:say_boot' => $say_boot, - 's05:type' => $type, - 's06:address_type' => $address_type, - 's07:address_bus' => $address_bus, - 's08:driver_name' => $driver_name, - 's09:device_bus' => $device_bus, - 's10:driver_type' => $driver_type, - 's11:address_domain' => $address_domain, - 's12:address_slot' => $address_slot, - 's13:address_function' => $address_function, - 's14:device_path' => $device_path, - 's15:driver_io' => $driver_io, - 's16:driver_cache' => $driver_cache, - 's17:on_lv' => $on_lv, - 's18:drbd_volume' => $drbd_volume, - 's19:max_free_space' => $max_free_space." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $max_free_space}).")", + 's01:device_target' => $device_target, + 's02:alias' => $alias, + 's03:boot_order' => $boot_order, + 's04:say_boot' => $say_boot, + 's05:type' => $type, + 's06:address_type' => $address_type, + 's07:address_bus' => $address_bus, + 's08:driver_name' => $driver_name, + 's09:device_bus' => $device_bus, + 's10:driver_type' => $driver_type, + 's11:address_domain' => $address_domain, + 's12:address_slot' => $address_slot, + 's13:address_function' => $address_function, + 's14:device_path' => $device_path, + 's15:driver_io' => $driver_io, + 's16:driver_cache' => $driver_cache, + 's17:on_lv' => $on_lv, + 's18:drbd_volume' => $drbd_volume, + 's19:max_free_space' => $max_free_space." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $max_free_space}).")", + }}); + #print "- Target: [".$device_target."], boot: [".$say_boot."], path: [".$device_path."], cache: [".$driver_cache."], driver type: [".$driver_type."]\n"; + print "- Target: [".$device_target."], boot: [".$say_boot."], path: [".$device_path."], Available space: [".$anvil->Convert->bytes_to_human_readable({'bytes' => $max_free_space})."]\n"; + + # What are we doing? + if ($anvil->data->{switches}{grow}) + { + manage_disk_grow($anvil, $drbd_resource, $drbd_volume, $max_free_space); + } + } + elsif ($anvil->data->{switches}{add}) + { + manage_disk_add($anvil, $drbd_resource); + } + + return(0); +} + +sub manage_disk_add +{ + my ($anvil, $drbd_resource) = @_; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { drbd_resource => $drbd_resource }}); + + my $anvil_uuid = defined $anvil->data->{switches}{anvil_uuid} ? $anvil->data->{switches}{anvil_uuid} : $anvil->Cluster->get_anvil_uuid(); + my $short_host_name = $anvil->Get->short_host_name; + my $server_name = $anvil->data->{switches}{server_name}; + my $from_source = get_definition_source($anvil); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + anvil_uuid => $anvil_uuid, + short_host_name => $short_host_name, + server_name => $server_name, + from_source => $from_source, + }}); + + # Are they asking for an available amount of space? + my $error_note = q| +[ Note ] - The size can be in percent, ie: '50%' or '100%', a number in bytes, or a human-readable size. + - Human readable sizes must NOT have a space between the number and letter suffix. Also, base2 + - vs base10 notation! Ie: '1GiB' = 1,073,741,824 bytes', '1GB' == '1,000,000,000 bytes'. A single + - letter used to denote size will be interpreted as base2. ie: '1G == 1GiB'. +|; + + # Do we have a storage group? + if (not $anvil->data->{switches}{'storage-group'}) + { + print "Please specify a storage group to use to add the new drive to.\n"; + short_storage_groups($anvil, $anvil_uuid); + $anvil->nice_exit({exit_code => 1}); + } + + # Make sure that the passed + my $storage_group_switch = $anvil->data->{switches}{'storage-group'}; + my $storage_group_uuid = ""; + my $storage_group_name = ""; + if (exists $anvil->data->{storage_groups}{anvil_uuid}{$anvil_uuid}{storage_group_name}{$storage_group_switch}) + { + $storage_group_uuid = $anvil->data->{storage_groups}{anvil_uuid}{$anvil_uuid}{storage_group_name}{$storage_group_switch}{storage_group_uuid}; + $storage_group_name = $storage_group_switch; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + storage_group_uuid => $storage_group_uuid, + storage_group_name => $storage_group_name, + }}); + } + elsif (exists $anvil->data->{storage_groups}{anvil_uuid}{$anvil_uuid}{storage_group_uuid}{$storage_group_switch}) + { + $storage_group_uuid = $storage_group_switch; + $storage_group_name = $anvil->data->{storage_groups}{anvil_uuid}{$anvil_uuid}{storage_group_uuid}{$storage_group_switch}{group_name}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + storage_group_uuid => $storage_group_uuid, + storage_group_name => $storage_group_name, + }}); + } + + # Did we get a valid disk size? + my $free_space = $anvil->data->{storage_groups}{anvil_uuid}{$anvil_uuid}{storage_group_uuid}{$storage_group_uuid}{free_space}; + my $add_size = $anvil->data->{switches}{add}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { add_size => $add_size }}); + if ($add_size =~ /^(\d+)%$/) + { + # This is valid + my $percent = ".".$1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { percent => $percent }}); + + $add_size = int($free_space * $percent); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { add_size => $add_size }}); + } + elsif ($add_size !~ /\d/) + { + # No digits, probably didn't set a value at all. + print "\n[ Error ] - Please specify the size you would like to grow this disk by. The maximum size is: [".$anvil->Convert->bytes_to_human_readable({'bytes' => $free_space})."].\n"; + print $error_note."\n"; + $anvil->nice_exit({exit_code => 1}); + } + elsif ($add_size !~ /^\d+$/) + { + # Size is not in bytes, try to convert it. + my $bytes = $anvil->Convert->human_readable_to_bytes({ + debug => 2, + size => $add_size, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'bytes' => $bytes }}); + if ($bytes =~ /^\d+$/) + { + $add_size = $bytes; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { add_size => $add_size }}); + } + else + { + # Not a valid size. + print "\n[ Error ] - The requested size: [".$add_size."] could not be interpreted.\n"; + print $error_note."\n"; + $anvil->nice_exit({exit_code => 1}); + } + } + + # Still here? We're good to go. + my $lv_command_size = 0; + my $hr_size = $anvil->Convert->bytes_to_human_readable({'bytes' => $add_size}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { hr_size => $hr_size }}); + if ($add_size eq "100%") + { + # This is valid + $add_size = "-l +100\%FREE"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { add_size => $add_size }}); + } + else + { + $hr_size =~ s/\s+//g; + $add_size = "-L +".$hr_size; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { add_size => $add_size }}); + } + + # What's the next free drive in the system, and what's the next free volume number? + my $new_device_target = ""; + my $target_prefix = ""; + my $disk_device_bus = ""; + my $disk_cache = ""; + my $disk_io_policy = ""; + my $drive_letter = "a"; + foreach my $device_target (sort {$a cmp $b} keys %{$anvil->data->{server}{$short_host_name}{$server_name}{$from_source}{device}{disk}{target}}) + { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { device_target => $device_target }}); + if (not $disk_device_bus) + { + $target_prefix = ($device_target =~ /^(\w+)\w$/)[0]; + $disk_device_bus = $anvil->data->{server}{$short_host_name}{$server_name}{$from_source}{device}{disk}{target}{$device_target}{device_bus}; + $disk_io_policy = $anvil->data->{server}{$short_host_name}{$server_name}{$from_source}{device}{disk}{target}{$device_target}{driver}{io}; + $disk_cache = $anvil->data->{server}{$short_host_name}{$server_name}{$from_source}{device}{disk}{target}{$device_target}{driver}{cache}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + target_prefix => $target_prefix, + disk_device_bus => $disk_device_bus, + disk_io_policy => $disk_io_policy, + disk_cache => $disk_cache, + }}); + last; + } + } + for (0..25) + { + my $test_device = $target_prefix.$drive_letter; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { test_device => $test_device }}); + if (not exists $anvil->data->{server}{$short_host_name}{$server_name}{$from_source}{device}{disk}{target}{$test_device}) + { + # Found a free one. + $new_device_target = $test_device; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { new_device_target => $new_device_target }}); + last; + } + $drive_letter++; + } + + if (not $new_device_target) + { + print "\n[ Error ] - Failed to find a new target device name.\n"; + $anvil->nice_exit({exit_code => 1}); + } + + my $next_drbd_volume = ""; + foreach my $this_host (sort {$a cmp $b} keys %{$anvil->data->{drbd}{drbd_node}}) + { + my $host_uuid = $anvil->Get->host_uuid_from_name({host_name => $this_host}); + my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:this_host' => $this_host, + 's2:host_uuid' => $host_uuid, + 's3:short_host_name' => $short_host_name, + }}); + + if ($next_drbd_volume eq "") + { + my $test_drbd_volume = 0; + for (0..100) + { + if (not $anvil->data->{new}{resource}{$drbd_resource}{host_uuid}{$host_uuid}{volume_number}{$test_drbd_volume}{device_path}) + { + # This is free. + $next_drbd_volume = $test_drbd_volume; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_drbd_volume => $next_drbd_volume }}); + last; + } + $test_drbd_volume++; + next; + } + } + + if ($next_drbd_volume eq "") + { + print "\n[ Error ] - Failed to find a new DRBD volume to use.\n"; + $anvil->nice_exit({exit_code => 1}); + } + } + + ### TODO: Make this work without the peer node being online. + # The server is allowed to be running, but both nodes and any DR hosts this is replicating to + # needs to be online. + my $all_online = check_drbd_peer_access($anvil); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_online => $all_online }}); + + if (not $all_online) + { + print "\n[ Error ] - Adding a new disk requires all peers to be online.\n"; + foreach my $short_host_name (sort {$a cmp $b} keys %{$anvil->data->{peer}}) + { + my $say_access = $anvil->data->{peer}{$short_host_name}{access_ip} ? "up." : "down!"; + print " - Peer: [".$short_host_name."] is ".$say_access."\n"; + } + $anvil->nice_exit({exit_code => 1}); + } + + # Still alive? Ask the user to confirm. + print "- New drive target: [".$new_device_target."], size: [".$hr_size."], bus: [".$disk_device_bus."], cache: [".$disk_cache."], IO policy: [".$disk_io_policy."]\n"; + print "- Preparing to add a the drive: [".$drbd_resource."/".$next_drbd_volume."] using the storage group: [".$storage_group_name."]...\n"; + if (not $anvil->data->{switches}{confirm}) + { + print $anvil->Words->string({key => "message_0021"})." "; + my $answer = ; + chomp($answer); + if ($answer !~ /^y/i) + { + print "Aborting.\n"; + $anvil->nice_exit({exit_code => 0}); + } + + # Test that we've lost access while waiting for the answer. + my $all_online = check_drbd_peer_access($anvil); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_online => $all_online }}); + + if (not $all_online) + { + print "\n[ Error ] - It would appear that we've lost access to a peer while waiting for the answer.\n"; + foreach my $short_host_name (sort {$a cmp $b} keys %{$anvil->data->{peer}}) + { + my $say_access = $anvil->data->{peer}{$short_host_name}{access_ip} ? "up." : "down!"; + print " - Peer: [".$short_host_name."] is ".$say_access."\n"; + } + $anvil->nice_exit({exit_code => 1}); + } + } + + # Get the next free minor number + my ($free_minor, undef) = $anvil->DRBD->get_next_resource({ + debug => 2, + minor_only => 1, + anvil_uuid => $anvil_uuid, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { free_minor => $free_minor }}); + + # Create the new LVs + foreach my $host_type ("node", "dr") + { + foreach my $short_host_name (sort {$a cmp $b} keys %{$anvil->data->{drbd_resource}{$drbd_resource}{host_type}{$host_type}{short_host_name}}) + { + my $host_uuid = $anvil->data->{drbd_resource}{$drbd_resource}{host_type}{$host_type}{short_host_name}{$short_host_name}{host_uuid}; + my $vg_name = $anvil->data->{storage_groups}{anvil_uuid}{$anvil_uuid}{storage_group_uuid}{$storage_group_uuid}{host_uuid}{$host_uuid}{vg_name}; + my $vg_internal_uuid = $anvil->data->{storage_groups}{anvil_uuid}{$anvil_uuid}{storage_group_uuid}{$storage_group_uuid}{host_uuid}{$host_uuid}{vg_internal_uuid}; + my $new_lv_name = $server_name."_".$next_drbd_volume; + my $backing_disk = "/dev/".$vg_name."/".$new_lv_name; + my $shell_call = "if [ -e '".$backing_disk."' ]; then echo 'LV: [".$backing_disk."] already exists.'; else ".$anvil->data->{path}{exe}{lvcreate}." ".$add_size." -n ".$new_lv_name." ".$vg_name."; fi;"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:short_host_name' => $short_host_name, + 's2:host_uuid' => $host_uuid, + 's3:vg_name' => $vg_name, + 's4:vg_internal_uuid' => $vg_internal_uuid, + 's5:new_lv_name' => $new_lv_name, + 's6:backing_disk' => $backing_disk, + 's7:shell_call' => $shell_call, + }}); + + # Record this for updating the DRBD resource. + $anvil->data->{new_drbd}{$short_host_name}{resource}{$drbd_resource}{volume}{$next_drbd_volume}{minor} = $free_minor; + $anvil->data->{new_drbd}{$short_host_name}{resource}{$drbd_resource}{volume}{$next_drbd_volume}{backing_disk} = $backing_disk; + $anvil->data->{new_drbd}{$short_host_name}{resource}{$drbd_resource}{volume}{$next_drbd_volume}{seen} = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "new_drbd::${short_host_name}::resource::${drbd_resource}::volume::${next_drbd_volume}::minor" => $anvil->data->{new_drbd}{$short_host_name}{resource}{$drbd_resource}{volume}{$next_drbd_volume}{minor}, + "new_drbd::${short_host_name}::resource::${drbd_resource}::volume::${next_drbd_volume}::backing_disk" => $anvil->data->{new_drbd}{$short_host_name}{resource}{$drbd_resource}{volume}{$next_drbd_volume}{backing_disk}, + "new_drbd::${short_host_name}::resource::${drbd_resource}::volume::${next_drbd_volume}::seen" => $anvil->data->{new_drbd}{$short_host_name}{resource}{$drbd_resource}{volume}{$next_drbd_volume}{seen}, + }}); + + # This lets us insert the new volume as needed. + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + }}); + if ($host_uuid eq $anvil->Get->host_uuid) + { + print " - Creating the new local LV: [".$backing_disk."]..."; + my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + if ($return_code) + { + print " Error!\n"; + print "[ FAILED ] - When trying to create the new local logical volume: [".$backing_disk."]\n"; + print "[ FAILED ] - using the command: [".$shell_call."]\n"; + print "[ FAILED ] - The return code: [".$return_code."] was received, expected '0'. Output, if any:\n"; + print "==========\n"; + print $output."\n"; + print "==========\n"; + print "The creation of the new replicatedd disk is incomplete, manual intervention is required!!\n"; + $anvil->nice_exit({exit_code => 1}); + } + else + { + print " Done!\n"; + } + } + else + { + my $use_ip = $anvil->data->{peer}{$short_host_name}{access}{ip}; + my $use_network = $anvil->data->{peer}{$short_host_name}{access}{network}; + print " - Creating the new LV on the peer: [".$short_host_name.":".$backing_disk."], via: [".$use_ip." (".$use_network.")]"; + my ($output, $error, $return_code) = $anvil->Remote->call({ + shell_call => $shell_call, + target => $use_ip, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + if ($return_code) + { + print " Error!\n"; + print "[ FAILED ] - When trying to create the peer: [".$short_host_name."]'s logical volume: [".$backing_disk."]\n"; + print "[ FAILED ] - using the command: [".$shell_call."]\n"; + print "[ FAILED ] - The return code: [".$return_code."] was received, expected '0'. Output, if any:\n"; + print "==] STDOUT [========\n"; + print $output."\n"; + print "==] STDERR [========\n"; + print $error."\n"; + print "====================\n"; + print "The creation of the new replicated disk is incomplete, manual intervention is required!!\n"; + $anvil->nice_exit({exit_code => 1}); + } + else + { + print " Done!\n"; + } + } + } + } + + # Update the DRBD config file. + my $new_res_file = ""; + my $drbd_res_file = $anvil->data->{path}{directories}{drbd_resources}."/".$drbd_resource.".res"; + my $drbd_res_body = $anvil->Storage->read_file({file => $drbd_res_file}); + my $in_on_host = ""; + my $in_volume = ""; + foreach my $line (split /\n/, $drbd_res_body) + { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { line => $line }}); + if ($line =~ /on\s+(.*?)\s/) + { + $in_on_host = $1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { line => $line }}); + + $new_res_file .= $line."\n"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { new_res_file => $new_res_file }}); + next; + } + + if (($in_on_host) && ($line =~ /volume\s+(\d+)\s/)) + { + $in_volume = $1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { in_volume => $in_volume }}); + + $new_res_file .= $line."\n"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { new_res_file => $new_res_file }}); + + $anvil->data->{new_drbd}{$in_on_host}{resource}{$drbd_resource}{volume}{$in_volume}{seen} = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "new_drbd::${in_on_host}::resource::${drbd_resource}::volume::${in_volume}::seen" => $anvil->data->{new_drbd}{$in_on_host}{resource}{$drbd_resource}{volume}{$in_volume}{seen}, + }}); + next; + } + + if ($line =~ /}/) + { + if ($in_volume) + { + $in_volume = ""; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { in_volume => $in_volume }}); + } + elsif ($in_on_host) + { + # This is where we insert the new volume, if we've not seen it yet. + if (not $anvil->data->{new_drbd}{$in_on_host}{resource}{$drbd_resource}{volume}{$next_drbd_volume}{seen}) + { + # Insert the line. + $new_res_file .= $line." + + volume ".$next_drbd_volume." { + device /dev/drbd_".$drbd_resource."_".$next_drbd_volume." minor ".$anvil->data->{new_drbd}{$short_host_name}{resource}{$drbd_resource}{volume}{$next_drbd_volume}{minor}."; + disk ".$anvil->data->{new_drbd}{$in_on_host}{resource}{$drbd_resource}{volume}{$next_drbd_volume}{backing_disk}."; + meta-disk internal; + } +"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { new_res_file => $new_res_file }}); + + $anvil->data->{new_drbd}{$in_on_host}{resource}{$drbd_resource}{volume}{$next_drbd_volume}{seen} = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "new_drbd::${in_on_host}::resource::${drbd_resource}::volume::${in_volume}::seen" => $anvil->data->{new_drbd}{$in_on_host}{resource}{$drbd_resource}{volume}{$in_volume}{seen}, + }}); + next; + } + } + } + + $new_res_file .= $line."\n"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { new_res_file => $new_res_file }}); + } + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { new_res_file => $new_res_file }}); + + my $difference = diff \$drbd_res_body, \$new_res_file, { STYLE => 'Unified' }; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { difference => $difference }}); + + # Write the file to a test file and verify it's sane, + my $test_file = $anvil->data->{path}{directories}{temp}."/test-".$drbd_resource.".res"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { test_file => $test_file }}); + my ($problem) = $anvil->Storage->write_file({ + debug => 2, + backup => 0, + overwrite => 1, + file => $test_file, + body => $new_res_file, + user => "root", + group => "root", + mode => "0644", + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); + + # Validate. + print "- Testing the updated DRBD resource config file to ensure the new volumes are cromulent..."; + my $shell_call = $anvil->data->{path}{exe}{drbdadm}." --config-to-test ".$test_file." --config-to-exclude ".$drbd_res_file." sh-nop"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + if ($return_code) + { + # Something went wrong. + print " Failed! +[ Error ] - The new DRBD resource config appears to be invalid, which is likely a program error. The new +[ Error ] - config was written to the test file: [".$test_file."]. +[ Error ] - The test to confirm it was valid exited with the return code: [".$return_code."], expected '0'. +[ Error ] - The output, if anything, was: +==== +".$output." +==== +"; + $anvil->nice_exit({exit_code => 1}); + } + print " Success!\n"; + + # Remove the test file. + unlink $test_file; + + # Backup the res file so we can tell the user where the current config was backed up to in + # case they need to restore it. + print "- Writing out the updated DRBD config file.\n"; + my ($backup_file) = $anvil->Storage->backup({file => $drbd_res_file}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { backup_file => $backup_file }}); + + # Write out the new file. + ($problem) = $anvil->Storage->write_file({ + debug => 2, + backup => 0, + overwrite => 1, + file => $drbd_res_file, + body => $new_res_file, + user => "root", + group => "root", + mode => "0644", + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { backup_file => $backup_file }}); +# + # Copy this to our peers. + print "- Copying the new resource file to out peers.\n"; + foreach my $host_type ("node", "dr") + { + foreach my $short_host_name (sort {$a cmp $b} keys %{$anvil->data->{drbd_resource}{$drbd_resource}{host_type}{$host_type}{short_host_name}}) + { + my $host_uuid = $anvil->data->{drbd_resource}{$drbd_resource}{host_type}{$host_type}{short_host_name}{$short_host_name}{host_uuid}; + next if $host_uuid eq $anvil->Get->host_uuid; + + my $use_ip = $anvil->data->{peer}{$short_host_name}{access}{ip}; + my $destination = "root\@".$use_ip.":".$anvil->data->{path}{directories}{drbd_resources}."/"; + $destination =~ s/\/\//\//g; + print " - Copying: [".$drbd_res_file."] to: [".$short_host_name.":".$destination."] via: [".$use_ip."]\n"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + use_ip => $use_ip, + destination => $destination, + }}); + + my $failed = $anvil->Storage->rsync({ + debug => 2, + destination => $destination, + source => $drbd_res_file, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { failed => $failed }}); + + if ($failed) + { + print " +[ Error ] - There was a problem copying the new config file! Unable to proceed. +[ Error ] - Manual intervention to complete the update is required! +"; + $anvil->nice_exit({exit_code => 1}); + } + } + } + + # Create the metadata. + print "- Creating the replicated storage metadata on the new backing devices now.\n"; + foreach my $host_type ("node", "dr") + { + foreach my $short_host_name (sort {$a cmp $b} keys %{$anvil->data->{drbd_resource}{$drbd_resource}{host_type}{$host_type}{short_host_name}}) + { + my $host_uuid = $anvil->data->{drbd_resource}{$drbd_resource}{host_type}{$host_type}{short_host_name}{$short_host_name}{host_uuid}; + my $shell_call = $anvil->data->{path}{exe}{drbdadm}." --force create-md --max-peers=3 ".$drbd_resource."/".$next_drbd_volume; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:short_host_name' => $short_host_name, + 's2:host_uuid' => $host_uuid, + 's3:shell_call' => $shell_call, + }}); + + # Create the metadata, but don't exit on failure in case the metadata was created in + # a previous pass. + if ($host_uuid eq $anvil->Get->host_uuid) + { + print " - Creating the meta-data on the new local volume: [".$next_drbd_volume."]..."; + my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + + ### Return codes + # 0 == Success + # 1 == ? + # 3 == Configuration not found. + if ($return_code) + { + print " Warning!\n"; + print "[ Warning ] - When trying to create the local meta-data on: [".$drbd_resource."/".$next_drbd_volume."]\n"; + print "[ Warning ] - using the command: [".$shell_call."]\n"; + print "[ Warning ] - The return code: [".$return_code."] was received, expected '0'. Output, if any:\n"; + print "==========\n"; + print $output."\n"; + print "==========\n"; + print "We will try to proceed anyway.\n"; + } + else + { + print " Done!\n"; + } + } + else + { + my $use_ip = $anvil->data->{peer}{$short_host_name}{access}{ip}; + my $use_network = $anvil->data->{peer}{$short_host_name}{access}{network}; + print " - Creating the meta-data on the peer: [".$short_host_name.":".$drbd_resource."/".$next_drbd_volume."], via: [".$use_ip." (".$use_network.")]"; + my ($output, $error, $return_code) = $anvil->Remote->call({ + shell_call => $shell_call, + target => $use_ip, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + if ($return_code) + { + print " Warning!\n"; + print "[ Warning ] - When trying to create the peer: [".$short_host_name."]'s meta-data on: [".$drbd_resource."/".$next_drbd_volume."]\n"; + print "[ Warning ] - using the command: [".$shell_call."]\n"; + print "[ Warning ] - The return code: [".$return_code."] was received, expected '0'. Output, if any:\n"; + print "==] STDOUT [========\n"; + print $output."\n"; + print "==] STDERR [========\n"; + print $error."\n"; + print "====================\n"; + print "We will try to proceed anyway.\n"; + } + else + { + print " Done!\n"; + } + } + } + } + + ### NOTE: The call to 'drbdadm adjust ' hangs, hard, until the same command is run on the peers. + ### To deal with this, we register jobs to run 'anvil-special-operations' on the peers, then we + ### call adjust here. + # Adjust to start/connect. + foreach my $host_type ("node", "dr") + { + foreach my $short_host_name (sort {$a cmp $b} keys %{$anvil->data->{drbd_resource}{$drbd_resource}{host_type}{$host_type}{short_host_name}}) + { + my $host_uuid = $anvil->data->{drbd_resource}{$drbd_resource}{host_type}{$host_type}{short_host_name}{$short_host_name}{host_uuid}; + my $shell_call = $anvil->data->{path}{exe}{'anvil-special-operations'}." --task refresh-drbd-resource --resource ".$drbd_resource.$anvil->Log->switches; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:short_host_name' => $short_host_name, + 's2:host_uuid' => $host_uuid, + 's3:shell_call' => $shell_call, + }}); + next if $host_uuid eq $anvil->Get->host_uuid; + + my ($job_uuid) = $anvil->Database->insert_or_update_jobs({ + debug => 2, + job_command => $shell_call, + job_data => "adjust=".$drbd_resource, + job_name => "server::add_disk::rescan", + job_title => "job_0465", + job_description => "job_0466", + job_progress => 0, + job_host_uuid => $host_uuid, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }}); + + print "- Registered a job with job UUID: [".$job_uuid."] to reload the resource config on the host: [".$short_host_name."].\n"; + } + } + + print "- Adjusting the local resource: [".$drbd_resource."] to pick up the new config.\n"; + print "[ NOTE ] - If this hangs, make sure 'anvil-daemon' is running on the peers.\n"; + $shell_call = $anvil->data->{path}{exe}{drbdadm}." adjust ".$drbd_resource; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + ($output, $return_code) = $anvil->System->call({ + background => 1, + shell_call => $shell_call, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + + # Find which node is currently Primary and use that host to force primary to start sync. If none, + # force here. + print "- Waiting for all peers to connect the new volume..."; + my $waiting = 1; + my $wait_until = time + 300; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { wait_until => $wait_until }}); + while ($waiting) + { + $anvil->DRBD->get_status({debug => 2}); + my $peers_connected = 1; + my $disks_ready = 0; + my $drbd_peer_count = keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{connection}}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { drbd_peer_count => $drbd_peer_count }}); + if (not $drbd_peer_count) + { + $peers_connected = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peers_connected => $peers_connected }}); + } + foreach my $this_host_name (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{connection}}) + { + my $host_uuid = $anvil->Get->host_uuid_from_name({host_name => $this_host_name}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:this_host_name' => $this_host_name, + 's2:host_uuid' => $host_uuid, + }}); + next if $host_uuid eq $anvil->Get->host_uuid; + + my $connection_state = $anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{connection}{$this_host_name}{'connection-state'}; + my $node_id = $anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{connection}{$this_host_name}{'peer-node-id'}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:connection_state' => $connection_state, + 's2:node_id' => $node_id, + }}); + + if (lc($connection_state) ne "connected") + { + $peers_connected = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peers_connected => $peers_connected }}); + } + } + + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peers_connected => $peers_connected }}); + if ($peers_connected) + { + # Make sure all disks are attached. + $disks_ready = 1; + $anvil->data->{peers}{$short_host_name}{disk_state} = $anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{devices}{volume}{$next_drbd_volume}{'disk-state'}; + $anvil->data->{peers}{$short_host_name}{role} = $anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{role}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "peers::${short_host_name}::disk_state" => $anvil->data->{peers}{$short_host_name}{disk_state}, + "peers::${short_host_name}::role" => $anvil->data->{peers}{$short_host_name}{role}, + }}); + foreach my $peer_name (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{connection}}) + { + my $peer_disk_state = $anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{connection}{$peer_name}{volume}{$next_drbd_volume}{'peer-disk-state'}; + my $replication_state = $anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{connection}{$peer_name}{volume}{$next_drbd_volume}{'replication-state'}; + my $role = $anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{connection}{$peer_name}{'peer-role'}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:peer_name' => $peer_name, + 's2:peer_disk_state' => $peer_disk_state, + 's3:replication_state' => $replication_state, + 's4:role' => $role, + }}); + if (lc($replication_state) ne "established") + { + $disks_ready = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { disks_ready => $disks_ready }}); + } + if ((not $peer_disk_state) or (lc($peer_disk_state) eq "diskless")) + { + $disks_ready = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { disks_ready => $disks_ready }}); + } + + $anvil->data->{peers}{$peer_name}{disk_state} = $peer_disk_state; + $anvil->data->{peers}{$peer_name}{role} = $role; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "peers::${peer_name}::disk_state" => $anvil->data->{peers}{$peer_name}{disk_state}, + }}); + } + } + + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { disks_ready => $disks_ready }}); + if ($disks_ready) + { + $waiting = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); + } + else + { + if (time > $wait_until) + { + print " Failed!\n[ Error ] - The peers did not connect in the expected period of time.\n"; + $anvil->nice_exit({exit_code => 1}); + } + sleep 2; + } + } + print " Done!\n"; + + print "- Peers are connected! Checking if the new volume requires initial sync.\n"; + my $all_inconsistent = 1; + my $primary_on_host = ""; + foreach my $peer_name (sort {$a cmp $b} keys %{$anvil->data->{peers}}) + { + my $disk_state = $anvil->data->{peers}{$peer_name}{disk_state}; + my $role = $anvil->data->{peers}{$peer_name}{role}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + peer_name => $peer_name, + disk_state => $disk_state, + role => $role, }}); - #print "- Target: [".$device_target."], boot: [".$say_boot."], path: [".$device_path."], cache: [".$driver_cache."], driver type: [".$driver_type."]\n"; - print "- Target: [".$device_target."], boot: [".$say_boot."], path: [".$device_path."], Available space: [".$anvil->Convert->bytes_to_human_readable({'bytes' => $max_free_space})."]\n"; + if (lc($disk_state) ne "inconsistent") + { + $all_inconsistent = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_inconsistent => $all_inconsistent }}); + } + if (lc($role) eq "primary") + { + $primary_on_host = $peer_name; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { primary_on_host => $primary_on_host }}); + } + } + + if ($all_inconsistent) + { + print "- Initial sync required!\n"; + my $shell_call = $anvil->data->{path}{exe}{drbdadm}." primary ".$drbd_resource." --force"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); - # What are we doing? - if ($anvil->data->{switches}{grow}) + print Dumper %{$anvil->data->{peers}}; + die; + + # Which node should be forced primary? + my $already_primary = 1; + if (not $primary_on_host) { - manage_disk_grow($anvil, $drbd_resource, $drbd_volume, $max_free_space); + # We'll make it primary. + $primary_on_host = $short_host_name; + $already_primary = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + primary_on_host => $primary_on_host, + already_primary => $already_primary, + }}); + } + + my $primary_on_host_uuid = $anvil->Get->host_uuid_from_name({host_name => $primary_on_host}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { primary_on_host_uuid => $primary_on_host_uuid }}); + if ($primary_on_host_uuid eq $anvil->Get->host_uuid) + { + print "- Forcing primary locally... "; + my ($output, $return_code) = $anvil->System->call({debug => 2, shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + # Return code of '0' is success. + if ($return_code) + { + print "Failed! +[ Error ] - There was a problem trying to force the new volume: [".$drbd_resource."/".$next_drbd_volume."] to Primary. +[ Error ] - Attempted this using the shell call: [".$shell_call."]. +[ Error ] - Expected the return code '0' but got: [".$return_code."]. The output, if any, was: +========== +".$output." +========== +[ Error ] - Once corrected, please manually add the new volume to the server. +"; + $anvil->nice_exit({exit_code => 1}); + } + + # Now demote it again. + $shell_call = $anvil->data->{path}{exe}{drbdadm}." secondary ".$drbd_resource; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + print "Success!\n"; + } + else + { + my $use_ip = $anvil->data->{peer}{$short_host_name}{access}{ip}; + my $use_network = $anvil->data->{peer}{$short_host_name}{access}{network}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + use_ip => $use_ip, + use_network => $use_network, + }}); + print " - The resource is primary on the peer: [".$short_host_name."], forcing primary there via: [".$use_ip." (".$use_network.")]"; + my ($output, $error, $return_code) = $anvil->Remote->call({ + shell_call => $shell_call, + target => $use_ip, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + if ($return_code) + { + print "Failed! +[ Error ] - There was a problem trying to force the new volume: [".$drbd_resource."/".$next_drbd_volume."] to Primary. +[ Error ] - Attempted this using the shell call: [".$shell_call."]. +[ Error ] - Expected the return code '0' but got: [".$return_code."]. The output, if any, was: +========== +".$output." +========== +[ Error ] - Once corrected, please manually add the new volume to the server. +"; + $anvil->nice_exit({exit_code => 1}); + } + + # Now demote it again. + $shell_call = $anvil->data->{path}{exe}{drbdadm}." secondary ".$drbd_resource; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + ($output, $error, $return_code) = $anvil->Remote->call({ + shell_call => $shell_call, + target => $use_ip, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + print "Success!\n"; } } - elsif ($anvil->data->{switches}{add}) + else { - manage_disk_add($anvil, $drbd_resource); + print "Initial sync does not appear to be required.\n"; } - return(0); -} - -sub manage_disk_add -{ - my ($anvil, $drbd_resource) = @_; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { drbd_resource => $drbd_resource }}); - - my $anvil_uuid = defined $anvil->data->{switches}{anvil_uuid} ? $anvil->data->{switches}{anvil_uuid} : $anvil->Cluster->get_anvil_uuid(); - my $short_host_name = $anvil->Get->short_host_name; - my $server_name = $anvil->data->{switches}{server_name}; - my $from_source = get_definition_source($anvil); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - anvil_uuid => $anvil_uuid, - short_host_name => $short_host_name, - server_name => $server_name, - from_source => $from_source, - }}); - - # Are they asking for an available amount of space? - my $error_note = q| -[ Note ] - The size can be in percent, ie: '50%' or '100%', a number in bytes, or a human-readable size. - - Human readable sizes must NOT have a space between the number and letter suffix. Also, base2 - - vs base10 notation! Ie: '1GiB' = 1,073,741,824 bytes', '1GB' == '1,000,000,000 bytes'. A single - - letter used to denote size will be interpreted as base2. ie: '1G == 1GiB'. -|; - - # Do we have a storage group? - if (not $anvil->data->{switches}{'storage-group'}) + # Is the server running? If so, where. + print "- Ready to add the new disk. Checking if the server is running...\n"; + my $server_host = ""; + foreach my $host_type ("node", "dr") { - print "Please specify a storage group to use to add the new drive to.\n"; - short_storage_groups($anvil, $anvil_uuid); - $anvil->nice_exit({exit_code => 1}); + foreach my $short_host_name (sort {$a cmp $b} keys %{$anvil->data->{drbd_resource}{$drbd_resource}{host_type}{$host_type}{short_host_name}}) + { + my $host_uuid = $anvil->data->{drbd_resource}{$drbd_resource}{host_type}{$host_type}{short_host_name}{$short_host_name}{host_uuid}; + my $shell_call = $anvil->data->{path}{exe}{'anvil-special-operations'}." --task refresh-drbd-resource --resource ".$drbd_resource.$anvil->Log->switches; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:short_host_name' => $short_host_name, + 's2:host_uuid' => $host_uuid, + 's3:shell_call' => $shell_call, + }}); + + if ($host_uuid eq $anvil->Get->host_uuid) + { + $anvil->Server->find(); + } + else + { + my $use_ip = $anvil->data->{peer}{$short_host_name}{access}{ip}; + $anvil->Server->find({target => $use_ip }); + } + + if ((exists $anvil->data->{server}{location}{$server_name}) && + ($anvil->data->{server}{location}{$server_name}{host_name})) + { + my $this_host = defined $anvil->data->{server}{location}{$server_name}{host_name} ? $anvil->data->{server}{location}{$server_name}{host_name} : ""; + my $server_status = defined $anvil->data->{server}{location}{$server_name}{status} ? $anvil->data->{server}{location}{$server_name}{status} : ""; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + this_host => $this_host, + server_status => $server_status, + }}); + if ($server_status eq "running") + { + # Found it. + $server_host = $this_host; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { server_host => $server_host }}); + if ($host_uuid eq $anvil->Get->host_uuid) + { + print "- The server is running on this host, we'll attach the disk here.\n"; + } + else + { + print "- The server is running on: [".$server_host."], we'll attach the disk there.\n"; + } + last; + } + } + } } - # Make sure that the passed - my $storage_group_switch = $anvil->data->{switches}{'storage-group'}; - my $storage_group_uuid = ""; - my $storage_group_name = ""; - if (exists $anvil->data->{storage_groups}{anvil_uuid}{$anvil_uuid}{storage_group_name}{$storage_group_switch}) + my $offline = 0; + if (not $server_host) { - $storage_group_uuid = $anvil->data->{storage_groups}{anvil_uuid}{$anvil_uuid}{storage_group_name}{$storage_group_switch}{storage_group_uuid}; - $storage_group_name = $storage_group_switch; + print "- The server isn't running anywhere, we'll attach the disk offline on this host.\n"; + $server_host = $short_host_name; + $offline = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - storage_group_uuid => $storage_group_uuid, - storage_group_name => $storage_group_name, + server_host => $server_host, + offline => $offline, }}); } - elsif (exists $anvil->data->{storage_groups}{anvil_uuid}{$anvil_uuid}{storage_group_uuid}{$storage_group_switch}) + + $shell_call = $anvil->data->{path}{exe}{setsid}." --wait ".$anvil->data->{path}{exe}{virsh}." attach-disk ".$server_name." "; + $shell_call .= "/dev/drbd/by-res/".$drbd_resource."/".$next_drbd_volume." ".$new_device_target." "; + $shell_call .= "--persistent --targetbus ".$disk_device_bus." "; + $shell_call .= "--cache ".$disk_cache." "; + $shell_call .= "--io ".$disk_io_policy; + $shell_call .= "--sourcetype block --subdriver raw"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + if ($offline) { - $storage_group_uuid = $storage_group_switch; - $storage_group_name = $anvil->data->{storage_groups}{anvil_uuid}{$anvil_uuid}{storage_group_uuid}{$storage_group_switch}{group_name}; + # Define the VM, if needed, then add the drive, dump the config and push it out. + print "- Defining the server: [".$server_name."] to prepare for 'virsh' modification of the server.\n"; + update_definition($anvil, "define", ""); + + # Update the definition. + print "- Adding the drive to the definition now.\n"; + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - storage_group_uuid => $storage_group_uuid, - storage_group_name => $storage_group_name, + output => $output, + return_code => $return_code, }}); - } - - # Did we get a valid disk size? - my $free_space = $anvil->data->{storage_groups}{anvil_uuid}{$anvil_uuid}{storage_group_uuid}{$storage_group_uuid}{free_space}; - my $add_size = $anvil->data->{switches}{add}; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { add_size => $add_size }}); - if ($add_size =~ /^(\d+)%$/) - { - # This is valid - my $percent = ".".$1; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { percent => $percent }}); - $add_size = int($free_space * $percent); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { add_size => $add_size }}); - } - elsif ($add_size !~ /\d/) - { - # No digits, probably didn't set a value at all. - print "\n[ Error ] - Please specify the size you would like to grow this disk by. The maximum size is: [".$anvil->Convert->bytes_to_human_readable({'bytes' => $free_space})."].\n"; - print $error_note."\n"; - $anvil->nice_exit({exit_code => 1}); + print "- Updating the stored definition and undefining the server now...\n"; + update_definition($anvil, "undefine", ""); + print "Done!\n"; + $anvil->nice_exit({exit_code => 0}); } - elsif ($add_size !~ /^\d+$/) + else { - # Size is not in bytes, try to convert it. - my $bytes = $anvil->Convert->human_readable_to_bytes({ - debug => 2, - size => $add_size, - }); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'bytes' => $bytes }}); - if ($bytes =~ /^\d+$/) + # Add the drive live, dump the new definition and push it out. + my $host_uuid = $anvil->Get->host_uuid_from_name({host_name => $server_host}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_uuid => $host_uuid }}); + if ($host_uuid eq $anvil->Get->host_uuid) { - $add_size = $bytes; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { add_size => $add_size }}); + # Do the add here. + print "- Adding the drive to the server directly...\n"; + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + + print "- Updating the stored definition and undefining the server now...\n"; + update_definition($anvil, "undefine", ""); + print "Done!\n"; + $anvil->nice_exit({exit_code => 0}); } else { - # Not a valid size. - print "\n[ Error ] - The requested size: [".$add_size."] could not be interpreted.\n"; - print $error_note."\n"; - $anvil->nice_exit({exit_code => 1}); + # Do the add on the target. + my $use_ip = $anvil->data->{peer}{$short_host_name}{access}{ip}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { use_ip => $use_ip }}); + print " - Adding the drivve to the server using its host: [".$server_host."] via: [".$use_ip."]...\n"; + my ($output, $error, $return_code) = $anvil->Remote->call({ + shell_call => $shell_call, + target => $use_ip, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + + # Get the updated definition file. + my $shell_call = $anvil->data->{path}{exe}{setsid}." --wait ".$anvil->data->{path}{exe}{virsh}." dumpxml --inactive ".$server_name; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + (my $virsh_definition, $error, $return_code) = $anvil->Remote->call({ + shell_call => $shell_call, + target => $use_ip, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + virsh_definition => $virsh_definition, + error => $error, + return_code => $return_code, + }}); + + # Make sure the $output is valid XML. + my $problem = $anvil->Server->parse_definition({ + server => $server_name, + source => "from_virsh", + definition => $virsh_definition, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); + + if ($problem) + { + # Failed?! + print " +[ Error ] - The definition file read from the remote host appears to be invalid after trying to attach the +[ Error ] - disk! It is unsafe to update the on disk and in DB definition. It's likely the attach has failed. +[ Error ] - Manual update to the server is likely required now. +"; + $anvil->nice_exit({exit_code => 1}); + } + + print "- Updating the stored definition and undefining the server now...\n"; + update_definition($anvil, "undefine", $virsh_definition); + print "Done!\n"; + $anvil->nice_exit({exit_code => 0}); } } - # Get the next free minor number - my ($free_minor, undef) = $anvil->DRBD->get_next_resource({ - debug => 2, - minor_only => 1, - anvil_uuid => $anvil_uuid, - }); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { free_minor => $free_minor }}); - - return(0); } @@ -544,7 +1486,7 @@ sub manage_disk_grow ### TODO: Make this work without the peer node being online. # The server is allowed to be running, but both nodes and any DR hosts this is replicating to # needs to be online. - my $all_online = check_drbd_peer_access($anvil, $from_source, $drbd_volume); + my $all_online = check_drbd_peer_access($anvil); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_online => $all_online }}); if (not $all_online) @@ -588,7 +1530,7 @@ sub manage_disk_grow } # Test that we've lost access while waiting for the answer. - my $all_online = check_drbd_peer_access($anvil, $from_source, $drbd_volume); + my $all_online = check_drbd_peer_access($anvil); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_online => $all_online }}); if (not $all_online) @@ -860,7 +1802,7 @@ sub manage_optical if (not -f $iso) { print "[ Error ] - The target: [".$iso."] doesn't exist, can't insert it into the optical drive.\n"; - update_definition($anvil, "undefine"); + update_definition($anvil, "undefine", ""); $anvil->nice_exit({exit_code => 1}); } else @@ -872,7 +1814,7 @@ sub manage_optical # If the server is running, update the on-disk and in-DB definition. print "Defining the server: [".$server_name."] to prepare for 'virsh' modification of the server.\n"; - update_definition($anvil, "define"); + update_definition($anvil, "define", ""); # Now we can modify the server using virsh. if ($anvil->data->{switches}{insert}) @@ -907,7 +1849,7 @@ sub manage_optical print "'virsh' Output: [".$output."]\n"; print "Updating the stored definition and undefining the server now:\n"; - update_definition($anvil, "undefine"); + update_definition($anvil, "undefine", ""); print "Done!\n"; return(0); @@ -1089,16 +2031,12 @@ sub show_server_details sub check_drbd_peer_access { - my ($anvil, $drbd_resource, $drbd_volume) = @_; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - 's01:drbd_resource' => $drbd_resource, - 's02:drbd_volume' => $drbd_volume, - }}); + my ($anvil) = @_; my $all_online = 1; foreach my $this_host (sort {$a cmp $b} keys %{$anvil->data->{drbd}{drbd_node}}) { - my $host_uuid = $anvil->Get->host_uuid_from_name({debug => 2, host_name => $this_host}); + my $host_uuid = $anvil->Get->host_uuid_from_name({host_name => $this_host}); my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:this_host' => $this_host, @@ -1172,9 +2110,9 @@ sub get_max_free_space my $drbd_path = $anvil->data->{drbd}{drbd_node}{$this_host}{config}{resource}{$drbd_resource}{volume}{$drbd_volume}{drbd_path}; my $drbd_path_by_res = $anvil->data->{drbd}{drbd_node}{$this_host}{config}{resource}{$drbd_resource}{volume}{$drbd_volume}{drbd_path_by_res}; my $backing_lv = $anvil->data->{drbd}{drbd_node}{$this_host}{config}{resource}{$drbd_resource}{volume}{$drbd_volume}{backing_lv}; - my $lv_name = $anvil->data->{lvm}{host_name}{$this_host}{lv_path}{$backing_lv}{scan_lvm_lv_name};; - my $on_vg = $anvil->data->{lvm}{host_name}{$this_host}{lv}{$lv_name}{scan_lvm_lv_on_vg}; - my $vg_free_space = $anvil->data->{lvm}{host_name}{$this_host}{vg}{$on_vg}{scan_lvm_vg_free}; + my $lv_name = $anvil->data->{lvm}{host_name}{$this_host}{lv_path}{$backing_lv}{scan_lvm_lv_name} ? $anvil->data->{lvm}{host_name}{$this_host}{lv_path}{$backing_lv}{scan_lvm_lv_name} : ""; + my $on_vg = $anvil->data->{lvm}{host_name}{$this_host}{lv}{$lv_name}{scan_lvm_lv_on_vg} ? $anvil->data->{lvm}{host_name}{$this_host}{lv}{$lv_name}{scan_lvm_lv_on_vg} : ""; + my $vg_free_space = $anvil->data->{lvm}{host_name}{$this_host}{vg}{$on_vg}{scan_lvm_vg_free} ? $anvil->data->{lvm}{host_name}{$this_host}{vg}{$on_vg}{scan_lvm_vg_free} : 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's01:this_host' => $this_host, 's02:drbd_path' => $drbd_path, @@ -1453,8 +2391,11 @@ sub validate_server # Update the definition on disk and in the DB, and define or undefine if requested. sub update_definition { - my ($anvil, $task) = @_; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { task => $task }}); + my ($anvil, $task, $definition) = @_; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + task => $task, + definition => $definition, + }}); my $short_host_name = $anvil->Get->short_host_name; my $server_name = $anvil->data->{switches}{server_name}; @@ -1499,39 +2440,55 @@ sub update_definition my $disk_definition = $anvil->data->{server}{$short_host_name}{$server_name}{from_disk}{xml} ? $anvil->data->{server}{$short_host_name}{$server_name}{from_disk}{xml} : ""; my $virsh_definition = $anvil->data->{server}{$short_host_name}{$server_name}{from_virsh}{xml} ? $anvil->data->{server}{$short_host_name}{$server_name}{from_virsh}{xml} : ""; - my $use_definition = ""; - if (($server_running_here) or (($server_state eq "running") && ($virsh_definition))) + my $use_definition = $virsh_definition; + if (not $use_definition) { - # Get the live definition - if ($server_running_here) + if (($server_running_here) or (($server_state eq "running") && ($virsh_definition))) { - my $shell_call = $anvil->data->{path}{exe}{setsid}." --wait ".$anvil->data->{path}{exe}{virsh}." dumpxml --inactive ".$server_name; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); - - my ($live_virsh_definition, $return_code) = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - live_virsh_definition => $live_virsh_definition, - return_code => $return_code, - }}); - - my ($problem) = $anvil->Server->parse_definition({ - server => $server_name, - source => "from_virsh", - definition => $live_virsh_definition, - }); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); - if (not $problem) + # Get the live definition + if ($server_running_here) { - $use_definition = $live_virsh_definition; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { use_definition => $use_definition }}); + my $shell_call = $anvil->data->{path}{exe}{setsid}." --wait ".$anvil->data->{path}{exe}{virsh}." dumpxml --inactive ".$server_name; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); - $anvil->Server->parse_definition({ - debug => 3, - host => $short_host_name, - server => $server_name, - source => "from_virsh", + my ($live_virsh_definition, $return_code) = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + live_virsh_definition => $live_virsh_definition, + return_code => $return_code, + }}); + + my ($problem) = $anvil->Server->parse_definition({ + server => $server_name, + source => "from_virsh", definition => $live_virsh_definition, }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); + if (not $problem) + { + $use_definition = $live_virsh_definition; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { use_definition => $use_definition }}); + + $anvil->Server->parse_definition({ + debug => 3, + host => $short_host_name, + server => $server_name, + source => "from_virsh", + definition => $live_virsh_definition, + }); + } + else + { + $use_definition = $virsh_definition; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { use_definition => $use_definition }}); + + $anvil->Server->parse_definition({ + debug => 3, + host => $short_host_name, + server => $server_name, + source => "from_virsh", + definition => $virsh_definition, + }); + } } else { @@ -1549,31 +2506,18 @@ sub update_definition } else { - $use_definition = $virsh_definition; + $use_definition = $disk_definition; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { use_definition => $use_definition }}); $anvil->Server->parse_definition({ debug => 3, host => $short_host_name, server => $server_name, - source => "from_virsh", + source => "from_disk", definition => $virsh_definition, }); } } - else - { - $use_definition = $disk_definition; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { use_definition => $use_definition }}); - - $anvil->Server->parse_definition({ - debug => 3, - host => $short_host_name, - server => $server_name, - source => "from_disk", - definition => $virsh_definition, - }); - } if (not $use_definition) { diff --git a/tools/anvil-provision-server b/tools/anvil-provision-server index 8bb55cfa..16a3d5a4 100755 --- a/tools/anvil-provision-server +++ b/tools/anvil-provision-server @@ -846,10 +846,11 @@ sub startup_resource # Is the current resource up locally already? If it is, we're done. my $server = $anvil->data->{job}{server_name}; my $short_host_name = $anvil->data->{job}{short_host_name}; - my $role = defined $anvil->data->{drbd}{status}{$short_host_name}{resource}{$server}{role} ? $anvil->data->{drbd}{status}{$short_host_name}{resource}{$server}{role} : ""; + my $role = defined $anvil->data->{drbd}{status}{$short_host_name}{resource}{$server}{role} ? $anvil->data->{drbd}{status}{$short_host_name}{resource}{$server}{role} : ""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - 'job::server' => $anvil->data->{job}{server_name}, - role => $role, + server => $server, + short_host_name => $short_host_name, + role => $role, }}); if ((lc($role) ne "secondary") && (lc($role) ne "primary")) diff --git a/tools/anvil-safe-stop b/tools/anvil-safe-stop index 6c6b99cf..0ea2962e 100755 --- a/tools/anvil-safe-stop +++ b/tools/anvil-safe-stop @@ -274,9 +274,9 @@ sub process_servers 's2:progress_steps' => $progress_steps, }}); - # If we have one or more local servers, we need to know if both of us are in the cluster. If we're - # not, or the peer isn't, we can't migrate. - my $can_migrate = 0; + # If we have one or more local servers, we need to know if both subnodes are in the node's cluster. + # If we're not, or the peer isn't, we can't migrate. + my $can_migrate = 1; if ($server_count) { my $problem = $anvil->Cluster->parse_cib({debug => 2}); @@ -287,18 +287,20 @@ sub process_servers }}); if ($problem) { + # We're not in the node's cluster, we can't migrate. $can_migrate = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { can_migrate => $can_migrate }}); } elsif ((not $anvil->data->{cib}{parsed}{'local'}{ready}) or (not $anvil->data->{cib}{parsed}{peer}{ready})) { + # One of the subnodes is not in the cluster, so we can't migrate. $can_migrate = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { can_migrate => $can_migrate }}); } if ((not $anvil->data->{switches}{'stop-servers'}) && (not $can_migrate)) { - # Abort. + # We would have to stop the servers, and the user didn't tell us to do that, abort. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0372"}); $anvil->Job->update_progress({progress => 100, message => "error_0372"}); $anvil->nice_exit({exit_code => 1}); diff --git a/tools/anvil-special-operations b/tools/anvil-special-operations new file mode 100755 index 00000000..d434d196 --- /dev/null +++ b/tools/anvil-special-operations @@ -0,0 +1,120 @@ +#!/usr/bin/perl +# +# This program has no specific purpose. It's a general program for performing certain special tasks that +# can't be done otherwise in a reliable or efficient way. +# +# Exit codes; +# 0 = Normal exit. +# 1 = No database connection. + + +use strict; +use warnings; +use Anvil::Tools; +require POSIX; +use Text::Diff; +use Data::Dumper; + +my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0]; +my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0]; +if (($running_directory =~ /^\./) && ($ENV{PWD})) +{ + $running_directory =~ s/^\./$ENV{PWD}/; +} + +# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete. +$| = 1; + +my $anvil = Anvil::Tools->new(); + +# Read switches (target ([user@]host[:port]) and the file with the target's password. +$anvil->Get->switches({list => [ + "task", + "resource", + ], man => $THIS_FILE}); +$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}}); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }}); + +$anvil->Database->connect(); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"}); +if (not $anvil->data->{sys}{database}{connections}) +{ + # No databases, update the job, sleep for a bit and then exit. The daemon will pick it up and try + # again after we exit. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0306"}); + sleep 10; + $anvil->nice_exit({exit_code => 1}); +} + +if ($anvil->data->{switches}{'job-uuid'}) +{ + $anvil->Job->clear(); + $anvil->Job->get_job_details({debug => 2}); + $anvil->Job->update_progress({ + progress => 1, + job_picked_up_by => $$, + job_picked_up_at => time, + message => "message_0311", + }); +} +if ($anvil->data->{switches}{task} eq "refresh-drbd-resource") +{ + refresh_drbd_resource($anvil); +} + +$anvil->nice_exit({exit_code => 0}); + + +############################################################################################################# +# Functions # +############################################################################################################# + +# This function is needed to call 'drbdadm adjust ' in a background call from a remote host. This is +# needed for adding new volumes to an existing resource, as the call from 'drbdadm adjust ' won't return +# until the call is run on all hosts. +sub refresh_drbd_resource +{ + my ($anvil) = @_; + + my $resource = $anvil->data->{switches}{resource}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { resource => $resource }}); + + if (not $resource) + { + # No resource. + $anvil->Job->update_progress({ + progress => 100, + message => "error_0419", + job_status => "failed", + }); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => 'err', key => "error_0419"}); + $anvil->nice_exit({exit_code => 1}); + } + + $anvil->Job->update_progress({ + progress => 10, + message => "message_0312,!!resource!".$resource."!!", + }); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "message_0312", variables => { resource => $resource }}); + + my $shell_call = $anvil->data->{path}{exe}{drbdadm}." adjust ".$resource; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + my ($output, $return_code) = $anvil->System->call({ + shell_call => $shell_call, + background => 1, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + + $anvil->Job->update_progress({ + progress => 100, + message => "message_0313,!!resource!".$resource."!!", + }); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "message_0313", variables => { resource => $resource }}); + + $anvil->nice_exit({exit_code => 0}); + + return(0); +} \ No newline at end of file diff --git a/tools/anvil-update-system b/tools/anvil-update-system index f73c0d44..07c1926d 100755 --- a/tools/anvil-update-system +++ b/tools/anvil-update-system @@ -15,6 +15,7 @@ # 1 = No database connections available. # 2 = The job UUID was passed, but it wasn't valid. # 3 = It looks like the update failed, reset progress to '0'. +# 4 = Failed to withdraw the node from the cluster. # # TODO: # - Rebuild this to be 'striker-update-system' and have it update local strikers and all nodes. @@ -36,10 +37,12 @@ if (($running_directory =~ /^\./) && ($ENV{PWD})) my $anvil = Anvil::Tools->new(); -# Read switches -$anvil->data->{switches}{'job-uuid'} = ""; -$anvil->Get->switches; -$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => "log_0115", variables => { program => $THIS_FILE }}); +# Read switches (target ([user@]host[:port]) and the file with the target's password. +$anvil->Get->switches({list => [ + "clear-cache", + "no-reboot"], man => $THIS_FILE}); +$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}}); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }}); # Log that we've started. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, secure => 0, key => "log_0115", variables => { program => $THIS_FILE }}); @@ -90,7 +93,49 @@ my $reboot_needed = $anvil->System->reboot_needed({debug => 2}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { reboot_needed => $reboot_needed }}); if ($reboot_needed) { - update_progress($anvil, 100, "message_0039"); + if (not $anvil->data->{switches}{'no-reboot'}) + { + # Clear maintenance mode. + $anvil->System->maintenance_mode({set => 0}); + + # Record that we're rebooting so that 'striker-update-cluster' knows to wait for a reboot. + if ($anvil->data->{switches}{'job-uuid'}) + { + my $query = " +UPDATE + jobs +SET + job_data = 'rebooted', + modified_date = ".$anvil->Database->quote($anvil->Database->refresh_timestamp)." +WHERE + job_uuid = ".$anvil->Database->quote($anvil->data->{switches}{'job-uuid'})." +;"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }}); + $anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__}); + } + + # Register a job to reboot. + update_progress($anvil, 98, "message_0318"); + my ($job_uuid) = $anvil->Database->insert_or_update_jobs({ + file => $THIS_FILE, + line => __LINE__, + job_command => $anvil->data->{path}{exe}{'anvil-manage-power'}." --reboot -y".$anvil->Log->switches, + job_data => "", + job_name => "reboot::system", + job_title => "job_0009", + job_description => "job_0006", + job_progress => 0, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }}); + + # Record that we're going to reboot now. + update_progress($anvil, 100, "message_0317"); + } + else + { + # Record that a reboot is needed. + update_progress($anvil, 100, "message_0039"); + } } else { @@ -140,17 +185,82 @@ sub run_os_update # This needs to be set to avoid warnings when called without a job-uuid. $anvil->data->{sys}{last_update} = 0; + # Make sure that, if we're a node, we're out of the cluster. + my $host_type = $anvil->Get->host_type(); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { host_type => $host_type }}); + if ($host_type eq "node") + { + # Call anvil-safe-stop + update_progress($anvil, 3, "message_0314"); + + my $problem = $anvil->Cluster->parse_cib({debug => 3}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { problem => $problem }}); + if (not $problem) + { + # Call anvil-safe-stop + update_progress($anvil, 4, "message_0315"); + + my $shell_call = $anvil->data->{path}{exe}{'anvil-safe-stop'}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + + if ($return_code) + { + # Something went wrong, abort. + update_progress($anvil, 100, "error_0420,!!return_code!".$return_code."!!,!!output!".$output."!!"); + + # Set the job_data to 'failed' so that striker-update-cluster' knows to abort. + if ($anvil->data->{switches}{'job-uuid'}) + { + my $query = " +UPDATE + jobs +SET + job_data = 'failed', + modified_date = ".$anvil->Database->quote($anvil->Database->refresh_timestamp)." +WHERE + job_uuid = ".$anvil->Database->quote($anvil->data->{switches}{'job-uuid'})." +;"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }}); + $anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__}); + } + + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, secure => 0, key => "error_0035", variables => { output => $output } }); + $anvil->nice_exit({exit_code => 4}); + } + } + } + + # Should we clear the cache? + if ($anvil->data->{switches}{'clear-cache'}) + { + # Yes. + my $shell_call = $anvil->data->{path}{exe}{dnf}." clean all"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + update_progress($anvil, 5, "message_0316"); + } + # NOTE: We run this directly to better monitor progress and update the progress. my $transaction_shown = 0; my $success = 0; my $to_update = 0; my $percent_step = 0; - my $progress = 5; + my $progress = 6; my $counted_lines = 0; my $next_step = 0; my $verifying = 0; my $output = ""; - my $shell_call = $anvil->data->{path}{exe}{dnf}." clean expire-cache && ".$anvil->data->{path}{exe}{dnf}." -y update --best --allowerasing; ".$anvil->data->{path}{exe}{echo}." return_code:\$?"; + my $shell_call = $anvil->data->{path}{exe}{dnf}." -y update; ".$anvil->data->{path}{exe}{echo}." return_code:\$?"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { shell_call => $shell_call }}); open (my $file_handle, $shell_call." 2>&1 |") or $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, secure => 0, priority => "err", key => "log_0014", variables => { shell_call => $shell_call, error => $! }}); while(<$file_handle>) @@ -162,14 +272,6 @@ sub run_os_update $anvil->data->{counts}{lines}++; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "counts::lines" => $anvil->data->{counts}{lines}, line => $line }}); - if ($line =~ /^kernel /) - { - # Reboot will be needed. - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0687", variables => { reason => "#!string!log_0690!#" }}); - my $reboot_needed = $anvil->System->reboot_needed({set => 1}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { reboot_needed => $reboot_needed }}); - } - # If there were no updates, let the user know. if ($line =~ /^Nothing to do/i) { @@ -286,6 +388,37 @@ sub run_os_update my ($systemctl_output, $return_code) = $anvil->System->call({debug => 3, shell_call => $anvil->data->{path}{exe}{systemctl}." daemon-reload", source => $THIS_FILE, line => __LINE__}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { systemctl_output => $systemctl_output, return_code => $return_code }}); + ### See if the kernel has been updated. + # Get the newest installed kernel + $shell_call = $anvil->data->{path}{exe}{rpm}." -q kernel | ".$anvil->data->{path}{exe}{'sort'}." | ".$anvil->data->{path}{exe}{tail}." -n 1"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + (my $installed_kernel, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + installed_kernel => $installed_kernel, + return_code => $return_code, + }}); + $installed_kernel =~ s/^kernel-(\d+.\d+\.\d+-\d+)\..*$/$1/; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { installed_kernel => $installed_kernel }}); + + # Get the running kernel + $shell_call = $anvil->data->{path}{exe}{uname}." -r"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + (my $active_kernel, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + active_kernel => $active_kernel, + return_code => $return_code, + }}); + $active_kernel =~ s/(\d+.\d+\.\d+-\d+)\..*$/$1/; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { active_kernel => $active_kernel }}); + + if ($installed_kernel ne $active_kernel) + { + # Reboot needed + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0687", variables => { reason => "#!string!log_0690!#" }}); + my $reboot_needed = $anvil->System->reboot_needed({set => 1}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { reboot_needed => $reboot_needed }}); + } + # Did it work? if (not $success) { diff --git a/tools/striker-collect-debug b/tools/striker-collect-debug new file mode 100755 index 00000000..8710eb10 --- /dev/null +++ b/tools/striker-collect-debug @@ -0,0 +1,797 @@ +#!/usr/bin/perl +# +# This program will collect data from all accessible machines and compile it into a common tarball. This is +# designed to make it easier to diagnose faults. +# +# Exit codes; +# 0 = Normal exit. +# 1 = No database connection. +# +# TODO: +# +# USAGE: +# + +use strict; +use warnings; +use Anvil::Tools; +require POSIX; +use Term::Cap; +use Text::Diff; +use Data::Dumper; + +my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0]; +my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0]; +if (($running_directory =~ /^\./) && ($ENV{PWD})) +{ + $running_directory =~ s/^\./$ENV{PWD}/; +} + +# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete. +$| = 1; + +my $anvil = Anvil::Tools->new(); + +# Read switches (target ([user@]host[:port]) and the file with the target's password. +$anvil->Get->switches({list => [ + "anvil", + "hosts", + "output-file"], man => $THIS_FILE}); +$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}}); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }}); + +# Connect to the database(s). If we have no connections, we'll proceed anyway as one of the 'run_once' tasks +# is to setup the database server. +$anvil->Database->connect(); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"}); +if (not $anvil->data->{sys}{database}{connections}) +{ + # No databases, update the job, sleep for a bit and then exit. The daemon will pick it up and try + # again after we exit. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0305"}); + sleep 10; + $anvil->nice_exit({exit_code => 1}); +} + +# Make sure we're running as 'root' +# $< == real UID, $> == effective UID +if (($< != 0) && ($> != 0)) +{ + # Not root + print $anvil->Words->string({key => "error_0005"})."\n"; + $anvil->nice_exit({exit_code => 1}); +} + +# Make sure we're a striker. +if ($anvil->Get->host_type ne "striker") +{ + print "This has to be run on a Striker dashboard.\n"; + $anvil->nice_exit({exit_code => 1}); +} + +# Make sure the collection directory exists. +$anvil->data->{sys}{date_and_time} = $anvil->Get->date_and_time({file_name => 1}); +$anvil->data->{sys}{compile_directory} = "/tmp/anvil-debug_".$anvil->data->{sys}{date_and_time}; +$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "sys::date_and_time" => $anvil->data->{sys}{date_and_time}, + "sys::compile_directory" => $anvil->data->{sys}{compile_directory}, +}}); + +print "Data collection has begun.\n"; +print "Depending on how many systems we're collecting from, this could take a while.\n"; + +# Get the directory portion of the output path and make sure it exists. +my $tarball = process_output($anvil); +$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { tarball => $tarball }}); + +process_switches($anvil); + +collect_data($anvil); + +# Create the tarball now. +print "Data collection complete, creating the tarball now... "; +my $shell_call = $anvil->data->{path}{exe}{tar}." -cvjf ".$tarball." ".$anvil->data->{sys}{compile_directory}; +$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:tarball' => $tarball, + 's2:shell_call' => $shell_call, +}}); + +my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); +$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, +}}); +print "Done!\n"; + +print "\n[ Complete ] - The debug data is here: [".$tarball."]\n"; +print "[ Warning ] - The collected logs likely include sensitive information! Share is carefully!\n"; + + + +$anvil->nice_exit({exit_code => 0}); + + +############################################################################################################# +# Functions # +############################################################################################################# + +sub process_output +{ + my ($anvil) = @_; + + my $tarball = "/root/anvil-debug_".$anvil->data->{sys}{date_and_time}.".tar.bz2"; + if ($anvil->data->{switches}{'output-file'}) + { + my $new_directory = $anvil->data->{switches}{'output-file'}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { new_directory => $new_directory }}); + if ($new_directory !~ /^\//) + { + print "[ Error ] - The output path needs to be a path.\n"; + $anvil->nice_exit({exit_code => 1}); + } + else + { + # Append .tar.bz2. + $tarball = $new_directory; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { tarball => $tarball }}); + } + } + + # Break the directory off and make sure the output directory exists. + my $output_file = ($tarball =~ /^.*\/(.*)$/)[0]; + my $output_directory = ($tarball =~ /^(.*?)\/$output_file$/)[0]; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output_file => $output_file, + output_directory => $output_directory, + }}); + + if (not $output_file) + { + $output_file = "anvil-debug_".$anvil->data->{sys}{date_and_time}.".tar.bz2"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output_file => $output_file }}); + } + elsif ($output_file !~ /\.tar\.bz2/) + { + $output_file .= ".tar.bz2"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output_file => $output_file }}); + } + + if ($output_directory ne "/") + { + print "- Creating the output directory: [".$output_directory."]... "; + my $failed = $anvil->Storage->make_directory({directory => $output_directory}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { failed => $failed }}); + if ($failed) + { + print "Failed!\nUnable to create the directory: [".$anvil->data->{sys}{compile_directory}."]. The error should be logged.\n"; + $anvil->nice_exit({exit_code => 1}); + } + } + + $tarball = $output_directory."/".$output_file; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { tarball => $tarball }}); + + return($tarball); +} + +sub collect_data +{ + my ($anvil) = @_; + + my $failed = $anvil->Storage->make_directory({directory => $anvil->data->{sys}{compile_directory}}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { failed => $failed }}); + if ($failed) + { + print "Failed to create the directory: [".$anvil->data->{sys}{compile_directory}."]. The error should be logged.\n"; + $anvil->nice_exit({exit_code => 1}); + } + + my $hosts = @{$anvil->data->{collect_from}}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { hosts => $hosts }}); + foreach my $host_type ("striker", "node", "dr") + { + foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}}) + { + my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name}; + my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; + my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:host_name' => $host_name, + 's2:host_uuid' => $host_uuid, + 's3:short_host_name' => $short_host_name, + 's4:this_host_type' => $this_host_type, + }}); + next if $host_type ne $this_host_type; + + # Are we collecting from a subset only? + if ($hosts) + { + # Yes, is this host one of them? + my $found = 0; + foreach my $this_host_uuid (@{$anvil->data->{collect_from}}) + { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + host_uuid => $host_uuid, + this_host_uuid => $this_host_uuid, + }}); + if ($this_host_uuid eq $host_uuid) + { + $found = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { found => $found }}); + last; + } + } + next if not $found; + } + + # Make sure there's a directory for this host. + my $target_directory = $anvil->data->{sys}{compile_directory}."/".$short_host_name; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { target_directory => $target_directory }}); + if (not -d $target_directory) + { + my $failed = $anvil->Storage->make_directory({ + directory => $target_directory, + mode => "777", + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { failed => $failed }}); + if ($failed) + { + print "Failed to create the directory: [".$target_directory."]. The error should be logged.\n"; + $anvil->nice_exit({exit_code => 1}); + } + } + + # Is this the local host or a remote one? + if ($host_uuid eq $anvil->Get->host_uuid) + { + ### Collecting local data. + collect_local_data($anvil, $target_directory); + } + else + { + # Collecting data from a remote machine + my $problem = collect_remote_data($anvil, $host_uuid, $target_directory); + if ($problem) + { + # Create a file saying we couldn't access this machine. + my $body = "No access to: [".$host_name."] found.\n"; + my $file = $target_directory."/no_access.txt"; + $anvil->Storage->write_file({ + file => $file, + body => $body, + overwrite => 1, + backup => 0, + }); + } + } + } + } + + return(0); +} + +sub collect_remote_data +{ + my ($anvil, $host_uuid, $target_directory) = @_; + + my $host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_name}; + my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; + my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type}; + my $failed_body = "File not copied from: [".$host_name."].\n"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:host_name' => $host_name, + 's2:host_uuid' => $host_uuid, + 's3:short_host_name' => $short_host_name, + 's4:this_host_type' => $this_host_type, + 's5:target_directory' => $target_directory, + }}); + + # Dump the previous boot logs to a file. + print "\nGrabbing logs and data from the remote system: [".$short_host_name."].\n"; + print "- Testing access...\n"; + my $matches = $anvil->Network->find_access({ + debug => 2, + target => $host_name, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { matches => $matches }}); + $anvil->data->{peer}{$short_host_name}{access}{ip} = ""; + $anvil->data->{peer}{$short_host_name}{access}{network} = ""; + foreach my $preferred_network ("bcn", "mn", "ifn", "sn") + { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { preferred_network => $preferred_network }}); + foreach my $network_name (sort {$a cmp $b} keys %{$anvil->data->{network_access}}) + { + next if $network_name !~ /^$preferred_network/; + my $target_ip = $anvil->data->{network_access}{$network_name}{target_ip_address}; + my $test_access = $anvil->Remote->test_access({target => $target_ip}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:network_name' => $network_name, + 's2:target_ip' => $target_ip, + 's3:test_access' => $test_access, + }}); + + if ($test_access) + { + # We're good. + print "- Found access over the network: [".$network_name."] using the target IP: [".$target_ip."]\n"; + $anvil->data->{peer}{$short_host_name}{access}{ip} = $target_ip; + $anvil->data->{peer}{$short_host_name}{access}{network} = $network_name; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "s1:peer::${short_host_name}::access::ip" => $anvil->data->{peer}{$short_host_name}{access}{ip}, + "s2:peer::${short_host_name}::access::network" => $anvil->data->{peer}{$short_host_name}{access}{network}, + }}); + } + } + } + + if (not $anvil->data->{peer}{$short_host_name}{access}{ip}) + { + print "No access!!\n"; + print "- Not able to collect data from this host, skipping.\n"; + return(1); + } + + print "- Writing out system logs from the previous boot... "; + my $shell_call = $anvil->data->{path}{exe}{journalctl}." -b -1 > /tmp/journalctl-previous-boot.log"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + my ($output, $error, $return_code) = $anvil->Remote->call({ + shell_call => $shell_call, + target => $anvil->data->{peer}{$short_host_name}{access}{ip}, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + + # Copying the file + print "Done! Copying to here... "; + $anvil->Storage->rsync({ + source => "root\@".$anvil->data->{peer}{$short_host_name}{access}{ip}.":/tmp/journalctl-previous-boot.log", + destination => $target_directory."/", + }); + my $test_file = $target_directory."/tmp/journalctl-previous-boot.log"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { test_file => $test_file }}); + if (not -e $test_file) + { + print "Done.\n"; + } + else + { + print "Failed!\n"; + print "- For some reason, this file was not collected.\n"; + $anvil->Storage->write_file({ + file => $test_file, + body => $failed_body, + overwrite => 1, + backup => 0, + }); + } + + # Dump the current boot logs + print "- Grabbing system logs from this boot... "; + $shell_call = $anvil->data->{path}{exe}{journalctl}." -b 0 > /tmp/journalctl-current-boot.log"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + + # Copying the file + print "Done! Copying to here... "; + $anvil->Storage->rsync({ + source => "root\@".$anvil->data->{peer}{$short_host_name}{access}{ip}.":/tmp/journalctl-current-boot.log", + destination => $target_directory."/", + }); + $test_file = $target_directory."/journalctl-current-boot.log"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { test_file => $test_file }}); + if (not -e $test_file) + { + print "Done.\n"; + } + else + { + print "Failed!\n"; + print "- For some reason, this file was not collected.\n"; + $anvil->Storage->write_file({ + file => $test_file, + body => $failed_body, + overwrite => 1, + backup => 0, + }); + } + + # If we're a striker, dump the database also. + if ($this_host_type eq "striker") + { + # What's the password and address? + if (not exists $anvil->data->{database}{$host_uuid}) + { + # The remote striker isn't known + print "- The host is a Striker, but we don't have database access info, skipping DB dump.\n"; + } + else + { + print "- Dumping and compressing remote database data, PLEASE BE PATIENT!... "; + my $pg_file = "/root/.pgpass"; + my $pg_body = "*:*:*:admin:".$anvil->data->{database}{$host_uuid}{password}; + $anvil->Storage->write_file({ + file => $pg_file, + body => $pg_body, + mode => "600", + overwrite => 0, + backup => 0, + }); + my $shell_call = $anvil->data->{path}{exe}{pg_dump}." -h ".$anvil->data->{peer}{$short_host_name}{access}{ip}." -U admin anvil 2>/dev/null | ".$anvil->data->{path}{exe}{bzip2}." --stdout > ".$target_directory."/anvil.out.bz2"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + if ($return_code) + { + # Failed + print "Failed!\n"; + print "Expected the return code '0', but got: [".$return_code."]. The error, if any, was:\n"; + print "========\n"; + print $output."\n"; + print "========\n"; + $anvil->nice_exit({exit_code => 1}); + } + unlink $pg_file; + print "Done!\n"; + } + } + + print "- Grabbing hosts file... "; + $anvil->Storage->rsync({ + source => "root\@".$anvil->data->{peer}{$short_host_name}{access}{ip}.":/etc/hosts", + destination => $target_directory."/", + }); + $test_file = $target_directory."/hosts"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { test_file => $test_file }}); + if (not -e $test_file) + { + print "Done.\n"; + } + else + { + print "Failed!\n"; + print "- For some reason, this file was not collected.\n"; + $anvil->Storage->write_file({ + file => $test_file, + body => $failed_body, + overwrite => 1, + backup => 0, + }); + } + + print "- Grabbing Anvil! log... "; + $anvil->Storage->rsync({ + source => "root\@".$anvil->data->{peer}{$short_host_name}{access}{ip}.":/var/log/anvil.log", + destination => $target_directory."/", + }); + $test_file = $target_directory."/anvil.log"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { test_file => $test_file }}); + if (not -e $test_file) + { + print "Done.\n"; + } + else + { + print "Failed!\n"; + print "- For some reason, this file was not collected.\n"; + $anvil->Storage->write_file({ + file => $test_file, + body => $failed_body, + overwrite => 1, + backup => 0, + }); + } + + # If this is a node, grab the shared files. + if ($this_host_type eq "node") + { + print "- Collecting the cluster information base (CIB)... "; + $shell_call = $anvil->data->{path}{exe}{pcs}." cluster cib > /tmp/cib.xml"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + + # Copying the file + print "Done! Copying to here... "; + $anvil->Storage->rsync({ + source => "root\@".$anvil->data->{peer}{$short_host_name}{access}{ip}.":/tmp/cib.xml", + destination => $target_directory."/", + }); + my $test_file = $target_directory."/cib.xml"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { test_file => $test_file }}); + if (not -e $test_file) + { + print "Done.\n"; + } + else + { + print "Failed!\n"; + print "- For some reason, this file was not collected.\n"; + $anvil->Storage->write_file({ + file => $test_file, + body => $failed_body, + overwrite => 1, + backup => 0, + }); + } + } + + # If this is not a striker, collect definition files. + if ($this_host_type ne "striker") + { + print "- Collecting server definitions... "; + $anvil->Storage->rsync({ + source => "root\@".$anvil->data->{peer}{$short_host_name}{access}{ip}.":/mnt/shared/definitions", + destination => $target_directory."/", + }); + print "Done!\n"; + + print "- Collecting replicated storage config... "; + $anvil->Storage->rsync({ + source => "root\@".$anvil->data->{peer}{$short_host_name}{access}{ip}.":/etc/drbd.d", + destination => $target_directory."/", + }); + print "Done!\n"; + } + + return(0); +} + +sub collect_local_data +{ + my ($anvil, $target_directory) = @_; + + my $host_uuid = $anvil->Get->host_uuid(); + my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:target_directory' => $target_directory, + 's2:host_uuid' => $host_uuid, + 's3:this_host_type' => $this_host_type, + }}); + + # Dump the previous boot logs to a file. + print "\nGrabbing logs and data from the local system.\n"; + print "- Grabbing system logs from the previous boot... "; + my $shell_call = $anvil->data->{path}{exe}{journalctl}." -b -1 > ".$target_directory."/journalctl-previous-boot.log"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + print "Done!\n"; + + # Dump the current boot logs + print "- Grabbing system logs from this boot... "; + $shell_call = $anvil->data->{path}{exe}{journalctl}." -b 0 > ".$target_directory."/journalctl-current-boot.log"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + print "Done!\n"; + + # If we're a striker, dump the database also. + if ($this_host_type eq "striker") + { + print "- Dumping and compressing database data, PLEASE BE PATIENT!... "; + my $shell_call = $anvil->data->{path}{exe}{su}." postgres -c \"".$anvil->data->{path}{exe}{pg_dump}." anvil\" 2>/dev/null | ".$anvil->data->{path}{exe}{bzip2}." --stdout > ".$target_directory."/anvil.out.bz2"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + if ($return_code) + { + # Failed + print "Failed!\n"; + print "Expected the return code '0', but got: [".$return_code."]. The error, if any, was:\n"; + print "========\n"; + print $output."\n"; + print "========\n"; + $anvil->nice_exit({exit_code => 1}); + } + print "Done!\n"; + } + + print "- Grabbing hosts file... "; + $shell_call = $anvil->data->{path}{exe}{cp}." /etc/hosts ".$target_directory."/"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + print "Done!\n"; + + print "- Grabbing Anvil! log... "; + $shell_call = $anvil->data->{path}{exe}{cp}." /var/log/anvil.log ".$target_directory."/"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + print "Done!\n"; + + # If this is a node, grab the shared files. + if ($this_host_type eq "node") + { + print "- Collecting the cluster information base (CIB)... "; + $shell_call = $anvil->data->{path}{exe}{pcs}." cluster cib > ".$target_directory."/cib.xml"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + print "Done!\n"; + } + + # If this is not a striker, collect definition files. + if ($this_host_type ne "striker") + { + print "- Collecting server definitions... "; + $shell_call = $anvil->data->{path}{exe}{rsync}." -av /mnt/shared/definitions ".$target_directory."/"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + print "Done!\n"; + } + + return(0); +} + +sub process_switches +{ + my ($anvil) = @_; + + $anvil->data->{collect_from} = []; + $anvil->Database->get_hosts(); + + if ($anvil->data->{switches}{anvil}) + { + if ($anvil->data->{switches}{anvil} eq "#!SET!#") + { + # Show a list of Anvil! systems. + print "Available Anvil! systems. Use '--anvil ' to collect data from a specific Anvil! node.\n"; + foreach my $anvil_name (sort {$a cmp $b} keys %{$anvil->data->{anvils}{anvil_name}}) + { + print "- Name: [".$anvil_name."], UUID: [".$anvil->data->{anvils}{anvil_name}{$anvil_name}{anvil_uuid}."]\n"; + } + $anvil->nice_exit({exit_code => 0}); + } + + # Make sure the anvil is valid. + my ($anvil_name, $anvil_uuid) = $anvil->Get->anvil_from_switch({ + debug => 2, + anvil => $anvil->data->{switches}{anvil}, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:anvil_name' => $anvil_name, + 's2:anvil_uuid' => $anvil_uuid, + }}); + + if (not $anvil_name) + { + # Bad name. + print "[ Error ] - Unable to get the Anvil! name and UUID from the string: [".$anvil->data->{switches}{anvil}."]\n"; + $anvil->nice_exit({exit_code => 1}); + } + + # Add the host_uuids to the collect_from array. + push @{$anvil->data->{collect_from}}, $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node1_host_uuid}; + push @{$anvil->data->{collect_from}}, $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node2_host_uuid}; + } + + if ($anvil->data->{switches}{hosts}) + { + if ($anvil->data->{switches}{hosts} eq "#!SET!#") + { + # Show a list of all machines. + print "Available Anvil! cluster systems. Use '--host ' to collect data from specific hosts.\n"; + foreach my $host_type ("striker", "node", "dr") + { + print "- Striker Dashboards:\n" if $host_type eq "striker"; + print "\n- Anvil! sub-nodes:\n" if $host_type eq "node"; + print "\n- Disaster recovery hosts:\n" if $host_type eq "dr"; + foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}}) + { + my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name}; + my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:host_name' => $host_name, + 's2:host_uuid' => $host_uuid, + 's3:this_host_type' => $this_host_type, + }}); + next if $host_type ne $this_host_type; + + print " - Host: [".$host_name."], UUID: [".$host_uuid."]\n"; + } + } + + $anvil->nice_exit({exit_code => 0}); + } + + foreach my $host (split/,/, $anvil->data->{switches}{hosts}) + { + # Make sure this host is valid. + my ($host_uuid) = $anvil->Database->get_host_uuid_from_string({string => $host}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:host' => $host, + 's2:host_uuid' => $host_uuid, + }}); + if (not $host_uuid) + { + print "[ Error ] - Unable to get the host UUID from the host string: [".$host."]\n"; + $anvil->nice_exit({exit_code => 1}); + } + push @{$anvil->data->{collect_from}}, $host_uuid; + } + } + + # If we were restricted to an anvil or host, make sure we've added the Strikers. + if (($anvil->data->{switches}{anvil}) or ($anvil->data->{switches}{hosts})) + { + foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}}) + { + my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name}; + my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:host_name' => $host_name, + 's2:host_uuid' => $host_uuid, + 's3:this_host_type' => $this_host_type, + }}); + next if $this_host_type ne "striker"; + + my $seen = 0; + foreach my $this_host_uuid (@{$anvil->data->{collect_from}}) + { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:this_host_uuid' => $this_host_uuid, + 's2:host_uuid' => $host_uuid, + }}); + if ($this_host_uuid eq $host_uuid) + { + $seen = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { seen => $seen }}); + last; + } + } + + if (not $seen) + { + push @{$anvil->data->{collect_from}}, $host_uuid; + } + } + } + + return(0); +} diff --git a/tools/striker-update-cluster b/tools/striker-update-cluster new file mode 100755 index 00000000..23a41944 --- /dev/null +++ b/tools/striker-update-cluster @@ -0,0 +1,1254 @@ +#!/usr/bin/perl +# +# This program will disable our daemons on all machines, then update each striker. It then walks through all +# DR hosts and Anvil! nodes. With nodes, it migrates servers to the peer, takes the node out of the cluster, +# updates it, reboots if the kernel was updated, and then rejoins the cluster, migrates the VMs and the does +# the same process on the peer sub-node. +# +# Exit codes; +# 0 = Normal exit. +# 1 = No database connection. +# +# TODO: +# +# USAGE: +# + +use strict; +use warnings; +use Anvil::Tools; +require POSIX; +use Term::Cap; +use Text::Diff; +use Data::Dumper; + +my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0]; +my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0]; +if (($running_directory =~ /^\./) && ($ENV{PWD})) +{ + $running_directory =~ s/^\./$ENV{PWD}/; +} + +# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete. +$| = 1; + +my $anvil = Anvil::Tools->new(); + +### TODO: Remove this before final release +$anvil->Log->level({set => 2}); +$anvil->Log->secure({set => 1}); +########################################## + +# Read switches (target ([user@]host[:port]) and the file with the target's password. +$anvil->Get->switches({list => [ + "clear-cache", + "force", + "no-reboot", + "reboot-self", + "y", + "yes"], man => $THIS_FILE}); +$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}}); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }}); + +# Connect to the database(s). If we have no connections, we'll proceed anyway as one of the 'run_once' tasks +# is to setup the database server. +$anvil->Database->connect(); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"}); +if (not $anvil->data->{sys}{database}{connections}) +{ + # No databases, update the job, sleep for a bit and then exit. The daemon will pick it up and try + # again after we exit. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0305"}); + sleep 10; + $anvil->nice_exit({exit_code => 1}); +} + +# Make sure we're running as 'root' +# $< == real UID, $> == effective UID +if (($< != 0) && ($> != 0)) +{ + # Not root + print $anvil->Words->string({key => "error_0005"})."\n"; + $anvil->nice_exit({exit_code => 1}); +} + +# Make sure we're a striker. +if ($anvil->Get->host_type ne "striker") +{ + print "This has to be run on a Striker dashboard.\n"; + $anvil->nice_exit({exit_code => 1}); +} + +# If we still don't have a job-uuit, go into interactive mode. +$anvil->data->{sys}{progress} = 0; +if ($anvil->data->{switches}{'job-uuid'}) +{ + # Load the job data. + $anvil->Job->clear(); + $anvil->Job->get_job_details({debug => 2}); + $anvil->Job->update_progress({ + progress => $anvil->data->{sys}{progress}++, + job_picked_up_by => $$, + job_picked_up_at => time, + 'print' => 1, + message => "message_0319", + }); +} + +# Update beginning. Verifying all known machines are accessible... +$anvil->Job->update_progress({ + 'print' => 1, + progress => $anvil->data->{sys}{progress}++, + message => "job_0469", +}); +my $all_access = verify_access($anvil); +if ((not $all_access) && (not $anvil->data->{switches}{force})) +{ + print "[ Error ] - Not all systems are accessible. Update aborted!\n"; + $anvil->nice_exit({exit_code => 1}); +} +print "Success!\n"; + +if (($anvil->data->{switches}{y}) or ($anvil->data->{switches}{yes})) +{ + print "[ Note ] - Proceeding without confirmation, '-y' or '--yes' used.\n"; +} +else +{ + print "[ Note ] - All nodes need to be up and running for the update to run on nodes. + [ Note ] - Any out-of-sync storage needs to complete before a node can be updated. + [ Warning ] - Servers will be migrated between subnodes, which can cause reduced performance during + [ Warning ] - the these migrations. If a sub-node is not active, it will be activated as part of the + [ Warning ] - upgrade process.\n"; + print "\n".$anvil->Words->string({key => "message_0021"})."\n"; + my $answer = ; + chomp $answer; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { answer => $answer }}); + + if ($answer =~ /^y/i) + { + print $anvil->Words->string({key => "message_0175"})."\n"; + } + else + { + print $anvil->Words->string({key => "message_0022"})."\n"; + $anvil->nice_exit({exit_code => 0}); + } +} + +manage_daemons($anvil, "stop"); + +# Update systems +update_strikers_and_dr($anvil); + +# Update DR Host +update_nodes($anvil); + +print "Updates complete!\n"; + +my $host_uuid = $anvil->Get->host_uuid; +my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; +$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:host_uuid' => $host_uuid, + 's2:short_host_name' => $short_host_name, +}}); +if ($anvil->data->{sys}{reboot_needed}) +{ + if ($anvil->data->{switches}{'reboot-self'}) + { + print "[ Note ] - The local system needs to be rebooted, and '--reboot-self' was used. Rebooting in 60 seconds! Use ctrl+c to abort!\n"; + my $waiting = 60; + while ($waiting) + { + print $waiting.", "; + sleep 5; + $waiting -= 5; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $waiting }}); + } + print "\nRebooting now!\n"; + + my $shell_call = $anvil->data->{path}{exe}{systemctl}." reboot"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code }}); + + print "Reboot requested, exiting.\n"; + } + else + { + print "[ Note ] - This host needs to be rebooted to activate the new kernel. Please update as soon as you can.\n"; + } +} + +$anvil->nice_exit({exit_code => 0}); + + +############################################################################################################# +# Functions # +############################################################################################################# + +sub update_nodes +{ + my ($anvil) = @_; + + # Here, we loop through anvil systems, and find which sub nodes will be updated first, and which will + # be updated second. + foreach my $anvil_name (sort {$a cmp $b} keys %{$anvil->data->{anvils}{anvil_name}}) + { + my $anvil_uuid = $anvil->data->{anvils}{anvil_name}{$anvil_name}{anvil_uuid}; + my $anvil_description = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_description}; + my $anvil_node1_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node1_host_uuid}; + my $anvil_node2_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node2_host_uuid}; + my $primary_host_uuid = $anvil->Cluster->get_primary_host_uuid({anvil_uuid => $anvil_uuid}); + my $secondary_host_uuid = $primary_host_uuid eq $anvil_node1_host_uuid ? $anvil_node2_host_uuid : $anvil_node1_host_uuid; + my $node1_short_host_name = $anvil->data->{hosts}{host_uuid}{$anvil_node1_host_uuid}{short_host_name}; + my $node2_short_host_name = $anvil->data->{hosts}{host_uuid}{$anvil_node2_host_uuid}{short_host_name}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:anvil_name' => $anvil_name, + 's2:anvil_uuid' => $anvil_uuid, + 's3:anvil_description' => $anvil_description, + 's4:anvil_node1_host_uuid' => $anvil_node1_host_uuid, + 's5:anvil_node2_host_uuid' => $anvil_node2_host_uuid, + 's6:primary_host_uuid' => $primary_host_uuid, + 's7:secondary_host_uuid' => $secondary_host_uuid, + 's8:node1_short_host_name' => $node1_short_host_name, + 's9:node2_short_host_name' => $node2_short_host_name, + }}); + + # Before we proceed, are both nodes online? If so, great. If not, are both offline? If only + # one is online, abort. Check now in case things have changed since our first scan + print "Preparing to update the Anvil! node: [".$anvil_name."]. Verifying subnode access:\n"; + foreach my $host_uuid ($secondary_host_uuid, $primary_host_uuid) + { + my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:host_uuid' => $host_uuid, + 's2:short_host_name' => $short_host_name, + }}); + print "- Verifying access to subnode: [".$short_host_name."]\n"; + my $matches = $anvil->Network->find_access({ + debug => 2, + target => $host_uuid, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { matches => $matches }}); + $anvil->data->{peer}{$short_host_name}{access}{ip} = ""; + $anvil->data->{peer}{$short_host_name}{access}{network} = ""; + foreach my $preferred_network ("bcn", "mn", "ifn", "sn") + { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { preferred_network => $preferred_network }}); + foreach my $network_name (sort {$a cmp $b} keys %{$anvil->data->{network_access}}) + { + next if $network_name !~ /^$preferred_network/; + my $target_ip = $anvil->data->{network_access}{$network_name}{target_ip_address}; + my $test_access = $anvil->Remote->test_access({target => $target_ip}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:network_name' => $network_name, + 's2:target_ip' => $target_ip, + 's3:test_access' => $test_access, + }}); + + if ($test_access) + { + # We're good. + $anvil->data->{peer}{$short_host_name}{access}{ip} = $target_ip; + $anvil->data->{peer}{$short_host_name}{access}{network} = $network_name; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "s1:peer::${short_host_name}::access::ip" => $anvil->data->{peer}{$short_host_name}{access}{ip}, + "s2:peer::${short_host_name}::access::network" => $anvil->data->{peer}{$short_host_name}{access}{network}, + }}); + print "- Access found uver the: [".$network_name."] networking using the IP: [".$target_ip."]\n"; + last; + } + } + } + if (not $anvil->data->{peer}{$short_host_name}{access}{ip}) + { + print "[ Warning ] - Access not found!\n"; + } + } + + if ((($anvil->data->{peer}{$node1_short_host_name}{access}{ip}) && (not $anvil->data->{peer}{$node2_short_host_name}{access}{ip})) or + ((not $anvil->data->{peer}{$node1_short_host_name}{access}{ip}) && ($anvil->data->{peer}{$node2_short_host_name}{access}{ip}))) + { + # Only one node online, skip this Anvil node. + if ($anvil->data->{switches}{force}) + { + # Skip this Anvil! system + print "[ Warning ] - '--force' used, skipping this node.\n"; + print "[ NOTE ] - This node may not be able to communicate with the Striker dashboards until updated manually!\n"; + next; + } + else + { + print "[ Error ] - Exiting update! Please bring the missing subnode back online and try again!\n"; + $anvil->nice_exit({exit_code => 1}); + } + } + + # Update the secondary first, as it should have no VMs on it. + foreach my $host_uuid ($secondary_host_uuid, $primary_host_uuid) + { + # Withdraw the node from the cluster. + my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; + my $peer_host_uuid = $host_uuid eq $primary_host_uuid ? $secondary_host_uuid : $primary_host_uuid; + my $peer_short_host_name = $anvil->data->{hosts}{host_uuid}{$peer_host_uuid}{short_host_name}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:host_uuid' => $host_uuid, + 's2:short_host_name' => $short_host_name, + 's3:peer_host_uuid' => $peer_host_uuid, + 's4:peer_short_host_name' => $peer_short_host_name, + }}); + + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:host_uuid' => $host_uuid, + 's2:short_host_name' => $short_host_name, + }}); + + print "Preparing to update: [".$short_host_name."]. Withdrawing the subnode from the Anvil! node.\n"; + print "- [ Note ] - If the node has servers that need to be migrated off, or if the node is SyncSource for storage,\n"; + print "- [ Note ] - this could take some time to complete.\n"; + + # Register an anvil-safe-stop job and then wait. + my $job_uuid = $anvil->Database->insert_or_update_jobs({ + debug => 2, + job_command => $anvil->data->{path}{exe}{'anvil-safe-stop'}, + job_host_uuid => $host_uuid, + job_description => "job_0339", + job_name => "cgi-bin::set_membership::leave", + job_progress => 0, + job_title => "job_0338" + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }}); + print "- [ Note ] - Job registered with UUID: [".$job_uuid."], waiting for it to complete.\n"; + + # Log into the target machine and make sure anvil-daemon is running. + print "- Making sure anvil-daemon is running... "; + my $shell_call = $anvil->data->{path}{exe}{systemctl}." enable --now anvil-daemon.service"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + my ($output, $error, $return_code) = $anvil->Remote->call({ + 'close' => 1, + no_cache => 1, + shell_call => $shell_call, + target => $anvil->data->{peer}{$short_host_name}{access}{ip}, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + if (not $return_code) + { + print " running.\n"; + } + else + { + print " not running!\n"; + } + + # Verify that the node is no longer in the cluster. + my $waiting = 1; + my $next_log = time + 60; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }}); + while ($waiting) + { + my $problem = $anvil->Cluster->parse_cib({target => $anvil->data->{peer}{$short_host_name}{access}{ip}}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); + + if ($problem) + { + # This is good, it didn't parse so it's out of the cluster. + print "- The subnode is out of the node cluster. Proceeding.\n"; + $waiting = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); + } + else + { + if (time > $next_log) + { + $anvil->Job->get_job_details({job_uuid => $job_uuid}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "jobs::job_progress" => $anvil->data->{jobs}{job_progress}, + "jobs::job_data" => $anvil->data->{jobs}{job_data}, + }}); + if ($anvil->data->{jobs}{job_progress} == 0) + { + print "[ Warning ] - The job has not been picked up yet. Is 'anvil-daemon' running on: [".$short_host_name."]?\n"; + } + else + { + print "[ Note ] - [".$anvil->Get->date_and_time({time_only => 1})."] - The job progress is: [".$anvil->data->{jobs}{job_progress}."], continuing to wait.\n"; + } + $next_log = time + 60; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }}); + } + sleep 5; + } + } + + # Record the start time so that we can be sure the subnode has rebooted (uptime is + # less than the current time minus this start time), if the host reboots as part of + # the update. + my $reboot_time = time; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + reboot_time => $reboot_time, + short_host_name => $short_host_name, + }}); + + # Do the OS update. + print "- Beginning OS update of: [".$short_host_name."]\n"; + my $rebooted = 0; + $shell_call = $anvil->data->{path}{exe}{'anvil-update-system'}; + if ($anvil->data->{switches}{'no-reboot'}) + { + $shell_call .= " --no-reboot"; + } + if ($anvil->data->{switches}{'clear-cache'}) + { + $shell_call .= " --clear-cache"; + } + $shell_call .= $anvil->Log->switches(); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + $job_uuid = ""; + $job_uuid = $anvil->Database->insert_or_update_jobs({ + debug => 2, + job_command => $shell_call, + job_description => "job_0468", + job_host_uuid => $host_uuid, + job_name => "system::update-system", + job_progress => 0, + job_title => "job_0467" + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }}); + print "- [ Note ] - Job registered with UUID: [".$job_uuid."], waiting for it to complete.\n"; + + # Verify that the node is no longer in the cluster. + $waiting = 1; + $next_log = time + 60; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }}); + while ($waiting) + { + $anvil->Job->get_job_details({job_uuid => $job_uuid}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "jobs::job_progress" => $anvil->data->{jobs}{job_progress}, + "jobs::job_data" => $anvil->data->{jobs}{job_data}, + }}); + if ($anvil->data->{jobs}{job_progress} == 100) + { + print "- Done! The host: [".$short_host_name."] has been updated\n"; + $waiting = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); + + # Did it reboot? + if ($anvil->data->{jobs}{job_data} eq "rebooted") + { + $rebooted = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { rebooted => $rebooted }}); + } + + # Did it fail? + if ($anvil->data->{jobs}{job_data} eq "failed") + { + # Abort! + print "[ Error ] - There was a problem updating the subnode! Anvil! cluster update aborted.\n"; + $anvil->nice_exit({exit_code => 1}); + } + } + else + { + if (time > $next_log) + { + if ($anvil->data->{jobs}{job_progress} == 0) + { + print "[ Warning ] - The job has not been picked up yet. Is 'anvil-daemon' running on: [".$short_host_name."]?\n"; + } + else + { + print "[ Note ] - [".$anvil->Get->date_and_time({time_only => 1})."] - The job progress is: [".$anvil->data->{jobs}{job_progress}."], continuing to wait.\n"; + } + $next_log = time + 60; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }}); + } + sleep 5; + } + } + + print "- Update completed successfully! Checking if a reboot is needed.\n"; + my $run_anvil_safe_start = 0; + if ($rebooted) + { + print "- Rebooted! Will wait for it to come back up.\n"; + wait_for_reboot($anvil, $host_uuid, $reboot_time); + } + else + { + print "- Reboot not needed, kernel appears to be up to date.\n"; + + $run_anvil_safe_start = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { run_anvil_safe_start => $run_anvil_safe_start }}); + } + + # Wait for the node to rejoin the cluster. As before, this is a time + # unrestricted wait loop. + print "- Waiting for the subnode to rejoin the node.\n"; + $waiting = 1; + my $start_called = 0; + $next_log = time + 60; + my $manual_start = time + 60; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + next_log => $next_log, + manual_start => $manual_start, + }}); + + while($waiting) + { + # Should we call a start to the cluster? + if ((not $start_called) && ($run_anvil_safe_start)) + { + print "- Calling 'anvil-safe-start' to rejoin the subnode to the node.\n"; + $start_called = 1; + my $shell_call = $anvil->data->{path}{exe}{'anvil-safe-start'}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + start_called => $start_called, + shell_call => $shell_call, + }}); + + my ($output, $error, $return_code) = $anvil->Remote->call({ + debug => 2, + shell_call => $shell_call, + target => $anvil->data->{peer}{$short_host_name}{access}{ip}, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + } + + # Pull the CIB and make sure both nodes are ready, and that DRBD resources + # are all UpToDate if this is the reboot from the first node. + my ($problem) = $anvil->Cluster->parse_cib({target => $anvil->data->{peer}{$short_host_name}{access}{ip}}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); + + # Are both nodes ready? + if (not $problem) + { + # Both nodes are in the cluster, but are they full members yet? + my $both_ready = 1; + my $node_count = 0; + foreach my $node_name (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{data}{node}}) + { + my $ready = $anvil->data->{cib}{parsed}{data}{node}{$node_name}{node_state}{ready}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + node_name => $node_name, + ready => $ready, + }}); + if (not $ready) + { + $both_ready = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { both_ready => $both_ready }}); + } + $node_count++; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { node_count => $node_count }}); + } + + # Did we see two nodes and are both ready? + if (($node_count == 2) && ($both_ready)) + { + # Yes! If this is the first subnode, we need to wait for DRBD + # to be UpToDate. If it's the second, we just wait for the + # connections to be up. + # NOTE: We call the peer to get the DRBD data as it's got a + # better view of the storage + print "- Both subnodes are online, will now check replicated storage.\n"; + $anvil->DRBD->get_status({ + host => $peer_short_host_name, + target => $anvil->data->{peer}{$peer_short_host_name}{access}{ip}, + }); + + if ($host_uuid eq $primary_host_uuid) + { + ### NOTE: Should we wait for all connections + ### to be up? + # This is the second node, we don't have to wait. + print "- This is the second node, no need to wait for replication to complete.\n"; + $waiting = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); + } + else + { + # This is the first node. Wait for all volumes to be + # UpToDate. + if (time > $next_log) + { + print "- Waiting for all volumes to be UpToDate before updating the other subnode.\n"; + } + my $all_uptodate = 1; + my $resources = 0; + foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$peer_short_host_name}{resource}}) + { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { resource => $resource }}); + foreach my $peer_name (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$peer_short_host_name}{resource}{$resource}{connection}}) + { + # We don't care about DR hosts for this upgrade + my $peer_uuid = $anvil->Get->host_uuid_from_name({host_name => $peer_name}); + my $peer_type = $anvil->data->{hosts}{host_uuid}{$peer_uuid}{host_type}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:peer_name' => $peer_name, + 's2:peer_uuid' => $peer_uuid, + 's3:peer_type' => $peer_type, + }}); + next if $peer_type ne "node"; + foreach my $volume (sort {$a <=> $b} keys %{$anvil->data->{drbd}{status}{$peer_short_host_name}{resource}{$resource}{connection}{$peer_name}{volume}}) + { + # This is this subnode's disk state, + # as the DRBD data was collected + # from the peer. + my $disk_state = $anvil->data->{drbd}{status}{$peer_short_host_name}{resource}{$resource}{connection}{$peer_name}{volume}{$volume}{'peer-disk-state'}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:volume' => $volume, + 's2:disk_state' => $disk_state, + }}); + + if (lc($disk_state) ne "uptodate") + { + $all_uptodate = 0; + my $eta_in_seconds = $anvil->data->{drbd}{status}{$peer_short_host_name}{resource}{$resource}{connection}{$peer_name}{volume}{$volume}{'estimated-seconds-to-finish'}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + all_uptodate => $all_uptodate, + eta_in_seconds => $eta_in_seconds, + }}); + if (time > $next_log) + { + if ($eta_in_seconds) + { + print "- The resource: [".$resource."/".$volume."] is not synced yet, ETA is: [".$eta_in_seconds."] to complete resync.\n"; + } + else + { + print "- The resource: [".$resource."/".$volume."] is not yet UpToDate.\n"; + } + } + } + } # End foreach volume + } # End foreach peer + } # End foreach resource + + if ($all_uptodate) + { + print "- All resources appear to be ready,\n"; + $waiting = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); + } + } # End if host is first or second subnode + } # End if both ready + elsif (time > $next_log) + { + print "- Both subnodes are not online yet, still waiting.\n"; + } + } # End if CIB was parsed + elsif (time > $next_log) + { + print "- Unable to parse the node's cluster information base, will try again soon.\n"; + } + + if (time > $next_log) + { + $next_log = time + 60; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }}); + } + + if ($waiting) + { + sleep 5; + } + } # End while waiting for subnode to return + + # Run anvil-version-change + print "- Running 'anvil-version-changes'.\n"; + $output = ""; + $error = ""; + $return_code = ""; + $shell_call = $anvil->data->{path}{exe}{'anvil-version-changes'}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + if ($host_uuid eq $anvil->Get->host_uuid) + { + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + } + else + { + ($output, $error, $return_code) = $anvil->Remote->call({ + shell_call => $shell_call, + target => $anvil->data->{peer}{$short_host_name}{access}{ip}, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + } + print "- Done!\n"; + } + } + + return(0); +} + +sub update_strikers_and_dr +{ + my ($anvil) = @_; + + foreach my $host_type ("striker", "dr") + { + if ($host_type eq "dr") + { + # Restart daemons. + manage_daemons($anvil, "start"); + } + foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}}) + { + my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name}; + my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; + my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:host_name' => $host_name, + 's2:host_uuid' => $host_uuid, + 's3:short_host_name' => $short_host_name, + 's4:this_host_type' => $this_host_type, + }}); + next if $this_host_type ne $host_type; + + if ($host_type eq "striker") + { + print "Starting the update of the Striker dashboard: [".$short_host_name."].\n"; + } + else + { + print "Starting the update of the DR host: [".$short_host_name."].\n"; + } + + # If this is the local system, set the variable to track if we need to reboot. + # Otherwise, see if we have access to the peer. + if ($host_uuid eq $anvil->Get->host_uuid) + { + $anvil->data->{sys}{reboot_needed} = 0; + } + elsif(not $anvil->data->{peer}{$short_host_name}{access}{ip}) + { + if ($host_type eq "striker") + { + print "- No access to the Striker dashboard: [".$short_host_name."], skipping.\n"; + } + else + { + print "- No access to the DR host: [".$short_host_name."], skipping.\n"; + } + next; + } + + # Record the start time so that we can be sure the subnode has rebooted (uptime is + # less than the current time minus this start time), if the host reboots as part of + # the update. + my $reboot_time = time; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { reboot_time => $reboot_time }}); + + print "- Beginning OS update of: [".$short_host_name."]\n"; + my $rebooted = 0; + my $output = ""; + my $error = ""; + my $return_code = ""; + if ($anvil->data->{switches}{'clear-cache'}) + { + my $shell_call = $anvil->data->{path}{exe}{dnf}." clean all"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + if ($host_uuid eq $anvil->Get->host_uuid) + { + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + } + else + { + ($output, $error, $return_code) = $anvil->Remote->call({ + timeout => 0, + no_cache => 1, + shell_call => $shell_call, + target => $anvil->data->{peer}{$short_host_name}{access}{ip}, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + } + print "- Cache cleared.\n"; + } + print "- Calling update now.\n"; + print "- NOTE: This can seem like it's hung! You can watch the progress using 'journalctl -f' on another terminal to\n"; + print "- watch the progress via the system logs. You can also check wiht 'ps aux | grep dnf'.\n"; + if ($host_uuid eq $anvil->Get->host_uuid) + { + my $shell_call = $anvil->data->{path}{exe}{dnf}." -y update"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + if ($return_code) + { + print "[ Error ] - There was a problem updating the system! Expected a return code of '0', but got: [".$return_code."]\n"; + print "[ Error [ - The output, if any, was\n"; + print "==] Output [==\n"; + print $output."\n"; + print "==============\n"; + } + + # Get the newest installed kernel + $shell_call = $anvil->data->{path}{exe}{rpm}." -q kernel | ".$anvil->data->{path}{exe}{'sort'}." | ".$anvil->data->{path}{exe}{tail}." -n 1"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + (my $installed_kernel, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + installed_kernel => $installed_kernel, + return_code => $return_code, + }}); + $installed_kernel =~ s/^kernel-(\d+.\d+\.\d+-\d+)\..*$/$1/; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { installed_kernel => $installed_kernel }}); + + # Get the running kernel + $shell_call = $anvil->data->{path}{exe}{uname}." -r"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + (my $active_kernel, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + active_kernel => $active_kernel, + return_code => $return_code, + }}); + $active_kernel =~ s/(\d+.\d+\.\d+-\d+)\..*$/$1/; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { active_kernel => $active_kernel }}); + + if ($installed_kernel eq $active_kernel) + { + print "- The kernel has not been updated.\n"; + } + else + { + print "- The kernel appears to have been upgraded, reboot needed!\n"; + $anvil->data->{sys}{reboot_needed} = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "sys::reboot_needed" => $anvil->data->{sys}{reboot_needed}, + }}); + } + } + else + { + # Call anvil-update-system and then wait. + print "- Beginning OS update of: [".$short_host_name."]\n"; + my $shell_call = $anvil->data->{path}{exe}{'anvil-update-system'}; + if ($anvil->data->{switches}{'no-reboot'}) + { + $shell_call .= " --no-reboot"; + } + if ($anvil->data->{switches}{'clear-cache'}) + { + $shell_call .= " --clear-cache"; + } + $shell_call .= $anvil->Log->switches(); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + my $job_uuid = $anvil->Database->insert_or_update_jobs({ + debug => 2, + job_command => $shell_call, + job_description => "job_0468", + job_host_uuid => $host_uuid, + job_name => "system::update-system", + job_progress => 0, + job_title => "job_0467" + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }}); + print "- [ Note ] - Job registered with UUID: [".$job_uuid."], waiting for it to complete.\n"; + + # Log into the target machine and make sure anvil-daemon is running. + print "- Making sure anvil-daemon is running... "; + $shell_call = $anvil->data->{path}{exe}{systemctl}." start anvil-daemon.service"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + my ($output, $error, $return_code) = $anvil->Remote->call({ + 'close' => 1, + no_cache => 1, + shell_call => $shell_call, + target => $anvil->data->{peer}{$short_host_name}{access}{ip}, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + if (not $return_code) + { + print " running.\n"; + } + else + { + print " not running!\n"; + } + + # Verify that the node is no longer in the cluster. + my $waiting = 1; + my $next_log = time + 60; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }}); + while ($waiting) + { + $anvil->Job->get_job_details({job_uuid => $job_uuid}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "jobs::job_progress" => $anvil->data->{jobs}{job_progress}, + "jobs::job_data" => $anvil->data->{jobs}{job_data}, + }}); + if ($anvil->data->{jobs}{job_progress} == 100) + { + print "- Done! The host: [".$short_host_name."] has been updated\n"; + $waiting = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); + + # Did it reboot? + if ($anvil->data->{jobs}{job_data} eq "rebooted") + { + $rebooted = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { rebooted => $rebooted }}); + } + + # Did it fail? + if ($anvil->data->{jobs}{job_data} eq "failed") + { + # Abort! + print "[ Error ] - There was a problem updating the system! Anvil! cluster update aborted.\n"; + $anvil->nice_exit({exit_code => 1}); + } + } + else + { + if (time > $next_log) + { + if ($anvil->data->{jobs}{job_progress} == 0) + { + print "[ Warning ] - The job has not been picked up yet. Is 'anvil-daemon' running on: [".$short_host_name."]?\n"; + } + else + { + print "[ Note ] - [".$anvil->Get->date_and_time({time_only => 1})."] - The job progress is: [".$anvil->data->{jobs}{job_progress}."], continuing to wait.\n"; + } + $next_log = time + 60; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }}); + } + sleep 5; + } + } + + } + + if ($rebooted) + { + print "- Rebooted! Will wait for it to come back up.\n"; + wait_for_reboot($anvil, $host_uuid, $reboot_time); + } + else + { + print "- Reboot not needed, kernel appears to be up to date.\n"; + } + + # Run anvil-version-change + print "- Running 'anvil-version-changes' now.\n"; + $output = ""; + $error = ""; + $return_code = ""; + my $shell_call = $anvil->data->{path}{exe}{'anvil-version-changes'}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + if ($host_uuid eq $anvil->Get->host_uuid) + { + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + } + else + { + ($output, $error, $return_code) = $anvil->Remote->call({ + shell_call => $shell_call, + target => $anvil->data->{peer}{$short_host_name}{access}{ip}, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + } + } + } + + return(0); +} + +sub wait_for_reboot +{ + my ($anvil, $host_uuid, $reboot_time) = @_; + my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:host_uuid' => $host_uuid, + 's2:short_host_name' => $short_host_name, + }}); + + my $matches = $anvil->Network->find_access({ + debug => 2, + target => $host_uuid, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { matches => $matches }}); + + # Wait until the node comes back up. + print "- The target has been rebooted. We'll wait for the target to come back online.\n"; + + # This is an infinite loop, there is no timeout for this. + my $waiting = 1; + my $next_log = time + 60; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }}); + while($waiting) + { + # Test access + my $target = $anvil->data->{peer}{$short_host_name}{access}{ip}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + target => $target, + short_host_name => $short_host_name, + }}); + my $test_access = $anvil->Remote->test_access({target => $target}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { test_access => $test_access }}); + + if ($test_access) + { + # What's the machine's uptime? + my $uptime = $anvil->Get->uptime({debug => 2, target => $anvil->data->{peer}{$short_host_name}{access}{ip}}); + my $time_since_reboot = time - $reboot_time; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + uptime => $uptime, + time_since_reboot => $time_since_reboot, + short_host_name => $short_host_name, + }}); + + if (($uptime) && ($uptime < $time_since_reboot)) + { + # Rebooted! + print "- Rebooted! Subnode is back up.\n"; + + $waiting = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); + } + } + + if ($waiting) + { + if (time > $next_log) + { + # Tell the user we're still waiting. + print "- [".$anvil->Get->date_and_time({time_only => 1})."] - We're still waiting for the subnode: [".$short_host_name."] to reboot.\n"; + $next_log = time + 60; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }}); + } + + sleep 5; + } + } + + return(0); +} + +sub manage_daemons +{ + my ($anvil, $task) = @_; + + $task = "start" if not $task; + + my $do_task = $task eq "start" ? "enable --now" : "stop"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { do_task => $do_task }}); + + if ($task eq "stop") + { + print "Disabling Anvil! daemons on all hosts...\n"; + } + else + { + print "Enabling Anvil! daemons on all hosts...\n"; + } + my $daemons = ["anvil-daemon", "scancore"]; + foreach my $host_type ("dr", "node", "striker") + { + foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}}) + { + my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name}; + my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; + my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:host_name' => $host_name, + 's2:host_uuid' => $host_uuid, + 's3:short_host_name' => $short_host_name, + 's4:this_host_type' => $this_host_type, + }}); + next if $host_type ne $this_host_type; + + if ($task eq "stop") + { + print "- Disabling dameons on: [".$short_host_name."]... "; + } + else + { + print "- Enabling dameons on: [".$short_host_name."]... "; + } + if (not $anvil->data->{peer}{$short_host_name}{access}{ip}) + { + print "Offline! Skipping.\n"; + next; + } + + # Local + foreach my $daemon (@{$daemons}) + { + my $shell_call = $anvil->data->{path}{exe}{systemctl}." ".$do_task." ".$daemon; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + my $output = ""; + my $error = ""; + my $return_code = 999; + if ($host_uuid eq $anvil->Get->host_uuid) + { + # Local + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + } + else + { + # Remote, it'll be a while before we hit some clients, so close this + # connection so later access to the machines don't fail with ssh + # connection timeouts. + ($output, $error, $return_code) = $anvil->Remote->call({ + 'close' => 1, + no_cache => 1, + shell_call => $shell_call, + target => $anvil->data->{peer}{$short_host_name}{access}{ip}, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + } + if (not $return_code) + { + if ($task eq "stop") + { + print $daemon." stopped... "; + } + else + { + print $daemon." started... "; + } + } + else + { + if ($task eq "stop") + { + print $daemon." didn't stop!... "; + } + else + { + print $daemon." didn't start!... "; + } + } + } + print "Done!\n"; + } + } + + return(0); +} + +sub verify_access +{ + my ($anvil) = @_; + + # Load host and Anvil! data. + $anvil->Database->get_hosts(); + + # Make sure all are available before we start. + my $all_access = 1; + foreach my $host_type ("dr", "node", "striker") + { + foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}}) + { + my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name}; + my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; + my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:host_name' => $host_name, + 's2:host_uuid' => $host_uuid, + 's3:short_host_name' => $short_host_name, + 's4:this_host_type' => $this_host_type, + }}); + next if $host_type ne $this_host_type; + + print "- Verifying access to: [".$short_host_name."]... "; + my $matches = $anvil->Network->find_access({ + debug => 2, + target => $host_uuid, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { matches => $matches }}); + + $anvil->data->{peer}{$short_host_name}{access}{ip} = ""; + $anvil->data->{peer}{$short_host_name}{access}{network} = ""; + foreach my $preferred_network ("bcn", "mn", "ifn", "sn") + { + next if $anvil->data->{peer}{$short_host_name}{access}{ip}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { preferred_network => $preferred_network }}); + foreach my $network_name (sort {$a cmp $b} keys %{$anvil->data->{network_access}}) + { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { network_name => $network_name }}); + next if $network_name !~ /^$preferred_network/; + + my $target_ip = $anvil->data->{network_access}{$network_name}{target_ip_address}; + my $test_access = $anvil->Remote->test_access({ + 'close' => 1, + target => $target_ip, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's2:target_ip' => $target_ip, + 's3:test_access' => $test_access, + }}); + + if ($test_access) + { + # We're good. + print "Connected on: [".$target_ip."] via: [".$network_name."]\n"; + $anvil->data->{peer}{$short_host_name}{access}{ip} = $target_ip; + $anvil->data->{peer}{$short_host_name}{access}{network} = $network_name; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "s1:peer::${short_host_name}::access::ip" => $anvil->data->{peer}{$short_host_name}{access}{ip}, + "s2:peer::${short_host_name}::access::network" => $anvil->data->{peer}{$short_host_name}{access}{network}, + }}); + } + } + } + + if (not $anvil->data->{peer}{$short_host_name}{access}{ip}) + { + print "No access! Skipping.\n"; + $all_access = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_access => $all_access }}); + } + } + } + + return($all_access); +} \ No newline at end of file