From 7fbed10864b44f465fe6714dd76ce0a1635c760a Mon Sep 17 00:00:00 2001 From: digimer Date: Thu, 29 Jun 2023 22:17:58 -0400 Subject: [PATCH 01/14] * Updated Remote->call() to take the new 'background' parameter. * Continues work on adding new disks (DRBD volumes) to anvil-manage-server-storage. * Updated DRBD->get_status() to record the peer-role. Signed-off-by: digimer --- Anvil/Tools/DRBD.pm | 2 + Anvil/Tools/Remote.pm | 33 +- tools/anvil-manage-server-storage | 870 +++++++++++++++++++++++++++++- tools/anvil-provision-server | 7 +- 4 files changed, 888 insertions(+), 24 deletions(-) diff --git a/Anvil/Tools/DRBD.pm b/Anvil/Tools/DRBD.pm index 94dc1086..990b7f68 100644 --- a/Anvil/Tools/DRBD.pm +++ b/Anvil/Tools/DRBD.pm @@ -2291,12 +2291,14 @@ sub get_status $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{congested} = $hash_ref->{connections}->[$i]->{congested}; $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'connection-state'} = $hash_ref->{connections}->[$i]->{'connection-state'}; $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'peer-node-id'} = $hash_ref->{connections}->[$i]->{'peer-node-id'}; + $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'peer-role'} = $hash_ref->{connections}->[$i]->{'peer-role'}; $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'rs-in-flight'} = $hash_ref->{connections}->[$i]->{'rs-in-flight'}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { "drbd::status::${host}::resource::${resource}::connection::${peer_name}::ap-in-flight" => $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'ap-in-flight'}, "drbd::status::${host}::resource::${resource}::connection::${peer_name}::congested" => $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{congested}, "drbd::status::${host}::resource::${resource}::connection::${peer_name}::connection-state" => $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'connection-state'}, "drbd::status::${host}::resource::${resource}::connection::${peer_name}::peer-node-id" => $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'peer-node-id'}, + "drbd::status::${host}::resource::${resource}::connection::${peer_name}::peer-role" => $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'peer-role'}, "drbd::status::${host}::resource::${resource}::connection::${peer_name}::rs-in-flight" => $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'rs-in-flight'}, }}); diff --git a/Anvil/Tools/Remote.pm b/Anvil/Tools/Remote.pm index aa9582b0..044be424 100644 --- a/Anvil/Tools/Remote.pm +++ b/Anvil/Tools/Remote.pm @@ -224,9 +224,13 @@ B: By default, a connection to a target will be held open and cached to in Parameters; +=head3 background (optional, default '0') + +If set to C<< 1 >>, the command is run in the background. In this case, the PID of the SSH process is returned. The called should use C<< waitpid >> to ensure the PID has been reaped. + =head3 close (optional, default '0') -If set, the connection to the target will be closed at the end of the call. +If set to C<< 1 >>, the connection to the target will be closed at the end of the call. =head3 log_level (optional, default C<< 3 >>) @@ -300,9 +304,10 @@ sub call $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { "cache::ssh_fh::${ssh_fh_key}" => $anvil->data->{cache}{ssh_fh}{$ssh_fh_key} }}); # Now pick up the rest of the variables. + my $background = defined $parameter->{background} ? $parameter->{background} : 0; my $close = defined $parameter->{'close'} ? $parameter->{'close'} : 0; my $no_cache = defined $parameter->{no_cache} ? $parameter->{no_cache} : 0; - my $password = defined $parameter->{password} ? $parameter->{password} : $anvil->data->{sys}{root_password}; + my $password = defined $parameter->{password} ? $parameter->{password} : ""; my $secure = defined $parameter->{secure} ? $parameter->{secure} : 0; my $shell_call = defined $parameter->{shell_call} ? $parameter->{shell_call} : ""; my $timeout = defined $parameter->{timeout} ? $parameter->{timeout} : 10; @@ -310,17 +315,27 @@ sub call my $ssh_fh = $anvil->data->{cache}{ssh_fh}{$ssh_fh_key}; # NOTE: The shell call might contain sensitive data, so we show '--' if 'secure' is set and $anvil->Log->secure is not. $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + background => $background, 'close' => $close, password => $anvil->Log->is_secure($password), secure => $secure, - shell_call => (not $secure) ? $shell_call : $anvil->Log->is_secure($shell_call), - ssh_fh => $ssh_fh, + shell_call => (not $secure) ? $shell_call : $anvil->Log->is_secure($shell_call), + ssh_fh => $ssh_fh, start_time => $start_time, + timeout => $timeout, port => $port, - target => $target, + target => $target, ssh_fh_key => $ssh_fh_key, }}); + if ((not $password) && (defined $anvil->data->{sys}{root_password})) + { + $password = $anvil->data->{sys}{root_password}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + password => $anvil->Log->is_secure($password), + }}); + } + # In case 'target' is our short host name, change it to ''. if ($target eq $anvil->Get->short_host_name()) { @@ -634,6 +649,14 @@ sub call $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, secure => $secure, list => { ssh_fh => $ssh_fh }}); if ($ssh_fh =~ /^Net::OpenSSH/) { + # Are we doing a background call? + if ($background) + { + my $pid = $ssh_fh->spawn($shell_call); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, secure => $secure, list => { pid => $pid }}); + return($pid); + } + # The shell_call can't end is a newline. Conveniently, we want the return code. By adding # this, we ensure it doesn't end in a new-line (and we can't blindly strip off the last # new-line because of 'EOF' type cat's). diff --git a/tools/anvil-manage-server-storage b/tools/anvil-manage-server-storage index 99dbe8f2..95504d1b 100755 --- a/tools/anvil-manage-server-storage +++ b/tools/anvil-manage-server-storage @@ -24,6 +24,7 @@ use warnings; use Anvil::Tools; require POSIX; use Term::Cap; +use Text::Diff; use Data::Dumper; my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0]; @@ -217,22 +218,27 @@ sub manage_disk foreach my $volume_number (sort {$a cmp $b} keys %{$anvil->data->{drbd_resource}{$drbd_resource}{host_uuid}{$host_uuid}{volume_number}}) { my $device_path = $anvil->data->{drbd_resource}{$drbd_resource}{host_uuid}{$host_uuid}{volume_number}{$volume_number}{device_path}; + next if $device_path eq "DELETED"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:volume_number' => $volume_number, + 's2:device_path' => $device_path, + }}); + my $device_minor = $anvil->data->{drbd_resource}{$drbd_resource}{host_uuid}{$host_uuid}{volume_number}{$volume_number}{device_minor}; my $volume_size = $anvil->data->{drbd_resource}{$drbd_resource}{host_uuid}{$host_uuid}{volume_number}{$volume_number}{volume_size}; my $backing_disk = $anvil->data->{new}{resource}{$drbd_resource}{host_uuid}{$host_uuid}{volume_number}{$volume_number}{backing_disk}; my $meta_disk = $anvil->data->{new}{resource}{$drbd_resource}{host_uuid}{$host_uuid}{volume_number}{$volume_number}{meta_disk}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - 's1:volume_number' => $volume_number, - 's2:device_path' => $device_path, - 's3:device_minor' => $device_minor, - 's4:volume_size' => $volume_size, - 's5:backing_disk' => $backing_disk, - 's6:meta_disk' => $meta_disk, + 's1:device_minor' => $device_minor, + 's2:volume_size' => $volume_size, + 's3:backing_disk' => $backing_disk, + 's4:meta_disk' => $meta_disk, }}); # Which volume group is the backing device in? foreach my $this_scan_lvm_lv_name (sort {$a cmp $b} keys %{$anvil->data->{lvm}{host_name}{$short_host_name}{lv}}) { + next if not $this_scan_lvm_lv_name; my $this_scan_lvm_lv_path = $anvil->data->{lvm}{host_name}{$short_host_name}{lv}{$this_scan_lvm_lv_name}{scan_lvm_lv_path}; my $this_scan_lvm_lv_on_vg = $anvil->data->{lvm}{host_name}{$short_host_name}{lv}{$this_scan_lvm_lv_name}{scan_lvm_lv_on_vg}; my $this_scan_lvm_lv_uuid = $anvil->data->{lvm}{host_name}{$short_host_name}{lv}{$this_scan_lvm_lv_name}{scan_lvm_lv_uuid}; @@ -445,6 +451,150 @@ sub manage_disk_add } } + # Still here? We're good to go. + my $lv_command_size = 0; + my $hr_size = $anvil->Convert->bytes_to_human_readable({'bytes' => $add_size}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { hr_size => $hr_size }}); + if ($add_size eq "100%") + { + # This is valid + $add_size = "-l +100\%FREE"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { add_size => $add_size }}); + } + else + { + $hr_size =~ s/\s+//g; + $add_size = "-L +".$hr_size; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { add_size => $add_size }}); + } + + # What's the next free drive in the system, and what's the next free volume number? + my $new_device_target = ""; + my $target_prefix = ""; + my $disk_device_bus = ""; + my $disk_cache = ""; + my $disk_io_policy = ""; + my $drive_letter = "a"; + foreach my $device_target (sort {$a cmp $b} keys %{$anvil->data->{server}{$short_host_name}{$server_name}{$from_source}{device}{disk}{target}}) + { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { device_target => $device_target }}); + if (not $disk_device_bus) + { + $target_prefix = ($device_target =~ /^(\w+)\w$/)[0]; + $disk_device_bus = $anvil->data->{server}{$short_host_name}{$server_name}{$from_source}{device}{disk}{target}{$device_target}{device_bus}; + $disk_io_policy = $anvil->data->{server}{$short_host_name}{$server_name}{$from_source}{device}{disk}{target}{$device_target}{driver}{io}; + $disk_cache = $anvil->data->{server}{$short_host_name}{$server_name}{$from_source}{device}{disk}{target}{$device_target}{driver}{cache}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + target_prefix => $target_prefix, + disk_device_bus => $disk_device_bus, + disk_io_policy => $disk_io_policy, + disk_cache => $disk_cache, + }}); + last; + } + } + for (0..25) + { + my $test_device = $target_prefix.$drive_letter; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { test_device => $test_device }}); + if (not exists $anvil->data->{server}{$short_host_name}{$server_name}{$from_source}{device}{disk}{target}{$test_device}) + { + # Found a free one. + $new_device_target = $test_device; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { new_device_target => $new_device_target }}); + last; + } + $drive_letter++; + } + + if (not $new_device_target) + { + print "\n[ Error ] - Failed to find a new target device name.\n"; + $anvil->nice_exit({exit_code => 1}); + } + + my $next_drbd_volume = ""; + foreach my $this_host (sort {$a cmp $b} keys %{$anvil->data->{drbd}{drbd_node}}) + { + my $host_uuid = $anvil->Get->host_uuid_from_name({debug => 2, host_name => $this_host}); + my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:this_host' => $this_host, + 's2:host_uuid' => $host_uuid, + 's3:short_host_name' => $short_host_name, + }}); + + if ($next_drbd_volume eq "") + { + my $test_drbd_volume = 0; + for (0..100) + { + if (not $anvil->data->{new}{resource}{$drbd_resource}{host_uuid}{$host_uuid}{volume_number}{$test_drbd_volume}{device_path}) + { + # This is free. + $next_drbd_volume = $test_drbd_volume; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_drbd_volume => $next_drbd_volume }}); + last; + } + $test_drbd_volume++; + next; + } + } + + if ($next_drbd_volume eq "") + { + print "\n[ Error ] - Failed to find a new DRBD volume to use.\n"; + $anvil->nice_exit({exit_code => 1}); + } + } + + ### TODO: Make this work without the peer node being online. + # The server is allowed to be running, but both nodes and any DR hosts this is replicating to + # needs to be online. + my $all_online = check_drbd_peer_access($anvil); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_online => $all_online }}); + + if (not $all_online) + { + print "\n[ Error ] - Adding a new disk requires all peers to be online.\n"; + foreach my $short_host_name (sort {$a cmp $b} keys %{$anvil->data->{peer}}) + { + my $say_access = $anvil->data->{peer}{$short_host_name}{access_ip} ? "up." : "down!"; + print " - Peer: [".$short_host_name."] is ".$say_access."\n"; + } + $anvil->nice_exit({exit_code => 1}); + } + + # Still alive? Ask the user to confirm. + print "- New drive target: [".$new_device_target."], size: [".$hr_size."], bus: [".$disk_device_bus."], cache: [".$disk_cache."], IO policy: [".$disk_io_policy."]\n"; + print "- Preparing to add a the drive: [".$drbd_resource."/".$next_drbd_volume."] using the storage group: [".$storage_group_name."]...\n"; + if (not $anvil->data->{switches}{confirm}) + { + print $anvil->Words->string({key => "message_0021"})." "; + my $answer = ; + chomp($answer); + if ($answer !~ /^y/i) + { + print "Aborting.\n"; + $anvil->nice_exit({exit_code => 0}); + } + + # Test that we've lost access while waiting for the answer. + my $all_online = check_drbd_peer_access($anvil); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_online => $all_online }}); + + if (not $all_online) + { + print "\n[ Error ] - It would appear that we've lost access to a peer while waiting for the answer.\n"; + foreach my $short_host_name (sort {$a cmp $b} keys %{$anvil->data->{peer}}) + { + my $say_access = $anvil->data->{peer}{$short_host_name}{access_ip} ? "up." : "down!"; + print " - Peer: [".$short_host_name."] is ".$say_access."\n"; + } + $anvil->nice_exit({exit_code => 1}); + } + } + # Get the next free minor number my ($free_minor, undef) = $anvil->DRBD->get_next_resource({ debug => 2, @@ -453,6 +603,698 @@ sub manage_disk_add }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { free_minor => $free_minor }}); + # Create the new LVs + foreach my $host_type ("node", "dr") + { + foreach my $short_host_name (sort {$a cmp $b} keys %{$anvil->data->{drbd_resource}{$drbd_resource}{host_type}{$host_type}{short_host_name}}) + { + my $host_uuid = $anvil->data->{drbd_resource}{$drbd_resource}{host_type}{$host_type}{short_host_name}{$short_host_name}{host_uuid}; + my $vg_name = $anvil->data->{storage_groups}{anvil_uuid}{$anvil_uuid}{storage_group_uuid}{$storage_group_uuid}{host_uuid}{$host_uuid}{vg_name}; + my $vg_internal_uuid = $anvil->data->{storage_groups}{anvil_uuid}{$anvil_uuid}{storage_group_uuid}{$storage_group_uuid}{host_uuid}{$host_uuid}{vg_internal_uuid}; + my $new_lv_name = $server_name."_".$next_drbd_volume; + my $backing_disk = "/dev/".$vg_name."/".$new_lv_name; + my $shell_call = "if [ -e '".$backing_disk."' ]; then echo 'LV: [".$backing_disk."] already exists.'; else ".$anvil->data->{path}{exe}{lvcreate}." ".$add_size." -n ".$new_lv_name." ".$vg_name."; fi;"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:short_host_name' => $short_host_name, + 's2:host_uuid' => $host_uuid, + 's3:vg_name' => $vg_name, + 's4:vg_internal_uuid' => $vg_internal_uuid, + 's5:new_lv_name' => $new_lv_name, + 's6:backing_disk' => $backing_disk, + 's7:shell_call' => $shell_call, + }}); + + # Record this for updating the DRBD resource. + $anvil->data->{new_drbd}{$short_host_name}{resource}{$drbd_resource}{volume}{$next_drbd_volume}{minor} = $free_minor; + $anvil->data->{new_drbd}{$short_host_name}{resource}{$drbd_resource}{volume}{$next_drbd_volume}{backing_disk} = $backing_disk; + $anvil->data->{new_drbd}{$short_host_name}{resource}{$drbd_resource}{volume}{$next_drbd_volume}{seen} = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "new_drbd::${short_host_name}::resource::${drbd_resource}::volume::${next_drbd_volume}::minor" => $anvil->data->{new_drbd}{$short_host_name}{resource}{$drbd_resource}{volume}{$next_drbd_volume}{minor}, + "new_drbd::${short_host_name}::resource::${drbd_resource}::volume::${next_drbd_volume}::backing_disk" => $anvil->data->{new_drbd}{$short_host_name}{resource}{$drbd_resource}{volume}{$next_drbd_volume}{backing_disk}, + "new_drbd::${short_host_name}::resource::${drbd_resource}::volume::${next_drbd_volume}::seen" => $anvil->data->{new_drbd}{$short_host_name}{resource}{$drbd_resource}{volume}{$next_drbd_volume}{seen}, + }}); + + # This lets us insert the new volume as needed. + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + }}); + if ($host_uuid eq $anvil->Get->host_uuid) + { + print " - Creating the new local LV: [".$backing_disk."]..."; + my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + if ($return_code) + { + print " Error!\n"; + print "[ FAILED ] - When trying to create the new local logical volume: [".$backing_disk."]\n"; + print "[ FAILED ] - using the command: [".$shell_call."]\n"; + print "[ FAILED ] - The return code: [".$return_code."] was received, expected '0'. Output, if any:\n"; + print "==========\n"; + print $output."\n"; + print "==========\n"; + print "The creation of the new replicatedd disk is incomplete, manual intervention is required!!\n"; + $anvil->nice_exit({exit_code => 1}); + } + else + { + print " Done!\n"; + } + } + else + { + my $use_ip = $anvil->data->{peer}{$short_host_name}{access}{ip}; + my $use_network = $anvil->data->{peer}{$short_host_name}{access}{network}; + print " - Creating the new LV on the peer: [".$short_host_name.":".$backing_disk."], via: [".$use_ip." (".$use_network.")]"; + my ($output, $error, $return_code) = $anvil->Remote->call({ + shell_call => $shell_call, + target => $use_ip, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + if ($return_code) + { + print " Error!\n"; + print "[ FAILED ] - When trying to create the peer: [".$short_host_name."]'s logical volume: [".$backing_disk."]\n"; + print "[ FAILED ] - using the command: [".$shell_call."]\n"; + print "[ FAILED ] - The return code: [".$return_code."] was received, expected '0'. Output, if any:\n"; + print "==] STDOUT [========\n"; + print $output."\n"; + print "==] STDERR [========\n"; + print $error."\n"; + print "====================\n"; + print "The creation of the new replicated disk is incomplete, manual intervention is required!!\n"; + $anvil->nice_exit({exit_code => 1}); + } + else + { + print " Done!\n"; + } + } + } + } + + # Update the DRBD config file. + my $new_res_file = ""; + my $drbd_res_file = $anvil->data->{path}{directories}{drbd_resources}."/".$drbd_resource.".res"; + my $drbd_res_body = $anvil->Storage->read_file({file => $drbd_res_file}); + my $in_on_host = ""; + my $in_volume = ""; + foreach my $line (split /\n/, $drbd_res_body) + { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { line => $line }}); + if ($line =~ /on\s+(.*?)\s/) + { + $in_on_host = $1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { line => $line }}); + + $new_res_file .= $line."\n"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { new_res_file => $new_res_file }}); + next; + } + + if (($in_on_host) && ($line =~ /volume\s+(\d+)\s/)) + { + $in_volume = $1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { in_volume => $in_volume }}); + + $new_res_file .= $line."\n"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { new_res_file => $new_res_file }}); + + $anvil->data->{new_drbd}{$in_on_host}{resource}{$drbd_resource}{volume}{$in_volume}{seen} = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "new_drbd::${in_on_host}::resource::${drbd_resource}::volume::${in_volume}::seen" => $anvil->data->{new_drbd}{$in_on_host}{resource}{$drbd_resource}{volume}{$in_volume}{seen}, + }}); + next; + } + + if ($line =~ /}/) + { + if ($in_volume) + { + $in_volume = ""; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { in_volume => $in_volume }}); + } + elsif ($in_on_host) + { + # This is where we insert the new volume, if we've not seen it yet. + if (not $anvil->data->{new_drbd}{$in_on_host}{resource}{$drbd_resource}{volume}{$next_drbd_volume}{seen}) + { + # Insert the line. + $new_res_file .= $line." + + volume ".$next_drbd_volume." { + device /dev/drbd_".$drbd_resource."_".$next_drbd_volume." minor ".$anvil->data->{new_drbd}{$short_host_name}{resource}{$drbd_resource}{volume}{$next_drbd_volume}{minor}."; + disk ".$anvil->data->{new_drbd}{$in_on_host}{resource}{$drbd_resource}{volume}{$next_drbd_volume}{backing_disk}."; + meta-disk internal; + } +"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { new_res_file => $new_res_file }}); + + $anvil->data->{new_drbd}{$in_on_host}{resource}{$drbd_resource}{volume}{$next_drbd_volume}{seen} = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "new_drbd::${in_on_host}::resource::${drbd_resource}::volume::${in_volume}::seen" => $anvil->data->{new_drbd}{$in_on_host}{resource}{$drbd_resource}{volume}{$in_volume}{seen}, + }}); + next; + } + } + } + + $new_res_file .= $line."\n"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { new_res_file => $new_res_file }}); + } + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { new_res_file => $new_res_file }}); + + my $difference = diff \$drbd_res_body, \$new_res_file, { STYLE => 'Unified' }; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { difference => $difference }}); + + # Write the file to a test file and verify it's sane, + my $test_file = $anvil->data->{path}{directories}{temp}."/test-".$drbd_resource.".res"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { test_file => $test_file }}); + my ($problem) = $anvil->Storage->write_file({ + debug => 2, + backup => 0, + overwrite => 1, + file => $test_file, + body => $new_res_file, + user => "root", + group => "root", + mode => "0644", + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); + + # Validate. + print "- Testing the updated DRBD resource config file to ensure the new volumes are cromulent..."; + my $shell_call = $anvil->data->{path}{exe}{drbdadm}." --config-to-test ".$test_file." --config-to-exclude ".$drbd_res_file." sh-nop"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + if ($return_code) + { + # Something went wrong. + print " Failed! +[ Error ] - The new DRBD resource config appears to be invalid, which is likely a program error. The new +[ Error ] - config was written to the test file: [".$test_file."]. +[ Error ] - The test to confirm it was valid exited with the return code: [".$return_code."], expected '0'. +[ Error ] - The output, if anything, was: +==== +".$output." +==== +"; + $anvil->nice_exit({exit_code => 1}); + } + print " Success!\n"; + + # Remove the test file. + unlink $test_file; + + # Backup the res file so we can tell the user where the current config was backed up to in + # case they need to restore it. + print "- Writing out the updated DRBD config file.\n"; + my ($backup_file) = $anvil->Storage->backup({file => $drbd_res_file}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { backup_file => $backup_file }}); + + # Write out the new file. + ($problem) = $anvil->Storage->write_file({ + debug => 2, + backup => 0, + overwrite => 1, + file => $drbd_res_file, + body => $new_res_file, + user => "root", + group => "root", + mode => "0644", + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { backup_file => $backup_file }}); +# + # Copy this to our peers. + print "- Copying the new resource file to out peers.\n"; + foreach my $host_type ("node", "dr") + { + foreach my $short_host_name (sort {$a cmp $b} keys %{$anvil->data->{drbd_resource}{$drbd_resource}{host_type}{$host_type}{short_host_name}}) + { + my $host_uuid = $anvil->data->{drbd_resource}{$drbd_resource}{host_type}{$host_type}{short_host_name}{$short_host_name}{host_uuid}; + next if $host_uuid eq $anvil->Get->host_uuid; + + my $use_ip = $anvil->data->{peer}{$short_host_name}{access}{ip}; + my $destination = "root\@".$use_ip.":".$anvil->data->{path}{directories}{drbd_resources}."/"; + $destination =~ s/\/\//\//g; + print " - Copying: [".$drbd_res_file."] to: [".$short_host_name.":".$destination."] via: [".$use_ip."]\n"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + use_ip => $use_ip, + destination => $destination, + }}); + + my $failed = $anvil->Storage->rsync({ + debug => 2, + destination => $destination, + source => $drbd_res_file, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { failed => $failed }}); + + if ($failed) + { + print " +[ Error ] - There was a problem copying the new config file! Unable to proceed. +[ Error ] - Manual intervention to complete the update is required! +"; + $anvil->nice_exit({exit_code => 1}); + } + } + } + + # Create the metadata. + print "- Creating the replicated storage metadata on the new backing devices now.\n"; + foreach my $host_type ("node", "dr") + { + foreach my $short_host_name (sort {$a cmp $b} keys %{$anvil->data->{drbd_resource}{$drbd_resource}{host_type}{$host_type}{short_host_name}}) + { + my $host_uuid = $anvil->data->{drbd_resource}{$drbd_resource}{host_type}{$host_type}{short_host_name}{$short_host_name}{host_uuid}; + my $shell_call = $anvil->data->{path}{exe}{drbdadm}." --force create-md --max-peers=3 ".$drbd_resource."/".$next_drbd_volume; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:short_host_name' => $short_host_name, + 's2:host_uuid' => $host_uuid, + 's3:shell_call' => $shell_call, + }}); + + # Create the metadata, but don't exit on failure in case the metadata was created in + # a previous pass. + if ($host_uuid eq $anvil->Get->host_uuid) + { + print " - Creating the meta-data on the new local volume: [".$next_drbd_volume."]..."; + my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + + ### Return codes + # 0 == Success + # 1 == ? + # 3 == Configuration not found. + if ($return_code) + { + print " Warning!\n"; + print "[ Warning ] - When trying to create the local meta-data on: [".$drbd_resource."/".$next_drbd_volume."]\n"; + print "[ Warning ] - using the command: [".$shell_call."]\n"; + print "[ Warning ] - The return code: [".$return_code."] was received, expected '0'. Output, if any:\n"; + print "==========\n"; + print $output."\n"; + print "==========\n"; + print "We will try to proceed anyway.\n"; + } + else + { + print " Done!\n"; + } + } + else + { + my $use_ip = $anvil->data->{peer}{$short_host_name}{access}{ip}; + my $use_network = $anvil->data->{peer}{$short_host_name}{access}{network}; + print " - Creating the meta-data on the peer: [".$short_host_name.":".$drbd_resource."/".$next_drbd_volume."], via: [".$use_ip." (".$use_network.")]"; + my ($output, $error, $return_code) = $anvil->Remote->call({ + shell_call => $shell_call, + target => $use_ip, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + if ($return_code) + { + print " Warning!\n"; + print "[ Warning ] - When trying to create the peer: [".$short_host_name."]'s meta-data on: [".$drbd_resource."/".$next_drbd_volume."]\n"; + print "[ Warning ] - using the command: [".$shell_call."]\n"; + print "[ Warning ] - The return code: [".$return_code."] was received, expected '0'. Output, if any:\n"; + print "==] STDOUT [========\n"; + print $output."\n"; + print "==] STDERR [========\n"; + print $error."\n"; + print "====================\n"; + print "We will try to proceed anyway.\n"; + } + else + { + print " Done!\n"; + } + } + } + } + + # Adjust to start/connect. + my @pids; + foreach my $host_type ("node", "dr") + { + foreach my $short_host_name (sort {$a cmp $b} keys %{$anvil->data->{drbd_resource}{$drbd_resource}{host_type}{$host_type}{short_host_name}}) + { + my $host_uuid = $anvil->data->{drbd_resource}{$drbd_resource}{host_type}{$host_type}{short_host_name}{$short_host_name}{host_uuid}; + my $shell_call = $anvil->data->{path}{exe}{drbdadm}." adjust ".$drbd_resource; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:short_host_name' => $short_host_name, + 's2:host_uuid' => $host_uuid, + 's3:shell_call' => $shell_call, + }}); + + ### NOTE: The 'adjust' call doesn't return until it's adjusted on all machines, so we + ### make these calls as background calls. + # Create the metadata, but don't exit on failure in case the metadata was created in + # a previous pass. + if ($host_uuid eq $anvil->Get->host_uuid) + { + print "- Adjusting the local resource: [".$drbd_resource."] to pick up the new config.\n"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + ($output, $return_code) = $anvil->System->call({ + shell_call => $shell_call, + background => 1, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + } + else + { + # We'll use this in a minute to confirm connections. + $anvil->data->{peers}{$short_host_name}{host_uuid} = $host_uuid; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "peers::${short_host_name}::host_uuid" => $anvil->data->{peers}{$short_host_name}{host_uuid}, + }}); + + ### NOTE: This is expected to timeout when DR is used. + print "- Adjusting the peer: [".$short_host_name."]'s resource: [".$drbd_resource."] to pick up the new config.\n"; + my $use_ip = $anvil->data->{peer}{$short_host_name}{access}{ip}; + my $use_network = $anvil->data->{peer}{$short_host_name}{access}{network}; + my $shell_call = $anvil->data->{path}{exe}{drbdadm}." adjust ".$drbd_resource; + my ($pid) = $anvil->Remote->call({ + debug => 2, + background => 1, + shell_call => $shell_call, + target => $use_ip, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { pid => $pid }}); + push @pids, $pid; + } + } + } + + # Wait for the remote PID(s) to be reaped. +# foreach my $pid (@pids) +# { +# $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { pid => $pid }}); +# next if not $pid; +# waitpid($pid, 0); +# } + + # Find which node is currently Primary and use that host to force primary to start sync. If none, + # force here. + print "- Waiting for all peers to connect the new volume..."; + my $waiting = 1; + my $wait_until = time + 60; + while ($waiting) + { + $anvil->DRBD->get_status({debug => 2}); + my $peers_connected = 1; + my $disks_ready = 0; + foreach my $this_host_name (sort {$a cmp $b} keys %{$anvil->data->{peers}}) + { + my $host_uuid = $anvil->data->{peers}{$this_host_name}{host_uuid}; + my $connection_state = $anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{connection}{$this_host_name}{'connection-state'}; + my $node_id = $anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{connection}{$this_host_name}{'peer-node-id'}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:this_host_name' => $this_host_name, + 's2:host_uuid' => $host_uuid, + 's3:connection_state' => $connection_state, + 's4:node_id' => $node_id, + }}); + if (lc($connection_state) ne "connected") + { + $peers_connected = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peers_connected => $peers_connected }}); + } + } + + if ($peers_connected) + { + # Make sure all disks are attached. + $disks_ready = 1; + $anvil->data->{peers}{$short_host_name}{disk_state} = $anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{devices}{volume}{$next_drbd_volume}{'disk-state'}; + $anvil->data->{peers}{$short_host_name}{role} = $anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{role}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "peers::${short_host_name}::disk_state" => $anvil->data->{peers}{$short_host_name}{disk_state}, + }}); + foreach my $peer_name (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{connection}}) + { + my $peer_disk_state = $anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{connection}{$peer_name}{volume}{$next_drbd_volume}{'peer-disk-state'}; + my $replication_state = $anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{connection}{$peer_name}{volume}{$next_drbd_volume}{'replication-state'}; + my $role = $anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{connection}{$peer_name}{'peer-role'}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:peer_name' => $peer_name, + 's2:peer_disk_state' => $peer_disk_state, + 's3:replication_state' => $replication_state, + 's4:role' => $role, + }}); + if (lc($replication_state) ne "established") + { + $disks_ready = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peers_connected => $peers_connected }}); + } + + $anvil->data->{peers}{$peer_name}{disk_state} = $peer_disk_state; + $anvil->data->{peers}{$peer_name}{role} = $role; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "peers::${peer_name}::disk_state" => $anvil->data->{peers}{$peer_name}{disk_state}, + }}); + } + } + + if ($disks_ready) + { + $waiting = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); + } + else + { + if (time > $wait_until) + { + print " Failed!\n[ Error ] - The peers did not connect in the expected period of time.\n"; + $anvil->nice_exit({exit_code => 1}); + } + sleep 2; + } + } + print " Done!\n"; + + print "- Peers are connected! Checking if the new volume requires initial sync.\n"; + my $all_inconsistent = 1; + my $primary_on_host = ""; + foreach my $peer_name (sort {$a cmp $b} keys %{$anvil->data->{peers}}) + { + my $disk_state = $anvil->data->{peers}{$peer_name}{disk_state}; + my $role = $anvil->data->{peers}{$peer_name}{role}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + peer_name => $peer_name, + disk_state => $disk_state, + role => $role, + }}); + if (lc($disk_state) ne "inconsistent") + { + $all_inconsistent = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_inconsistent => $all_inconsistent }}); + } + if (lc($role) eq "primary") + { + $primary_on_host = $peer_name; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { primary_on_host => $primary_on_host }}); + } + } + + if ($all_inconsistent) + { + print "- Initial sync required!\n"; + my $shell_call = $anvil->data->{path}{exe}{drbdadm}." primary ".$drbd_resource." --force"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + # Which node should be forced primary? + if (not $primary_on_host) + { + # We'll make it primary. + $primary_on_host = $short_host_name; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { primary_on_host => $primary_on_host }}); + } + + my $primary_on_host_uuid = $anvil->Get->host_uuid_from_name({debug => 2, host_name => $primary_on_host}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { primary_on_host_uuid => $primary_on_host_uuid }}); + if ($primary_on_host_uuid eq $anvil->Get->host_uuid) + { + print "- Forcing primary locally... "; + my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + # Return code of '0' is success. + if ($return_code) + { + print "Failed! +[ Error ] - There was a problem trying to force the new volume: [".$drbd_resource."/".$next_drbd_volume."] to Primary. +[ Error ] - Attempted this using the shell call: [".$shell_call."]. +[ Error ] - Expected the return code '0' but got: [".$return_code."]. The output, if any, was: +========== +".$output." +========== +[ Error ] - Once corrected, please manually add the new volume to the server. +"; + $anvil->nice_exit({exit_code => 1}); + } + + # Now demote it again. + $shell_call = $anvil->data->{path}{exe}{drbdadm}." secondary ".$drbd_resource; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + print "Success!\n"; + } + else + { + my $use_ip = $anvil->data->{peer}{$short_host_name}{access}{ip}; + my $use_network = $anvil->data->{peer}{$short_host_name}{access}{network}; + print " - The resource is primary onthe peer: [".$short_host_name."], forcing primary there via: [".$use_ip." (".$use_network.")]"; + my ($output, $error, $return_code) = $anvil->Remote->call({ + shell_call => $shell_call, + target => $use_ip, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + if ($return_code) + { + print "Failed! +[ Error ] - There was a problem trying to force the new volume: [".$drbd_resource."/".$next_drbd_volume."] to Primary. +[ Error ] - Attempted this using the shell call: [".$shell_call."]. +[ Error ] - Expected the return code '0' but got: [".$return_code."]. The output, if any, was: +========== +".$output." +========== +[ Error ] - Once corrected, please manually add the new volume to the server. +"; + $anvil->nice_exit({exit_code => 1}); + } + + # Now demote it again. + $shell_call = $anvil->data->{path}{exe}{drbdadm}." secondary ".$drbd_resource; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + ($output, $error, $return_code) = $anvil->Remote->call({ + shell_call => $shell_call, + target => $use_ip, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + print "Success!\n"; + } + } + else + { + print "Initial sync does not appear to be required.\n"; + } + +# my $startup_needed = 1; +# my $local_role = defined $anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{role} ? $anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{role} : ""; +# $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_role => $local_role }}); + + + +=cut + # Create the DRBD metadata. For this, we don't fail. + foreach my $host_type ("node", "dr") + { + foreach my $short_host_name (sort {$a cmp $b} keys %{$anvil->data->{drbd_resource}{$drbd_resource}{host_type}{$host_type}{short_host_name}}) + { + my $host_uuid = $anvil->data->{drbd_resource}{$drbd_resource}{host_type}{$host_type}{short_host_name}{$short_host_name}{host_uuid}; + my $shell_call = $anvil->data->{path}{exe}{drbdadm}." --force create-md --max-peers=3 ".$drbd_resource."/".$next_drbd_volume; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:short_host_name' => $short_host_name, + 's2:host_uuid' => $host_uuid, + 's7:shell_call' => $shell_call, + }}); + if ($host_uuid eq $anvil->Get->host_uuid) + { + print " - Creating the new local LV: [".$backing_disk."]..."; + my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + if ($return_code) + { + print " Error!\n"; + print "[ FAILED ] - When trying to create the new local logical volume: [".$backing_disk."]\n"; + print "[ FAILED ] - using the command: [".$shell_call."]\n"; + print "[ FAILED ] - The return code: [".$return_code."] was received, expected '0'. Output, if any:\n"; + print "==========\n"; + print $output."\n"; + print "==========\n"; + print "The creation of the new replicatedd disk is incomplete, manual intervention is required!!\n"; + $anvil->nice_exit({exit_code => 1}); + } + else + { + print " Done!\n"; + } + } + else + { + my $use_ip = $anvil->data->{peer}{$short_host_name}{access}{ip}; + my $use_network = $anvil->data->{peer}{$short_host_name}{access}{network}; + print " - Creating the new LV on the peer: [".$short_host_name.":".$backing_disk."], via: [".$use_ip." (".$use_network.")]"; + my ($output, $error, $return_code) = $anvil->Remote->call({ + shell_call => $shell_call, + target => $use_ip, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + if ($return_code) + { + print " Error!\n"; + print "[ FAILED ] - When trying to create the peer's logical volume: [".$backing_disk."]\n"; + print "[ FAILED ] - using the command: [".$shell_call."]\n"; + print "[ FAILED ] - The return code: [".$return_code."] was received, expected '0'. Output, if any:\n"; + print "==] STDOUT [========\n"; + print $output."\n"; + print "==] STDERR [========\n"; + print $error."\n"; + print "====================\n"; + print "The creation of the new replicated disk is incomplete, manual intervention is required!!\n"; + $anvil->nice_exit({exit_code => 1}); + } + else + { + print " Done!\n"; + } + } + } + } +=cut return(0); } @@ -544,7 +1386,7 @@ sub manage_disk_grow ### TODO: Make this work without the peer node being online. # The server is allowed to be running, but both nodes and any DR hosts this is replicating to # needs to be online. - my $all_online = check_drbd_peer_access($anvil, $from_source, $drbd_volume); + my $all_online = check_drbd_peer_access($anvil); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_online => $all_online }}); if (not $all_online) @@ -588,7 +1430,7 @@ sub manage_disk_grow } # Test that we've lost access while waiting for the answer. - my $all_online = check_drbd_peer_access($anvil, $from_source, $drbd_volume); + my $all_online = check_drbd_peer_access($anvil); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_online => $all_online }}); if (not $all_online) @@ -1089,11 +1931,7 @@ sub show_server_details sub check_drbd_peer_access { - my ($anvil, $drbd_resource, $drbd_volume) = @_; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - 's01:drbd_resource' => $drbd_resource, - 's02:drbd_volume' => $drbd_volume, - }}); + my ($anvil) = @_; my $all_online = 1; foreach my $this_host (sort {$a cmp $b} keys %{$anvil->data->{drbd}{drbd_node}}) @@ -1172,9 +2010,9 @@ sub get_max_free_space my $drbd_path = $anvil->data->{drbd}{drbd_node}{$this_host}{config}{resource}{$drbd_resource}{volume}{$drbd_volume}{drbd_path}; my $drbd_path_by_res = $anvil->data->{drbd}{drbd_node}{$this_host}{config}{resource}{$drbd_resource}{volume}{$drbd_volume}{drbd_path_by_res}; my $backing_lv = $anvil->data->{drbd}{drbd_node}{$this_host}{config}{resource}{$drbd_resource}{volume}{$drbd_volume}{backing_lv}; - my $lv_name = $anvil->data->{lvm}{host_name}{$this_host}{lv_path}{$backing_lv}{scan_lvm_lv_name};; - my $on_vg = $anvil->data->{lvm}{host_name}{$this_host}{lv}{$lv_name}{scan_lvm_lv_on_vg}; - my $vg_free_space = $anvil->data->{lvm}{host_name}{$this_host}{vg}{$on_vg}{scan_lvm_vg_free}; + my $lv_name = $anvil->data->{lvm}{host_name}{$this_host}{lv_path}{$backing_lv}{scan_lvm_lv_name} ? $anvil->data->{lvm}{host_name}{$this_host}{lv_path}{$backing_lv}{scan_lvm_lv_name} : ""; + my $on_vg = $anvil->data->{lvm}{host_name}{$this_host}{lv}{$lv_name}{scan_lvm_lv_on_vg} ? $anvil->data->{lvm}{host_name}{$this_host}{lv}{$lv_name}{scan_lvm_lv_on_vg} : ""; + my $vg_free_space = $anvil->data->{lvm}{host_name}{$this_host}{vg}{$on_vg}{scan_lvm_vg_free} ? $anvil->data->{lvm}{host_name}{$this_host}{vg}{$on_vg}{scan_lvm_vg_free} : 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's01:this_host' => $this_host, 's02:drbd_path' => $drbd_path, diff --git a/tools/anvil-provision-server b/tools/anvil-provision-server index 8bb55cfa..16a3d5a4 100755 --- a/tools/anvil-provision-server +++ b/tools/anvil-provision-server @@ -846,10 +846,11 @@ sub startup_resource # Is the current resource up locally already? If it is, we're done. my $server = $anvil->data->{job}{server_name}; my $short_host_name = $anvil->data->{job}{short_host_name}; - my $role = defined $anvil->data->{drbd}{status}{$short_host_name}{resource}{$server}{role} ? $anvil->data->{drbd}{status}{$short_host_name}{resource}{$server}{role} : ""; + my $role = defined $anvil->data->{drbd}{status}{$short_host_name}{resource}{$server}{role} ? $anvil->data->{drbd}{status}{$short_host_name}{resource}{$server}{role} : ""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - 'job::server' => $anvil->data->{job}{server_name}, - role => $role, + server => $server, + short_host_name => $short_host_name, + role => $role, }}); if ((lc($role) ne "secondary") && (lc($role) ne "primary")) From 1b8b0bc493fa70a315a2ba59ac1858989cca77fb Mon Sep 17 00:00:00 2001 From: digimer Date: Fri, 30 Jun 2023 21:02:30 -0400 Subject: [PATCH 02/14] * Created the new 'anvil-manage-server-storage' with the first role of reload a DRBD resource. * Updated Remote->call() to remove the 'background' parameter as it wasn't working. * Updated anvil-manage-server-storage to use 'anvil-manage-server-storage' to adjust resources in a way that doesn't block. Signed-off-by: digimer --- Anvil/Tools.pm | 1 + Anvil/Tools/Remote.pm | 22 +----- man/Makefile.am | 1 + man/anvil-special-operations.8 | 32 ++++++++ share/words.xml | 6 ++ tools/Makefile.am | 1 + tools/anvil-manage-server-storage | 93 +++++++++-------------- tools/anvil-special-operations | 120 ++++++++++++++++++++++++++++++ 8 files changed, 202 insertions(+), 74 deletions(-) create mode 100644 man/anvil-special-operations.8 create mode 100755 tools/anvil-special-operations diff --git a/Anvil/Tools.pm b/Anvil/Tools.pm index 8fab1091..f50e413a 100644 --- a/Anvil/Tools.pm +++ b/Anvil/Tools.pm @@ -1150,6 +1150,7 @@ sub _set_paths 'anvil-safe-start' => "/usr/sbin/anvil-safe-start", 'anvil-safe-stop' => "/usr/sbin/anvil-safe-stop", 'anvil-shutdown-server' => "/usr/sbin/anvil-shutdown-server", + 'anvil-special-operations' => "/usr/sbin/anvil-special-operations", 'anvil-sync-shared' => "/usr/sbin/anvil-sync-shared", 'anvil-update-files' => "/usr/sbin/anvil-update-files", 'anvil-update-states' => "/usr/sbin/anvil-update-states", diff --git a/Anvil/Tools/Remote.pm b/Anvil/Tools/Remote.pm index 044be424..8bcea088 100644 --- a/Anvil/Tools/Remote.pm +++ b/Anvil/Tools/Remote.pm @@ -224,13 +224,9 @@ B: By default, a connection to a target will be held open and cached to in Parameters; -=head3 background (optional, default '0') - -If set to C<< 1 >>, the command is run in the background. In this case, the PID of the SSH process is returned. The called should use C<< waitpid >> to ensure the PID has been reaped. - =head3 close (optional, default '0') -If set to C<< 1 >>, the connection to the target will be closed at the end of the call. +If set, the connection to the target will be closed at the end of the call. =head3 log_level (optional, default C<< 3 >>) @@ -304,7 +300,6 @@ sub call $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { "cache::ssh_fh::${ssh_fh_key}" => $anvil->data->{cache}{ssh_fh}{$ssh_fh_key} }}); # Now pick up the rest of the variables. - my $background = defined $parameter->{background} ? $parameter->{background} : 0; my $close = defined $parameter->{'close'} ? $parameter->{'close'} : 0; my $no_cache = defined $parameter->{no_cache} ? $parameter->{no_cache} : 0; my $password = defined $parameter->{password} ? $parameter->{password} : ""; @@ -315,16 +310,15 @@ sub call my $ssh_fh = $anvil->data->{cache}{ssh_fh}{$ssh_fh_key}; # NOTE: The shell call might contain sensitive data, so we show '--' if 'secure' is set and $anvil->Log->secure is not. $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { - background => $background, 'close' => $close, password => $anvil->Log->is_secure($password), secure => $secure, - shell_call => (not $secure) ? $shell_call : $anvil->Log->is_secure($shell_call), - ssh_fh => $ssh_fh, + shell_call => (not $secure) ? $shell_call : $anvil->Log->is_secure($shell_call), + ssh_fh => $ssh_fh, start_time => $start_time, timeout => $timeout, port => $port, - target => $target, + target => $target, ssh_fh_key => $ssh_fh_key, }}); @@ -649,14 +643,6 @@ sub call $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, secure => $secure, list => { ssh_fh => $ssh_fh }}); if ($ssh_fh =~ /^Net::OpenSSH/) { - # Are we doing a background call? - if ($background) - { - my $pid = $ssh_fh->spawn($shell_call); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, secure => $secure, list => { pid => $pid }}); - return($pid); - } - # The shell_call can't end is a newline. Conveniently, we want the return code. By adding # this, we ensure it doesn't end in a new-line (and we can't blindly strip off the last # new-line because of 'EOF' type cat's). diff --git a/man/Makefile.am b/man/Makefile.am index 0150315a..cce903dc 100644 --- a/man/Makefile.am +++ b/man/Makefile.am @@ -21,6 +21,7 @@ dist_man8_MANS = \ anvil-manage-server.8 \ anvil-manage-server-storage.8 \ anvil-manage-storage-groups.8 \ + anvil-special-operations.8 \ anvil-watch-drbd.8 \ scancore.8 \ striker-check-machines.8 \ diff --git a/man/anvil-special-operations.8 b/man/anvil-special-operations.8 new file mode 100644 index 00000000..8ef90225 --- /dev/null +++ b/man/anvil-special-operations.8 @@ -0,0 +1,32 @@ +.\" Manpage for the Anvil! storage groups +.\" Contact mkelly@alteeve.com to report issues, concerns or suggestions. +.TH anvil-special-operations "8" "Jun 30 2023" "Anvil! Intelligent Availability™ Platform" +.SH NAME +anvil-special-operations \- This program is generally meant to be used by other programs. +.SH SYNOPSIS +.B anvil-special-operations +\fI\, \/\fR[\fI\,options\/\fR] +.SH DESCRIPTION +This tool is used, generally by other parts of the Anvil!, the accomplish tasks that generally can't be accomplished by direct system calls. It's a general purpose tool meant to solve specific corner cases. +.TP +.SH OPTIONS +.TP +\-?, \-h, \fB\-\-help\fR +Show this man page. +.TP +\fB\-\-log-secure\fR +When logging, record sensitive data, like passwords. +.TP +\-v, \-vv, \-vvv +Set the log level to 1, 2 or 3 respectively. Be aware that level 3 generates a significant amount of log data. +.SS "Commands:" +.TP +\fB\-\-task\fR +This is the task being requested. Current optiopns are: +.IP refresh-drbd-resource +This requires \fB\-\-resource \fR, and will call 'drbdadm adjust ' as a background task and then return immediately. This is required when adding a new volume to an existing resource as 'drbdadm adjust ' will hold until it is called on all active DRBD nodes. This blocks the caller after the first remote host call. +.IP +.SH AUTHOR +Written by Madison Kelly, Alteeve staff and the Anvil! project contributors. +.SH "REPORTING BUGS" +Report bugs to users@clusterlabs.org diff --git a/share/words.xml b/share/words.xml index e3e96bc7..d5f0ec0c 100644 --- a/share/words.xml +++ b/share/words.xml @@ -602,6 +602,7 @@ The error was: There was a problem with finding a common storage network between: [#!variable!node1_name!#] and: [#!variable!node2_name!#]. Found node 1 to have the IP: [#!variable!node1_ip!#] and node 2: [#!variable!node2_ip!#]. Is there a problem with '/etc/hosts'? Failed to find a network to use for storage replication. Is there a problem with '/etc/hosts'? + '.]]> @@ -1552,6 +1553,8 @@ Note: This is a permanent action! If you protect this server again later, a full Enabling the enable-safe-start daemon. Calling select ScanCore scan agents to ensure the database is updated. + Reload (adjust) a DRBD resource + This job is to reload (adjust) a DRBD resource. It's run as a job as it blocks until the adjust is run on all nodes. Starting: [#!variable!program!#]. @@ -2893,6 +2896,9 @@ Proceed? [y/N] The DRBD config file was not found. A protect job needs to be run from the Anvil! node hosting the server to be protected. Waiting a bit to make sure the file: [#!variable!file!#] is done uploading... Upload complete. + Picked up the special operation job. + Reloading (adjusting) the DRBD resource: [#!variable!resource!#]. This will not complete until all peers have also reloaded this resource. + DRBD resource: [#!variable!resource!#] has been reloaded. Normal Password diff --git a/tools/Makefile.am b/tools/Makefile.am index cd2b412f..b782df7a 100644 --- a/tools/Makefile.am +++ b/tools/Makefile.am @@ -37,6 +37,7 @@ dist_sbin_SCRIPTS = \ anvil-scan-network \ anvil-show-local-ips \ anvil-shutdown-server \ + anvil-special-operations \ anvil-sync-shared \ anvil-test-alerts \ anvil-update-definition \ diff --git a/tools/anvil-manage-server-storage b/tools/anvil-manage-server-storage index 95504d1b..1b0302b1 100755 --- a/tools/anvil-manage-server-storage +++ b/tools/anvil-manage-server-storage @@ -950,69 +950,52 @@ sub manage_disk_add } } + ### NOTE: The call to 'drbdadm adjust ' hangs, hard, until the same command is run on the peers. + ### To deal with this, we register jobs to run 'anvil-special-operations' on the peers, then we + ### call adjust here. # Adjust to start/connect. - my @pids; foreach my $host_type ("node", "dr") { foreach my $short_host_name (sort {$a cmp $b} keys %{$anvil->data->{drbd_resource}{$drbd_resource}{host_type}{$host_type}{short_host_name}}) { my $host_uuid = $anvil->data->{drbd_resource}{$drbd_resource}{host_type}{$host_type}{short_host_name}{$short_host_name}{host_uuid}; - my $shell_call = $anvil->data->{path}{exe}{drbdadm}." adjust ".$drbd_resource; + my $shell_call = $anvil->data->{path}{exe}{'anvil-special-operations'}." --task refresh-drbd-resource --resource ".$drbd_resource.$anvil->Log->switches; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:short_host_name' => $short_host_name, 's2:host_uuid' => $host_uuid, 's3:shell_call' => $shell_call, }}); + next if $host_uuid eq $anvil->Get->host_uuid; - ### NOTE: The 'adjust' call doesn't return until it's adjusted on all machines, so we - ### make these calls as background calls. - # Create the metadata, but don't exit on failure in case the metadata was created in - # a previous pass. - if ($host_uuid eq $anvil->Get->host_uuid) - { - print "- Adjusting the local resource: [".$drbd_resource."] to pick up the new config.\n"; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); - ($output, $return_code) = $anvil->System->call({ - shell_call => $shell_call, - background => 1, - }); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - output => $output, - return_code => $return_code, - }}); - } - else - { - # We'll use this in a minute to confirm connections. - $anvil->data->{peers}{$short_host_name}{host_uuid} = $host_uuid; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - "peers::${short_host_name}::host_uuid" => $anvil->data->{peers}{$short_host_name}{host_uuid}, - }}); - - ### NOTE: This is expected to timeout when DR is used. - print "- Adjusting the peer: [".$short_host_name."]'s resource: [".$drbd_resource."] to pick up the new config.\n"; - my $use_ip = $anvil->data->{peer}{$short_host_name}{access}{ip}; - my $use_network = $anvil->data->{peer}{$short_host_name}{access}{network}; - my $shell_call = $anvil->data->{path}{exe}{drbdadm}." adjust ".$drbd_resource; - my ($pid) = $anvil->Remote->call({ - debug => 2, - background => 1, - shell_call => $shell_call, - target => $use_ip, - }); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { pid => $pid }}); - push @pids, $pid; - } + my ($job_uuid) = $anvil->Database->insert_or_update_jobs({ + debug => 2, + job_command => $shell_call, + job_data => "adjust=".$drbd_resource, + job_name => "server::add_disk::rescan", + job_title => "job_0465", + job_description => "job_0466", + job_progress => 0, + job_host_uuid => $host_uuid, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }}); + + print "- Registered a job with job UUID: [".$job_uuid."] to reload the resource config on the host: [".$short_host_name."].\n"; } } - # Wait for the remote PID(s) to be reaped. -# foreach my $pid (@pids) -# { -# $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { pid => $pid }}); -# next if not $pid; -# waitpid($pid, 0); -# } + print "- Adjusting the local resource: [".$drbd_resource."] to pick up the new config.\n"; + print "[ NOTE ] - If this hangs, make sure 'anvil-daemon' is running on the peers.\n"; + $shell_call = $anvil->data->{path}{exe}{drbdadm}." adjust ".$drbd_resource; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + ($output, $return_code) = $anvil->System->call({ + debug => 2, + background => 1, + shell_call => $shell_call, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); # Find which node is currently Primary and use that host to force primary to start sync. If none, # force here. @@ -1171,7 +1154,7 @@ sub manage_disk_add { my $use_ip = $anvil->data->{peer}{$short_host_name}{access}{ip}; my $use_network = $anvil->data->{peer}{$short_host_name}{access}{network}; - print " - The resource is primary onthe peer: [".$short_host_name."], forcing primary there via: [".$use_ip." (".$use_network.")]"; + print " - The resource is primary on the peer: [".$short_host_name."], forcing primary there via: [".$use_ip." (".$use_network.")]"; my ($output, $error, $return_code) = $anvil->Remote->call({ shell_call => $shell_call, target => $use_ip, @@ -1214,14 +1197,12 @@ sub manage_disk_add { print "Initial sync does not appear to be required.\n"; } - -# my $startup_needed = 1; -# my $local_role = defined $anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{role} ? $anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{role} : ""; -# $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_role => $local_role }}); - - - + =cut + my $startup_needed = 1; + my $local_role = defined $anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{role} ? $anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{role} : ""; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_role => $local_role }}); + # Create the DRBD metadata. For this, we don't fail. foreach my $host_type ("node", "dr") { diff --git a/tools/anvil-special-operations b/tools/anvil-special-operations new file mode 100755 index 00000000..d434d196 --- /dev/null +++ b/tools/anvil-special-operations @@ -0,0 +1,120 @@ +#!/usr/bin/perl +# +# This program has no specific purpose. It's a general program for performing certain special tasks that +# can't be done otherwise in a reliable or efficient way. +# +# Exit codes; +# 0 = Normal exit. +# 1 = No database connection. + + +use strict; +use warnings; +use Anvil::Tools; +require POSIX; +use Text::Diff; +use Data::Dumper; + +my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0]; +my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0]; +if (($running_directory =~ /^\./) && ($ENV{PWD})) +{ + $running_directory =~ s/^\./$ENV{PWD}/; +} + +# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete. +$| = 1; + +my $anvil = Anvil::Tools->new(); + +# Read switches (target ([user@]host[:port]) and the file with the target's password. +$anvil->Get->switches({list => [ + "task", + "resource", + ], man => $THIS_FILE}); +$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}}); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }}); + +$anvil->Database->connect(); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"}); +if (not $anvil->data->{sys}{database}{connections}) +{ + # No databases, update the job, sleep for a bit and then exit. The daemon will pick it up and try + # again after we exit. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0306"}); + sleep 10; + $anvil->nice_exit({exit_code => 1}); +} + +if ($anvil->data->{switches}{'job-uuid'}) +{ + $anvil->Job->clear(); + $anvil->Job->get_job_details({debug => 2}); + $anvil->Job->update_progress({ + progress => 1, + job_picked_up_by => $$, + job_picked_up_at => time, + message => "message_0311", + }); +} +if ($anvil->data->{switches}{task} eq "refresh-drbd-resource") +{ + refresh_drbd_resource($anvil); +} + +$anvil->nice_exit({exit_code => 0}); + + +############################################################################################################# +# Functions # +############################################################################################################# + +# This function is needed to call 'drbdadm adjust ' in a background call from a remote host. This is +# needed for adding new volumes to an existing resource, as the call from 'drbdadm adjust ' won't return +# until the call is run on all hosts. +sub refresh_drbd_resource +{ + my ($anvil) = @_; + + my $resource = $anvil->data->{switches}{resource}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { resource => $resource }}); + + if (not $resource) + { + # No resource. + $anvil->Job->update_progress({ + progress => 100, + message => "error_0419", + job_status => "failed", + }); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => 'err', key => "error_0419"}); + $anvil->nice_exit({exit_code => 1}); + } + + $anvil->Job->update_progress({ + progress => 10, + message => "message_0312,!!resource!".$resource."!!", + }); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "message_0312", variables => { resource => $resource }}); + + my $shell_call = $anvil->data->{path}{exe}{drbdadm}." adjust ".$resource; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + my ($output, $return_code) = $anvil->System->call({ + shell_call => $shell_call, + background => 1, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + + $anvil->Job->update_progress({ + progress => 100, + message => "message_0313,!!resource!".$resource."!!", + }); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "message_0313", variables => { resource => $resource }}); + + $anvil->nice_exit({exit_code => 0}); + + return(0); +} \ No newline at end of file From bf1ccc8bee83f14dee313a7cd2ba42fbd81dfd04 Mon Sep 17 00:00:00 2001 From: digimer Date: Fri, 30 Jun 2023 22:36:27 -0400 Subject: [PATCH 03/14] * Finally got the creation of new DRBD volumes under existing resources work! Signed-off-by: digimer --- tools/anvil-manage-server-storage | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/tools/anvil-manage-server-storage b/tools/anvil-manage-server-storage index 1b0302b1..c25f902f 100755 --- a/tools/anvil-manage-server-storage +++ b/tools/anvil-manage-server-storage @@ -1001,7 +1001,7 @@ sub manage_disk_add # force here. print "- Waiting for all peers to connect the new volume..."; my $waiting = 1; - my $wait_until = time + 60; + my $wait_until = time + 300; while ($waiting) { $anvil->DRBD->get_status({debug => 2}); @@ -1009,15 +1009,20 @@ sub manage_disk_add my $disks_ready = 0; foreach my $this_host_name (sort {$a cmp $b} keys %{$anvil->data->{peers}}) { - my $host_uuid = $anvil->data->{peers}{$this_host_name}{host_uuid}; + my $host_uuid = $anvil->Get->host_uuid_from_name({host_name => $this_host_name}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:this_host_name' => $this_host_name, + 's2:host_uuid' => $host_uuid, + }}); + next if $host_uuid eq $anvil->Get->host_uuid; + my $connection_state = $anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{connection}{$this_host_name}{'connection-state'}; my $node_id = $anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{connection}{$this_host_name}{'peer-node-id'}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - 's1:this_host_name' => $this_host_name, - 's2:host_uuid' => $host_uuid, - 's3:connection_state' => $connection_state, - 's4:node_id' => $node_id, + 's1:connection_state' => $connection_state, + 's2:node_id' => $node_id, }}); + if (lc($connection_state) ne "connected") { $peers_connected = 0; @@ -1025,6 +1030,7 @@ sub manage_disk_add } } + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peers_connected => $peers_connected }}); if ($peers_connected) { # Make sure all disks are attached. @@ -1050,6 +1056,11 @@ sub manage_disk_add $disks_ready = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peers_connected => $peers_connected }}); } + if ((not $peer_disk_state) or (lc($peer_disk_state) eq "diskless")) + { + $disks_ready = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peers_connected => $peers_connected }}); + } $anvil->data->{peers}{$peer_name}{disk_state} = $peer_disk_state; $anvil->data->{peers}{$peer_name}{role} = $role; @@ -1059,6 +1070,7 @@ sub manage_disk_add } } + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peers_connected => $peers_connected }}); if ($disks_ready) { $waiting = 0; From a7ebe45f768f578c231f9c3bf032c15620d850c0 Mon Sep 17 00:00:00 2001 From: digimer Date: Wed, 5 Jul 2023 21:04:05 -0400 Subject: [PATCH 04/14] This adds the new 'striker-collect-debug' tool that collects all potentially useful debug info into a single tarball. * Fixed a bug in Get->anvil_from_switch() to work when the Anvil! name is passed. Signed-off-by: digimer --- Anvil/Tools.pm | 1 + Anvil/Tools/Get.pm | 4 +- man/Makefile.am | 1 + man/striker-check-machines.8 | 2 +- man/striker-collect-debug.8 | 41 ++ tools/Makefile.am | 1 + tools/anvil-manage-dr | 30 +- tools/anvil-manage-server-storage | 341 +++++++++----- tools/striker-collect-debug | 737 ++++++++++++++++++++++++++++++ 9 files changed, 1037 insertions(+), 121 deletions(-) create mode 100644 man/striker-collect-debug.8 create mode 100755 tools/striker-collect-debug diff --git a/Anvil/Tools.pm b/Anvil/Tools.pm index f50e413a..a7efaf0c 100644 --- a/Anvil/Tools.pm +++ b/Anvil/Tools.pm @@ -1282,6 +1282,7 @@ sub _set_paths swapon => "/usr/sbin/swapon", sysctl => "/usr/sbin/sysctl", systemctl => "/usr/bin/systemctl", + tar => "/usr/bin/tar", timeout => "/usr/bin/timeout", touch => "/usr/bin/touch", tput => "/usr/bin/tput", diff --git a/Anvil/Tools/Get.pm b/Anvil/Tools/Get.pm index d0101457..71d15693 100644 --- a/Anvil/Tools/Get.pm +++ b/Anvil/Tools/Get.pm @@ -161,10 +161,10 @@ sub anvil_from_switch "switches::anvil_uuid" => $anvil->data->{switches}{anvil_uuid}, }}); } - elsif (exists $anvil->data->{anvils}{anvil_uuid}{$anvil_string}) + elsif (exists $anvil->data->{anvils}{anvil_name}{$anvil_string}) { $anvil->data->{switches}{anvil_name} = $anvil_string; - $anvil->data->{switches}{anvil_uuid} = $anvil->data->{anvils}{anvil_uuid}{$anvil_string}{anvil_uuid}; + $anvil->data->{switches}{anvil_uuid} = $anvil->data->{anvils}{anvil_name}{$anvil_string}{anvil_uuid}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { "switches::anvil_name" => $anvil->data->{switches}{anvil_name}, "switches::anvil_uuid" => $anvil->data->{switches}{anvil_uuid}, diff --git a/man/Makefile.am b/man/Makefile.am index cce903dc..f5125e8a 100644 --- a/man/Makefile.am +++ b/man/Makefile.am @@ -25,4 +25,5 @@ dist_man8_MANS = \ anvil-watch-drbd.8 \ scancore.8 \ striker-check-machines.8 \ + striker-collect-debug.8 \ striker-initialize-host.8 diff --git a/man/striker-check-machines.8 b/man/striker-check-machines.8 index 9e9c41d4..fff74b73 100644 --- a/man/striker-check-machines.8 +++ b/man/striker-check-machines.8 @@ -22,7 +22,7 @@ Set the log level to 1, 2 or 3 respectively. Be aware that level 3 generates a s .SS "Commands:" .TP This program takes no commands. -.TP +.IP .SH AUTHOR Written by Madison Kelly, Alteeve staff and the Anvil! project contributors. .SH "REPORTING BUGS" diff --git a/man/striker-collect-debug.8 b/man/striker-collect-debug.8 new file mode 100644 index 00000000..e3c01804 --- /dev/null +++ b/man/striker-collect-debug.8 @@ -0,0 +1,41 @@ +.\" Manpage for the Anvil! machine power and access reporting tool. +.\" Contact mkelly@alteeve.com to report issues, concerns or suggestions. +.TH striker-collect-data "8" "July 04 2023" "Anvil! Intelligent Availability™ Platform" +.SH NAME +striker-collect-data \- This program collects data needed to help diagnose problems with an Anvil! system. +.SH SYNOPSIS +.B striker-collect-data +\fI\, \/\fR[\fI\,options\/\fR] +.SH DESCRIPTION +This program collects database data, logs, config files and other information needed to help diagnose problems with the Anvil! platform. By default, this collects all data from all accessible machines. +.TP +.B Note: +.TP +This program collects potentially secure information, like passwords. Be careful who you share the collected data with! +.TP +The data from Striker dashboards are always collected. +.TP +.SH OPTIONS +.TP +\-?, \-h, \fB\-\-help\fR +Show this man page. +.TP +\fB\-\-log-secure\fR +When logging, record sensitive data, like passwords. +.TP +\-v, \-vv, \-vvv +Set the log level to 1, 2 or 3 respectively. Be aware that level 3 generates a significant amount of log data. +.SS "Commands:" +.TP +\fB\-\-anvil\fR +.TP +This restricts the data to be collected to the Striker dashboards and the specific Anvil! node pair. +.TP +\fB\-\-hosts\fR +.TP +This can be used to specify which specific hosts data is collected from. Note that this can be used in conjuction with \fB\-\-anvil\fR to add additional hosts to collect data from, like DR hosts. +.IP +.SH AUTHOR +Written by Madison Kelly, Alteeve staff and the Anvil! project contributors. +.SH "REPORTING BUGS" +Report bugs to users@clusterlabs.org diff --git a/tools/Makefile.am b/tools/Makefile.am index b782df7a..615c2026 100644 --- a/tools/Makefile.am +++ b/tools/Makefile.am @@ -52,6 +52,7 @@ dist_sbin_SCRIPTS = \ striker-auto-initialize-all \ striker-boot-machine \ striker-check-machines \ + striker-collect-debug \ striker-db-report \ striker-db-status \ striker-file-manager \ diff --git a/tools/anvil-manage-dr b/tools/anvil-manage-dr index 184814d8..88f176c6 100755 --- a/tools/anvil-manage-dr +++ b/tools/anvil-manage-dr @@ -382,7 +382,8 @@ sub sanity_check if (($anvil->data->{switches}{'connect'}) or ($anvil->data->{switches}{'disconnect'})) { # Is this server configured to be protected? - my $config_file = $anvil->data->{path}{directories}{drbd_resources}."/".$server_name.".res"; + my $config_file = $anvil->data->{path}{directories}{drbd_resources}."/".$server_name.".res"; + $config_file =~ s/\/\//\//g; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { config_file => $config_file }}); if (not -e $config_file) { @@ -398,8 +399,8 @@ sub sanity_check variables => $variables, job_status => "failed", }); + $anvil->nice_exit({exit_code => 1}); } - $anvil->nice_exit({exit_code => 1}); } # If we're doing a --protect or --remove, make sure we're a node, the cluster is up, and both nodes @@ -533,7 +534,6 @@ sub sanity_check # Get the Anvil! details. $anvil->Database->get_hosts(); - $anvil->Database->get_anvils(); $anvil->Database->get_storage_group_data({debug => 2}); $anvil->Database->get_dr_links({debug => 2}); @@ -559,7 +559,9 @@ sub sanity_check } } - # If I don't have a dr_host_uuid yet, see which are available. If only one, use it. If two or more, tell the user they need to specify which. + # If I don't have a dr_host_uuid yet, see which are available. If only one, use it. If two or more, + # and if the server is already being protected, determine which to use from it's config. Otherwise, + # tell the user they need to specify which. if (not $dr_host_uuid) { my $dr_count = keys %{$anvil->data->{dr_links}{by_anvil_uuid}{$anvil_uuid}{dr_link_host_name}}; @@ -587,6 +589,26 @@ sub sanity_check $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { dr_host_uuid => $dr_host_uuid }}); } } + else + { + # Two or more. Is the server already protected? If so, try to find which DR it's + # using. + if (($anvil->data->{switches}{'connect'}) or ($anvil->data->{switches}{'disconnect'})) + { + # Read the config. + my $config_file = $anvil->data->{path}{directories}{drbd_resources}."/".$server_name.".res"; + $config_file =~ s/\/\//\//g; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { config_file => $config_file }}); + + my $resource_config = $anvil->Storage->read_file({file => $config_file}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { resource_config => $resource_config }}); + foreach my $line (split/\n/, $resource_config) + { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { lineg => $line }}); + + } + } + } } # If I still don't have a DR host, fail out. diff --git a/tools/anvil-manage-server-storage b/tools/anvil-manage-server-storage index c25f902f..e57c9168 100755 --- a/tools/anvil-manage-server-storage +++ b/tools/anvil-manage-server-storage @@ -516,7 +516,7 @@ sub manage_disk_add my $next_drbd_volume = ""; foreach my $this_host (sort {$a cmp $b} keys %{$anvil->data->{drbd}{drbd_node}}) { - my $host_uuid = $anvil->Get->host_uuid_from_name({debug => 2, host_name => $this_host}); + my $host_uuid = $anvil->Get->host_uuid_from_name({host_name => $this_host}); my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:this_host' => $this_host, @@ -988,7 +988,6 @@ sub manage_disk_add $shell_call = $anvil->data->{path}{exe}{drbdadm}." adjust ".$drbd_resource; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); ($output, $return_code) = $anvil->System->call({ - debug => 2, background => 1, shell_call => $shell_call, }); @@ -1002,12 +1001,20 @@ sub manage_disk_add print "- Waiting for all peers to connect the new volume..."; my $waiting = 1; my $wait_until = time + 300; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { wait_until => $wait_until }}); while ($waiting) { $anvil->DRBD->get_status({debug => 2}); my $peers_connected = 1; my $disks_ready = 0; - foreach my $this_host_name (sort {$a cmp $b} keys %{$anvil->data->{peers}}) + my $drbd_peer_count = keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{connection}}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { drbd_peer_count => $drbd_peer_count }}); + if (not $drbd_peer_count) + { + $peers_connected = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peers_connected => $peers_connected }}); + } + foreach my $this_host_name (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{connection}}) { my $host_uuid = $anvil->Get->host_uuid_from_name({host_name => $this_host_name}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { @@ -1039,6 +1046,7 @@ sub manage_disk_add $anvil->data->{peers}{$short_host_name}{role} = $anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{role}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "peers::${short_host_name}::disk_state" => $anvil->data->{peers}{$short_host_name}{disk_state}, + "peers::${short_host_name}::role" => $anvil->data->{peers}{$short_host_name}{role}, }}); foreach my $peer_name (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{connection}}) { @@ -1054,12 +1062,12 @@ sub manage_disk_add if (lc($replication_state) ne "established") { $disks_ready = 0; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peers_connected => $peers_connected }}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { disks_ready => $disks_ready }}); } if ((not $peer_disk_state) or (lc($peer_disk_state) eq "diskless")) { $disks_ready = 0; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peers_connected => $peers_connected }}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { disks_ready => $disks_ready }}); } $anvil->data->{peers}{$peer_name}{disk_state} = $peer_disk_state; @@ -1070,7 +1078,7 @@ sub manage_disk_add } } - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peers_connected => $peers_connected }}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { disks_ready => $disks_ready }}); if ($disks_ready) { $waiting = 0; @@ -1118,20 +1126,28 @@ sub manage_disk_add my $shell_call = $anvil->data->{path}{exe}{drbdadm}." primary ".$drbd_resource." --force"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + print Dumper %{$anvil->data->{peers}}; + die; + # Which node should be forced primary? + my $already_primary = 1; if (not $primary_on_host) { # We'll make it primary. $primary_on_host = $short_host_name; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { primary_on_host => $primary_on_host }}); + $already_primary = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + primary_on_host => $primary_on_host, + already_primary => $already_primary, + }}); } - my $primary_on_host_uuid = $anvil->Get->host_uuid_from_name({debug => 2, host_name => $primary_on_host}); + my $primary_on_host_uuid = $anvil->Get->host_uuid_from_name({host_name => $primary_on_host}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { primary_on_host_uuid => $primary_on_host_uuid }}); if ($primary_on_host_uuid eq $anvil->Get->host_uuid) { print "- Forcing primary locally... "; - my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + my ($output, $return_code) = $anvil->System->call({debug => 2, shell_call => $shell_call}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code, @@ -1166,6 +1182,10 @@ sub manage_disk_add { my $use_ip = $anvil->data->{peer}{$short_host_name}{access}{ip}; my $use_network = $anvil->data->{peer}{$short_host_name}{access}{network}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + use_ip => $use_ip, + use_network => $use_network, + }}); print " - The resource is primary on the peer: [".$short_host_name."], forcing primary there via: [".$use_ip." (".$use_network.")]"; my ($output, $error, $return_code) = $anvil->Remote->call({ shell_call => $shell_call, @@ -1209,85 +1229,172 @@ sub manage_disk_add { print "Initial sync does not appear to be required.\n"; } - -=cut - my $startup_needed = 1; - my $local_role = defined $anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{role} ? $anvil->data->{drbd}{status}{$short_host_name}{resource}{$drbd_resource}{role} : ""; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_role => $local_role }}); - # Create the DRBD metadata. For this, we don't fail. + # Is the server running? If so, where. + print "- Ready to add the new disk. Checking if the server is running...\n"; + my $server_host = ""; foreach my $host_type ("node", "dr") { foreach my $short_host_name (sort {$a cmp $b} keys %{$anvil->data->{drbd_resource}{$drbd_resource}{host_type}{$host_type}{short_host_name}}) { my $host_uuid = $anvil->data->{drbd_resource}{$drbd_resource}{host_type}{$host_type}{short_host_name}{$short_host_name}{host_uuid}; - my $shell_call = $anvil->data->{path}{exe}{drbdadm}." --force create-md --max-peers=3 ".$drbd_resource."/".$next_drbd_volume; + my $shell_call = $anvil->data->{path}{exe}{'anvil-special-operations'}." --task refresh-drbd-resource --resource ".$drbd_resource.$anvil->Log->switches; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:short_host_name' => $short_host_name, 's2:host_uuid' => $host_uuid, - 's7:shell_call' => $shell_call, + 's3:shell_call' => $shell_call, }}); + if ($host_uuid eq $anvil->Get->host_uuid) { - print " - Creating the new local LV: [".$backing_disk."]..."; - my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - output => $output, - return_code => $return_code, - }}); - if ($return_code) - { - print " Error!\n"; - print "[ FAILED ] - When trying to create the new local logical volume: [".$backing_disk."]\n"; - print "[ FAILED ] - using the command: [".$shell_call."]\n"; - print "[ FAILED ] - The return code: [".$return_code."] was received, expected '0'. Output, if any:\n"; - print "==========\n"; - print $output."\n"; - print "==========\n"; - print "The creation of the new replicatedd disk is incomplete, manual intervention is required!!\n"; - $anvil->nice_exit({exit_code => 1}); - } - else - { - print " Done!\n"; - } + $anvil->Server->find(); } else { - my $use_ip = $anvil->data->{peer}{$short_host_name}{access}{ip}; - my $use_network = $anvil->data->{peer}{$short_host_name}{access}{network}; - print " - Creating the new LV on the peer: [".$short_host_name.":".$backing_disk."], via: [".$use_ip." (".$use_network.")]"; - my ($output, $error, $return_code) = $anvil->Remote->call({ - shell_call => $shell_call, - target => $use_ip, - }); + my $use_ip = $anvil->data->{peer}{$short_host_name}{access}{ip}; + $anvil->Server->find({target => $use_ip }); + } + + if ((exists $anvil->data->{server}{location}{$server_name}) && + ($anvil->data->{server}{location}{$server_name}{host_name})) + { + my $this_host = defined $anvil->data->{server}{location}{$server_name}{host_name} ? $anvil->data->{server}{location}{$server_name}{host_name} : ""; + my $server_status = defined $anvil->data->{server}{location}{$server_name}{status} ? $anvil->data->{server}{location}{$server_name}{status} : ""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - output => $output, - error => $error, - return_code => $return_code, + this_host => $this_host, + server_status => $server_status, }}); - if ($return_code) - { - print " Error!\n"; - print "[ FAILED ] - When trying to create the peer's logical volume: [".$backing_disk."]\n"; - print "[ FAILED ] - using the command: [".$shell_call."]\n"; - print "[ FAILED ] - The return code: [".$return_code."] was received, expected '0'. Output, if any:\n"; - print "==] STDOUT [========\n"; - print $output."\n"; - print "==] STDERR [========\n"; - print $error."\n"; - print "====================\n"; - print "The creation of the new replicated disk is incomplete, manual intervention is required!!\n"; - $anvil->nice_exit({exit_code => 1}); - } - else + if ($server_status eq "running") { - print " Done!\n"; + # Found it. + $server_host = $this_host; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { server_host => $server_host }}); + if ($host_uuid eq $anvil->Get->host_uuid) + { + print "- The server is running on this host, we'll attach the disk here.\n"; + } + else + { + print "- The server is running on: [".$server_host."], we'll attach the disk there.\n"; + } + last; } } } } -=cut + + my $offline = 0; + if (not $server_host) + { + print "- The server isn't running anywhere, we'll attach the disk offline on this host.\n"; + $server_host = $short_host_name; + $offline = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + server_host => $server_host, + offline => $offline, + }}); + } + + $shell_call = $anvil->data->{path}{exe}{setsid}." --wait ".$anvil->data->{path}{exe}{virsh}." attach-disk ".$server_name." "; + $shell_call .= "/dev/drbd/by-res/".$drbd_resource."/".$next_drbd_volume." ".$new_device_target." "; + $shell_call .= "--persistent --targetbus ".$disk_device_bus." "; + $shell_call .= "--cache ".$disk_cache." "; + $shell_call .= "--io ".$disk_io_policy; + $shell_call .= "--sourcetype block --subdriver raw"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + if ($offline) + { + # Define the VM, if needed, then add the drive, dump the config and push it out. + print "- Defining the server: [".$server_name."] to prepare for 'virsh' modification of the server.\n"; + update_definition($anvil, "define", ""); + + # Update the definition. + print "- Adding the drive to the definition now.\n"; + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + + print "- Updating the stored definition and undefining the server now...\n"; + update_definition($anvil, "undefine", ""); + print "Done!\n"; + $anvil->nice_exit({exit_code => 0}); + } + else + { + # Add the drive live, dump the new definition and push it out. + my $host_uuid = $anvil->Get->host_uuid_from_name({host_name => $server_host}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host_uuid => $host_uuid }}); + if ($host_uuid eq $anvil->Get->host_uuid) + { + # Do the add here. + print "- Adding the drive to the server directly...\n"; + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + + print "- Updating the stored definition and undefining the server now...\n"; + update_definition($anvil, "undefine", ""); + print "Done!\n"; + $anvil->nice_exit({exit_code => 0}); + } + else + { + # Do the add on the target. + my $use_ip = $anvil->data->{peer}{$short_host_name}{access}{ip}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { use_ip => $use_ip }}); + print " - Adding the drivve to the server using its host: [".$server_host."] via: [".$use_ip."]...\n"; + my ($output, $error, $return_code) = $anvil->Remote->call({ + shell_call => $shell_call, + target => $use_ip, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + + # Get the updated definition file. + my $shell_call = $anvil->data->{path}{exe}{setsid}." --wait ".$anvil->data->{path}{exe}{virsh}." dumpxml --inactive ".$server_name; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + (my $virsh_definition, $error, $return_code) = $anvil->Remote->call({ + shell_call => $shell_call, + target => $use_ip, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + virsh_definition => $virsh_definition, + error => $error, + return_code => $return_code, + }}); + + # Make sure the $output is valid XML. + my $problem = $anvil->Server->parse_definition({ + server => $server_name, + source => "from_virsh", + definition => $virsh_definition, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); + + if ($problem) + { + # Failed?! + print " +[ Error ] - The definition file read from the remote host appears to be invalid after trying to attach the +[ Error ] - disk! It is unsafe to update the on disk and in DB definition. It's likely the attach has failed. +[ Error ] - Manual update to the server is likely required now. +"; + $anvil->nice_exit({exit_code => 1}); + } + + print "- Updating the stored definition and undefining the server now...\n"; + update_definition($anvil, "undefine", $virsh_definition); + print "Done!\n"; + $anvil->nice_exit({exit_code => 0}); + } + } return(0); } @@ -1695,7 +1802,7 @@ sub manage_optical if (not -f $iso) { print "[ Error ] - The target: [".$iso."] doesn't exist, can't insert it into the optical drive.\n"; - update_definition($anvil, "undefine"); + update_definition($anvil, "undefine", ""); $anvil->nice_exit({exit_code => 1}); } else @@ -1707,7 +1814,7 @@ sub manage_optical # If the server is running, update the on-disk and in-DB definition. print "Defining the server: [".$server_name."] to prepare for 'virsh' modification of the server.\n"; - update_definition($anvil, "define"); + update_definition($anvil, "define", ""); # Now we can modify the server using virsh. if ($anvil->data->{switches}{insert}) @@ -1742,7 +1849,7 @@ sub manage_optical print "'virsh' Output: [".$output."]\n"; print "Updating the stored definition and undefining the server now:\n"; - update_definition($anvil, "undefine"); + update_definition($anvil, "undefine", ""); print "Done!\n"; return(0); @@ -1929,7 +2036,7 @@ sub check_drbd_peer_access my $all_online = 1; foreach my $this_host (sort {$a cmp $b} keys %{$anvil->data->{drbd}{drbd_node}}) { - my $host_uuid = $anvil->Get->host_uuid_from_name({debug => 2, host_name => $this_host}); + my $host_uuid = $anvil->Get->host_uuid_from_name({host_name => $this_host}); my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:this_host' => $this_host, @@ -2284,8 +2391,11 @@ sub validate_server # Update the definition on disk and in the DB, and define or undefine if requested. sub update_definition { - my ($anvil, $task) = @_; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { task => $task }}); + my ($anvil, $task, $definition) = @_; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + task => $task, + definition => $definition, + }}); my $short_host_name = $anvil->Get->short_host_name; my $server_name = $anvil->data->{switches}{server_name}; @@ -2330,39 +2440,55 @@ sub update_definition my $disk_definition = $anvil->data->{server}{$short_host_name}{$server_name}{from_disk}{xml} ? $anvil->data->{server}{$short_host_name}{$server_name}{from_disk}{xml} : ""; my $virsh_definition = $anvil->data->{server}{$short_host_name}{$server_name}{from_virsh}{xml} ? $anvil->data->{server}{$short_host_name}{$server_name}{from_virsh}{xml} : ""; - my $use_definition = ""; - if (($server_running_here) or (($server_state eq "running") && ($virsh_definition))) + my $use_definition = $virsh_definition; + if (not $use_definition) { - # Get the live definition - if ($server_running_here) + if (($server_running_here) or (($server_state eq "running") && ($virsh_definition))) { - my $shell_call = $anvil->data->{path}{exe}{setsid}." --wait ".$anvil->data->{path}{exe}{virsh}." dumpxml --inactive ".$server_name; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); - - my ($live_virsh_definition, $return_code) = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - live_virsh_definition => $live_virsh_definition, - return_code => $return_code, - }}); - - my ($problem) = $anvil->Server->parse_definition({ - server => $server_name, - source => "from_virsh", - definition => $live_virsh_definition, - }); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); - if (not $problem) + # Get the live definition + if ($server_running_here) { - $use_definition = $live_virsh_definition; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { use_definition => $use_definition }}); + my $shell_call = $anvil->data->{path}{exe}{setsid}." --wait ".$anvil->data->{path}{exe}{virsh}." dumpxml --inactive ".$server_name; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); - $anvil->Server->parse_definition({ - debug => 3, - host => $short_host_name, - server => $server_name, - source => "from_virsh", + my ($live_virsh_definition, $return_code) = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + live_virsh_definition => $live_virsh_definition, + return_code => $return_code, + }}); + + my ($problem) = $anvil->Server->parse_definition({ + server => $server_name, + source => "from_virsh", definition => $live_virsh_definition, }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); + if (not $problem) + { + $use_definition = $live_virsh_definition; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { use_definition => $use_definition }}); + + $anvil->Server->parse_definition({ + debug => 3, + host => $short_host_name, + server => $server_name, + source => "from_virsh", + definition => $live_virsh_definition, + }); + } + else + { + $use_definition = $virsh_definition; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { use_definition => $use_definition }}); + + $anvil->Server->parse_definition({ + debug => 3, + host => $short_host_name, + server => $server_name, + source => "from_virsh", + definition => $virsh_definition, + }); + } } else { @@ -2380,31 +2506,18 @@ sub update_definition } else { - $use_definition = $virsh_definition; + $use_definition = $disk_definition; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { use_definition => $use_definition }}); $anvil->Server->parse_definition({ debug => 3, host => $short_host_name, server => $server_name, - source => "from_virsh", + source => "from_disk", definition => $virsh_definition, }); } } - else - { - $use_definition = $disk_definition; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { use_definition => $use_definition }}); - - $anvil->Server->parse_definition({ - debug => 3, - host => $short_host_name, - server => $server_name, - source => "from_disk", - definition => $virsh_definition, - }); - } if (not $use_definition) { diff --git a/tools/striker-collect-debug b/tools/striker-collect-debug new file mode 100755 index 00000000..f440f1cd --- /dev/null +++ b/tools/striker-collect-debug @@ -0,0 +1,737 @@ +#!/usr/bin/perl +# +# This program will collect data from all accessible machines and compile it into a common tarball. This is +# designed to make it easier to diagnose faults. +# +# Exit codes; +# 0 = Normal exit. +# 1 = No database connection. +# +# TODO: +# +# USAGE: +# + +use strict; +use warnings; +use Anvil::Tools; +require POSIX; +use Term::Cap; +use Text::Diff; +use Data::Dumper; + +my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0]; +my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0]; +if (($running_directory =~ /^\./) && ($ENV{PWD})) +{ + $running_directory =~ s/^\./$ENV{PWD}/; +} + +# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete. +$| = 1; + +my $anvil = Anvil::Tools->new(); + +### TODO: Remove this before final release +$anvil->Log->level({set => 2}); +$anvil->Log->secure({set => 1}); +########################################## + +# Read switches (target ([user@]host[:port]) and the file with the target's password. +$anvil->Get->switches({list => ["anvil", "hosts"], man => $THIS_FILE}); +$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}}); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }}); + +# Connect to the database(s). If we have no connections, we'll proceed anyway as one of the 'run_once' tasks +# is to setup the database server. +$anvil->Database->connect(); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"}); +if (not $anvil->data->{sys}{database}{connections}) +{ + # No databases, update the job, sleep for a bit and then exit. The daemon will pick it up and try + # again after we exit. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0305"}); + sleep 10; + $anvil->nice_exit({exit_code => 1}); +} + +# Make sure we're running as 'root' +# $< == real UID, $> == effective UID +if (($< != 0) && ($> != 0)) +{ + # Not root + print $anvil->Words->string({key => "error_0005"})."\n"; + $anvil->nice_exit({exit_code => 1}); +} + +# Make sure we're a striker. +if ($anvil->Get->host_type ne "striker") +{ + print "This has to be run on a Striker dashboard.\n"; + $anvil->nice_exit({exit_code => 1}); +} + +print "Data collection has begun.\n"; +print "Depending on how many systems we're collecting from, this could take a while.\n"; + +process_switches($anvil); + +collect_data($anvil); + +# Create the tarball now. +print "Data collection complete, creating the tarball now... "; +my $tarball = "/root/anvil-debug_".$anvil->data->{sys}{date_and_time}.".tar.bz2"; +my $shell_call = $anvil->data->{path}{exe}{tar}." -cvjf ".$tarball." ".$anvil->data->{sys}{compile_directory}; +$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:tarball' => $tarball, + 's2:shell_call' => $shell_call, +}}); + +my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); +$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, +}}); +print "Done!\n"; + +print "\n[ Complete ] - The debug data is here: [".$tarball."]\n"; +print "[ Warning ] - The collected logs likely include sensitive information! Share is carefully!\n"; + + + +$anvil->nice_exit({exit_code => 0}); + + +############################################################################################################# +# Functions # +############################################################################################################# + +sub collect_data +{ + my ($anvil) = @_; + + # Make sure the collection directory exists. + $anvil->data->{sys}{date_and_time} = $anvil->Get->date_and_time({file_name => 1}); + $anvil->data->{sys}{compile_directory} = "/tmp/anvil-debug_".$anvil->data->{sys}{date_and_time}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "sys::date_and_time" => $anvil->data->{sys}{date_and_time}, + "sys::compile_directory" => $anvil->data->{sys}{compile_directory}, + }}); + + my $failed = $anvil->Storage->make_directory({directory => $anvil->data->{sys}{compile_directory}}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { failed => $failed }}); + if ($failed) + { + print "Failed to create the directory: [".$anvil->data->{sys}{compile_directory}."]. The error should be logged.\n"; + $anvil->nice_exit({exit_code => 1}); + } + + my $hosts = @{$anvil->data->{collect_from}}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { hosts => $hosts }}); + foreach my $host_type ("striker", "node", "dr") + { + foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}}) + { + my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name}; + my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; + my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:host_name' => $host_name, + 's2:host_uuid' => $host_uuid, + 's3:short_host_name' => $short_host_name, + 's4:this_host_type' => $this_host_type, + }}); + next if $host_type ne $this_host_type; + + # Are we collecting from a subset only? + if ($hosts) + { + # Yes, is this host one of them? + my $found = 0; + foreach my $this_host_uuid (@{$anvil->data->{collect_from}}) + { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + host_uuid => $host_uuid, + this_host_uuid => $this_host_uuid, + }}); + if ($this_host_uuid eq $host_uuid) + { + $found = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { found => $found }}); + last; + } + } + next if not $found; + } + + # Make sure there's a directory for this host. + my $target_directory = $anvil->data->{sys}{compile_directory}."/".$short_host_name; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { target_directory => $target_directory }}); + if (not -d $target_directory) + { + my $failed = $anvil->Storage->make_directory({ + directory => $target_directory, + mode => "777", + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { failed => $failed }}); + if ($failed) + { + print "Failed to create the directory: [".$target_directory."]. The error should be logged.\n"; + $anvil->nice_exit({exit_code => 1}); + } + } + + # Is this the local host or a remote one? + if ($host_uuid eq $anvil->Get->host_uuid) + { + ### Collecting local data. + collect_local_data($anvil, $target_directory); + } + else + { + # Collecting data from a remote machine + my $problem = collect_remote_data($anvil, $host_uuid, $target_directory); + if ($problem) + { + # Create a file saying we couldn't access this machine. + my $body = "No access to: [".$host_name."] found.\n"; + my $file = $target_directory."/no_access.txt"; + $anvil->Storage->write_file({ + file => $file, + body => $body, + overwrite => 1, + backup => 0, + }); + } + } + } + } + + return(0); +} + +sub collect_remote_data +{ + my ($anvil, $host_uuid, $target_directory) = @_; + + my $host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_name}; + my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; + my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type}; + my $failed_body = "File not copied from: [".$host_name."].\n"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:host_name' => $host_name, + 's2:host_uuid' => $host_uuid, + 's3:short_host_name' => $short_host_name, + 's4:this_host_type' => $this_host_type, + 's5:target_directory' => $target_directory, + }}); + + # Dump the previous boot logs to a file. + print "\nGrabbing logs and data from the remote system: [".$short_host_name."].\n"; + print "- Testing access...\n"; + my $matches = $anvil->Network->find_access({ + debug => 2, + target => $host_name, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { matches => $matches }}); + $anvil->data->{peer}{$short_host_name}{access}{ip} = ""; + $anvil->data->{peer}{$short_host_name}{access}{network} = ""; + foreach my $preferred_network ("bcn", "mn", "ifn", "sn") + { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { preferred_network => $preferred_network }}); + foreach my $network_name (sort {$a cmp $b} keys %{$anvil->data->{network_access}}) + { + next if $network_name !~ /^$preferred_network/; + my $target_ip = $anvil->data->{network_access}{$network_name}{target_ip_address}; + my $test_access = $anvil->Remote->test_access({target => $target_ip}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:network_name' => $network_name, + 's2:target_ip' => $target_ip, + 's3:test_access' => $test_access, + }}); + + if ($test_access) + { + # We're good. + print "- Found access over the network: [".$network_name."] using the target IP: [".$target_ip."]\n"; + $anvil->data->{peer}{$short_host_name}{access}{ip} = $target_ip; + $anvil->data->{peer}{$short_host_name}{access}{network} = $network_name; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "s1:peer::${short_host_name}::access::ip" => $anvil->data->{peer}{$short_host_name}{access}{ip}, + "s2:peer::${short_host_name}::access::network" => $anvil->data->{peer}{$short_host_name}{access}{network}, + }}); + } + } + } + + if (not $anvil->data->{peer}{$short_host_name}{access}{ip}) + { + print "No access!!\n"; + print "- Not able to collect data from this host, skipping.\n"; + return(1); + } + + print "- Writing out system logs from the previous boot... "; + my $shell_call = $anvil->data->{path}{exe}{journalctl}." -b -1 > /tmp/journalctl-previous-boot.log"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + my ($output, $error, $return_code) = $anvil->Remote->call({ + shell_call => $shell_call, + target => $anvil->data->{peer}{$short_host_name}{access}{ip}, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + + # Copying the file + print "Done! Copying to here... "; + $anvil->Storage->rsync({ + source => "root\@".$anvil->data->{peer}{$short_host_name}{access}{ip}.":/tmp/journalctl-previous-boot.log", + destination => $target_directory."/", + }); + my $test_file = $target_directory."/tmp/journalctl-previous-boot.log"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { test_file => $test_file }}); + if (not -e $test_file) + { + print "Done.\n"; + } + else + { + print "Failed!\n"; + print "- For some reason, this file was not collected.\n"; + $anvil->Storage->write_file({ + file => $test_file, + body => $failed_body, + overwrite => 1, + backup => 0, + }); + } + + # Dump the current boot logs + print "- Grabbing system logs from this boot... "; + $shell_call = $anvil->data->{path}{exe}{journalctl}." -b 0 > /tmp/journalctl-current-boot.log"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + + # Copying the file + print "Done! Copying to here... "; + $anvil->Storage->rsync({ + source => "root\@".$anvil->data->{peer}{$short_host_name}{access}{ip}.":/tmp/journalctl-current-boot.log", + destination => $target_directory."/", + }); + $test_file = $target_directory."/journalctl-current-boot.log"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { test_file => $test_file }}); + if (not -e $test_file) + { + print "Done.\n"; + } + else + { + print "Failed!\n"; + print "- For some reason, this file was not collected.\n"; + $anvil->Storage->write_file({ + file => $test_file, + body => $failed_body, + overwrite => 1, + backup => 0, + }); + } + + # If we're a striker, dump the database also. + if ($this_host_type eq "striker") + { + # What's the password and address? + if (not exists $anvil->data->{database}{$host_uuid}) + { + # The remote striker isn't known + print "- The host is a Striker, but we don't have database access info, skipping DB dump.\n"; + } + else + { + print "- Dumping and compressing remote database data, PLEASE BE PATIENT!... "; + my $pg_file = "/root/.pgpass"; + my $pg_body = "*:*:*:admin:".$anvil->data->{database}{$host_uuid}{password}; + $anvil->Storage->write_file({ + file => $pg_file, + body => $pg_body, + mode => "600", + overwrite => 0, + backup => 0, + }); + my $shell_call = $anvil->data->{path}{exe}{pg_dump}." -h ".$anvil->data->{peer}{$short_host_name}{access}{ip}." -U admin anvil 2>/dev/null | ".$anvil->data->{path}{exe}{bzip2}." --stdout > ".$target_directory."/anvil.out.bz2"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + if ($return_code) + { + # Failed + print "Failed!\n"; + print "Expected the return code '0', but got: [".$return_code."]. The error, if any, was:\n"; + print "========\n"; + print $output."\n"; + print "========\n"; + $anvil->nice_exit({exit_code => 1}); + } + unlink $pg_file; + print "Done!\n"; + } + } + + print "- Grabbing hosts file... "; + $anvil->Storage->rsync({ + source => "root\@".$anvil->data->{peer}{$short_host_name}{access}{ip}.":/etc/hosts", + destination => $target_directory."/", + }); + $test_file = $target_directory."/hosts"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { test_file => $test_file }}); + if (not -e $test_file) + { + print "Done.\n"; + } + else + { + print "Failed!\n"; + print "- For some reason, this file was not collected.\n"; + $anvil->Storage->write_file({ + file => $test_file, + body => $failed_body, + overwrite => 1, + backup => 0, + }); + } + + print "- Grabbing Anvil! log... "; + $anvil->Storage->rsync({ + source => "root\@".$anvil->data->{peer}{$short_host_name}{access}{ip}.":/var/log/anvil.log", + destination => $target_directory."/", + }); + $test_file = $target_directory."/anvil.log"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { test_file => $test_file }}); + if (not -e $test_file) + { + print "Done.\n"; + } + else + { + print "Failed!\n"; + print "- For some reason, this file was not collected.\n"; + $anvil->Storage->write_file({ + file => $test_file, + body => $failed_body, + overwrite => 1, + backup => 0, + }); + } + + # If this is a node, grab the shared files. + if ($this_host_type eq "node") + { + print "- Collecting the cluster information base (CIB)... "; + $shell_call = $anvil->data->{path}{exe}{pcs}." cluster cib > /tmp/cib.xml"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + + # Copying the file + print "Done! Copying to here... "; + $anvil->Storage->rsync({ + source => "root\@".$anvil->data->{peer}{$short_host_name}{access}{ip}.":/tmp/cib.xml", + destination => $target_directory."/", + }); + my $test_file = $target_directory."/cib.xml"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { test_file => $test_file }}); + if (not -e $test_file) + { + print "Done.\n"; + } + else + { + print "Failed!\n"; + print "- For some reason, this file was not collected.\n"; + $anvil->Storage->write_file({ + file => $test_file, + body => $failed_body, + overwrite => 1, + backup => 0, + }); + } + } + + # If this is not a striker, collect definition files. + if ($this_host_type ne "striker") + { + print "- Collecting server definitions... "; + $anvil->Storage->rsync({ + source => "root\@".$anvil->data->{peer}{$short_host_name}{access}{ip}.":/mnt/shared/definitions", + destination => $target_directory."/", + }); + print "Done!\n"; + + print "- Collecting replicated storage config... "; + $anvil->Storage->rsync({ + source => "root\@".$anvil->data->{peer}{$short_host_name}{access}{ip}.":/etc/drbd.d", + destination => $target_directory."/", + }); + print "Done!\n"; + } + + return(0); +} + +sub collect_local_data +{ + my ($anvil, $target_directory) = @_; + + my $host_uuid = $anvil->Get->host_uuid(); + my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:target_directory' => $target_directory, + 's2:host_uuid' => $host_uuid, + 's3:this_host_type' => $this_host_type, + }}); + + # Dump the previous boot logs to a file. + print "\nGrabbing logs and data from the local system.\n"; + print "- Grabbing system logs from the previous boot... "; + my $shell_call = $anvil->data->{path}{exe}{journalctl}." -b -1 > ".$target_directory."/journalctl-previous-boot.log"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + print "Done!\n"; + + # Dump the current boot logs + print "- Grabbing system logs from this boot... "; + $shell_call = $anvil->data->{path}{exe}{journalctl}." -b 0 > ".$target_directory."/journalctl-current-boot.log"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + print "Done!\n"; + + # If we're a striker, dump the database also. + if ($this_host_type eq "striker") + { + print "- Dumping and compressing database data, PLEASE BE PATIENT!... "; + my $shell_call = $anvil->data->{path}{exe}{su}." postgres -c \"".$anvil->data->{path}{exe}{pg_dump}." anvil\" 2>/dev/null | ".$anvil->data->{path}{exe}{bzip2}." --stdout > ".$target_directory."/anvil.out.bz2"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + if ($return_code) + { + # Failed + print "Failed!\n"; + print "Expected the return code '0', but got: [".$return_code."]. The error, if any, was:\n"; + print "========\n"; + print $output."\n"; + print "========\n"; + $anvil->nice_exit({exit_code => 1}); + } + print "Done!\n"; + } + + print "- Grabbing hosts file... "; + $shell_call = $anvil->data->{path}{exe}{cp}." /etc/hosts ".$target_directory."/"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + print "Done!\n"; + + print "- Grabbing Anvil! log... "; + $shell_call = $anvil->data->{path}{exe}{cp}." /var/log/anvil.log ".$target_directory."/"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + print "Done!\n"; + + # If this is a node, grab the shared files. + if ($this_host_type eq "node") + { + print "- Collecting the cluster information base (CIB)... "; + $shell_call = $anvil->data->{path}{exe}{pcs}." cluster cib > ".$target_directory."/cib.xml"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + print "Done!\n"; + } + + # If this is not a striker, collect definition files. + if ($this_host_type ne "striker") + { + print "- Collecting server definitions... "; + $shell_call = $anvil->data->{path}{exe}{rsync}." -av /mnt/shared/definitions ".$target_directory."/"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + print "Done!\n"; + } + + return(0); +} + +sub process_switches +{ + my ($anvil) = @_; + + $anvil->data->{collect_from} = []; + $anvil->Database->get_hosts(); + + if ($anvil->data->{switches}{anvil}) + { + if ($anvil->data->{switches}{anvil} eq "#!SET!#") + { + # Show a list of Anvil! systems. + print "Available Anvil! systems. Use '--anvil ' to collect data from a specific Anvil! node.\n"; + foreach my $anvil_name (sort {$a cmp $b} keys %{$anvil->data->{anvils}{anvil_name}}) + { + print "- Name: [".$anvil_name."], UUID: [".$anvil->data->{anvils}{anvil_name}{$anvil_name}{anvil_uuid}."]\n"; + } + $anvil->nice_exit({exit_code => 0}); + } + + # Make sure the anvil is valid. + my ($anvil_name, $anvil_uuid) = $anvil->Get->anvil_from_switch({ + debug => 2, + anvil => $anvil->data->{switches}{anvil}, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:anvil_name' => $anvil_name, + 's2:anvil_uuid' => $anvil_uuid, + }}); + + if (not $anvil_name) + { + # Bad name. + print "[ Error ] - Unable to get the Anvil! name and UUID from the string: [".$anvil->data->{switches}{anvil}."]\n"; + $anvil->nice_exit({exit_code => 1}); + } + + # Add the host_uuids to the collect_from array. + push @{$anvil->data->{collect_from}}, $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node1_host_uuid}; + push @{$anvil->data->{collect_from}}, $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node2_host_uuid}; + } + + if ($anvil->data->{switches}{hosts}) + { + if ($anvil->data->{switches}{hosts} eq "#!SET!#") + { + # Show a list of all machines. + print "Available Anvil! cluster systems. Use '--host ' to collect data from specific hosts.\n"; + foreach my $host_type ("striker", "node", "dr") + { + print "- Striker Dashboards:\n" if $host_type eq "striker"; + print "\n- Anvil! sub-nodes:\n" if $host_type eq "node"; + print "\n- Disaster recovery hosts:\n" if $host_type eq "dr"; + foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}}) + { + my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name}; + my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:host_name' => $host_name, + 's2:host_uuid' => $host_uuid, + 's3:this_host_type' => $this_host_type, + }}); + next if $host_type ne $this_host_type; + + print " - Host: [".$host_name."], UUID: [".$host_uuid."]\n"; + } + } + + $anvil->nice_exit({exit_code => 0}); + } + + foreach my $host (split/,/, $anvil->data->{switches}{hosts}) + { + # Make sure this host is valid. + my ($host_uuid) = $anvil->Database->get_host_uuid_from_string({string => $host}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:host' => $host, + 's2:host_uuid' => $host_uuid, + }}); + if (not $host_uuid) + { + print "[ Error ] - Unable to get the host UUID from the host string: [".$host."]\n"; + $anvil->nice_exit({exit_code => 1}); + } + push @{$anvil->data->{collect_from}}, $host_uuid; + } + } + + # If we were restricted to an anvil or host, make sure we've added the Strikers. + if (($anvil->data->{switches}{anvil}) or ($anvil->data->{switches}{hosts})) + { + foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}}) + { + my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name}; + my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:host_name' => $host_name, + 's2:host_uuid' => $host_uuid, + 's3:this_host_type' => $this_host_type, + }}); + next if $this_host_type ne "striker"; + + my $seen = 0; + foreach my $this_host_uuid (@{$anvil->data->{collect_from}}) + { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:this_host_uuid' => $this_host_uuid, + 's2:host_uuid' => $host_uuid, + }}); + if ($this_host_uuid eq $host_uuid) + { + $seen = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { seen => $seen }}); + last; + } + } + + if (not $seen) + { + push @{$anvil->data->{collect_from}}, $host_uuid; + } + } + } + + return(0); +} From 3215e178efcfc4e835ddd22bbce3ad04151501c9 Mon Sep 17 00:00:00 2001 From: digimer Date: Thu, 6 Jul 2023 13:02:59 -0400 Subject: [PATCH 05/14] * Updated striker-collect-debug to support '--output-file /path/to/file.tar.bz2'. Signed-off-by: digimer --- man/striker-collect-debug.8 | 4 ++ tools/striker-collect-debug | 81 +++++++++++++++++++++++++++++++++---- 2 files changed, 77 insertions(+), 8 deletions(-) diff --git a/man/striker-collect-debug.8 b/man/striker-collect-debug.8 index e3c01804..1853dc98 100644 --- a/man/striker-collect-debug.8 +++ b/man/striker-collect-debug.8 @@ -34,6 +34,10 @@ This restricts the data to be collected to the Striker dashboards and the specif \fB\-\-hosts\fR .TP This can be used to specify which specific hosts data is collected from. Note that this can be used in conjuction with \fB\-\-anvil\fR to add additional hosts to collect data from, like DR hosts. +.TP +\fB\-\-output\-file\fR +.TP +This allows you to specify the output compressed tarball that the files will be saved in. By default, the output file is \fB/root/anvil-debug_.tar.bz2\fR. If this is a directory (ending in \fB/\fR), the normal file name is created, just in a different directory. If the path ends in a file that doesn't have the \fB.tar.bz2\fR suffix, that suffix will be added automatically. The output file will always be a bzip2's tarball. .IP .SH AUTHOR Written by Madison Kelly, Alteeve staff and the Anvil! project contributors. diff --git a/tools/striker-collect-debug b/tools/striker-collect-debug index f440f1cd..fa85e856 100755 --- a/tools/striker-collect-debug +++ b/tools/striker-collect-debug @@ -38,7 +38,10 @@ $anvil->Log->secure({set => 1}); ########################################## # Read switches (target ([user@]host[:port]) and the file with the target's password. -$anvil->Get->switches({list => ["anvil", "hosts"], man => $THIS_FILE}); +$anvil->Get->switches({list => [ + "anvil", + "hosts", + "output-file"], man => $THIS_FILE}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }}); @@ -71,16 +74,27 @@ if ($anvil->Get->host_type ne "striker") $anvil->nice_exit({exit_code => 1}); } +# Make sure the collection directory exists. +$anvil->data->{sys}{date_and_time} = $anvil->Get->date_and_time({file_name => 1}); +$anvil->data->{sys}{compile_directory} = "/tmp/anvil-debug_".$anvil->data->{sys}{date_and_time}; +$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "sys::date_and_time" => $anvil->data->{sys}{date_and_time}, + "sys::compile_directory" => $anvil->data->{sys}{compile_directory}, +}}); + print "Data collection has begun.\n"; print "Depending on how many systems we're collecting from, this could take a while.\n"; +# Get the directory portion of the output path and make sure it exists. +my $tarball = process_output($anvil); +$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { tarball => $tarball }}); + process_switches($anvil); collect_data($anvil); # Create the tarball now. print "Data collection complete, creating the tarball now... "; -my $tarball = "/root/anvil-debug_".$anvil->data->{sys}{date_and_time}.".tar.bz2"; my $shell_call = $anvil->data->{path}{exe}{tar}." -cvjf ".$tarball." ".$anvil->data->{sys}{compile_directory}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:tarball' => $tarball, @@ -106,18 +120,69 @@ $anvil->nice_exit({exit_code => 0}); # Functions # ############################################################################################################# -sub collect_data +sub process_output { my ($anvil) = @_; - # Make sure the collection directory exists. - $anvil->data->{sys}{date_and_time} = $anvil->Get->date_and_time({file_name => 1}); - $anvil->data->{sys}{compile_directory} = "/tmp/anvil-debug_".$anvil->data->{sys}{date_and_time}; + my $tarball = "/root/anvil-debug_".$anvil->data->{sys}{date_and_time}.".tar.bz2"; + if ($anvil->data->{switches}{'output-file'}) + { + my $new_directory = $anvil->data->{switches}{'output-file'}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { new_directory => $new_directory }}); + if ($new_directory !~ /^\//) + { + print "[ Error ] - The output path needs to be a path.\n"; + $anvil->nice_exit({exit_code => 1}); + } + else + { + # Append .tar.bz2. + $tarball = $new_directory; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { tarball => $tarball }}); + } + } + + # Break the directory off and make sure the output directory exists. + my $output_file = ($tarball =~ /^.*\/(.*)$/)[0]; + my $output_directory = ($tarball =~ /^(.*?)\/$output_file$/)[0]; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - "sys::date_and_time" => $anvil->data->{sys}{date_and_time}, - "sys::compile_directory" => $anvil->data->{sys}{compile_directory}, + output_file => $output_file, + output_directory => $output_directory, }}); + if (not $output_file) + { + $output_file = "anvil-debug_".$anvil->data->{sys}{date_and_time}.".tar.bz2"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output_file => $output_file }}); + } + elsif ($output_file !~ /\.tar\.bz2/) + { + $output_file .= ".tar.bz2"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output_file => $output_file }}); + } + + if ($output_directory ne "/") + { + print "- Creating the output directory: [".$output_directory."]... "; + my $failed = $anvil->Storage->make_directory({directory => $output_directory}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { failed => $failed }}); + if ($failed) + { + print "Failed!\nUnable to create the directory: [".$anvil->data->{sys}{compile_directory}."]. The error should be logged.\n"; + $anvil->nice_exit({exit_code => 1}); + } + } + + $tarball = $output_directory."/".$output_file; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { tarball => $tarball }}); + + return($tarball); +} + +sub collect_data +{ + my ($anvil) = @_; + my $failed = $anvil->Storage->make_directory({directory => $anvil->data->{sys}{compile_directory}}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { failed => $failed }}); if ($failed) From d56b7f9a842b3a0972d7df800a94863e6e69b74d Mon Sep 17 00:00:00 2001 From: digimer Date: Fri, 7 Jul 2023 17:54:57 -0400 Subject: [PATCH 06/14] * Created (but not finished!) the new striker-update-cluster tool. * Updated Cluster->get_primary_host_uuid() to only load anvils if not already loaded. Signed-off-by: digimer --- Anvil/Tools.pm | 2 + Anvil/Tools/Cluster.pm | 9 +- man/Makefile.am | 3 +- man/striker-update-cluster.8 | 0 tools/Makefile.am | 3 +- tools/striker-collect-debug | 5 - tools/striker-update-cluster | 578 +++++++++++++++++++++++++++++++++++ 7 files changed, 590 insertions(+), 10 deletions(-) create mode 100644 man/striker-update-cluster.8 create mode 100755 tools/striker-update-cluster diff --git a/Anvil/Tools.pm b/Anvil/Tools.pm index a7efaf0c..ea9900c7 100644 --- a/Anvil/Tools.pm +++ b/Anvil/Tools.pm @@ -1261,6 +1261,7 @@ sub _set_paths 'shutdown' => "/usr/sbin/shutdown", snmpget => "/usr/bin/snmpget", snmpset => "/usr/bin/snmpset", + 'sort' => "/usr/bin/sort", 'ssh-keygen' => "/usr/bin/ssh-keygen", 'ssh-keyscan' => "/usr/bin/ssh-keyscan", 'stat' => "/usr/bin/stat", @@ -1282,6 +1283,7 @@ sub _set_paths swapon => "/usr/sbin/swapon", sysctl => "/usr/sbin/sysctl", systemctl => "/usr/bin/systemctl", + tail => "/usr/bin/tail", tar => "/usr/bin/tar", timeout => "/usr/bin/timeout", touch => "/usr/bin/touch", diff --git a/Anvil/Tools/Cluster.pm b/Anvil/Tools/Cluster.pm index 831d7c45..660e6ae9 100644 --- a/Anvil/Tools/Cluster.pm +++ b/Anvil/Tools/Cluster.pm @@ -2441,7 +2441,7 @@ sub get_peers =head2 get_primary_host_uuid -This takes an Anvil! UUID and returns with node is currently the "primary" node. That is to say, which node has the most servers running on it, by allocated RAM. For example, if node 1 has two servers, each with 8 GiB of RAN and node 2 has one VM with 32 GiB of RAM, node 2 will be considered primary as it would take longest to migrate servers off. +This takes an Anvil! UUID and returns with the node's host UUID that is currently the "primary" node. That is to say, which node has the most servers running on it, by allocated RAM. For example, if node 1 has two servers, each with 8 GiB of RAN and node 2 has one VM with 32 GiB of RAM, node 2 will be considered primary as it would take longest to migrate servers off. If all is equal, node 1 is considered primary. If only one node is a cluster member, it is considered primary. If neither node is up, an empty string is returned. @@ -2478,8 +2478,11 @@ sub get_primary_host_uuid return(""); } - # Get the two node UUIDs. - $anvil->Database->get_anvils({debug => $debug}); + # Get the two node UUIDs, if not already loaded + if (not exists $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}) + { + $anvil->Database->get_anvils({debug => $debug}); + } if (not exists $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}) { diff --git a/man/Makefile.am b/man/Makefile.am index f5125e8a..78446e8e 100644 --- a/man/Makefile.am +++ b/man/Makefile.am @@ -26,4 +26,5 @@ dist_man8_MANS = \ scancore.8 \ striker-check-machines.8 \ striker-collect-debug.8 \ - striker-initialize-host.8 + striker-initialize-host.8 \ + striker-update-cluster diff --git a/man/striker-update-cluster.8 b/man/striker-update-cluster.8 new file mode 100644 index 00000000..e69de29b diff --git a/tools/Makefile.am b/tools/Makefile.am index 615c2026..153f39e6 100644 --- a/tools/Makefile.am +++ b/tools/Makefile.am @@ -67,7 +67,8 @@ dist_sbin_SCRIPTS = \ striker-prep-database \ striker-purge-target \ striker-scan-network \ - striker-show-db-counts + striker-show-db-counts \ + striker-update-cluster fencedir = ${FASEXECPREFIX}/sbin diff --git a/tools/striker-collect-debug b/tools/striker-collect-debug index fa85e856..8710eb10 100755 --- a/tools/striker-collect-debug +++ b/tools/striker-collect-debug @@ -32,11 +32,6 @@ $| = 1; my $anvil = Anvil::Tools->new(); -### TODO: Remove this before final release -$anvil->Log->level({set => 2}); -$anvil->Log->secure({set => 1}); -########################################## - # Read switches (target ([user@]host[:port]) and the file with the target's password. $anvil->Get->switches({list => [ "anvil", diff --git a/tools/striker-update-cluster b/tools/striker-update-cluster new file mode 100755 index 00000000..118fa6ed --- /dev/null +++ b/tools/striker-update-cluster @@ -0,0 +1,578 @@ +#!/usr/bin/perl +# +# This program will disable our daemons on all machines, then update each striker. It then walks through all +# DR hosts and Anvil! nodes. With nodes, it migrates servers to the peer, takes the node out of the cluster, +# updates it, reboots if the kernel was updated, and then rejoins the cluster, migrates the VMs and the does +# the same process on the peer sub-node. +# +# Exit codes; +# 0 = Normal exit. +# 1 = No database connection. +# +# TODO: +# +# USAGE: +# + +use strict; +use warnings; +use Anvil::Tools; +require POSIX; +use Term::Cap; +use Text::Diff; +use Data::Dumper; + +my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0]; +my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0]; +if (($running_directory =~ /^\./) && ($ENV{PWD})) +{ + $running_directory =~ s/^\./$ENV{PWD}/; +} + +# Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete. +$| = 1; + +my $anvil = Anvil::Tools->new(); + +### TODO: Remove this before final release +$anvil->Log->level({set => 2}); +$anvil->Log->secure({set => 1}); +########################################## + +# Read switches (target ([user@]host[:port]) and the file with the target's password. +$anvil->Get->switches({list => ["force"], man => $THIS_FILE}); +$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}}); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }}); + +# Connect to the database(s). If we have no connections, we'll proceed anyway as one of the 'run_once' tasks +# is to setup the database server. +$anvil->Database->connect(); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 3, secure => 0, key => "log_0132"}); +if (not $anvil->data->{sys}{database}{connections}) +{ + # No databases, update the job, sleep for a bit and then exit. The daemon will pick it up and try + # again after we exit. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0305"}); + sleep 10; + $anvil->nice_exit({exit_code => 1}); +} + +# Make sure we're running as 'root' +# $< == real UID, $> == effective UID +if (($< != 0) && ($> != 0)) +{ + # Not root + print $anvil->Words->string({key => "error_0005"})."\n"; + $anvil->nice_exit({exit_code => 1}); +} + +# Make sure we're a striker. +if ($anvil->Get->host_type ne "striker") +{ + print "This has to be run on a Striker dashboard.\n"; + $anvil->nice_exit({exit_code => 1}); +} + +print "Update beginning. Verifying all known machines are accessible...\n"; +my $all_access = verify_access($anvil); +if ((not $all_access) && ($anvil->data->{switches}{force})) +{ + print "[ Error ] - Not all systems are accessible. Update aborted!\n"; + $anvil->nice_exit({exit_code => 1}); +} +print "Success!\n"; + +print "[ Warning ] - All nodes need to be up and running for the update to run on nodes. +[ Warning ] - Servers will be migrated between subnodes, which can cause reduced performance during +[ Warning ] - the these migrations. If a sub-node is not active, it will be activated as part of the +[ Warning ] - upgrade process.\n"; +print "\n".$anvil->Words->string({key => "message_0021"})."\n"; +my $answer = ; +chomp $answer; +$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { answer => $answer }}); + +if ($answer =~ /^y/i) +{ + print $anvil->Words->string({key => "message_0175"})."\n"; + $record_job = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { record_job => $record_job }}); +} +else +{ + print $anvil->Words->string({key => "message_0022"})."\n"; + $anvil->nice_exit({exit_code => 0}); +} + +disable_daemons($anvil); + +# Update systems +update_strikers_and_dr($anvil); + +# Update DR Host +update_nodes($anvil); + + +$anvil->nice_exit({exit_code => 0}); + + +############################################################################################################# +# Functions # +############################################################################################################# + +sub update_nodes +{ + my ($anvil) = @_; + + # Here, we loop through anvil systems, and find which sub nodes will be updated first, and which will + # be updated second. + foreach my $anvil_name (sort {$a cmp $b} keys %{$anvil->data->{anvils}{anvil_name}}) + { + my $anvil_uuid = $anvil->data->{anvils}{anvil_name}{$anvil_name}{anvil_uuid}; + my $anvil_description = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_description}; + my $anvil_node1_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node1_host_uuid}; + my $anvil_node2_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node2_host_uuid}; + my $primary_host_uuid = $anvil->Cluster->get_primary_host_uuid({anvil_uuid => $anvil_uuid}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:anvil_name' => $anvil_name, + 's2:anvil_uuid' => $anvil_uuid, + 's3:anvil_description' => $anvil_description, + 's4:anvil_node1_host_uuid' => $anvil_node1_host_uuid, + 's5:anvil_node2_host_uuid' => $anvil_node2_host_uuid, + 's6:primary_host_uuid' => $primary_host_uuid, + }}); + + + } + + foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}}) + { + my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name}; + my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; + my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:host_name' => $host_name, + 's2:host_uuid' => $host_uuid, + 's3:short_host_name' => $short_host_name, + 's4:this_host_type' => $this_host_type, + }}); + next if $host_type ne "node"; + + $anvil->data->{sys}{host}{$short_host_name}{reboot_needed} = 0; + + if (not $anvil->data->{peer}{$short_host_name}{access}{ip}) + { + print "- No access to the DR host: [".$short_host_name."], skipping.\n"; + next; + } + + # These are always remote. + print "- Beginning OS update of: [".$short_host_name."]\n"; + my $shell_call = $anvil->data->{path}{exe}{dnf}." clean all"; + my ($output, $error, $return_code) = $anvil->Remote->call({ + shell_call => $shell_call, + target => $anvil->data->{peer}{$short_host_name}{access}{ip}, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + + print "- Cache cleared, calling update now.\n"; + print "- NOTE: This can seem like it's hung! You can watch the progress using 'journalctl -f' on another terminal to\n"; + print "- watch the progress via the system logs.\n"; + $output = ""; + $error = ""; + $return_code = ""; + $shell_call = $anvil->data->{path}{exe}{dnf}." -y update"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + ($output, $error, $return_code) = $anvil->Remote->call({ + shell_call => $shell_call, + target => $anvil->data->{peer}{$short_host_name}{access}{ip}, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + + if ($return_code) + { + print "[ Error ] - There was a problem updating the system! Expected a return code of '0', but got: [".$return_code."]\n"; + print "[ Error [ - The output, if any, was\n"; + print "==] STDOUT [==\n"; + print $output."\n"; + print "==] STDERR [==\n"; + print $error."\n"; + print "==============\n"; + } + else + { + print "Success! Checking if a reboot is needed.\n"; + check_if_reboot_needed($anvil, $host_uuid); + } + + } + + return(0); +} + +sub update_strikers_and_dr +{ + my ($anvil) = @_; + + foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}}) + { + my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name}; + my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; + my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:host_name' => $host_name, + 's2:host_uuid' => $host_uuid, + 's3:short_host_name' => $short_host_name, + 's4:this_host_type' => $this_host_type, + }}); + next if $host_type ne "striker"; + + $anvil->data->{sys}{host}{$short_host_name}{reboot_needed} = 0; + + if (not $anvil->data->{peer}{$short_host_name}{access}{ip}) + { + print "- No access to the Striker dashboard: [".$short_host_name."], skipping.\n"; + next; + } + + print "- Beginning OS update of: [".$short_host_name."]\n"; + my $output = ""; + my $error = ""; + my $return_code = ""; + my $shell_call = $anvil->data->{path}{exe}{dnf}." clean all"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + if ($host_uuid eq $anvil->Get->host_uuid) + { + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + } + else + { + ($output, $error, $return_code) = $anvil->Remote->call({ + shell_call => $shell_call, + target => $anvil->data->{peer}{$short_host_name}{access}{ip}, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + } + print "- Cache cleared, calling update now.\n"; + print "- NOTE: This can seem like it's hung! You can watch the progress using 'journalctl -f' on another terminal to\n"; + print "- watch the progress via the system logs.\n"; + $output = ""; + $error = ""; + $return_code = ""; + $shell_call = $anvil->data->{path}{exe}{dnf}." -y update"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + if ($host_uuid eq $anvil->Get->host_uuid) + { + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + } + else + { + ($output, $error, $return_code) = $anvil->Remote->call({ + shell_call => $shell_call, + target => $anvil->data->{peer}{$short_host_name}{access}{ip}, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + } + + if ($return_code) + { + print "[ Error ] - There was a problem updating the system! Expected a return code of '0', but got: [".$return_code."]\n"; + print "[ Error [ - The output, if any, was\n"; + if ($host_uuid eq $anvil->Get->host_uuid) + { + print "==] Output [==\n"; + print $output."\n"; + print "==============\n"; + } + else + { + print "==] STDOUT [==\n"; + print $output."\n"; + print "==] STDERR [==\n"; + print $error."\n"; + print "==============\n"; + } + } + else + { + print "Success! Checking if a reboot is needed.\n"; + check_if_reboot_needed($anvil, $host_uuid); + } + + # Run anvil-version-change + $output = ""; + $error = ""; + $return_code = ""; + $shell_call = $anvil->data->{path}{exe}{dnf}." -y update"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + if ($host_uuid eq $anvil->Get->host_uuid) + { + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + } + else + { + ($output, $error, $return_code) = $anvil->Remote->call({ + shell_call => $shell_call, + target => $anvil->data->{peer}{$short_host_name}{access}{ip}, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + } + } + + return(0); +} + +sub check_if_reboot_needed +{ + my ($anvil, $host_uuid) = @_; + + my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; + my $shell_call = $anvil->data->{path}{exe}{rpm}." -q kernel | ".$anvil->data->{path}{exe}{'sort'}." | ".$anvil->data->{path}{exe}{tail}." -n 1"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + short_host_name => $short_host_name, + shell_call => $shell_call, + }}); + + # Get the newest installed kernel + my $installed_kernel = ""; + my $active_kernel = ""; + my $error = ""; + my $return_code = 999; + if ($host_uuid eq $anvil->Get->host_uuid) + { + ($installed_kernel, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + installed_kernel => $installed_kernel, + return_code => $return_code, + }}); + } + else + { + ($installed_kernel, $error, $return_code) = $anvil->Remote->call({ + shell_call => $shell_call, + target => $anvil->data->{peer}{$short_host_name}{access}{ip}, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + installed_kernel => $installed_kernel, + error => $error, + return_code => $return_code, + }}); + } + $installed_kernel =~ s/^kernel-(\d+.\d+\.\d+-\d+)\..*$/$1/; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { installed_kernel => $installed_kernel }}); + + # Get the running kernel + $error = ""; + $return_code = 999; + $shell_call = $anvil->data->{path}{exe}{uname}." -r"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + if ($host_uuid eq $anvil->Get->host_uuid) + { + ($active_kernel, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + active_kernel => $active_kernel, + return_code => $return_code, + }}); + } + else + { + ($active_kernel, $error, $return_code) = $anvil->Remote->call({ + shell_call => $shell_call, + target => $anvil->data->{peer}{$short_host_name}{access}{ip}, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + active_kernel => $active_kernel, + error => $error, + return_code => $return_code, + }}); + } + $active_kernel =~ s/(\d+.\d+\.\d+-\d+)\..*$/$1/; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { active_kernel => $active_kernel }}); + + if ($installed_kernel eq $active_kernel) + { + print "The kernel has not been updated.\n"; + } + else + { + print "Reboot needed!\n"; + $anvil->data->{sys}{host}{$short_host_name}{reboot_needed} = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "sys::host::${short_host_name}::reboot_needed" => $anvil->data->{sys}{host}{$short_host_name}{reboot_needed}, + }}); + } + + return(0); +} + +sub disable_daemons +{ + my ($anvil) = @_; + + my $daemons = ("anvil-daemon", "scancore"); + foreach my $host_type ("dr", "node", "striker") + { + foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}}) + { + my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name}; + my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; + my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:host_name' => $host_name, + 's2:host_uuid' => $host_uuid, + 's3:short_host_name' => $short_host_name, + 's4:this_host_type' => $this_host_type, + }}); + next if $host_type ne $this_host_type; + + print "- Disabling dameons on: [".$short_host_name."]... "; + if (not $anvil->data->{peer}{$short_host_name}{access}{ip}) + { + print "Offline! Skipping.\n"; + next; + } + + # Local + foreach my $daemons (@{$daemons}) + { + my $shell_call = $anvil->data->{path}{exe}{systemctl}." stop ".$daemon; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + my $output = ""; + my $error = ""; + my $return_code = 999; + if ($host_uuid eq $anvil->Get->host_uuid) + { + # Local + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + } + else + { + # Remote + ($output, $error, $return_code) = $anvil->Remote->call({ + shell_call => $shell_call, + target => $anvil->data->{peer}{$short_host_name}{access}{ip}, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + } + if (not $return_code) + { + print $daemon." stopped... "; + } + else + { + print $daemon." didn't stop!... "; + } + } + print "Done!\n"; + } + } + + return(0); +} + +sub verify_access +{ + my ($anvil) = @_; + + # Make sure all are available before we start. + my $all_access = 1; + foreach my $host_type ("dr", "node", "striker") + { + foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}}) + { + my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name}; + my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; + my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:host_name' => $host_name, + 's2:host_uuid' => $host_uuid, + 's3:short_host_name' => $short_host_name, + 's4:this_host_type' => $this_host_type, + }}); + next if $host_type ne $this_host_type; + + print "- Verifying access to: [".$short_host_name."]... "; + + $anvil->data->{peer}{$short_host_name}{access}{ip} = ""; + $anvil->data->{peer}{$short_host_name}{access}{network} = ""; + foreach my $preferred_network ("bcn", "mn", "ifn", "sn") + { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { preferred_network => $preferred_network }}); + foreach my $network_name (sort {$a cmp $b} keys %{$anvil->data->{network_access}}) + { + next if $network_name !~ /^$preferred_network/; + my $target_ip = $anvil->data->{network_access}{$network_name}{target_ip_address}; + my $test_access = $anvil->Remote->test_access({target => $target_ip}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:network_name' => $network_name, + 's2:target_ip' => $target_ip, + 's3:test_access' => $test_access, + }}); + + if ($test_access) + { + # We're good. + $anvil->data->{peer}{$short_host_name}{access}{ip} = $target_ip; + $anvil->data->{peer}{$short_host_name}{access}{network} = $network_name; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "s1:peer::${short_host_name}::access::ip" => $anvil->data->{peer}{$short_host_name}{access}{ip}, + "s2:peer::${short_host_name}::access::network" => $anvil->data->{peer}{$short_host_name}{access}{network}, + }}); + } + } + } + + if (not $anvil->data->{peer}{$short_host_name}{access}{ip}) + { + print "No access!!\n"; + print "- Not able to collect data from this host, skipping.\n"; + $all_access = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_access => $all_access }}); + } + } + } + + + + return($all_access); +} \ No newline at end of file From 3016fb875b2b755d70dfc4607c0a12aeb0584ac8 Mon Sep 17 00:00:00 2001 From: digimer Date: Fri, 14 Jul 2023 22:29:07 -0400 Subject: [PATCH 07/14] * Reworded striker-update-cluster to use anvil-update-system for on-system OS updates. * Updated DRBD->get_status() to take the new 'host' paramter to allow the caller to define the hash key string used in the stored data. * Updated Get->anvil_version() (and a few other places) to use the new 'striker-ui-api' shell user, replacing the 'apache' user. * Updated Remote->test_access() to take the new 'close' parameter to close the SSH session used when testing access to the target. * Fixed a logging bug in anvil-manage-power. * Updated anvil-update-system to take the '--no-reboot' and 'clear-cache' command line switches. Signed-off-by: digimer --- Anvil/Tools.pm | 3 - Anvil/Tools/DRBD.pm | 29 +- Anvil/Tools/Get.pm | 8 +- Anvil/Tools/Job.pm | 23 - Anvil/Tools/Network.pm | 2 +- Anvil/Tools/Remote.pm | 26 +- Anvil/Tools/Storage.pm | 6 +- man/Makefile.am | 3 +- man/anvil-boot-server.8 | 2 +- man/anvil-manage-power.8 | 45 ++ man/anvil-update-system.8 | 39 ++ man/striker-update-cluster.8 | 53 ++ share/words.xml | 11 +- tools/anvil-manage-power | 22 +- tools/anvil-update-system | 120 +++- tools/striker-update-cluster | 1171 ++++++++++++++++++++++++++-------- 16 files changed, 1217 insertions(+), 346 deletions(-) create mode 100644 man/anvil-manage-power.8 create mode 100644 man/anvil-update-system.8 diff --git a/Anvil/Tools.pm b/Anvil/Tools.pm index ea9900c7..6c3d5568 100644 --- a/Anvil/Tools.pm +++ b/Anvil/Tools.pm @@ -851,9 +851,6 @@ sub _set_defaults }, }; $anvil->data->{sys} = { - apache => { - user => "admin", - }, daemon => { dhcpd => "dhcpd.service", firewalld => "firewalld.service", diff --git a/Anvil/Tools/DRBD.pm b/Anvil/Tools/DRBD.pm index 990b7f68..fc4aa327 100644 --- a/Anvil/Tools/DRBD.pm +++ b/Anvil/Tools/DRBD.pm @@ -2146,6 +2146,10 @@ If any data for the host was stored in a previous call, it will be deleted befor Parameters; +=head3 host (optional) + +By default, the hash key C<< host_name >> listed above is either the local system's short host name, or the C<< target >>. If you'd like to use a specific host name in the hash key, you can use this parameter to set it. + =head3 password (optional) This is the password to use when connecting to a remote machine. If not set, but C<< target >> is, an attempt to connect without a password will be made. @@ -2172,22 +2176,42 @@ sub get_status my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "DRBD->get_status()" }}); + my $host = defined $parameter->{host} ? $parameter->{host} : ""; my $password = defined $parameter->{password} ? $parameter->{password} : ""; my $port = defined $parameter->{port} ? $parameter->{port} : ""; my $remote_user = defined $parameter->{remote_user} ? $parameter->{remote_user} : "root"; my $target = defined $parameter->{target} ? $parameter->{target} : ""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + host => $host, password => $anvil->Log->is_secure($password), port => $port, remote_user => $remote_user, target => $target, }}); + # If we weren't passed a host, use this machine's short host name. + my $is_local = $anvil->Network->is_local({host => $target}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { is_local => $is_local }}); + if (not $host) + { + # Host not set, set one. + if ($is_local) + { + $host = $anvil->Get->short_host_name(); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { host => $host }}); + } + else + { + # Remote, using the target as the host. + $host = $target; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { host => $host }}); + } + } + # Is this a local call or a remote call? my $shell_call = $anvil->data->{path}{exe}{drbdsetup}." status --json"; my $output = ""; - my $host = $anvil->Get->short_host_name(); - my $is_local = $anvil->Network->is_local({host => $target}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { shell_call => $shell_call }}); if ($is_local) { # Local. @@ -2200,7 +2224,6 @@ sub get_status else { # Remote call. - $host = $target; ($output, my $error, $anvil->data->{drbd}{status}{$host}{return_code}) = $anvil->Remote->call({ debug => $debug, shell_call => $shell_call, diff --git a/Anvil/Tools/Get.pm b/Anvil/Tools/Get.pm index 71d15693..d9d590c0 100644 --- a/Anvil/Tools/Get.pm +++ b/Anvil/Tools/Get.pm @@ -326,7 +326,7 @@ sub anvil_version schema_cache_file => $schema_cache_file, user => $user, }}); - if ($user eq "apache") + if (($user eq "apache") or ($user eq "striker-ui-api")) { # Try to read the local cached version. if (-e $anvil_cache_file) @@ -1867,8 +1867,8 @@ sub host_uuid debug => $debug, file => $anvil->data->{path}{data}{host_uuid}, body => $uuid, - user => "apache", - group => "apache", + user => "striker-ui-api", + group => "striker-ui-api", mode => "0666", overwrite => 0, }); @@ -2529,7 +2529,7 @@ sub switches $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { found => $found }}); if (not $found) { - print "Switch '--".$set_switch." not recognized.\n"; + print "Switch '--".$set_switch."' is not recognized.\n"; $problem = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { problem => $problem }}); } diff --git a/Anvil/Tools/Job.pm b/Anvil/Tools/Job.pm index b31ec09d..460ca803 100644 --- a/Anvil/Tools/Job.pm +++ b/Anvil/Tools/Job.pm @@ -756,29 +756,6 @@ WHERE $job_status =~ s/message_0058,!!downloaded!.*?!!,!!installed!.*?!!,!!verified!.*?!!,!!lines!.*?!!/message_0058,!!downloaded!$downloaded!!,!!installed!$installed!!,!!verified!$verified!!,!!lines!$lines!!/sm; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { "<< job_status" => $job_status }}); } - # This is used by 'anvil-download-file' - if ($job_status =~ /message_0142/gs) - { - ### NOTE: Is this needed anymore? -# my $downloaded = $anvil->data->{counts}{downloaded} ? $anvil->Convert->add_commas({number => $anvil->data->{counts}{downloaded}}) : 0; -# my $installed = $anvil->data->{counts}{installed} ? $anvil->Convert->add_commas({number => $anvil->data->{counts}{installed}}) : 0; -# my $verified = $anvil->data->{counts}{verified} ? $anvil->Convert->add_commas({number => $anvil->data->{counts}{verified}}) : 0; -# my $lines = $anvil->data->{counts}{lines} ? $anvil->Convert->add_commas({number => $anvil->data->{counts}{lines}}) : 0; -# $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { -# "s1:counts::downloaded" => $anvil->data->{counts}{downloaded}, -# "s2:downloaded" => $downloaded, -# "s3:counts::installed" => $anvil->data->{counts}{installed}, -# "s4:installed" => $installed, -# "s5:counts::verified" => $anvil->data->{counts}{verified}, -# "s6:verified" => $verified, -# "s7:counts::lines" => $anvil->data->{counts}{lines}, -# "s8:lines" => $lines, -# }}); -# -# $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { ">> job_status" => $job_status }}); -# $job_status =~ s/message_0142,!!downloaded!.*?!!,!!installed!.*?!!,!!verified!.*?!!,!!lines!.*?!!/message_0058,!!downloaded!$downloaded!!,!!installed!$installed!!,!!verified!$verified!!,!!lines!$lines!!/sm; -# $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { "<< job_status" => $job_status }}); - } $job_uuid = $anvil->Database->insert_or_update_jobs({ file => $THIS_FILE, diff --git a/Anvil/Tools/Network.pm b/Anvil/Tools/Network.pm index 3df3192a..0ca8bd65 100644 --- a/Anvil/Tools/Network.pm +++ b/Anvil/Tools/Network.pm @@ -1212,7 +1212,7 @@ Paramters; =head3 target (required) -This is the host we're looking for connection options with. +This is the host (name or UUID) we're looking for connection options with. =cut sub find_access diff --git a/Anvil/Tools/Remote.pm b/Anvil/Tools/Remote.pm index 8bcea088..937db22c 100644 --- a/Anvil/Tools/Remote.pm +++ b/Anvil/Tools/Remote.pm @@ -311,6 +311,7 @@ sub call # NOTE: The shell call might contain sensitive data, so we show '--' if 'secure' is set and $anvil->Log->secure is not. $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { 'close' => $close, + no_cache => $no_cache, password => $anvil->Log->is_secure($password), secure => $secure, shell_call => (not $secure) ? $shell_call : $anvil->Log->is_secure($shell_call), @@ -634,6 +635,19 @@ sub call { $error = $anvil->Words->string({key => $message_key, variables => $variables}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => $message_key, variables => $variables}); + + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => { + 'close' => $close, + password => $anvil->Log->is_secure($password), + secure => $secure, + shell_call => (not $secure) ? $shell_call : $anvil->Log->is_secure($shell_call), + ssh_fh => $ssh_fh, + start_time => $start_time, + timeout => $timeout, + port => $port, + target => $target, + ssh_fh_key => $ssh_fh_key, + }}); } } @@ -676,6 +690,10 @@ sub call error => $ssh_fh->error, }}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, secure => $secure, list => { error => $error }}); + + # Close the connection. + $close = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, secure => $secure, list => { 'close' => $close }}); } # Take the last new line off. @@ -923,6 +941,10 @@ This attempts to log into the target to verify that the target is up and reachab Parameters; +=head3 close (optional, default '1') + +If set, the SSH connection used to test the access to the remote host wil be closed. This can be useful it there might be a delay between when the connecton is tested and when it is used again. + =head3 password (optional) This is the password used to connect to the remote target as the given user. @@ -950,12 +972,14 @@ sub test_access my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "Remote->test_access()" }}); + my $close = defined $parameter->{'close'} ? $parameter->{'close'} : 1; my $password = defined $parameter->{password} ? $parameter->{password} : ""; my $port = defined $parameter->{port} ? $parameter->{port} : 22; my $target = defined $parameter->{target} ? $parameter->{target} : ""; my $user = defined $parameter->{user} ? $parameter->{user} : getpwuid($<); my $access = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, secure => 0, list => { + 'close' => $close, password => $anvil->Log->is_secure($password), port => $port, target => $target, @@ -969,7 +993,7 @@ sub test_access shell_call => $anvil->data->{path}{exe}{echo}." 1", target => $target, remote_user => $user, - 'close' => 1, + 'close' => $close, no_cache => 1, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { diff --git a/Anvil/Tools/Storage.pm b/Anvil/Tools/Storage.pm index 1588a847..1815f95e 100644 --- a/Anvil/Tools/Storage.pm +++ b/Anvil/Tools/Storage.pm @@ -452,7 +452,7 @@ sub change_mode This changes the owner and/or group of a file or directory. - $anvil->Storage->change_owner({path => "/tmp/foo", user => "apache", group => "apache" }); + $anvil->Storage->change_owner({path => "/tmp/foo", user => "striker-ui-api", group => "striker-ui-api" }); If it fails to write the file, an alert will be logged and 'C<< 1 >>' will be returned. Otherwise, 'C<< 0 >>' will be returned. @@ -4972,11 +4972,11 @@ sub update_config body => $new_file, debug => $debug, file => $anvil->data->{path}{configs}{'anvil.conf'}, - group => "apache", + group => "striker-ui-api", mode => "0640", overwrite => 1, secure => 1, - user => "apache", + user => "striker-ui-api", password => $password, port => $port, target => $target, diff --git a/man/Makefile.am b/man/Makefile.am index 78446e8e..5464bfeb 100644 --- a/man/Makefile.am +++ b/man/Makefile.am @@ -18,6 +18,7 @@ dist_man8_MANS = \ anvil-manage-dr.8 \ anvil-manage-files.8 \ anvil-manage-keys.1 \ + anvil-manage-power.8 \ anvil-manage-server.8 \ anvil-manage-server-storage.8 \ anvil-manage-storage-groups.8 \ @@ -27,4 +28,4 @@ dist_man8_MANS = \ striker-check-machines.8 \ striker-collect-debug.8 \ striker-initialize-host.8 \ - striker-update-cluster + striker-update-cluster.8 diff --git a/man/anvil-boot-server.8 b/man/anvil-boot-server.8 index 783faecb..27ed8c3c 100644 --- a/man/anvil-boot-server.8 +++ b/man/anvil-boot-server.8 @@ -40,7 +40,7 @@ This is the server UUID of the server to boot. Generally this isn't needed, exce \fB\-\-wait\fR When using '\fB\-\-server\fR all', the request to boot each server will normally not wait for the server to boot. When this is set, this behaviour is changed and the boot will wait before moving on to boot the next server. .TP -Be away that when this is used, if a server fails to boot, no further servers will be started. +Be aware that when this is used, if a server fails to boot, no further servers will be started. .IP .SH AUTHOR Written by Madison Kelly, Alteeve staff and the Anvil! project contributors. diff --git a/man/anvil-manage-power.8 b/man/anvil-manage-power.8 new file mode 100644 index 00000000..a35c367f --- /dev/null +++ b/man/anvil-manage-power.8 @@ -0,0 +1,45 @@ +.\" Manpage for the Anvil! power management tool +.\" Contact mkelly@alteeve.com to report issues, concerns or suggestions. +.TH anvil-manage-power "8" "July 11 2023" "Anvil! Intelligent Availability™ Platform" +.SH NAME +anvil-manage-power \- This program can power off, reboot, or set a flag indicating one of these actions are required. +.SH SYNOPSIS +.B anvil-manage-power +\fI\, \/\fR[\fI\,options\/\fR] +.SH DESCRIPTION +This program can mark a machine as needing to be powered off or rebooted, or perform those actions directly or as a job. +.TP +\-?, \-h, \fB\-\-help\fR +Show this man page. +.TP +\fB\-\-log-secure\fR +When logging, record sensitive data, like passwords. +.TP +\-v, \-vv, \-vvv +Set the log level to 1, 2 or 3 respectively. Be aware that level 3 generates a significant amount of log data. +.SS "Commands:" +.TP +\fB\-\-no-wait\fR +.TP +Normally, this program will not reboot a machine until the uptime is over five minutes. This is done to provide a chance for someone to log in and disable anvil-daemon in the case of a reboot loop. This switch prevents waiting for that 5 minute delay. +.TP +\fB\-\-poweroff\fR, \fB\-\-power\-off\fR +.TP +This powers off the host. +.TP +\fB\-\-reboot\fR +.TP +This reboots the host. +.TP +\fB\-\-reboot\-needed\fR [0,1] +.TP +This sets (1) or clears (0) the 'reboot needed' flag for the host system. +.TP +\fB\-\-y\fR, \fB\-\-yes\fR +.TP +If passed, requests to reboot or power off won't ask for confirmation. +.IP +.SH AUTHOR +Written by Madison Kelly, Alteeve staff and the Anvil! project contributors. +.SH "REPORTING BUGS" +Report bugs to users@clusterlabs.org diff --git a/man/anvil-update-system.8 b/man/anvil-update-system.8 new file mode 100644 index 00000000..7b36ba1d --- /dev/null +++ b/man/anvil-update-system.8 @@ -0,0 +1,39 @@ +.\" Manpage for the Anvil! cluster update tool. +.\" Contact mkelly@alteeve.com to report issues, concerns or suggestions. +.TH anvil-update-system "8" "July 14 2023" "Anvil! Intelligent Availability™ Platform" +.SH NAME +anvil-update-system \- This program updates the local operting system +.SH SYNOPSIS +.B anvil-update-system +\fI\, \/\fR[\fI\,options\/\fR] +.SH DESCRIPTION +This program updates the local operating system. If the kernel is updated, a reboot will be performed. +.TP +.B Note: +.TP +If the host is an Anvil! subnode, the subnode will be removed from the Anvil! node (and servers migrated off, or, shut down if the peer subnode is offline). +.TP +.SH OPTIONS +.TP +\-?, \-h, \fB\-\-help\fR +Show this man page. +.TP +\fB\-\-log-secure\fR +When logging, record sensitive data, like passwords. +.TP +\-v, \-vv, \-vvv +Set the log level to 1, 2 or 3 respectively. Be aware that level 3 generates a significant amount of log data. +.SS "Commands:" +.TP +\fB\-\-clear\-cache\fR +.TP +This will force the dnf cache to be cleared before the OS update is started. This slows the update down a bit, but ensures the latest updates are installed. +.TP +\fB\-\-no\-reboot\fR +.TP +If the kernel is updated, the system will normally be rebooted. This switch prevents the reboot from occuring. +.IP +.SH AUTHOR +Written by Madison Kelly, Alteeve staff and the Anvil! project contributors. +.SH "REPORTING BUGS" +Report bugs to users@clusterlabs.org diff --git a/man/striker-update-cluster.8 b/man/striker-update-cluster.8 index e69de29b..0c15566e 100644 --- a/man/striker-update-cluster.8 +++ b/man/striker-update-cluster.8 @@ -0,0 +1,53 @@ +.\" Manpage for the Anvil! cluster update tool. +.\" Contact mkelly@alteeve.com to report issues, concerns or suggestions. +.TH striker-update-cluster "8" "July 11 2023" "Anvil! Intelligent Availability™ Platform" +.SH NAME +striker-update-cluster \- This program updates all physical machines in an Anvil! cluster +.SH SYNOPSIS +.B striker-update-cluster +\fI\, \/\fR[\fI\,options\/\fR] +.SH DESCRIPTION +This program sequentially updates Striker dashboards, DR hosts and Anvil! nodes (the paired sub-nodes). It does this without needing to take hosted servers offline. +.TP +.B Note: +.TP +This program requires all machines be online, and Anvil! nodes being paired and sync'ed. When nodes are updated, the inactive subnode will be removed from the node, updated, rebooted if necessary, and then rejoined to the node. Then hosted servers will migrate to the now-updated subnode, and the process repeated for the other subnode. Anvil! nodes are updated sequentially, so the process can take some time to complete, but should not require a maintenance window. +.TP +The upgrade process will live-migrate all hosted servers! If any hosted server is either under heavy load, or the replication link (the BCN or MN) is relatively lower bandwidth, this could cause performance concerns. As such, it's ideal to run the upgrades at a time less sensitive to performance impacts. +.TP +.SH OPTIONS +.TP +\-?, \-h, \fB\-\-help\fR +Show this man page. +.TP +\fB\-\-log-secure\fR +When logging, record sensitive data, like passwords. +.TP +\-v, \-vv, \-vvv +Set the log level to 1, 2 or 3 respectively. Be aware that level 3 generates a significant amount of log data. +.SS "Commands:" +.TP +\fB\-\-clear\-cache\fR +.TP +This will force the dnf cache to be cleared before the OS update is started. This slows the update down a bit, but ensures the latest updates are installed. +.TP +\fB\-\-force\fR +.TP +If any Striker dashboards or DR hosts are unavailable, or if an entire node (paired subnodes) is offline, this switch will allow you to force the upgrade attempt. +.TP +\fB\-y\fR, \fB\-\-yes\fR +.TP +Automatically continue with the upgrade without prompting for confirmation. +.TP +\fB\-\-no\-reboot\fR +.TP +If the kernel is updated on a remote system, the system will normally be rebooted. This switch prevents the reboot from occuring. +.TP +\fB\-\-reboot\-self\fR +.TP +By default, if the local system needs to be updated, a message is printed but the local system is NOT rebooted. This switch will instead cause this host to reboot at the end of the cluster update. +.IP +.SH AUTHOR +Written by Madison Kelly, Alteeve staff and the Anvil! project contributors. +.SH "REPORTING BUGS" +Report bugs to users@clusterlabs.org diff --git a/share/words.xml b/share/words.xml index d5f0ec0c..57d838df 100644 --- a/share/words.xml +++ b/share/words.xml @@ -1555,6 +1555,8 @@ Note: This is a permanent action! If you protect this server again later, a full Calling select ScanCore scan agents to ensure the database is updated. Reload (adjust) a DRBD resource This job is to reload (adjust) a DRBD resource. It's run as a job as it blocks until the adjust is run on all nodes. + Update the base operating system. + This uses 'dnf' to do an OS update on the host. If this is run on a node, 'anvil-safe-stop' will be called to withdraw the subnode from the node's cluster. If the peer subnode is also offline, hosted servers will be shut down. Starting: [#!variable!program!#]. @@ -2408,7 +2410,10 @@ The file: [#!variable!file!#] needs to be updated. The difference is: There was an unknown error while connecting as: [#!variable!user!#] to: [#!variable!remote_user!#@#!variable!target!#]. The error was: [#!variable!error!#] We were unable to log in to: [#!variable!connection!#]. Please check that the password is correct or that passwordless SSH is configured properly. An SSH session was successfully opened to: [#!variable!target!#]. - The remote shell call: [#!variable!shell_call!#] to: [#!variable!connection!#] failed with the error: [#!variable!error!#]. + The remote shell call: [#!variable!shell_call!#] to: [#!variable!connection!#] failed with the error: +==== +#!variable!error!# +==== The SSH session to: [#!variable!target!#] was successfully closed. The SSH session to: [#!variable!target!#] was closed because 'no_cache' was set and there was an open SSH connection. Wrote the system UUID to the file: [#!variable!file!#] to enable the web based tools to read this system's UUID. @@ -2899,6 +2904,10 @@ Proceed? [y/N] Picked up the special operation job. Reloading (adjusting) the DRBD resource: [#!variable!resource!#]. This will not complete until all peers have also reloaded this resource. DRBD resource: [#!variable!resource!#] has been reloaded. + Checking if the subnode is out of the node's cluster before updating the OS. + The subnode is in the node's cluster, asking it to withdraw. This could take some time if servers need to be migrated. + Cleared 'dnf' cache. + The kernel was updated, so a reboot is required. Rebooting now. Normal Password diff --git a/tools/anvil-manage-power b/tools/anvil-manage-power index 1bf093e1..35dfd04e 100755 --- a/tools/anvil-manage-power +++ b/tools/anvil-manage-power @@ -34,17 +34,15 @@ if (($running_directory =~ /^\./) && ($ENV{PWD})) my $anvil = Anvil::Tools->new(); -# Read switches -$anvil->data->{switches}{'poweroff'} = ""; -$anvil->data->{switches}{'power-off'} = ""; -$anvil->data->{switches}{'reboot'} = ""; -$anvil->data->{switches}{'y'} = ""; -$anvil->data->{switches}{'yes'} = ""; -$anvil->data->{switches}{'reboot-needed'} = ""; -$anvil->data->{switches}{'job-uuid'} = ""; -$anvil->data->{switches}{'no-delay'} = ""; -$anvil->Get->switches; -$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => "log_0115", variables => { program => $THIS_FILE }}); +# Read switches (target ([user@]host[:port]) and the file with the target's password. +$anvil->Get->switches({list => [ + "no-wait", + "power-off", + "poweroff", + "reboot", + "reboot-needed"], man => $THIS_FILE}); +$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}}); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }}); if ($anvil->data->{switches}{'power-off'}) { @@ -191,7 +189,7 @@ sub do_poweroff # We'll wait until the system has at least 5 minutes of uptime, unless '--no-wait' was given. my $uptime = $anvil->data->{switches}{'no-wait'} ? 0 : $anvil->Get->uptime; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - "switches::no-wait" => $anvil->data->{switches}{'no-delay'}, + "switches::no-wait" => $anvil->data->{switches}{'no-wait'}, uptime => $uptime, }}); diff --git a/tools/anvil-update-system b/tools/anvil-update-system index f73c0d44..ebaa1260 100755 --- a/tools/anvil-update-system +++ b/tools/anvil-update-system @@ -36,10 +36,12 @@ if (($running_directory =~ /^\./) && ($ENV{PWD})) my $anvil = Anvil::Tools->new(); -# Read switches -$anvil->data->{switches}{'job-uuid'} = ""; -$anvil->Get->switches; -$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, secure => 0, key => "log_0115", variables => { program => $THIS_FILE }}); +# Read switches (target ([user@]host[:port]) and the file with the target's password. +$anvil->Get->switches({list => [ + "clear-cache", + "no-reboot"], man => $THIS_FILE}); +$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}}); +$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }}); # Log that we've started. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, secure => 0, key => "log_0115", variables => { program => $THIS_FILE }}); @@ -90,7 +92,32 @@ my $reboot_needed = $anvil->System->reboot_needed({debug => 2}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { reboot_needed => $reboot_needed }}); if ($reboot_needed) { - update_progress($anvil, 100, "message_0039"); + if (not $anvil->data->{switches}{'no-reboot'}) + { + # Record that we're going to reboot now. + update_progress($anvil, 100, "message_0317"); + + # Clear maintenance mode. + $anvil->System->maintenance_mode({set => 0}); + + # Record that we're rebooting so that 'striker-update-cluster' knows to wait for a reboot. + my $query = "UPDATE jobs SET job_status = 'rebooted', modified_date = ".$anvil->Database->quote($anvil->Database->refresh_timestamp)." WHERE job_uuid = ".$anvil->Database->quote($job_uuid).";"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { query => $query }}); + $anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__}); + + sleep 2; + my $shell_call = $anvil->data->{path}{exe}{systemctl}." reboot"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code }}); + $anvil->nice_exit({exit_code => 0}); + } + else + { + # Record that a reboot is needed. + update_progress($anvil, 100, "message_0039"); + } } else { @@ -140,17 +167,57 @@ sub run_os_update # This needs to be set to avoid warnings when called without a job-uuid. $anvil->data->{sys}{last_update} = 0; + # Make sure that, if we're a node, we're out of the cluster. + my $host_type = $anvil->Get->host_type(); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { host_type => $host_type }}); + if ($host_type eq "node") + { + # Call anvil-safe-stop + update_progress($anvil, 3, "message_0314"); + + my $problem = $anvil->Cluster->parse_cib({debug => 3}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { problem => $problem }}); + if (not $problem) + { + # Call anvil-safe-stop + update_progress($anvil, 4, "message_0315"); + + my $shell_call = $anvil->data->{path}{exe}{'anvil-safe-stop'}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + } + } + + # Should we clear the cache? + if ($anvil->data->{switches}{'clear-cache'}) + { + # Yes. + my $shell_call = $anvil->data->{path}{exe}{dnf}." clean all"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + update_progress($anvil, 5, "message_0316"); + } + # NOTE: We run this directly to better monitor progress and update the progress. my $transaction_shown = 0; my $success = 0; my $to_update = 0; my $percent_step = 0; - my $progress = 5; + my $progress = 6; my $counted_lines = 0; my $next_step = 0; my $verifying = 0; my $output = ""; - my $shell_call = $anvil->data->{path}{exe}{dnf}." clean expire-cache && ".$anvil->data->{path}{exe}{dnf}." -y update --best --allowerasing; ".$anvil->data->{path}{exe}{echo}." return_code:\$?"; + my $shell_call = $anvil->data->{path}{exe}{dnf}." -y update --best --allowerasing; ".$anvil->data->{path}{exe}{echo}." return_code:\$?"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { shell_call => $shell_call }}); open (my $file_handle, $shell_call." 2>&1 |") or $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, secure => 0, priority => "err", key => "log_0014", variables => { shell_call => $shell_call, error => $! }}); while(<$file_handle>) @@ -162,14 +229,6 @@ sub run_os_update $anvil->data->{counts}{lines}++; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "counts::lines" => $anvil->data->{counts}{lines}, line => $line }}); - if ($line =~ /^kernel /) - { - # Reboot will be needed. - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0687", variables => { reason => "#!string!log_0690!#" }}); - my $reboot_needed = $anvil->System->reboot_needed({set => 1}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { reboot_needed => $reboot_needed }}); - } - # If there were no updates, let the user know. if ($line =~ /^Nothing to do/i) { @@ -286,6 +345,37 @@ sub run_os_update my ($systemctl_output, $return_code) = $anvil->System->call({debug => 3, shell_call => $anvil->data->{path}{exe}{systemctl}." daemon-reload", source => $THIS_FILE, line => __LINE__}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { systemctl_output => $systemctl_output, return_code => $return_code }}); + ### See if the kernel has been updated. + # Get the newest installed kernel + my $shell_call = $anvil->data->{path}{exe}{rpm}." -q kernel | ".$anvil->data->{path}{exe}{'sort'}." | ".$anvil->data->{path}{exe}{tail}." -n 1"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + my ($installed_kernel, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + installed_kernel => $installed_kernel, + return_code => $return_code, + }}); + $installed_kernel =~ s/^kernel-(\d+.\d+\.\d+-\d+)\..*$/$1/; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { installed_kernel => $installed_kernel }}); + + # Get the running kernel + $shell_call = $anvil->data->{path}{exe}{uname}." -r"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + (my $active_kernel, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + active_kernel => $active_kernel, + return_code => $return_code, + }}); + $active_kernel =~ s/(\d+.\d+\.\d+-\d+)\..*$/$1/; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { active_kernel => $active_kernel }}); + + if ($installed_kernel ne $active_kernel) + { + # Reboot needed + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0687", variables => { reason => "#!string!log_0690!#" }}); + my $reboot_needed = $anvil->System->reboot_needed({set => 1}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { reboot_needed => $reboot_needed }}); + } + # Did it work? if (not $success) { diff --git a/tools/striker-update-cluster b/tools/striker-update-cluster index 118fa6ed..259b9188 100755 --- a/tools/striker-update-cluster +++ b/tools/striker-update-cluster @@ -40,7 +40,13 @@ $anvil->Log->secure({set => 1}); ########################################## # Read switches (target ([user@]host[:port]) and the file with the target's password. -$anvil->Get->switches({list => ["force"], man => $THIS_FILE}); +$anvil->Get->switches({list => [ + "clear-cache", + "force", + "no-reboot", + "reboot-self", + "y", + "yes"], man => $THIS_FILE}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => $anvil->data->{switches}}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0115", variables => { program => $THIS_FILE }}); @@ -75,42 +81,86 @@ if ($anvil->Get->host_type ne "striker") print "Update beginning. Verifying all known machines are accessible...\n"; my $all_access = verify_access($anvil); -if ((not $all_access) && ($anvil->data->{switches}{force})) +if ((not $all_access) && (not $anvil->data->{switches}{force})) { print "[ Error ] - Not all systems are accessible. Update aborted!\n"; $anvil->nice_exit({exit_code => 1}); } print "Success!\n"; -print "[ Warning ] - All nodes need to be up and running for the update to run on nodes. -[ Warning ] - Servers will be migrated between subnodes, which can cause reduced performance during -[ Warning ] - the these migrations. If a sub-node is not active, it will be activated as part of the -[ Warning ] - upgrade process.\n"; -print "\n".$anvil->Words->string({key => "message_0021"})."\n"; -my $answer = ; -chomp $answer; -$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { answer => $answer }}); - -if ($answer =~ /^y/i) +if (($anvil->data->{switches}{y}) or ($anvil->data->{switches}{yes})) { - print $anvil->Words->string({key => "message_0175"})."\n"; - $record_job = 1; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { record_job => $record_job }}); + print "[ Note ] - Proceeding without confirmation, '-y' or '--yes' used.\n"; } else { - print $anvil->Words->string({key => "message_0022"})."\n"; - $anvil->nice_exit({exit_code => 0}); + print "[ Note ] - All nodes need to be up and running for the update to run on nodes. + [ Note ] - Any out-of-sync storage needs to complete before a node can be updated. + [ Warning ] - Servers will be migrated between subnodes, which can cause reduced performance during + [ Warning ] - the these migrations. If a sub-node is not active, it will be activated as part of the + [ Warning ] - upgrade process.\n"; + print "\n".$anvil->Words->string({key => "message_0021"})."\n"; + my $answer = ; + chomp $answer; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { answer => $answer }}); + + if ($answer =~ /^y/i) + { + print $anvil->Words->string({key => "message_0175"})."\n"; + } + else + { + print $anvil->Words->string({key => "message_0022"})."\n"; + $anvil->nice_exit({exit_code => 0}); + } } -disable_daemons($anvil); +manage_daemons($anvil, "stop"); # Update systems update_strikers_and_dr($anvil); +die; + # Update DR Host update_nodes($anvil); +print "Updates complete!\n"; + +my $host_uuid = $anvil->Get->host_uuid; +my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; +$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:host_uuid' => $host_uuid, + 's2:short_host_name' => $short_host_name, +}}); +if ($anvil->data->{sys}{reboot_needed}) +{ + if ($anvil->data->{switches}{'reboot-self'}) + { + print "[ Note ] - The local system needs to be rebooted, and '--reboot-self' was used. Rebooting in 60 seconds! Use ctrl+c to abort!\n"; + my $waiting = 60; + while ($waiting) + { + print $waiting.", "; + sleep 5; + $waiting -= 5; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $waiting }}); + } + print "\nRebooting now!\n"; + + my $shell_call = $anvil->data->{path}{exe}{systemctl}." reboot"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + + my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code }}); + + print "Reboot requested, exiting.\n"; + } + else + { + print "[ Note ] - This host needs to be rebooted to activate the new kernel. Please update as soon as you can.\n"; + } +} $anvil->nice_exit({exit_code => 0}); @@ -132,6 +182,9 @@ sub update_nodes my $anvil_node1_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node1_host_uuid}; my $anvil_node2_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node2_host_uuid}; my $primary_host_uuid = $anvil->Cluster->get_primary_host_uuid({anvil_uuid => $anvil_uuid}); + my $secondary_host_uuid = $primary_host_uuid eq $anvil_node1_host_uuid ? $anvil_node2_host_uuid : $anvil_node1_host_uuid; + my $node1_short_host_name = $anvil->data->{hosts}{host_uuid}{$anvil_node1_host_uuid}{short_host_name}; + my $node2_short_host_name = $anvil->data->{hosts}{host_uuid}{$anvil_node2_host_uuid}{short_host_name}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:anvil_name' => $anvil_name, 's2:anvil_uuid' => $anvil_uuid, @@ -139,126 +192,119 @@ sub update_nodes 's4:anvil_node1_host_uuid' => $anvil_node1_host_uuid, 's5:anvil_node2_host_uuid' => $anvil_node2_host_uuid, 's6:primary_host_uuid' => $primary_host_uuid, + 's7:secondary_host_uuid' => $secondary_host_uuid, + 's8:node1_short_host_name' => $node1_short_host_name, + 's9:node2_short_host_name' => $node2_short_host_name, }}); - - } - - foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}}) - { - my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name}; - my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; - my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type}; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - 's1:host_name' => $host_name, - 's2:host_uuid' => $host_uuid, - 's3:short_host_name' => $short_host_name, - 's4:this_host_type' => $this_host_type, - }}); - next if $host_type ne "node"; - - $anvil->data->{sys}{host}{$short_host_name}{reboot_needed} = 0; - - if (not $anvil->data->{peer}{$short_host_name}{access}{ip}) + # Before we proceed, are both nodes online? If so, great. If not, are both offline? If only + # one is online, abort. Check now in case things have changed since our first scan + print "Preparing to update the Anvil! node: [".$anvil_name."]. Verifying subnode access:\n"; + foreach my $host_uuid ($secondary_host_uuid, $primary_host_uuid) { - print "- No access to the DR host: [".$short_host_name."], skipping.\n"; - next; - } - - # These are always remote. - print "- Beginning OS update of: [".$short_host_name."]\n"; - my $shell_call = $anvil->data->{path}{exe}{dnf}." clean all"; - my ($output, $error, $return_code) = $anvil->Remote->call({ - shell_call => $shell_call, - target => $anvil->data->{peer}{$short_host_name}{access}{ip}, - }); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - output => $output, - error => $error, - return_code => $return_code, - }}); - - print "- Cache cleared, calling update now.\n"; - print "- NOTE: This can seem like it's hung! You can watch the progress using 'journalctl -f' on another terminal to\n"; - print "- watch the progress via the system logs.\n"; - $output = ""; - $error = ""; - $return_code = ""; - $shell_call = $anvil->data->{path}{exe}{dnf}." -y update"; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); - ($output, $error, $return_code) = $anvil->Remote->call({ - shell_call => $shell_call, - target => $anvil->data->{peer}{$short_host_name}{access}{ip}, - }); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - output => $output, - error => $error, - return_code => $return_code, - }}); - - if ($return_code) - { - print "[ Error ] - There was a problem updating the system! Expected a return code of '0', but got: [".$return_code."]\n"; - print "[ Error [ - The output, if any, was\n"; - print "==] STDOUT [==\n"; - print $output."\n"; - print "==] STDERR [==\n"; - print $error."\n"; - print "==============\n"; - } - else - { - print "Success! Checking if a reboot is needed.\n"; - check_if_reboot_needed($anvil, $host_uuid); + my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:host_uuid' => $host_uuid, + 's2:short_host_name' => $short_host_name, + }}); + print "- Verifying access to subnode: [".$short_host_name."]\n"; + $anvil->data->{peer}{$short_host_name}{access}{ip} = ""; + $anvil->data->{peer}{$short_host_name}{access}{network} = ""; + foreach my $preferred_network ("bcn", "mn", "ifn", "sn") + { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { preferred_network => $preferred_network }}); + foreach my $network_name (sort {$a cmp $b} keys %{$anvil->data->{network_access}}) + { + next if $network_name !~ /^$preferred_network/; + my $target_ip = $anvil->data->{network_access}{$network_name}{target_ip_address}; + my $test_access = $anvil->Remote->test_access({target => $target_ip}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:network_name' => $network_name, + 's2:target_ip' => $target_ip, + 's3:test_access' => $test_access, + }}); + + if ($test_access) + { + # We're good. + $anvil->data->{peer}{$short_host_name}{access}{ip} = $target_ip; + $anvil->data->{peer}{$short_host_name}{access}{network} = $network_name; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "s1:peer::${short_host_name}::access::ip" => $anvil->data->{peer}{$short_host_name}{access}{ip}, + "s2:peer::${short_host_name}::access::network" => $anvil->data->{peer}{$short_host_name}{access}{network}, + }}); + print "- Access found uver the: [".$network_name."] networking using the IP: [".$target_ip."]\n"; + last; + } + } + } + if (not $anvil->data->{peer}{$short_host_name}{access}{ip}) + { + print "[ Warning ] - Access not found!\n"; + } } - } - - return(0); -} - -sub update_strikers_and_dr -{ - my ($anvil) = @_; - - foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}}) - { - my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name}; - my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; - my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type}; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - 's1:host_name' => $host_name, - 's2:host_uuid' => $host_uuid, - 's3:short_host_name' => $short_host_name, - 's4:this_host_type' => $this_host_type, - }}); - next if $host_type ne "striker"; - - $anvil->data->{sys}{host}{$short_host_name}{reboot_needed} = 0; - - if (not $anvil->data->{peer}{$short_host_name}{access}{ip}) + if ((($anvil->data->{peer}{$node1_short_host_name}{access}{ip}) && (not $anvil->data->{peer}{$node2_short_host_name}{access}{ip})) or + ((not $anvil->data->{peer}{$node1_short_host_name}{access}{ip}) && ($anvil->data->{peer}{$node2_short_host_name}{access}{ip}))) { - print "- No access to the Striker dashboard: [".$short_host_name."], skipping.\n"; - next; + # Only one node online, skip this Anvil node. + if ($anvil->data->{switches}{force}) + { + # Skip this Anvil! system + print "[ Warning ] - '--force' used, skipping this node.\n"; + print "[ NOTE ] - This node may not be able to communicate with the Striker dashboards until updated manually!\n"; + next; + } + else + { + print "[ Error ] - Exiting update! Please bring the missing subnode back online and try again!\n"; + $anvil->nice_exit({exit_code => 1}); + } } - print "- Beginning OS update of: [".$short_host_name."]\n"; - my $output = ""; - my $error = ""; - my $return_code = ""; - my $shell_call = $anvil->data->{path}{exe}{dnf}." clean all"; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); - if ($host_uuid eq $anvil->Get->host_uuid) + # Update the secondary first, as it should have no VMs on it. + foreach my $host_uuid ($secondary_host_uuid, $primary_host_uuid) { - ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + # Withdraw the node from the cluster. + my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; + my $peer_host_uuid = $host_uuid eq $primary_host_uuid ? $secondary_host_uuid : $primary_host_uuid; + my $peer_short_host_name = $anvil->data->{hosts}{host_uuid}{$peer_host_uuid}{short_host_name}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - output => $output, - return_code => $return_code, + 's1:host_uuid' => $host_uuid, + 's2:short_host_name' => $short_host_name, + 's3:peer_host_uuid' => $peer_host_uuid, + 's4:peer_short_host_name' => $peer_short_host_name, }}); - } - else - { - ($output, $error, $return_code) = $anvil->Remote->call({ + + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:host_uuid' => $host_uuid, + 's2:short_host_name' => $short_host_name, + }}); + + print "Preparing to update: [".$short_host_name."]. Withdrawing the subnode from the Anvil! node.\n"; + print "- [ Note ] - If the node has servers that need to be migrated off, or if the node is SyncSource for storage,\n"; + print "- [ Note ] - this could take some time to complete.\n"; + + # Register an anvil-safe-stop job and then wait. + my $job_uuid = $anvil->Database->insert_or_update_jobs({ + debug => 2, + job_command => $anvil->data->{path}{exe}{'anvil-safe-stop'}, + job_host_uuid => $host_uuid, + job_description => "job_0339", + job_name => "cgi-bin::set_membership::leave", + job_progress => 0, + job_title => "job_0338" + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }}); + print "- [ Note ] - Job registered with UUID: [".$job_uuid."], waiting for it to complete.\n"; + + # Log into the target machine and make sure anvil-daemon is running. + print "- Making sure anvil-daemon is running... "; + my $shell_call = $anvil->data->{path}{exe}{systemctl}." start anvil-daemon.service"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + my ($output, $error, $return_code) = $anvil->Remote->call({ + 'close' => 1, + no_cache => 1, shell_call => $shell_call, target => $anvil->data->{peer}{$short_host_name}{access}{ip}, }); @@ -267,180 +313,713 @@ sub update_strikers_and_dr error => $error, return_code => $return_code, }}); - } - print "- Cache cleared, calling update now.\n"; - print "- NOTE: This can seem like it's hung! You can watch the progress using 'journalctl -f' on another terminal to\n"; - print "- watch the progress via the system logs.\n"; - $output = ""; - $error = ""; - $return_code = ""; - $shell_call = $anvil->data->{path}{exe}{dnf}." -y update"; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); - if ($host_uuid eq $anvil->Get->host_uuid) - { - ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - output => $output, - return_code => $return_code, - }}); - } - else - { - ($output, $error, $return_code) = $anvil->Remote->call({ - shell_call => $shell_call, - target => $anvil->data->{peer}{$short_host_name}{access}{ip}, + if (not $return_code) + { + print " running.\n"; + } + else + { + print " not running!\n"; + } + + # Verify that the node is no longer in the cluster. + my $waiting = 1; + my $next_log = time + 60; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }}); + while ($waiting) + { + my $problem = $anvil->Cluster->parse_cib({target => $anvil->data->{peer}{$short_host_name}{access}{ip}}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); + + if ($problem) + { + # This is good, it didn't parse so it's out of the cluster. + print "- The subnode is out of the node cluster. Proceeding.\n"; + $waiting = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); + } + else + { + if (time > $next_log) + { + $anvil->Database->get_job_details({job_uuid => $job_uuid}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "jobs::job_progress" => $anvil->data->{jobs}{job_progress}, + "jobs::job_status" => $anvil->data->{jobs}{job_status}, + }}); + if ($anvil->data->{jobs}{job_progress} == 0) + { + print "[ Warning ] - The job has not been picked up yet. Is 'anvil-daemon' running on: [".$short_host_name."]?\n"; + } + else + { + print "[ Note ] - [".$anvil->Get->date_and_time({time_only => 1})."] - The job progress is: [".$anvil->data->{jobs}{job_progress}."], continuing to wait.\n"; + } + $next_log = time + 60; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }}); + } + sleep 5; + } + } + + # Record the start time so that we can be sure the subnode has rebooted (uptime is + # less than the current time minus this start time), if the host reboots as part of + # the update. + my $reboot_time = time; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { reboot_time => $reboot_time }}); + + # Do the OS update. + print "- Beginning OS update of: [".$short_host_name."]\n"; + my $rebooted = 0; + $shell_call = $anvil->data->{path}{exe}{'anvil-update-system'}; + if ($anvil->data->{switches}{'no-reboot'}) + { + $shell_call .= " --no-reboot"; + } + if ($anvil->data->{switches}{'clear-cache'}) + { + $shell_call .= " --clear-cache"; + } + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + $job_uuid = ""; + $job_uuid = $anvil->Database->insert_or_update_jobs({ + debug => 2, + job_command => $shell_call, + job_description => "job_0468", + job_host_uuid => $host_uuid, + job_name => "system::update-system", + job_progress => 0, + job_title => "job_0467" }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }}); + print "- [ Note ] - Job registered with UUID: [".$job_uuid."], waiting for it to complete.\n"; + + # Verify that the node is no longer in the cluster. + $waiting = 1; + $next_log = time + 60; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }}); + while ($waiting) + { + $anvil->Database->get_job_details({job_uuid => $job_uuid}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "jobs::job_progress" => $anvil->data->{jobs}{job_progress}, + "jobs::job_status" => $anvil->data->{jobs}{job_status}, + }}); + if ($anvil->data->{jobs}{job_progress} == 100) + { + print "- Done! The host: [".$short_host_name."] has been updated\n"; + $waiting = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); + + # Did it reboot? + if ($anvil->data->{jobs}{job_status} eq "rebooted") + { + $rebooted = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { rebooted => $rebooted }}); + } + } + else + { + if (time > $next_log) + { + if ($anvil->data->{jobs}{job_progress} == 0) + { + print "[ Warning ] - The job has not been picked up yet. Is 'anvil-daemon' running on: [".$short_host_name."]?\n"; + } + else + { + print "[ Note ] - [".$anvil->Get->date_and_time({time_only => 1})."] - The job progress is: [".$anvil->data->{jobs}{job_progress}."], continuing to wait.\n"; + } + $next_log = time + 60; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }}); + } + sleep 5; + } + } + + print "- Update completed successfully! Checking if a reboot is needed.\n"; + my $run_anvil_safe_start = 0; + if ($rebooted) + { + print "- Rebooted! Will wait for it to come back up.\n"; + wait_for_reboot($anvil, $host_uuid, $reboot_time); + } + else + { + print "- Reboot not needed, kernel appears to be up to date.\n"; + + $run_anvil_safe_start = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { run_anvil_safe_start => $run_anvil_safe_start }}); + } + + # Wait for the node to rejoin the cluster. As before, this is a time + # unrestricted wait loop. + print "- Waiting for the subnode to rejoin the node.\n"; + $waiting = 1; + my $start_called = 0; + $next_log = time + 60; + my $manual_start = time + 60; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - output => $output, - error => $error, - return_code => $return_code, + next_log => $next_log, + manual_start => $manual_start, }}); - } - - if ($return_code) - { - print "[ Error ] - There was a problem updating the system! Expected a return code of '0', but got: [".$return_code."]\n"; - print "[ Error [ - The output, if any, was\n"; + + while($waiting) + { + # Should we call a start to the cluster? + if ((not $start_called) && ($run_anvil_safe_start)) + { + print "- Calling 'anvil-safe-start' to rejoin the subnode to the node.\n"; + $start_called = 1; + my $shell_call = $anvil->data->{path}{exe}{'anvil-safe-start'}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + start_called => $start_called, + shell_call => $shell_call, + }}); + + my ($output, $error, $return_code) = $anvil->Remote->call({ + shell_call => $shell_call, + target => $anvil->data->{peer}{$short_host_name}{access}{ip}, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + } + + # Pull the CIB and make sure both nodes are ready, and that DRBD resources + # are all UpToDate if this is the reboot from the first node. + my ($problem) = $anvil->Cluster->parse_cib({target => $anvil->data->{peer}{$short_host_name}{access}{ip}}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); + + # Are both nodes ready? + if (not $problem) + { + # Both nodes are in the cluster, but are they full members yet? + my $both_ready = 1; + my $node_count = 0; + foreach my $node_name (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{data}{node}}) + { + my $ready = $anvil->data->{cib}{parsed}{data}{node}{$node_name}{node_state}{ready}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + node_name => $node_name, + ready => $ready, + }}); + if (not $ready) + { + $both_ready = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { both_ready => $both_ready }}); + } + $node_count++; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { node_count => $node_count }}); + } + + # Did we see two nodes and are both ready? + if (($node_count == 2) && ($both_ready)) + { + # Yes! If this is the first subnode, we need to wait for DRBD + # to be UpToDate. If it's the second, we just wait for the + # connections to be up. + # NOTE: We call the peer to get the DRBD data as it's got a + # better view of the storage + print "- Both subnodes are online, will now check replicated storage.\n"; + $anvil->DRBD->get_status({ + host => $peer_short_host_name, + target => $anvil->data->{peer}{$peer_short_host_name}{access}{ip}, + }); + + if ($host_uuid eq $primary_host_uuid) + { + ### NOTE: Should we wait for all connections + ### to be up? + # This is the second node, we don't have to wait. + print "- This is the second node, no need to wait for replication to complete.\n"; + $waiting = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); + } + else + { + # This is the first node. Wait for all volumes to be + # UpToDate. + if (time > $next_log) + { + print "- Waiting for all volumes to be UpToDate before updating the other subnode.\n"; + } + my $all_uptodate = 1; + my $resources = 0; + foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$peer_short_host_name}{resource}}) + { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { resource => $resource }}); + foreach my $peer_name (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$peer_short_host_name}{resource}{$resource}{connection}}) + { + # We don't care about DR hosts for this upgrade + my $peer_uuid = $anvil->Get->host_uuid_from_name({host_name => $peer_name}); + my $peer_type = $anvil->data->{hosts}{host_uuid}{$peer_uuid}{host_type}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:peer_name' => $peer_name, + 's2:peer_uuid' => $peer_uuid, + 's3:peer_type' => $peer_type, + }}); + next if $peer_type ne "node"; + foreach my $volume (sort {$a <=> $b} keys %{$anvil->data->{drbd}{status}{$peer_short_host_name}{resource}{$resource}{connection}{$peer_name}{volume}}) + { + # This is this subnode's disk state, + # as the DRBD data was collected + # from the peer. + my $disk_state = $anvil->data->{drbd}{status}{$peer_short_host_name}{resource}{$resource}{connection}{$peer_name}{volume}{$volume}{'peer-disk-state'}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:volume' => $volume, + 's2:disk_state' => $disk_state, + }}); + + if (lc($disk_state) ne "uptodate") + { + $all_uptodate = 0; + my $eta_in_seconds = $anvil->data->{drbd}{status}{$peer_short_host_name}{resource}{$resource}{connection}{$peer_name}{volume}{$volume}{'estimated-seconds-to-finish'}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + all_uptodate => $all_uptodate, + eta_in_seconds => $eta_in_seconds, + }}); + if (time > $next_log) + { + if ($eta_in_seconds) + { + print "- The resource: [".$resource."/".$volume."] is not synced yet, ETA is: [".$eta_in_seconds."] to complete resync.\n"; + } + else + { + print "- The resource: [".$resource."/".$volume."] is not yet UpToDate.\n"; + } + } + } + } # End foreach volume + } # End foreach peer + } # End foreach resource + + if ($all_uptodate) + { + print "- All resources appear to be ready,\n"; + $waiting = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); + } + } # End if host is first or second subnode + } # End if both ready + elsif (time > $next_log) + { + print "- Both subnodes are not online yet, still waiting.\n"; + } + } # End if CIB was parsed + elsif (time > $next_log) + { + print "- Unable to parse the node's cluster information base, will try again soon.\n"; + } + + if (time > $next_log) + { + $next_log = time + 60; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }}); + } + + if ($waiting) + { + sleep 5; + } + } # End while waiting for subnode to return + + # Run anvil-version-change + print "- Running 'anvil-version-changes'.\n"; + $output = ""; + $error = ""; + $return_code = ""; + $shell_call = $anvil->data->{path}{exe}{'anvil-version-changes'}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); if ($host_uuid eq $anvil->Get->host_uuid) { - print "==] Output [==\n"; - print $output."\n"; - print "==============\n"; + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); } else { - print "==] STDOUT [==\n"; - print $output."\n"; - print "==] STDERR [==\n"; - print $error."\n"; - print "==============\n"; + ($output, $error, $return_code) = $anvil->Remote->call({ + shell_call => $shell_call, + target => $anvil->data->{peer}{$short_host_name}{access}{ip}, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); } + print "- Done!\n"; } - else - { - print "Success! Checking if a reboot is needed.\n"; - check_if_reboot_needed($anvil, $host_uuid); - } - - # Run anvil-version-change - $output = ""; - $error = ""; - $return_code = ""; - $shell_call = $anvil->data->{path}{exe}{dnf}." -y update"; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); - if ($host_uuid eq $anvil->Get->host_uuid) + } + + return(0); +} + +sub update_strikers_and_dr +{ + my ($anvil) = @_; + + foreach my $host_type ("striker", "dr") + { + if ($host_type eq "dr") { - ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - output => $output, - return_code => $return_code, - }}); + # Restart daemons. + manage_daemons($anvil, "start"); } - else + foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}}) { - ($output, $error, $return_code) = $anvil->Remote->call({ - shell_call => $shell_call, - target => $anvil->data->{peer}{$short_host_name}{access}{ip}, - }); + my $host_uuid = $anvil->data->{sys}{hosts}{by_name}{$host_name}; + my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; + my $this_host_type = $anvil->data->{hosts}{host_uuid}{$host_uuid}{host_type}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - output => $output, - error => $error, - return_code => $return_code, + 's1:host_name' => $host_name, + 's2:host_uuid' => $host_uuid, + 's3:short_host_name' => $short_host_name, + 's4:this_host_type' => $this_host_type, }}); + next if $this_host_type ne $host_type; + + if ($host_type eq "striker") + { + print "Starting the update of the Striker dashboard: [".$short_host_name."].\n"; + } + else + { + print "Starting the update of the DR host: [".$short_host_name."].\n"; + } + + # If this is the local system, set the variable to track if we need to reboot. + # Otherwise, see if we have access to the peer. + if ($host_uuid eq $anvil->Get->host_uuid) + { + $anvil->data->{sys}{reboot_needed} = 0; + } + elsif(not $anvil->data->{peer}{$short_host_name}{access}{ip}) + { + if ($host_type eq "striker") + { + print "- No access to the Striker dashboard: [".$short_host_name."], skipping.\n"; + } + else + { + print "- No access to the DR host: [".$short_host_name."], skipping.\n"; + } + next; + } + + # Record the start time so that we can be sure the subnode has rebooted (uptime is + # less than the current time minus this start time), if the host reboots as part of + # the update. + my $reboot_time = time; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { reboot_time => $reboot_time }}); + + print "- Beginning OS update of: [".$short_host_name."]\n"; + my $rebooted = 0; + my $output = ""; + my $error = ""; + my $return_code = ""; + if ($anvil->data->{switches}{'clear-cache'}) + { + my $shell_call = $anvil->data->{path}{exe}{dnf}." clean all"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + if ($host_uuid eq $anvil->Get->host_uuid) + { + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + } + else + { + ($output, $error, $return_code) = $anvil->Remote->call({ + timeout => 0, + no_cache => 1, + shell_call => $shell_call, + target => $anvil->data->{peer}{$short_host_name}{access}{ip}, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + } + print "- Cache cleared.\n"; + } + print "- Calling update now.\n"; + print "- NOTE: This can seem like it's hung! You can watch the progress using 'journalctl -f' on another terminal to\n"; + print "- watch the progress via the system logs. You can also check wiht 'ps aux | grep dnf'.\n"; + if ($host_uuid eq $anvil->Get->host_uuid) + { + my $shell_call = $anvil->data->{path}{exe}{dnf}." -y update --best --allowerasing"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + if ($return_code) + { + print "[ Error ] - There was a problem updating the system! Expected a return code of '0', but got: [".$return_code."]\n"; + print "[ Error [ - The output, if any, was\n"; + print "==] Output [==\n"; + print $output."\n"; + print "==============\n"; + } + + # Get the newest installed kernel + $shell_call = $anvil->data->{path}{exe}{rpm}." -q kernel | ".$anvil->data->{path}{exe}{'sort'}." | ".$anvil->data->{path}{exe}{tail}." -n 1"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + (my $installed_kernel, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + installed_kernel => $installed_kernel, + return_code => $return_code, + }}); + $installed_kernel =~ s/^kernel-(\d+.\d+\.\d+-\d+)\..*$/$1/; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { installed_kernel => $installed_kernel }}); + + # Get the running kernel + $shell_call = $anvil->data->{path}{exe}{uname}." -r"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + (my $active_kernel, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + active_kernel => $active_kernel, + return_code => $return_code, + }}); + $active_kernel =~ s/(\d+.\d+\.\d+-\d+)\..*$/$1/; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { active_kernel => $active_kernel }}); + + if ($installed_kernel eq $active_kernel) + { + print "- The kernel has not been updated.\n"; + } + else + { + print "- The kernel appears to have been upgraded, reboot needed!\n"; + $anvil->data->{sys}{reboot_needed} = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "sys::reboot_needed" => $anvil->data->{sys}{reboot_needed}, + }}); + } + } + else + { + # Call anvil-update-system and then wait. + print "- Beginning OS update of: [".$short_host_name."]\n"; + my $shell_call = $anvil->data->{path}{exe}{'anvil-update-system'}; + if ($anvil->data->{switches}{'no-reboot'}) + { + $shell_call .= " --no-reboot"; + } + if ($anvil->data->{switches}{'clear-cache'}) + { + $shell_call .= " --clear-cache"; + } + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + my $job_uuid = $anvil->Database->insert_or_update_jobs({ + debug => 2, + job_command => $shell_call, + job_description => "job_0468", + job_host_uuid => $host_uuid, + job_name => "system::update-system", + job_progress => 0, + job_title => "job_0467" + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }}); + print "- [ Note ] - Job registered with UUID: [".$job_uuid."], waiting for it to complete.\n"; + + # Log into the target machine and make sure anvil-daemon is running. + print "- Making sure anvil-daemon is running... "; + $shell_call = $anvil->data->{path}{exe}{systemctl}." start anvil-daemon.service"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + my ($output, $error, $return_code) = $anvil->Remote->call({ + 'close' => 1, + no_cache => 1, + shell_call => $shell_call, + target => $anvil->data->{peer}{$short_host_name}{access}{ip}, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + if (not $return_code) + { + print " running.\n"; + } + else + { + print " not running!\n"; + } + + # Verify that the node is no longer in the cluster. + my $waiting = 1; + my $next_log = time + 60; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }}); + while ($waiting) + { + $anvil->Database->get_job_details({job_uuid => $job_uuid}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "jobs::job_progress" => $anvil->data->{jobs}{job_progress}, + "jobs::job_status" => $anvil->data->{jobs}{job_status}, + }}); + if ($anvil->data->{jobs}{job_progress} == 100) + { + print "- Done! The host: [".$short_host_name."] has been updated\n"; + $waiting = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); + + # Did it reboot? + if ($anvil->data->{jobs}{job_status} eq "rebooted") + { + $rebooted = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { rebooted => $rebooted }}); + } + } + else + { + if (time > $next_log) + { + if ($anvil->data->{jobs}{job_progress} == 0) + { + print "[ Warning ] - The job has not been picked up yet. Is 'anvil-daemon' running on: [".$short_host_name."]?\n"; + } + else + { + print "[ Note ] - [".$anvil->Get->date_and_time({time_only => 1})."] - The job progress is: [".$anvil->data->{jobs}{job_progress}."], continuing to wait.\n"; + } + $next_log = time + 60; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }}); + } + sleep 5; + } + } + + } + + if ($rebooted) + { + print "- Rebooted! Will wait for it to come back up.\n"; + wait_for_reboot($anvil, $host_uuid, $reboot_time); + } + else + { + print "- Reboot not needed, kernel appears to be up to date.\n"; + } + + # Run anvil-version-change + print "- Running 'anvil-version-changes' now.\n"; + $output = ""; + $error = ""; + $return_code = ""; + my $shell_call = $anvil->data->{path}{exe}{'anvil-version-changes'}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + if ($host_uuid eq $anvil->Get->host_uuid) + { + ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + } + else + { + ($output, $error, $return_code) = $anvil->Remote->call({ + shell_call => $shell_call, + target => $anvil->data->{peer}{$short_host_name}{access}{ip}, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + error => $error, + return_code => $return_code, + }}); + } } } return(0); } -sub check_if_reboot_needed +sub wait_for_reboot { - my ($anvil, $host_uuid) = @_; - + my ($anvil, $host_uuid, $reboot_time) = @_; my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; - my $shell_call = $anvil->data->{path}{exe}{rpm}." -q kernel | ".$anvil->data->{path}{exe}{'sort'}." | ".$anvil->data->{path}{exe}{tail}." -n 1"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - short_host_name => $short_host_name, - shell_call => $shell_call, + 's1:host_uuid' => $host_uuid, + 's2:short_host_name' => $short_host_name, }}); - # Get the newest installed kernel - my $installed_kernel = ""; - my $active_kernel = ""; - my $error = ""; - my $return_code = 999; - if ($host_uuid eq $anvil->Get->host_uuid) - { - ($installed_kernel, $return_code) = $anvil->System->call({shell_call => $shell_call}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - installed_kernel => $installed_kernel, - return_code => $return_code, - }}); - } - else - { - ($installed_kernel, $error, $return_code) = $anvil->Remote->call({ - shell_call => $shell_call, - target => $anvil->data->{peer}{$short_host_name}{access}{ip}, - }); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - installed_kernel => $installed_kernel, - error => $error, - return_code => $return_code, - }}); - } - $installed_kernel =~ s/^kernel-(\d+.\d+\.\d+-\d+)\..*$/$1/; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { installed_kernel => $installed_kernel }}); - - # Get the running kernel - $error = ""; - $return_code = 999; - $shell_call = $anvil->data->{path}{exe}{uname}." -r"; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); - if ($host_uuid eq $anvil->Get->host_uuid) - { - ($active_kernel, $return_code) = $anvil->System->call({shell_call => $shell_call}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - active_kernel => $active_kernel, - return_code => $return_code, - }}); - } - else - { - ($active_kernel, $error, $return_code) = $anvil->Remote->call({ - shell_call => $shell_call, - target => $anvil->data->{peer}{$short_host_name}{access}{ip}, - }); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - active_kernel => $active_kernel, - error => $error, - return_code => $return_code, - }}); - } - $active_kernel =~ s/(\d+.\d+\.\d+-\d+)\..*$/$1/; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { active_kernel => $active_kernel }}); + # Wait until the node comes back up. + print "- The target has been rebooted. We'll wait for the target to come back online.\n"; - if ($installed_kernel eq $active_kernel) + # This is an infinite loop, there is no timeout for this. + my $waiting = 1; + my $next_log = time + 60; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }}); + while($waiting) { - print "The kernel has not been updated.\n"; - } - else - { - print "Reboot needed!\n"; - $anvil->data->{sys}{host}{$short_host_name}{reboot_needed} = 1; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - "sys::host::${short_host_name}::reboot_needed" => $anvil->data->{sys}{host}{$short_host_name}{reboot_needed}, - }}); + # Test access + my $test_access = $anvil->Remote->test_access({target => $anvil->data->{peer}{$short_host_name}{access}{ip}}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { test_access => $test_access }}); + + if ($test_access) + { + # What's the machine's uptime? + my $uptime = $anvil->Get->uptime({target => $anvil->data->{peer}{$short_host_name}{access}{ip}}); + my $time_since_reboot = time - $reboot_time; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + uptime => $uptime, + time_since_reboot => $time_since_reboot, + }}); + + if (($uptime) && ($uptime < $time_since_reboot)) + { + # Rebooted! + print "- Rebooted! Subnode is back up.\n"; + + $waiting = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); + } + } + + if ($waiting) + { + if (time > $next_log) + { + # Tell the user we're still waiting. + print "- [".$anvil->Get->date_and_time({time_only => 1})."] - We're still waiting for the subnode: [".$short_host_name."] to reboot.\n"; + $next_log = time + 60; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }}); + } + + sleep 5; + } } return(0); } -sub disable_daemons +sub manage_daemons { - my ($anvil) = @_; + my ($anvil, $task) = @_; + + $task = "start" if not $task; - my $daemons = ("anvil-daemon", "scancore"); + if ($task eq "stop") + { + print "Disabling Anvil! daemons on all hosts...\n"; + } + else + { + print "Enabling Anvil! daemons on all hosts...\n"; + } + my $daemons = ["anvil-daemon", "scancore"]; foreach my $host_type ("dr", "node", "striker") { foreach my $host_name (sort {$a cmp $b} keys %{$anvil->data->{sys}{hosts}{by_name}}) @@ -456,7 +1035,14 @@ sub disable_daemons }}); next if $host_type ne $this_host_type; - print "- Disabling dameons on: [".$short_host_name."]... "; + if ($task eq "stop") + { + print "- Disabling dameons on: [".$short_host_name."]... "; + } + else + { + print "- Enabling dameons on: [".$short_host_name."]... "; + } if (not $anvil->data->{peer}{$short_host_name}{access}{ip}) { print "Offline! Skipping.\n"; @@ -464,9 +1050,9 @@ sub disable_daemons } # Local - foreach my $daemons (@{$daemons}) + foreach my $daemon (@{$daemons}) { - my $shell_call = $anvil->data->{path}{exe}{systemctl}." stop ".$daemon; + my $shell_call = $anvil->data->{path}{exe}{systemctl}." ".$task." ".$daemon; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my $output = ""; @@ -483,8 +1069,12 @@ sub disable_daemons } else { - # Remote + # Remote, it'll be a while before we hit some clients, so close this + # connection so later access to the machines don't fail with ssh + # connection timeouts. ($output, $error, $return_code) = $anvil->Remote->call({ + 'close' => 1, + no_cache => 1, shell_call => $shell_call, target => $anvil->data->{peer}{$short_host_name}{access}{ip}, }); @@ -496,11 +1086,25 @@ sub disable_daemons } if (not $return_code) { - print $daemon." stopped... "; + if ($task eq "stop") + { + print $daemon." stopped... "; + } + else + { + print $daemon." started... "; + } } else { - print $daemon." didn't stop!... "; + if ($task eq "stop") + { + print $daemon." didn't stop!... "; + } + else + { + print $daemon." didn't start!... "; + } } } print "Done!\n"; @@ -514,6 +1118,9 @@ sub verify_access { my ($anvil) = @_; + # Load host and Anvil! data. + $anvil->Database->get_hosts(); + # Make sure all are available before we start. my $all_access = 1; foreach my $host_type ("dr", "node", "striker") @@ -532,19 +1139,29 @@ sub verify_access next if $host_type ne $this_host_type; print "- Verifying access to: [".$short_host_name."]... "; + my $matches = $anvil->Network->find_access({ + debug => 2, + target => $host_uuid, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { matches => $matches }}); $anvil->data->{peer}{$short_host_name}{access}{ip} = ""; $anvil->data->{peer}{$short_host_name}{access}{network} = ""; foreach my $preferred_network ("bcn", "mn", "ifn", "sn") { + next if $anvil->data->{peer}{$short_host_name}{access}{ip}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { preferred_network => $preferred_network }}); foreach my $network_name (sort {$a cmp $b} keys %{$anvil->data->{network_access}}) { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { network_name => $network_name }}); next if $network_name !~ /^$preferred_network/; + my $target_ip = $anvil->data->{network_access}{$network_name}{target_ip_address}; - my $test_access = $anvil->Remote->test_access({target => $target_ip}); + my $test_access = $anvil->Remote->test_access({ + 'close' => 1, + target => $target_ip, + }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - 's1:network_name' => $network_name, 's2:target_ip' => $target_ip, 's3:test_access' => $test_access, }}); @@ -552,6 +1169,7 @@ sub verify_access if ($test_access) { # We're good. + print "Connected on: [".$target_ip."] via: [".$network_name."]\n"; $anvil->data->{peer}{$short_host_name}{access}{ip} = $target_ip; $anvil->data->{peer}{$short_host_name}{access}{network} = $network_name; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { @@ -564,15 +1182,12 @@ sub verify_access if (not $anvil->data->{peer}{$short_host_name}{access}{ip}) { - print "No access!!\n"; - print "- Not able to collect data from this host, skipping.\n"; + print "No access! Skipping.\n"; $all_access = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_access => $all_access }}); } } } - - return($all_access); } \ No newline at end of file From 02c3d204ead9f974cd973b5e6d18ae7d5f7c10cf Mon Sep 17 00:00:00 2001 From: digimer Date: Fri, 14 Jul 2023 22:52:51 -0400 Subject: [PATCH 08/14] * Updated anvil-update-system to set 'job_data' to track reboots, and striker-update-cluster to read it. Signed-off-by: digimer --- Anvil/Tools/Database.pm | 1 + tools/anvil-update-system | 21 ++++++++++++++++----- tools/striker-update-cluster | 20 ++++++++++---------- 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/Anvil/Tools/Database.pm b/Anvil/Tools/Database.pm index f6c0ebe6..aedab183 100644 --- a/Anvil/Tools/Database.pm +++ b/Anvil/Tools/Database.pm @@ -4491,6 +4491,7 @@ WHERE } +### TODO: Delete this and convert over to Jobs->get_job_details() =head2 get_job_details This gets the details for a given job. If the job is found, a hash reference is returned containing the tables that were read in. diff --git a/tools/anvil-update-system b/tools/anvil-update-system index ebaa1260..78b0497a 100755 --- a/tools/anvil-update-system +++ b/tools/anvil-update-system @@ -101,9 +101,20 @@ if ($reboot_needed) $anvil->System->maintenance_mode({set => 0}); # Record that we're rebooting so that 'striker-update-cluster' knows to wait for a reboot. - my $query = "UPDATE jobs SET job_status = 'rebooted', modified_date = ".$anvil->Database->quote($anvil->Database->refresh_timestamp)." WHERE job_uuid = ".$anvil->Database->quote($job_uuid).";"; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { query => $query }}); - $anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__}); + if ($anvil->data->{switches}{'job-uuid'}) + { + my $query = " +UPDATE + jobs +SET + job_data = 'rebooted', + modified_date = ".$anvil->Database->quote($anvil->Database->refresh_timestamp)." +WHERE + job_uuid = ".$anvil->Database->quote($anvil->data->{switches}{'job-uuid'})." +;"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }}); + $anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__}); + } sleep 2; my $shell_call = $anvil->data->{path}{exe}{systemctl}." reboot"; @@ -347,9 +358,9 @@ sub run_os_update ### See if the kernel has been updated. # Get the newest installed kernel - my $shell_call = $anvil->data->{path}{exe}{rpm}." -q kernel | ".$anvil->data->{path}{exe}{'sort'}." | ".$anvil->data->{path}{exe}{tail}." -n 1"; + $shell_call = $anvil->data->{path}{exe}{rpm}." -q kernel | ".$anvil->data->{path}{exe}{'sort'}." | ".$anvil->data->{path}{exe}{tail}." -n 1"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); - my ($installed_kernel, $return_code) = $anvil->System->call({shell_call => $shell_call}); + (my $installed_kernel, $return_code) = $anvil->System->call({shell_call => $shell_call}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { installed_kernel => $installed_kernel, return_code => $return_code, diff --git a/tools/striker-update-cluster b/tools/striker-update-cluster index 259b9188..5e9eabd5 100755 --- a/tools/striker-update-cluster +++ b/tools/striker-update-cluster @@ -120,8 +120,6 @@ manage_daemons($anvil, "stop"); # Update systems update_strikers_and_dr($anvil); -die; - # Update DR Host update_nodes($anvil); @@ -342,10 +340,10 @@ sub update_nodes { if (time > $next_log) { - $anvil->Database->get_job_details({job_uuid => $job_uuid}); + $anvil->Job->get_job_details({job_uuid => $job_uuid}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "jobs::job_progress" => $anvil->data->{jobs}{job_progress}, - "jobs::job_status" => $anvil->data->{jobs}{job_status}, + "jobs::job_data" => $anvil->data->{jobs}{job_data}, }}); if ($anvil->data->{jobs}{job_progress} == 0) { @@ -380,6 +378,7 @@ sub update_nodes { $shell_call .= " --clear-cache"; } + $shell_call .= $anvil->Log->switches(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); $job_uuid = ""; $job_uuid = $anvil->Database->insert_or_update_jobs({ @@ -400,10 +399,10 @@ sub update_nodes $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }}); while ($waiting) { - $anvil->Database->get_job_details({job_uuid => $job_uuid}); + $anvil->Job->get_job_details({job_uuid => $job_uuid}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "jobs::job_progress" => $anvil->data->{jobs}{job_progress}, - "jobs::job_status" => $anvil->data->{jobs}{job_status}, + "jobs::job_data" => $anvil->data->{jobs}{job_data}, }}); if ($anvil->data->{jobs}{job_progress} == 100) { @@ -412,7 +411,7 @@ sub update_nodes $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); # Did it reboot? - if ($anvil->data->{jobs}{job_status} eq "rebooted") + if ($anvil->data->{jobs}{job_data} eq "rebooted") { $rebooted = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { rebooted => $rebooted }}); @@ -821,6 +820,7 @@ sub update_strikers_and_dr { $shell_call .= " --clear-cache"; } + $shell_call .= $anvil->Log->switches(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my $job_uuid = $anvil->Database->insert_or_update_jobs({ debug => 2, @@ -864,10 +864,10 @@ sub update_strikers_and_dr $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { next_log => $next_log }}); while ($waiting) { - $anvil->Database->get_job_details({job_uuid => $job_uuid}); + $anvil->Job->get_job_details({job_uuid => $job_uuid}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "jobs::job_progress" => $anvil->data->{jobs}{job_progress}, - "jobs::job_status" => $anvil->data->{jobs}{job_status}, + "jobs::job_data" => $anvil->data->{jobs}{job_data}, }}); if ($anvil->data->{jobs}{job_progress} == 100) { @@ -876,7 +876,7 @@ sub update_strikers_and_dr $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); # Did it reboot? - if ($anvil->data->{jobs}{job_status} eq "rebooted") + if ($anvil->data->{jobs}{job_data} eq "rebooted") { $rebooted = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { rebooted => $rebooted }}); From 4dc1b0e1172d90efdc9aef9c3a02ef72dbdf62d3 Mon Sep 17 00:00:00 2001 From: digimer Date: Fri, 14 Jul 2023 23:00:16 -0400 Subject: [PATCH 09/14] * Added a check to Network->get_company_from_mac() to manually set the company to KVM/qemu if the prefix is 52:54:00. Signed-off-by: digimer --- Anvil/Tools/Network.pm | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Anvil/Tools/Network.pm b/Anvil/Tools/Network.pm index 0ca8bd65..dd76a714 100644 --- a/Anvil/Tools/Network.pm +++ b/Anvil/Tools/Network.pm @@ -1662,6 +1662,11 @@ sub get_company_from_mac $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { company => $company }}); } + if ((not $company) && ($mac =~ /^52:54:00/)) + { + $company = "KVM/qemu"; + } + return($company); } From 458cb267da8a782063140a743428cde9ce7dd078 Mon Sep 17 00:00:00 2001 From: digimer Date: Sat, 15 Jul 2023 00:04:12 -0400 Subject: [PATCH 10/14] * Fixed a bug in Cluster->get_primary_host_uuid() where servers were not loaded before trying to calculate RAM use. Signed-off-by: digimer --- Anvil/Tools/Cluster.pm | 1 + Anvil/Tools/Network.pm | 1 + 2 files changed, 2 insertions(+) diff --git a/Anvil/Tools/Cluster.pm b/Anvil/Tools/Cluster.pm index 660e6ae9..a50a68ab 100644 --- a/Anvil/Tools/Cluster.pm +++ b/Anvil/Tools/Cluster.pm @@ -2603,6 +2603,7 @@ sub get_primary_host_uuid my $node2_ram_in_use_by_servers = 0; # Loop through servers. + $anvil->Database->get_servers({debug => $debug}); foreach my $server_name (sort {$a cmp $b} keys %{$anvil->data->{servers}{anvil_uuid}{$anvil_uuid}{server_name}}) { my $server_uuid = $anvil->data->{servers}{anvil_uuid}{$anvil_uuid}{server_name}{$server_name}{server_uuid}; diff --git a/Anvil/Tools/Network.pm b/Anvil/Tools/Network.pm index dd76a714..b5534734 100644 --- a/Anvil/Tools/Network.pm +++ b/Anvil/Tools/Network.pm @@ -1665,6 +1665,7 @@ sub get_company_from_mac if ((not $company) && ($mac =~ /^52:54:00/)) { $company = "KVM/qemu"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { company => $company }}); } return($company); From 751687129add9334cd52defef7f295a30b71591e Mon Sep 17 00:00:00 2001 From: digimer Date: Sat, 15 Jul 2023 16:19:21 -0400 Subject: [PATCH 11/14] * Updated anvil-daemon to not exit on RAM use if anvil-update-system is running. * Fixed a bug in anvil-safe-stop where it wouldn't trigger a migration when the peer is online. * Updated anvil-update-system to set job_data to 'failed' and exit with rc 4 if the os update failed. * Got striker-update-cluster to error out and exit if a called 'anvil-update-system' job failed. Signed-off-by: digimer --- share/words.xml | 4 ++++ tools/anvil-daemon | 12 +++++++++--- tools/anvil-safe-stop | 10 ++++++---- tools/anvil-update-system | 26 ++++++++++++++++++++++++++ tools/striker-update-cluster | 17 +++++++++++++++++ 5 files changed, 62 insertions(+), 7 deletions(-) diff --git a/share/words.xml b/share/words.xml index 57d838df..fae981e7 100644 --- a/share/words.xml +++ b/share/words.xml @@ -603,6 +603,10 @@ The error was: There was a problem with finding a common storage network between: [#!variable!node1_name!#] and: [#!variable!node2_name!#]. Found node 1 to have the IP: [#!variable!node1_ip!#] and node 2: [#!variable!node2_ip!#]. Is there a problem with '/etc/hosts'? Failed to find a network to use for storage replication. Is there a problem with '/etc/hosts'? '.]]> + Failed to withdraw the subnode from the node's cluster. Expected the 'anvil-safe-stop' call to return '0', but got: [#!variable!return_code!#]. The output, if anything, was: +======== +#!variable!output!# +======== diff --git a/tools/anvil-daemon b/tools/anvil-daemon index 4643c72c..441fa2c7 100755 --- a/tools/anvil-daemon +++ b/tools/anvil-daemon @@ -252,8 +252,8 @@ sub check_ram }}); if ($problem) { - # See if an 'anvil-sync-shared' job is running and, if so, don't exit. The file copy is - # counted and not an actual problem. + # See if an 'anvil-sync-shared', or an 'anvil-update-system' job is running and, if so, + # don't exit. The file copy or OS update is counted and not an actual problem. $anvil->Database->get_jobs({debug => 2}); foreach my $job_uuid (keys %{$anvil->data->{jobs}{running}}) { @@ -264,7 +264,13 @@ sub check_ram job_progress => $job_progress, }}); - if (($job_progress != 100) && ($job_command =~ /anvil-sync-shared/)) + if ( + ($job_progress != 100) && + ( + ($job_command =~ /anvil-update-system/) or + ($job_command =~ /anvil-sync-shared/) + ) + ) { # Don't abort. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "warning_0139", variables => { diff --git a/tools/anvil-safe-stop b/tools/anvil-safe-stop index 6c6b99cf..0ea2962e 100755 --- a/tools/anvil-safe-stop +++ b/tools/anvil-safe-stop @@ -274,9 +274,9 @@ sub process_servers 's2:progress_steps' => $progress_steps, }}); - # If we have one or more local servers, we need to know if both of us are in the cluster. If we're - # not, or the peer isn't, we can't migrate. - my $can_migrate = 0; + # If we have one or more local servers, we need to know if both subnodes are in the node's cluster. + # If we're not, or the peer isn't, we can't migrate. + my $can_migrate = 1; if ($server_count) { my $problem = $anvil->Cluster->parse_cib({debug => 2}); @@ -287,18 +287,20 @@ sub process_servers }}); if ($problem) { + # We're not in the node's cluster, we can't migrate. $can_migrate = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { can_migrate => $can_migrate }}); } elsif ((not $anvil->data->{cib}{parsed}{'local'}{ready}) or (not $anvil->data->{cib}{parsed}{peer}{ready})) { + # One of the subnodes is not in the cluster, so we can't migrate. $can_migrate = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { can_migrate => $can_migrate }}); } if ((not $anvil->data->{switches}{'stop-servers'}) && (not $can_migrate)) { - # Abort. + # We would have to stop the servers, and the user didn't tell us to do that, abort. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0372"}); $anvil->Job->update_progress({progress => 100, message => "error_0372"}); $anvil->nice_exit({exit_code => 1}); diff --git a/tools/anvil-update-system b/tools/anvil-update-system index 78b0497a..71898a6a 100755 --- a/tools/anvil-update-system +++ b/tools/anvil-update-system @@ -15,6 +15,7 @@ # 1 = No database connections available. # 2 = The job UUID was passed, but it wasn't valid. # 3 = It looks like the update failed, reset progress to '0'. +# 4 = Failed to withdraw the node from the cluster. # # TODO: # - Rebuild this to be 'striker-update-system' and have it update local strikers and all nodes. @@ -201,6 +202,31 @@ sub run_os_update output => $output, return_code => $return_code, }}); + + if ($return_code) + { + # Something went wrong, abort. + update_progress($anvil, 100, "error_0420,!!return_code!".$return_code."!!,!!output!".$output."!!"); + + # Set the job_data to 'failed' so that striker-update-cluster' knows to abort. + if ($anvil->data->{switches}{'job-uuid'}) + { + my $query = " +UPDATE + jobs +SET + job_data = 'failed', + modified_date = ".$anvil->Database->quote($anvil->Database->refresh_timestamp)." +WHERE + job_uuid = ".$anvil->Database->quote($anvil->data->{switches}{'job-uuid'})." +;"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { query => $query }}); + $anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__}); + } + + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, secure => 0, key => "error_0035", variables => { output => $output } }); + $anvil->nice_exit({exit_code => 4}); + } } } diff --git a/tools/striker-update-cluster b/tools/striker-update-cluster index 5e9eabd5..2017e5ec 100755 --- a/tools/striker-update-cluster +++ b/tools/striker-update-cluster @@ -416,6 +416,14 @@ sub update_nodes $rebooted = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { rebooted => $rebooted }}); } + + # Did it fail? + if ($anvil->data->{jobs}{job_data} eq "failed") + { + # Abort! + print "[ Error ] - There was a problem updating the subnode! Anvil! cluster update aborted.\n"; + $anvil->nice_exit({exit_code => 1}); + } } else { @@ -477,6 +485,7 @@ sub update_nodes }}); my ($output, $error, $return_code) = $anvil->Remote->call({ + debug => 2, shell_call => $shell_call, target => $anvil->data->{peer}{$short_host_name}{access}{ip}, }); @@ -881,6 +890,14 @@ sub update_strikers_and_dr $rebooted = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { rebooted => $rebooted }}); } + + # Did it fail? + if ($anvil->data->{jobs}{job_data} eq "failed") + { + # Abort! + print "[ Error ] - There was a problem updating the system! Anvil! cluster update aborted.\n"; + $anvil->nice_exit({exit_code => 1}); + } } else { From d741f4aa6fad95868b7904e9460ad5a7badf7da4 Mon Sep 17 00:00:00 2001 From: digimer Date: Sat, 15 Jul 2023 22:23:30 -0400 Subject: [PATCH 12/14] * Updated anvil-daemon to not exit on high RAM use is any job is running. * Updated anvil-update-system to reboot a target whose kernel updated using an anvil-manage-power job, * Started making striker-update-cluster run as a job (not at all complete). Fixed a bug where the wrong IP was being used when finding access to a target. Signed-off-by: digimer --- share/words.xml | 4 +++ tools/anvil-daemon | 13 +++------ tools/anvil-manage-power | 3 ++ tools/anvil-update-system | 24 ++++++++++------ tools/striker-update-cluster | 56 ++++++++++++++++++++++++++++++++---- 5 files changed, 76 insertions(+), 24 deletions(-) diff --git a/share/words.xml b/share/words.xml index fae981e7..f38586de 100644 --- a/share/words.xml +++ b/share/words.xml @@ -1561,6 +1561,8 @@ Note: This is a permanent action! If you protect this server again later, a full This job is to reload (adjust) a DRBD resource. It's run as a job as it blocks until the adjust is run on all nodes. Update the base operating system. This uses 'dnf' to do an OS update on the host. If this is run on a node, 'anvil-safe-stop' will be called to withdraw the subnode from the node's cluster. If the peer subnode is also offline, hosted servers will be shut down. + Update beginning. Verifying all known machines are accessible... + Starting: [#!variable!program!#]. @@ -2912,6 +2914,8 @@ Proceed? [y/N] The subnode is in the node's cluster, asking it to withdraw. This could take some time if servers need to be migrated. Cleared 'dnf' cache. The kernel was updated, so a reboot is required. Rebooting now. + Registering a job to reboot this host. + Preparing to update the entire Anvil! cluster. Normal Password diff --git a/tools/anvil-daemon b/tools/anvil-daemon index 441fa2c7..67cd9a4e 100755 --- a/tools/anvil-daemon +++ b/tools/anvil-daemon @@ -252,8 +252,8 @@ sub check_ram }}); if ($problem) { - # See if an 'anvil-sync-shared', or an 'anvil-update-system' job is running and, if so, - # don't exit. The file copy or OS update is counted and not an actual problem. + # See if any jobs are running, and if so, hold because those jobs might be doing things (like + # OS updates or file syncs) that could make anvil-daemon appear to be using more memory. $anvil->Database->get_jobs({debug => 2}); foreach my $job_uuid (keys %{$anvil->data->{jobs}{running}}) { @@ -264,17 +264,12 @@ sub check_ram job_progress => $job_progress, }}); - if ( - ($job_progress != 100) && - ( - ($job_command =~ /anvil-update-system/) or - ($job_command =~ /anvil-sync-shared/) - ) - ) + if (($job_progress != 100) && ($job_progress != 0)) { # Don't abort. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "warning_0139", variables => { job_command => $job_command, + job_progress => $job_progress, ram_used => $anvil->Convert->bytes_to_human_readable({'bytes' => $ram_used}), ram_used_bytes => $anvil->Convert->add_commas({number => $ram_used}), }}); diff --git a/tools/anvil-manage-power b/tools/anvil-manage-power index 35dfd04e..7b8eb093 100755 --- a/tools/anvil-manage-power +++ b/tools/anvil-manage-power @@ -186,6 +186,9 @@ sub do_poweroff my ($anvil, $task) = @_; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { task => $task }}); + # In case we're being called by another job, we'll sleep for a few second to let those close out. + sleep 3; + # We'll wait until the system has at least 5 minutes of uptime, unless '--no-wait' was given. my $uptime = $anvil->data->{switches}{'no-wait'} ? 0 : $anvil->Get->uptime; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { diff --git a/tools/anvil-update-system b/tools/anvil-update-system index 71898a6a..4b4f05de 100755 --- a/tools/anvil-update-system +++ b/tools/anvil-update-system @@ -95,9 +95,6 @@ if ($reboot_needed) { if (not $anvil->data->{switches}{'no-reboot'}) { - # Record that we're going to reboot now. - update_progress($anvil, 100, "message_0317"); - # Clear maintenance mode. $anvil->System->maintenance_mode({set => 0}); @@ -117,13 +114,22 @@ WHERE $anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__}); } - sleep 2; - my $shell_call = $anvil->data->{path}{exe}{systemctl}." reboot"; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + # Register a job to reboot. + update_progress($anvil, 98, "message_0318"); + my ($job_uuid) = $anvil->Database->insert_or_update_jobs({ + file => $THIS_FILE, + line => __LINE__, + job_command => $anvil->data->{path}{exe}{'anvil-manage-power'}." --reboot -y".$anvil->Log->switches, + job_data => "", + job_name => "reboot::system", + job_title => "job_0009", + job_description => "job_0006", + job_progress => 0, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }}); - my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call, source => $THIS_FILE, line => __LINE__}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code }}); - $anvil->nice_exit({exit_code => 0}); + # Record that we're going to reboot now. + update_progress($anvil, 100, "message_0317"); } else { diff --git a/tools/striker-update-cluster b/tools/striker-update-cluster index 2017e5ec..c93ad7a1 100755 --- a/tools/striker-update-cluster +++ b/tools/striker-update-cluster @@ -79,7 +79,28 @@ if ($anvil->Get->host_type ne "striker") $anvil->nice_exit({exit_code => 1}); } -print "Update beginning. Verifying all known machines are accessible...\n"; +# If we still don't have a job-uuit, go into interactive mode. +$anvil->data->{sys}{progress} = 0; +if ($anvil->data->{switches}{'job-uuid'}) +{ + # Load the job data. + $anvil->Job->clear(); + $anvil->Job->get_job_details({debug => 2}); + $anvil->Job->update_progress({ + progress => $anvil->data->{sys}{progress}++, + job_picked_up_by => $$, + job_picked_up_at => time, + 'print' => 1, + message => "message_0319", + }); +} + +# Update beginning. Verifying all known machines are accessible... +$anvil->Job->update_progress({ + 'print' => 1, + progress => $anvil->data->{sys}{progress}++, + message => "job_0469", +}); my $all_access = verify_access($anvil); if ((not $all_access) && (not $anvil->data->{switches}{force})) { @@ -206,6 +227,11 @@ sub update_nodes 's2:short_host_name' => $short_host_name, }}); print "- Verifying access to subnode: [".$short_host_name."]\n"; + my $matches = $anvil->Network->find_access({ + debug => 2, + target => $host_uuid, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { matches => $matches }}); $anvil->data->{peer}{$short_host_name}{access}{ip} = ""; $anvil->data->{peer}{$short_host_name}{access}{network} = ""; foreach my $preferred_network ("bcn", "mn", "ifn", "sn") @@ -298,7 +324,7 @@ sub update_nodes # Log into the target machine and make sure anvil-daemon is running. print "- Making sure anvil-daemon is running... "; - my $shell_call = $anvil->data->{path}{exe}{systemctl}." start anvil-daemon.service"; + my $shell_call = $anvil->data->{path}{exe}{systemctl}." enable --now anvil-daemon.service"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my ($output, $error, $return_code) = $anvil->Remote->call({ 'close' => 1, @@ -364,7 +390,10 @@ sub update_nodes # less than the current time minus this start time), if the host reboots as part of # the update. my $reboot_time = time; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { reboot_time => $reboot_time }}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + reboot_time => $reboot_time, + short_host_name => $short_host_name, + }}); # Do the OS update. print "- Beginning OS update of: [".$short_host_name."]\n"; @@ -972,6 +1001,12 @@ sub wait_for_reboot 's2:short_host_name' => $short_host_name, }}); + my $matches = $anvil->Network->find_access({ + debug => 2, + target => $host_uuid, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { matches => $matches }}); + # Wait until the node comes back up. print "- The target has been rebooted. We'll wait for the target to come back online.\n"; @@ -982,17 +1017,23 @@ sub wait_for_reboot while($waiting) { # Test access - my $test_access = $anvil->Remote->test_access({target => $anvil->data->{peer}{$short_host_name}{access}{ip}}); + my $target = $anvil->data->{peer}{$short_host_name}{access}{ip}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + target => $target, + short_host_name => $short_host_name, + }}); + my $test_access = $anvil->Remote->test_access({target => $target}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { test_access => $test_access }}); if ($test_access) { # What's the machine's uptime? - my $uptime = $anvil->Get->uptime({target => $anvil->data->{peer}{$short_host_name}{access}{ip}}); + my $uptime = $anvil->Get->uptime({debug => 2, target => $anvil->data->{peer}{$short_host_name}{access}{ip}}); my $time_since_reboot = time - $reboot_time; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { uptime => $uptime, time_since_reboot => $time_since_reboot, + short_host_name => $short_host_name, }}); if (($uptime) && ($uptime < $time_since_reboot)) @@ -1028,6 +1069,9 @@ sub manage_daemons $task = "start" if not $task; + my $do_task = $task eq "start" ? "enable --now" : "stop"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { do_task => $do_task }}); + if ($task eq "stop") { print "Disabling Anvil! daemons on all hosts...\n"; @@ -1069,7 +1113,7 @@ sub manage_daemons # Local foreach my $daemon (@{$daemons}) { - my $shell_call = $anvil->data->{path}{exe}{systemctl}." ".$task." ".$daemon; + my $shell_call = $anvil->data->{path}{exe}{systemctl}." ".$do_task." ".$daemon; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my $output = ""; From 42b44ac86409884830aab258ff6f8a70eb6f6d3b Mon Sep 17 00:00:00 2001 From: digimer Date: Sun, 16 Jul 2023 00:08:53 -0400 Subject: [PATCH 13/14] * Updated the log showing why anvil-daemon isn't exiting when a job is running with the job's current progress. Signed-off-by: digimer --- share/words.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/share/words.xml b/share/words.xml index f38586de..22b825e7 100644 --- a/share/words.xml +++ b/share/words.xml @@ -3613,7 +3613,7 @@ We will sleep a bit and try again. [ Warning ] - Failed to connect to the host: [#!variable!host!#]! Unable to up the resource, so the server may not start. If the peer can't be recovered, manually forcing the local resource(s) to UpToDate may be required. [ Warning ] - Timed out waiting for the connections to the peers, and the local resource(s) is not in 'UpToDate' state. Booting the server will likely fail. [ Warning ] - Timed out waiting for the connections to the peers. - [ Warning ] - We're using: [#!variable!ram_used!#] (#!variable!ram_used_bytes!# Bytes). but there is a job: [#!variable!job_command!#] is runnng, which might be why the RAM is high. NOT exiting while this program is running. + [ Warning ] - We're using: [#!variable!ram_used!#] (#!variable!ram_used_bytes!# Bytes). but there is a job: [#!variable!job_command!#] is runnng, (progress is: [#!variable!job_progress!#]), which might be why the RAM is high. NOT exiting while this program is running. [ Warning ] - A no-longer active PID: [#!variable!pid!#] (used by: [#!variable!caller!#] had marked the database: [#!variable!db!#] as "in_use", but the PID is gone now. Reaping the flag. [ Warning ] - We waited for: [#!variable!wait_time!#] seconds for all users of the local database to exit. Giving up waiting and taking the database down now. [ Warning ] - The command: [#!variable!command!#] is still using our database. From f262da544da0eb348b23c27f0f50f6391b252969 Mon Sep 17 00:00:00 2001 From: digimer Date: Sun, 16 Jul 2023 00:18:29 -0400 Subject: [PATCH 14/14] Removed '--best --allowerasing' from dnf update. Signed-off-by: digimer --- tools/anvil-update-system | 2 +- tools/striker-update-cluster | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/anvil-update-system b/tools/anvil-update-system index 4b4f05de..07c1926d 100755 --- a/tools/anvil-update-system +++ b/tools/anvil-update-system @@ -260,7 +260,7 @@ WHERE my $next_step = 0; my $verifying = 0; my $output = ""; - my $shell_call = $anvil->data->{path}{exe}{dnf}." -y update --best --allowerasing; ".$anvil->data->{path}{exe}{echo}." return_code:\$?"; + my $shell_call = $anvil->data->{path}{exe}{dnf}." -y update; ".$anvil->data->{path}{exe}{echo}." return_code:\$?"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { shell_call => $shell_call }}); open (my $file_handle, $shell_call." 2>&1 |") or $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, secure => 0, priority => "err", key => "log_0014", variables => { shell_call => $shell_call, error => $! }}); while(<$file_handle>) diff --git a/tools/striker-update-cluster b/tools/striker-update-cluster index c93ad7a1..23a41944 100755 --- a/tools/striker-update-cluster +++ b/tools/striker-update-cluster @@ -794,7 +794,7 @@ sub update_strikers_and_dr print "- watch the progress via the system logs. You can also check wiht 'ps aux | grep dnf'.\n"; if ($host_uuid eq $anvil->Get->host_uuid) { - my $shell_call = $anvil->data->{path}{exe}{dnf}." -y update --best --allowerasing"; + my $shell_call = $anvil->data->{path}{exe}{dnf}." -y update"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {