From faf1399440cb5ef34f9f6386e1ee3564d050f73a Mon Sep 17 00:00:00 2001 From: Digimer Date: Mon, 12 Apr 2021 20:46:30 -0400 Subject: [PATCH] * Continued work on anvil-safe-start. Got it to the point where it detects shared networks with its peer node and waits for all networks to be up. * Fixed a bug in scan-drbd where the volume_uuid wasn't being stored in the proper hash, breaking insertions into scan_drbd_peers in some cases. * Updated System->pids() to work with remote targets (will be used later to check for parallel runs of anvil-safe-start). Signed-off-by: Digimer --- Anvil/Tools/System.pm | 112 ++++++++++++----- scancore-agents/scan-drbd/scan-drbd | 12 +- share/words.xml | 5 + tools/anvil-safe-start | 184 +++++++++++++++++++++++++++- 4 files changed, 274 insertions(+), 39 deletions(-) diff --git a/Anvil/Tools/System.pm b/Anvil/Tools/System.pm index 8c51e3fa..10c79ecb 100644 --- a/Anvil/Tools/System.pm +++ b/Anvil/Tools/System.pm @@ -36,6 +36,7 @@ my $THIS_FILE = "System.pm"; # maintenance_mode # manage_authorized_keys # manage_firewall +# pids # parse_lshw # read_ssh_config # reload_daemon @@ -3539,7 +3540,7 @@ sub manage_firewall =head2 pids -This parses C<< ps aux >> and stores the information about running programs in C<< pids:::: >>. +This parses C<< ps aux >> and stores the information about running programs in C<< pids:::: >>. If called against a remote host, the data is stored in C<< remote_pids:::: >>. Optionally, if the C<< program_name >> parameter is set, an array of PIDs for that program will be returned. @@ -3553,6 +3554,22 @@ If set to C<< 1 >>, the PID of this program is ignored. This is an option string that is searched for in the 'command' portion of the 'ps aux' call. If this string matches, the PID is added to the array reference returned by this method. +=head3 password (optional) + +If you are testing IPMI from a remote machine, this is the password used to connect to that machine. If not passed, an attempt to connect with passwordless SSH will be made (but this won't be the case in most instances). Ignored if C<< target >> is not given. + +=head3 port (optional, default 22) + +This is the TCP port number to use if connecting to a remote machine over SSH. Ignored if C<< target >> is not given. + +=head3 remote_user (optional, default root) + +If C<< target >> is set, this is the user we will use when logging in to the target machine. + +=head3 target (optional) + +This is the IP address or (resolvable) host name of the target machine to test the IPMI connection from. + =cut sub pids { @@ -3562,23 +3579,58 @@ sub pids my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "System->pids()" }}); - my $ignore_me = defined $parameter->{ignore_me} ? $parameter->{ignore_me} : 0; - my $program_name = defined $parameter->{program_name} ? $parameter->{program_name} : ""; + my $ignore_me = defined $parameter->{ignore_me} ? $parameter->{ignore_me} : 0; + my $program_name = defined $parameter->{program_name} ? $parameter->{program_name} : ""; + my $password = defined $parameter->{password} ? $parameter->{password} : ""; + my $port = defined $parameter->{port} ? $parameter->{port} : ""; + my $remote_user = defined $parameter->{remote_user} ? $parameter->{remote_user} : ""; + my $target = defined $parameter->{target} ? $parameter->{target} : ""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { ignore_me => $ignore_me, program_name => $program_name, }}); - # If we stored this data before, delete it as it is now stale. - if (exists $anvil->data->{pids}) + my $my_pid = $$; + my $pids = []; + my $shell_call = $anvil->data->{path}{exe}{ps}." aux"; + my $pid_key = "pids"; + my $output = ""; + my $return_code = ""; + if ($anvil->Network->is_local({host => $target})) { - delete $anvil->data->{pids}; + + # Local call + ($output, $return_code) = $anvil->System->call({debug => $debug, shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + output => $output, + return_code => $return_code, + }}); } - my $my_pid = $$; - my $pids = []; - my $shell_call = $anvil->data->{path}{exe}{ps}." aux"; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { shell_call => $shell_call }}); - my ($output, $return_code) = $anvil->System->call({debug => $debug, shell_call => $shell_call}); + else + { + # Remote call, clear the 'my_pid' + $my_pid = ""; + $pid_key = "remote_pids"; + ($output, my $error, $return_code) = $anvil->Remote->call({ + debug => $debug, + shell_call => $shell_call, + target => $target, + port => $port, + password => $password, + remote_user => $remote_user, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + error => $error, + output => $output, + }}); + } + + # If we stored this data before, delete it as it is now stale. + if (exists $anvil->data->{$pid_key}) + { + delete $anvil->data->{$pid_key}; + } + foreach my $line (split/\n/, $output) { $line = $anvil->Words->clean_spaces({ string => $line }); @@ -3628,26 +3680,26 @@ sub pids } # Store by PID - $anvil->data->{pids}{$pid}{user} = $user; - $anvil->data->{pids}{$pid}{cpu} = $cpu; - $anvil->data->{pids}{$pid}{memory} = $memory; - $anvil->data->{pids}{$pid}{virtual_memory_size} = $virtual_memory_size; - $anvil->data->{pids}{$pid}{resident_set_size} = $resident_set_size; - $anvil->data->{pids}{$pid}{control_terminal} = $control_terminal; - $anvil->data->{pids}{$pid}{state_codes} = $state_codes; - $anvil->data->{pids}{$pid}{start_time} = $start_time; - $anvil->data->{pids}{$pid}{'time'} = $time; - $anvil->data->{pids}{$pid}{command} = $command; + $anvil->data->{$pid_key}{$pid}{user} = $user; + $anvil->data->{$pid_key}{$pid}{cpu} = $cpu; + $anvil->data->{$pid_key}{$pid}{memory} = $memory; + $anvil->data->{$pid_key}{$pid}{virtual_memory_size} = $virtual_memory_size; + $anvil->data->{$pid_key}{$pid}{resident_set_size} = $resident_set_size; + $anvil->data->{$pid_key}{$pid}{control_terminal} = $control_terminal; + $anvil->data->{$pid_key}{$pid}{state_codes} = $state_codes; + $anvil->data->{$pid_key}{$pid}{start_time} = $start_time; + $anvil->data->{$pid_key}{$pid}{'time'} = $time; + $anvil->data->{$pid_key}{$pid}{command} = $command; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { - "pids::${pid}::cpu" => $anvil->data->{pids}{$pid}{cpu}, - "pids::${pid}::memory" => $anvil->data->{pids}{$pid}{memory}, - "pids::${pid}::virtual_memory_size" => $anvil->data->{pids}{$pid}{virtual_memory_size}, - "pids::${pid}::resident_set_size" => $anvil->data->{pids}{$pid}{resident_set_size}, - "pids::${pid}::control_terminal" => $anvil->data->{pids}{$pid}{control_terminal}, - "pids::${pid}::state_codes" => $anvil->data->{pids}{$pid}{state_codes}, - "pids::${pid}::start_time" => $anvil->data->{pids}{$pid}{start_time}, - "pids::${pid}::time" => $anvil->data->{pids}{$pid}{'time'}, - "pids::${pid}::command" => $anvil->data->{pids}{$pid}{command}, + "${pid_key}::${pid}::cpu" => $anvil->data->{$pid_key}{$pid}{cpu}, + "${pid_key}::${pid}::memory" => $anvil->data->{$pid_key}{$pid}{memory}, + "${pid_key}::${pid}::virtual_memory_size" => $anvil->data->{$pid_key}{$pid}{virtual_memory_size}, + "${pid_key}::${pid}::resident_set_size" => $anvil->data->{$pid_key}{$pid}{resident_set_size}, + "${pid_key}::${pid}::control_terminal" => $anvil->data->{$pid_key}{$pid}{control_terminal}, + "${pid_key}::${pid}::state_codes" => $anvil->data->{$pid_key}{$pid}{state_codes}, + "${pid_key}::${pid}::start_time" => $anvil->data->{$pid_key}{$pid}{start_time}, + "${pid_key}::${pid}::time" => $anvil->data->{$pid_key}{$pid}{'time'}, + "${pid_key}::${pid}::command" => $anvil->data->{$pid_key}{$pid}{command}, }}); if ($command =~ /$program_name/) diff --git a/scancore-agents/scan-drbd/scan-drbd b/scancore-agents/scan-drbd/scan-drbd index 295994f3..da2934e0 100755 --- a/scancore-agents/scan-drbd/scan-drbd +++ b/scancore-agents/scan-drbd/scan-drbd @@ -741,6 +741,8 @@ WHERE scan_drbd_peer_scan_drbd_volume_uuid => $scan_drbd_peer_scan_drbd_volume_uuid, }}); + die if not $scan_drbd_peer_scan_drbd_volume_uuid; + my $query = " INSERT INTO scan_drbd_peers @@ -830,7 +832,7 @@ sub process_volumes my $update = 0; if ($new_scan_drbd_volume_device_path ne $old_scan_drbd_volume_device_path) { - $update = 1; + $update = 1; my $variables = { resource_name => $scan_drbd_resource_name, volume_number => $scan_drbd_volume_number, @@ -853,7 +855,7 @@ sub process_volumes if ($new_scan_drbd_volume_device_minor ne $old_scan_drbd_volume_device_minor) { # The minor-number changing is almost certainly a human-changed thing. - $update = 1; + $update = 1; my $variables = { resource_name => $scan_drbd_resource_name, volume_number => $scan_drbd_volume_number, @@ -866,7 +868,7 @@ sub process_volumes if ($new_scan_drbd_volume_size ne $old_scan_drbd_volume_size) { # Looks like the volume was resized. - $update = 1; + $update = 1; my $variables = { resource_name => $scan_drbd_resource_name, volume_number => $scan_drbd_volume_number, @@ -1252,14 +1254,16 @@ WHERE # Record the scan_drbd_volume_uuid in an easy to find place for later when looking for changes. my $on_resource_name = $anvil->data->{old}{uuid_to_resource}{$scan_drbd_volume_scan_drbd_resource_uuid}{name}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { on_resource_name => $on_resource_name }}); - + $anvil->data->{volume_to_uuid}{$scan_drbd_volume_uuid}{number} = $scan_drbd_volume_number; $anvil->data->{volume_to_uuid}{$scan_drbd_volume_uuid}{on_resource} = $scan_drbd_volume_scan_drbd_resource_uuid; $anvil->data->{volume_to_uuid}{$on_resource_name}{volume_number}{$scan_drbd_volume_number}{uuid} = $scan_drbd_volume_uuid; + $anvil->data->{volume_uuid}{resource}{$on_resource_name}{volume}{$scan_drbd_volume_number}{uuid} = $scan_drbd_volume_uuid; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "volume_to_uuid::${scan_drbd_volume_uuid}::number" => $anvil->data->{volume_to_uuid}{$scan_drbd_volume_uuid}{number}, "volume_to_uuid::${scan_drbd_volume_uuid}::on_resource" => $anvil->data->{volume_to_uuid}{$scan_drbd_volume_uuid}{on_resource}, "volume_to_uuid::${on_resource_name}::volume_number::${scan_drbd_volume_number}::uuid" => $anvil->data->{volume_to_uuid}{$on_resource_name}{volume_number}{$scan_drbd_volume_number}{uuid}, + "volume_uuid::resource::${on_resource_name}::volume::${scan_drbd_volume_number}::uuid" => $anvil->data->{volume_uuid}{resource}{$on_resource_name}{volume}{$scan_drbd_volume_number}{uuid}, }}); } undef $count; diff --git a/share/words.xml b/share/words.xml index 98608447..af09938f 100644 --- a/share/words.xml +++ b/share/words.xml @@ -1446,6 +1446,10 @@ The file: [#!variable!file!#] needs to be updated. The difference is: Disabled 'anvil-safe-start' locally on this node. Disabled 'anvil-safe-start' on both nodes in this Anvil! system. This node is not in an Anvil! yet, so there's no reason to run this program. + Scuccessful acess over the network: [#!variable!network!#] to the peer: [#!variable!peer!#] using the peer's IP: [#!variable!peer_ip!#]. + Failed to acess over the peer: [#!variable!peer!#] over the network: [#!variable!network!#] via the peer's IP: [#!variable!peer_ip!#]. + At least one network connection to the peer: [#!variable!peer!#] is still down. Waiting a bit and then will check again. + All connections to the peer: [#!variable!peer!#] are up! The host name: [#!variable!target!#] does not resolve to an IP address. @@ -1774,6 +1778,7 @@ Are you sure that you want to delete the server: [#!variable!server_name!#]? [Ty The 'anvil-safe-start' tool is disabled on this node and enabled on the peer. The 'anvil-safe-start' tool is disabled, exiting. Use '--force' to run anyway. The 'anvil-safe-start' tool is disabled, but '--force' was used, so proceeding. + It appears that another instance of 'anvil-safe-start' is already runing. Please wait for it to complete (or kill it manually if needed). Saved the mail server information successfully! diff --git a/tools/anvil-safe-start b/tools/anvil-safe-start index 77ad7f64..b7e8a496 100755 --- a/tools/anvil-safe-start +++ b/tools/anvil-safe-start @@ -18,6 +18,7 @@ use strict; use warnings; use Anvil::Tools; +use NetAddr::IP; require POSIX; my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0]; @@ -79,15 +80,160 @@ if (not $anvil->data->{sys}{database}{connections}) } } +### Process +# 1. Check if I am enabled and that no other copies are running. +# 2. Can I ping my peer on all three networks? Loop until true. +# - Wait here indefinately +# 3. ... +# 6. Using Start Groups/Delays (and ignoring 'clean' off VMs), boot servers. + + # Check to see if we should run. Also checks/sets enable/disable requests. prerun_checks($anvil); +# Wait until I can ping the peer on all three networks. This will not return until access is available on all +# networks. There is no timeout. +wait_for_access($anvil); + + $anvil->nice_exit({exit_code => 0}); ############################################################################################################# # Functions # ############################################################################################################# +# Check for which networks we have and verify that we can ping our peer on each. This function will not +# return until all networks are up. +sub wait_for_access +{ + my ($anvil) = @_; + + my $host_uuid = $anvil->Get->host_uuid(); + my $short_host_name = $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}; + my $peer_host_uuid = $anvil->data->{sys}{peer_host_uuid}; + my $peer_short_host_name = $anvil->data->{hosts}{host_uuid}{$peer_host_uuid}{short_host_name}; + my $peer_password = $anvil->data->{sys}{peer_password}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + host_uuid => $host_uuid, + short_host_name => $short_host_name, + peer_host_uuid => $peer_host_uuid, + peer_short_host_name => $peer_short_host_name, + peer_password => $anvil->Log->is_secure($peer_password), + }}); + + my $waiting = 1; + while ($waiting) + { + # This will get set back to '1' if + $waiting = 0; + + # Load IPs (again, to catch changes that might be delaying startup) + $anvil->Network->load_ips({ + clear => 1, + host => $short_host_name, + host_uuid => $host_uuid, + + }); + $anvil->Network->load_ips({ + clear => 1, + host => $peer_short_host_name, + host_uuid => $peer_host_uuid, + + }); + + # Loop through our interfaces and then loop our peers. Test access over them and set + # 'waiting' back to '1' if the connection fails. + foreach my $interface (sort {$a cmp $b} keys %{$anvil->data->{network}{$short_host_name}{interface}}) + { + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + interface => $interface, + waiting => $waiting, + }}); + + # Only care about our networks. + next if $waiting; + if (($interface !~ /^bcn/) && ($interface !~ /^sn/) && ($interface !~ /^ifn/)) + { + # Not an interface we care about + next; + } + + my $this_network = ($interface =~ /^(.*?)_/)[0]; + my $ip_address = $anvil->data->{network}{$short_host_name}{interface}{$interface}{ip}; + my $subnet_mask = $anvil->data->{network}{$short_host_name}{interface}{$interface}{subnet_mask}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:this_network' => $this_network, + 's2:ip_address' => $ip_address, + 's3:subnet_mask' => $subnet_mask, + }}); + + ### NOTE: I know I could match interface names, but that's not certain enough. It's + ### possible (if unlikely) that the network name+numbre differs on our peer. So + ### this is safer. + # Loop through my peer's interfaces and see if we're sharing this one. + my $local_network = NetAddr::IP->new($ip_address."/".$subnet_mask); + my $peer_match_found = 0; + foreach my $peer_interface (sort {$a cmp $b} keys %{$anvil->data->{network}{$peer_short_host_name}{interface}}) + { + last if $peer_match_found; + my $peer_ip_address = $anvil->data->{network}{$peer_short_host_name}{interface}{$peer_interface}{ip}; + my $peer_subnet_mask = $anvil->data->{network}{$peer_short_host_name}{interface}{$peer_interface}{subnet_mask}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + peer_interface => $peer_interface, + peer_ip_address => $peer_ip_address, + peer_subnet_mask => $peer_subnet_mask, + }}); + + # This the matching network? + next if $subnet_mask ne $peer_subnet_mask; + + my $peer_network = NetAddr::IP->new($peer_ip_address."/".$peer_subnet_mask); + if ($peer_network->within($local_network)) + { + # Match, test access. + $peer_match_found = 1; + my $access = $anvil->Remote->test_access({ + target => $peer_ip_address, + password => $peer_password, + }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { access => $access }}); + if ($access) + { + # This network is good. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0604", variables => { + peer => $peer_short_host_name, + network => $this_network, + peer_ip => $peer_ip_address, + }}); + } + else + { + # No access, wait and try it again. + $waiting = 1; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, priority => "alert", key => "log_0605", variables => { + peer => $peer_short_host_name, + network => $this_network, + peer_ip => $peer_ip_address, + }}); + } + } + + } + } + + if ($waiting) + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, priority => "alert", key => "log_0606", variables => { peer => $peer_short_host_name }}); + sleep 5; + } + } + + # All networks are up. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, priority => "alert", key => "log_0607", variables => { peer => $peer_short_host_name }}); + + return(0); +} + # This checks to verify that we're a node, and if so, if this tool is enabled. If it's disabled or this isn't # a node, this method will exit. sub prerun_checks @@ -123,7 +269,17 @@ sub prerun_checks my $node1_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node1_host_uuid}; my $node2_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node2_host_uuid}; - my $peer_host_uuid = $host_uuid eq $node1_host_uuid ? $node2_host_uuid : $node1_host_uuid; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + node1_host_uuid => $node1_host_uuid, + node2_host_uuid => $node2_host_uuid, + }}); + + $anvil->data->{sys}{peer_host_uuid} = $host_uuid eq $node1_host_uuid ? $node2_host_uuid : $node1_host_uuid; + $anvil->data->{sys}{peer_password} = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_password}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + "sys::peer_host_uuid" => $anvil->data->{sys}{peer_host_uuid}, + "sys::peer_password" => $anvil->Log->is_secure($anvil->data->{sys}{peer_password}), + }}); # Are we being asked to enable or disable? my $nodes = [$host_uuid]; @@ -147,12 +303,12 @@ sub prerun_checks if (not $anvil->data->{switches}{'local'}) { # Add our peer as well. - push @{$nodes}, $peer_host_uuid; + push @{$nodes}, $anvil->data->{sys}{peer_host_uuid}; } foreach my $host_uuid (@{$nodes}) { my ($variable_uuid) = $anvil->Database->insert_or_update_variables({ - debug => 2, + debug => 3, variable_name => "tool::anvil-safe-start::enabled", variable_value => $set_to, variable_default => 1, @@ -171,7 +327,7 @@ sub prerun_checks # Read my variables. my ($local_enabled, $variable_uuid, $mtime, $modified_date) = $anvil->Database->read_variable({ - debug => 2, + debug => 3, variable_name => "tool::anvil-safe-start::enabled", variable_source_table => "hosts", variable_source_uuid => $host_uuid, @@ -192,9 +348,10 @@ sub prerun_checks { # Yes, check our peer as well. my ($peer_enabled, $variable_uuid, $mtime, $modified_date) = $anvil->Database->read_variable({ + debug => 3, variable_name => "tool::anvil-safe-start::enabled", variable_source_table => "hosts", - variable_source_uuid => $peer_host_uuid, + variable_source_uuid => $anvil->data->{sys}{peer_host_uuid}, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peer_enabled => $peer_enabled, @@ -238,6 +395,22 @@ sub prerun_checks $anvil->nice_exit({exit_code => 0}); } + # Is another instance running? + my $pids = $anvil->System->pids({ + debug => 3, + ignore_me => 1, + program_name => $THIS_FILE, + }); + my $other_instances = @{$pids}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { other_instances => $other_instances }}); + + if ($other_instances) + { + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "message_0233"}); + $anvil->nice_exit({exit_code => 0}); + } + + # Last test, enabled or forced? if (not $local_enabled) { # Disabled. Forced? @@ -245,6 +418,7 @@ sub prerun_checks { # Forced, run anyway. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "message_0232"}); + return(0); } else {