diff --git a/ocf/alteeve/server b/ocf/alteeve/server index 995cc1f2..087dc97d 100755 --- a/ocf/alteeve/server +++ b/ocf/alteeve/server @@ -271,7 +271,7 @@ $anvil->nice_exit({exit_code => 255}); # This will either verify that 'libvirtd' and 'drbd' are running (and start them if not) is called with # "start". If called with "stop", a check is made on both nodes. If all VMs are gone, "libvirtd" and "drbd" # are stopped. -sub check_services +sub check_daemons { my ($anvil, $task) = @_; @@ -368,7 +368,10 @@ sub check_services if ($return_code eq "3") { # Stopped, start it.. - print "Starting: [".$daemon."] on: [".$peer_name."]\n"; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0486", variables => { + daemon => $daemon, + host => $peer_name, + }}); my ($output, $error, $return_code) = $anvil->Remote->call({ target => $peer_name, shell_call => $anvil->data->{path}{exe}{systemctl}." start ".$daemon, @@ -395,22 +398,34 @@ sub check_services if ($return_code eq "0") { $running = 1; - print "Verified start of: [".$daemon."] on: [".$peer_name."]\n"; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0487", variables => { + daemon => $daemon, + host => $peer_name, + }}); } else { $loops++; - if ($loops > 3) + if ($loops > 5) { + ### TODO: We may want to NOT die here, if + ### we're booting a server (though we + ### will if we're migrating). # Give up - print "[ Error ] - Start of: [".$daemon."] on: [".$peer_name."] appears to have failed!\n"; - die; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0135", variables => { + daemon => $daemon, + host => $peer_name, + }}); + $anvil->nice_exit({exit_code => 1}); } else { # Wait for a second. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0488", variables => { + daemon => $daemon, + host => $peer_name, + }}); sleep 1; - print "Waiting for: [".$daemon."] to start on: [".$peer_name."]...\n"; } } } @@ -418,7 +433,10 @@ sub check_services elsif ($return_code eq "0") { # Running, nothing to do. - print "The daemon: [".$daemon."] is already running on: [".$peer_name."].\n"; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0485", variables => { + daemon => $daemon, + host => $peer_name, + }}); } } } @@ -484,7 +502,16 @@ sub check_services }}); if ((not $local_vm_count) && (not $remote_vm_count)) { - print "No servers running on either node, stopping daemons.\n"; + if ($peer_ready) + { + # No servers running on either node. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0490"}); + } + else + { + # No servers running here and the peer is not in the cluster. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0491"}); + } foreach my $daemon ("libvirtd.service", "drbd.service") { my $running_local = 0; @@ -498,12 +525,12 @@ sub check_services if ($local_return_code eq "3") { # Already stopped. - print "The daemon: [".$daemon."] is already stopped locally.\n"; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0492", variables => { daemon => $daemon }}); } elsif ($local_return_code eq "0") { # Running, stop it. - print "Stopping: [".$daemon."] locally\n"; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0493", variables => { daemon => $daemon }}); my ($output, $return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{systemctl}." stop ".$daemon}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, @@ -523,12 +550,18 @@ sub check_services if ($remote_return_code eq "3") { # Already stopped. - print "The daemon: [".$daemon."] is already stopped on: [".$peer_name."].\n"; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0494", variables => { + daemon => $daemon, + host => $peer_name, + }}); } elsif ($remote_return_code eq "0") { # Running, stop it. - print "Stopping: [".$daemon."] on: [".$peer_name."]\n"; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0495", variables => { + daemon => $daemon, + host => $peer_name, + }}); my ($output, $error, $return_code) = $anvil->Remote->call({ target => $peer_name, shell_call => $anvil->data->{path}{exe}{systemctl}." stop ".$daemon, @@ -541,9 +574,13 @@ sub check_services } } } + else + { + # Servers are still running, don't stop the daemons. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0496"}); + } } - return(0); } @@ -576,7 +613,7 @@ sub start_server my ($anvil) = @_; # Before we do anything, make sure that 'libvirtd' and 'drbd' services are running. - check_services($anvil, "start"); + check_daemons($anvil, "start"); # Start procedure; # 1. Read the XML definition file and find the backing storage and bridges. Soft error if read fails. @@ -903,6 +940,9 @@ sub stop_server stop_drbd_resource($anvil); } + # If this was the last running server, stop the daemons. + check_daemons($anvil, "stop"); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "log_0324", variables => { server => $server }}); $anvil->nice_exit({exit_code => 0}); } @@ -934,6 +974,9 @@ sub migrate_server { my ($anvil) = @_; + # Before migrating, make sure the daemons are running on the peer. + check_daemons($anvil, "start"); + ### NOTE: For now, we're not going to block if the target is not UpToDate. There are times when a ### user might want to do this (ie: sync will be done soon and the need to evacuate the node ### ASAP is high). Maybe we'll enforce this and require a '--force' switch later? diff --git a/share/words.xml b/share/words.xml index b3b16c53..b37b9c88 100644 --- a/share/words.xml +++ b/share/words.xml @@ -194,6 +194,7 @@ The error was: Failed to remove the symlink: [#!variable!symlink!#]! Failed to read or parse the CIB! Is pacemaker running? Failed to start the daemon: [#!variable!daemon!#] on the local system, unable to boot the server. + Failed to start the daemon: [#!variable!daemon!#] on [#!variable!host!#], unable to boot the server. Current Network Interfaces and States @@ -712,7 +713,7 @@ We will keep looking. We were asked to promote: [#!variable!server!#], which makes no sense and is not supported. Ignoreing. We were asked to demote: [#!variable!server!#], which makes no sense and is not supported. Ignoreing. We were asked to notify, but this is not a promotable (we're stateless) agent. Ignoring. - We were invoked with an unexpected (or no) command. Environment variables and arguments below. + We were invoked with an unexpected (or no) command. Environment variables and arguments have been logged. We've been asked to start the server: [#!variable!server!#]. It appears that the list the currently running servers returned a non-zero return code: [#!variable!return_code!#]. We will proceed as we may be able to fix this. The output, if any, was: [#!variable!output!#]. Sanity checks passed, ready to start: [#!variable!server!#]. @@ -917,6 +918,17 @@ If the targets are unique, did you copy the full database directory? A unique id Verifying that the daemon: [#!variable!daemon!#] has started. Waiting for the daemon: [#!variable!daemon!#] to start... The daemon: [#!variable!daemon!#] was already running locally, no need to start. + Starting the daemon: [#!variable!daemon!#] on: [#!variable!host!#]. + Verifying that the daemon: [#!variable!daemon!#] has started on: [#!variable!host!#]. + Waiting for the daemon: [#!variable!daemon!#] to start on: [#!variable!host!#]... + The daemon: [#!variable!daemon!#] was already running on: [#!variable!host!#], no need to start. + There are no servers running on either node, stopping daemons. + There are no servers running on locally and the peer is not in the cluster, stopping daemons. + The daemon: [#!variable!daemon!#] is already stopped locally, nothing to do. + Stopping the daemon: [#!variable!daemon!#] locally. + The daemon: [#!variable!daemon!#] is already stopped on: [#!variable!host!#], nothing to do. + Stopping the daemon: [#!variable!daemon!#] on: [#!variable!host!#]. + One or more servers are still running on the Anvil!, not stopping daemons. The host name: [#!variable!target!#] does not resolve to an IP address. diff --git a/tools/test.pl b/tools/test.pl index 850c89b2..1aa140f9 100755 --- a/tools/test.pl +++ b/tools/test.pl @@ -25,261 +25,5 @@ print "Connecting to the database(s);\n"; $anvil->Database->connect(); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, secure => 0, key => "log_0132"}); -$anvil->data->{switches}{start} = ""; -$anvil->data->{switches}{stop} = ""; $anvil->Get->switches; -my $peer = $anvil->Cluster->get_peers(); -my $i_am = $anvil->data->{sys}{anvil}{i_am}; -my $peer_is = $anvil->data->{sys}{anvil}{peer_is}; -my $my_name = $i_am ? $anvil->data->{sys}{anvil}{$i_am}{host_name} : "--"; -my $peer_name = $peer_is ? $anvil->data->{sys}{anvil}{$peer_is}{host_name} : "--"; -print "I am: .. [".$i_am."], my host name is: . [".$my_name."]\n"; -print "Peer is: [".$peer_is."], peer host name is: [".$peer_name."]\n"; -print "- Returned peer: [".$peer."]\n"; - -if ($anvil->data->{switches}{start}) -{ - foreach my $daemon ("libvirtd.service", "drbd.service") - { - my $running_local = 0; - my $running_peer = 0; - - my ($local_output, $local_return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{systemctl}." status ".$daemon}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - local_output => $local_output, - local_return_code => $local_return_code, - }}); - if ($local_return_code eq "3") - { - # Stopped, start it.. - print "Starting: [".$daemon."] locally\n"; - my ($output, $return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{systemctl}." start ".$daemon}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - output => $output, - return_code => $return_code, - }}); - - my $loops = 0; - my $running = 0; - until ($running) - { - my ($output, $return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{systemctl}." status ".$daemon}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - output => $output, - return_code => $return_code, - }}); - if ($return_code eq "0") - { - $running = 1; - print "Verified start of: [".$daemon."]\n"; - } - else - { - $loops++; - if ($loops > 3) - { - # Give up - print "[ Error ] - Start of: [".$daemon."] appears to have failed!\n"; - die; - } - else - { - # Wait for a second. - sleep 1; - print "Waiting for: [".$daemon."] to start...\n"; - } - } - } - } - elsif ($local_return_code eq "0") - { - # Running, nothing to do. - print "The daemon: [".$daemon."] is already running locally.\n"; - } - - my ($remote_output, $remote_error, $remote_return_code) = $anvil->Remote->call({ - target => $peer_name, - shell_call => $anvil->data->{path}{exe}{systemctl}." status ".$daemon, - }); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - remote_output => $remote_output, - remote_error => $remote_error, - remote_return_code => $remote_return_code, - }}); - if ($remote_return_code eq "3") - { - # Stopped, start it.. - print "Starting: [".$daemon."] on: [".$peer_name."]\n"; - my ($output, $error, $return_code) = $anvil->Remote->call({ - target => $peer_name, - shell_call => $anvil->data->{path}{exe}{systemctl}." start ".$daemon, - }); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - output => $output, - error => $error, - return_code => $return_code, - }}); - - my $loops = 0; - my $running = 0; - until ($running) - { - my ($output, $error, $return_code) = $anvil->Remote->call({ - target => $peer_name, - shell_call => $anvil->data->{path}{exe}{systemctl}." status ".$daemon, - }); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - output => $output, - error => $error, - return_code => $return_code, - }}); - if ($return_code eq "0") - { - $running = 1; - print "Verified start of: [".$daemon."] on: [".$peer_name."]\n"; - } - else - { - $loops++; - if ($loops > 3) - { - # Give up - print "[ Error ] - Start of: [".$daemon."] on: [".$peer_name."] appears to have failed!\n"; - die; - } - else - { - # Wait for a second. - sleep 1; - print "Waiting for: [".$daemon."] to start on: [".$peer_name."]...\n"; - } - } - } - } - elsif ($remote_return_code eq "0") - { - # Running, nothing to do. - print "The daemon: [".$daemon."] is already running on: [".$peer_name."].\n"; - } - } -} -elsif ($anvil->data->{switches}{stop}) -{ - my $stop = 0; - - # Check both nodes if a server is running on either node. - my $local_vm_count = 0; - my $remote_vm_count = 0; - - # Call virsh list --all - my ($local_output, $local_return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{virsh}." list --all"}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - local_output => $local_output, - local_return_code => $local_return_code, - }}); - if (not $local_return_code) - { - # Parse output - foreach my $line (split/\n/, $local_output) - { - $line = $anvil->Words->clean_spaces({ string => $line }); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { line => $line }}); - - if ($line =~ /(\d+)\s+(.*?)\s+running/) - { - $local_vm_count++; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_vm_count => $local_vm_count }}); - } - } - } - - my ($remote_output, $remote_error, $remote_return_code) = $anvil->Remote->call({ - target => $peer_name, - shell_call => $anvil->data->{path}{exe}{virsh}." list --all", - }); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - remote_output => $remote_output, - remote_error => $remote_error, - remote_return_code => $remote_return_code, - }}); - if (not $remote_return_code) - { - # Parse output - foreach my $line (split/\n/, $remote_output) - { - $line = $anvil->Words->clean_spaces({ string => $line }); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { line => $line }}); - - if ($line =~ /(\d+)\s+(.*?)\s+running/) - { - $remote_vm_count++; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { remote_vm_count => $remote_vm_count }}); - } - } - } - - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - local_vm_count => $local_vm_count, - remote_vm_count => $remote_vm_count, - }}); - if ((not $local_vm_count) && (not $remote_vm_count)) - { - print "No servers running on either node, stopping daemons.\n"; - foreach my $daemon ("libvirtd.service", "drbd.service") - { - my $running_local = 0; - my $running_peer = 0; - - my ($local_output, $local_return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{systemctl}." status ".$daemon}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - local_output => $local_output, - local_return_code => $local_return_code, - }}); - if ($local_return_code eq "3") - { - # Already stopped. - print "The daemon: [".$daemon."] is already stopped locally.\n"; - } - elsif ($local_return_code eq "0") - { - # Running, stop it. - print "Stopping: [".$daemon."] locally\n"; - my ($output, $return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{systemctl}." stop ".$daemon}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - output => $output, - return_code => $return_code, - }}); - } - - my ($remote_output, $remote_error, $remote_return_code) = $anvil->Remote->call({ - target => $peer_name, - shell_call => $anvil->data->{path}{exe}{systemctl}." status ".$daemon, - }); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - remote_output => $remote_output, - remote_error => $remote_error, - remote_return_code => $remote_return_code, - }}); - if ($remote_return_code eq "3") - { - # Already stopped. - print "The daemon: [".$daemon."] is already stopped on: [".$peer_name."].\n"; - } - elsif ($remote_return_code eq "0") - { - # Running, stop it. - print "Stopping: [".$daemon."] on: [".$peer_name."]\n"; - my ($output, $error, $return_code) = $anvil->Remote->call({ - target => $peer_name, - shell_call => $anvil->data->{path}{exe}{systemctl}." stop ".$daemon, - }); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - output => $output, - error => $error, - return_code => $return_code, - }}); - } - } - } -}