#!/usr/bin/perl # # This is the resource agent used to manage servers on the Anvil! Intelligent Availability platform. # # License: GNU General Public License (GPL) v2+ # (c) 1997-2021 - Alteeve's Niche! Inc. # # WARNING: This is a pretty purpose-specific resource agent. No effort was made to test this on an rgmanager # cluster or on any configuration outside how the Anvil! m3 uses it. If you plan to adapt it to # another purpose, let us know and we'll try to help. # # NOTE: This method, for the sake of speed and reliability, does not connect to the Anvil! database. If you # do work on this RA, be sure that a check is made for database connections before SQL calls are made # in module methods. # # Based on: https://github.com/ClusterLabs/resource-agents/blob/master/doc/dev-guides/ra-dev-guide.asc # # Error types from pacemaker's perspective; # # - Soft Error - Unless specifically configured otherwise, pacemaker will attempt to recover a resource # in-place - usually by restarting the resource on the same node. # - Hard Error - Unless specifically configured otherwise, pacemaker will attempt to recover a resource # which failed with this error by restarting the resource on a different node. # - Fatal Error - This is a cluster-wide error, it would make no sense to recover such a resource on a # different node, let alone in-place. When a resource fails with this error, Pacemaker will # attempt to shut down the resource, and wait for administrator intervention. # # Exit codes; # 0 - OCF_SUCCESS # - The action completed successfully. This is the expected return code for any successful start, stop, # migrate_to, meta_data, help, and usage action. # - For monitor, however, a modified convention applies: # - If the server is running we return, OCF_SUCCESS. If not running and gracefully stopped or migrated # off, return OCF_NOT_RUNNING. # # 1 - OCF_ERR_GENERIC # - The action returned a generic error. This is used only when none of the more specific error codes, # defined below, accurately describes the problem. # - Pacemaker interprets this exit code as a soft error. # # 2 - OCF_ERR_ARGS # - The resource’s configuration is not valid on this machine. This can happen if the server fails to boot # because of a missing bridge, for example. # # 3 - OCF_ERR_UNIMPLEMENTED # - The resource agent was instructed to execute an action that we do not implement. # - Not all resource agent actions are mandatory. We don't implement 'promote' or 'demote'. We do implement # 'migrate_to', 'migrate_from', and 'notify'. If we're misconfigured as a master/slave resource, for # example, then will alert the user about this misconfiguration by returning OCF_ERR_UNIMPLEMENTED. # # 4 - OCF_ERR_PERM # - The action failed due to insufficient permissions. This may be due to a node not being able to open a # definition file or resource config. # - Pacemaker interprets this exit code as a hard error. # # 5 - OCF_ERR_INSTALLED # - The action failed because a required component is missing on the node where the action was executed. # This may be due to a required binary not being executable, or a the DRBD resource config file not # existing. # - Pacemaker interprets this exit code as a hard error. # # 6 - OCF_ERR_CONFIGURED # - The action failed because the user misconfigured the resource in pacemaker. For example, the user may # have configured an alphanumeric string for a parameter that really should be an integer. # - Pacemaker interprets this exit code as a fatal error. # # 7 - OCF_NOT_RUNNING # - The resource was found not to be running. This is an exit code that may be returned by the monitor # action exclusively. Note that this implies that the resource has either gracefully shut down, or has # never been started. # # 8 - OCF_RUNNING_MASTER # 9 - OCF_FAILED_MASTER # - These OCF exit codes are not used here. # # NOTE: We don't use Anvil::Tools to keep overhead low and to keep this agent independent as possible. use strict; use warnings; use Anvil::Tools; use XML::Simple; use JSON; use Math::BigInt; use Data::Dumper; # Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete. $| = 1; # The name of this file is just 'server', which isn't helpful, so we manually set it to our RA name. my $THIS_FILE = "ocf:alteeve:server"; my $running_directory = ($0 =~ /^(.*?)\/server/)[0]; if (($running_directory =~ /^\./) && ($ENV{PWD})) { $running_directory =~ s/^\./$ENV{PWD}/; } # Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete. $| = 1; # NOTE: Setting 'log_level' and 'log_secure' here will get overridden in the main lopp. Use the Log methods # in the loop as well to override defaults in code. my $anvil = Anvil::Tools->new(); ### Read or Set the environment variables # This is the name of the server we're managing. # Example values: $anvil->data->{environment}{OCF_RESKEY_name} = defined $ENV{OCF_RESKEY_name} ? $ENV{OCF_RESKEY_name} : ""; # This is our node name $anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node} = defined $ENV{OCF_RESKEY_CRM_meta_on_node} ? $ENV{OCF_RESKEY_CRM_meta_on_node} : ""; # This says "UUID", but it's the node ID. $anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node_uuid} = defined $ENV{OCF_RESKEY_CRM_meta_on_node_uuid} ? $ENV{OCF_RESKEY_CRM_meta_on_node_uuid} : ""; # Not used here, contains the pacemaker node ID # This is the timeout for the called action in millisecond. $anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout} = defined $ENV{OCF_RESKEY_CRM_meta_timeout} ? $ENV{OCF_RESKEY_CRM_meta_timeout} : ""; # 20000 # If this is set, we'll bump our log level as well. $anvil->data->{environment}{PCMK_debug} = defined $ENV{PCMK_debug} ? $ENV{PCMK_debug} : "0"; # Disable debug by default # These are other variables that are set, but we don't currently care about them $anvil->data->{environment}{OCF_EXIT_REASON_PREFIX} = defined $ENV{OCF_EXIT_REASON_PREFIX} ? $ENV{OCF_EXIT_REASON_PREFIX} : "ocf-exit-reason:"; $anvil->data->{environment}{OCF_RA_VERSION_MAJOR} = defined $ENV{OCF_RA_VERSION_MAJOR} ? $ENV{OCF_RA_VERSION_MAJOR} : ""; # 1 $anvil->data->{environment}{OCF_RA_VERSION_MINOR} = defined $ENV{OCF_RA_VERSION_MINOR} ? $ENV{OCF_RA_VERSION_MINOR} : ""; # 0 $anvil->data->{environment}{OCF_RESKEY_crm_feature_set} = defined $ENV{OCF_RESKEY_crm_feature_set} ? $ENV{OCF_RESKEY_crm_feature_set} : ""; # Pacemaker OCF version - 3.7.1 $anvil->data->{environment}{OCF_RESOURCE_INSTANCE} = defined $ENV{OCF_RESOURCE_INSTANCE} ? $ENV{OCF_RESOURCE_INSTANCE} : ""; # Name of server / resource being acted on. $anvil->data->{environment}{OCF_RESOURCE_PROVIDER} = defined $ENV{OCF_RESOURCE_PROVIDER} ? $ENV{OCF_RESOURCE_PROVIDER} : "alteeve"; $anvil->data->{environment}{OCF_RESOURCE_TYPE} = defined $ENV{OCF_RESOURCE_TYPE} ? $ENV{OCF_RESOURCE_TYPE} : "server"; $anvil->data->{environment}{OCF_ROOT} = defined $ENV{OCF_ROOT} ? $ENV{OCF_ROOT} : "/usr/lib/ocf"; # These are set during a migration $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source} = defined $ENV{OCF_RESKEY_CRM_meta_migrate_source} ? $ENV{OCF_RESKEY_CRM_meta_migrate_source} : ""; $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target} = defined $ENV{OCF_RESKEY_CRM_meta_migrate_target} ? $ENV{OCF_RESKEY_CRM_meta_migrate_target} : ""; $anvil->data->{environment}{OCF_RESKEY_CRM_meta_record_pending} = defined $ENV{OCF_RESKEY_CRM_meta_record_pending} ? $ENV{OCF_RESKEY_CRM_meta_record_pending} : ""; # Any variable=value arguments in the resource are set under 'OCF_RESKEY_CRM_meta_' foreach my $key (sort {$a cmp $b} keys %ENV) { next if $key !~ /^OCF_RESKEY_CRM_meta_/; $anvil->data->{environment}{$key} = $ENV{$key}; } # If pacemaker is in debug, so are we, if ($anvil->data->{environment}{PCMK_debug}) { $anvil->Log->level({set => 2}); } # Originally, this was designed to start and stop a server's DRBD resources on demand. Early testing appears # to show this prone to higher risk of fencing if something goes wrong. As such, we're changing the default # behaviour to leave DRBD resources up. Set this to '1' (here or by switch) to revert back to the old # behaviour. $anvil->data->{environment}{OCF_RESKEY_CRM_meta_stop_drbd_resources} = 0; # We're used by anvil-boot-server and anvil-stop-server. They don't set environment variables, but instead # use switches. Pick those up, if passed. $anvil->data->{switches}{migrate_to} = ""; # Sets 'meta_migrate_target' $anvil->data->{switches}{'migrate-to'} = ""; $anvil->data->{switches}{migrate_from} = ""; # Sets 'meta_migrate_source' When set without 'migrate_to', does a status check after migration $anvil->data->{switches}{'migrate-from'} = ""; $anvil->data->{switches}{server} = ""; # Sets 'name'. $anvil->data->{switches}{start} = ""; $anvil->data->{switches}{stop} = ""; $anvil->data->{switches}{monitor} = ""; $anvil->Get->switches(); if (($anvil->data->{switches}{'migrate-to'}) && not ($anvil->data->{switches}{migrate_to})) { $anvil->data->{switches}{migrate_to} = $anvil->data->{switches}{'migrate-to'}; } if (($anvil->data->{switches}{'migrate-from'}) && not ($anvil->data->{switches}{migrate_from})) { $anvil->data->{switches}{migrate_from} = $anvil->data->{switches}{'migrate-from'}; } $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::migrate_to" => $anvil->data->{switches}{migrate_to}, "switches::migrate_from" => $anvil->data->{switches}{migrate_from}, "switches::server" => $anvil->data->{switches}{server}, "switches::start" => $anvil->data->{switches}{start}, "switches::stop" => $anvil->data->{switches}{stop}, "switches::monitor" => $anvil->data->{switches}{monitor}, }}); if ($anvil->data->{switches}{stop_drbd_resources}) { $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source} = 1; } # Something for the logs $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 2, key => "log_0298"}); =cut Manual calls; # Start a server; /usr/lib/ocf/resource.d/alteeve/server -vv --log-secure --server --start # Stop a server /usr/lib/ocf/resource.d/alteeve/server -vv --log-secure --server --stop # Monitor a server /usr/lib/ocf/resource.d/alteeve/server -vv --log-secure --server --monitor # Migrate (run on current host) /usr/lib/ocf/resource.d/alteeve/server -vv --log-secure --server --migrate-to --migrate-from =cut foreach my $key (sort {$a cmp $b} keys %{$anvil->data->{environment}}) { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "environment::${key}" => $anvil->data->{environment}{$key}, }}); } foreach my $key (sort {$a cmp $b} keys %{$anvil->data->{switches}}) { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::${key}" => $anvil->data->{switches}{$key}, }}); } # Set environment variables from switches, if otherwise not set. if (($anvil->data->{switches}{server}) && (not $anvil->data->{environment}{OCF_RESKEY_name})) { $anvil->data->{environment}{OCF_RESKEY_name} = $anvil->data->{switches}{server}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "environment::OCF_RESKEY_name" => $anvil->data->{environment}{OCF_RESKEY_name}, }}); } if (($anvil->data->{switches}{migrate_to}) && (not $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target})) { $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target} = $anvil->data->{switches}{migrate_to}; $anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node} = $anvil->data->{switches}{migrate_to}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "environment::OCF_RESKEY_CRM_meta_migrate_target" => $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target}, }}); } if (($anvil->data->{switches}{migrate_from}) && (not $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source})) { $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source} = $anvil->data->{switches}{migrate_from}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "environment::OCF_RESKEY_CRM_meta_migrate_source" => $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source}, }}); } # This is for debugging. if (not $anvil->data->{switches}{monitor}) { show_environment($anvil, 3); } ### What are we being asked to do? # start  - Starts the resource. # stop  - Shuts down the resource. # monitor  - (status aliases here) Queries the resource for its state. # meta-data  - Dumps the resource agent metadata. # promote  - Turns a resource into the Master role (Master/Slave resources only). # demote  - Turns a resource into the Slave role (Master/Slave resources only). # migrate_to - migration target # migrate_from - Implement live migration of resources. # validate-all - Validates a resource’s configuration. # help  - (usage maps here) Displays a usage message when the resource agent is invoked from the command line, rather than by the cluster manager. # notify  - Inform resource about changes in state of other clones. if ($anvil->data->{switches}{migrate_to}) { # We don't support this, so we return OCF_ERR_UNIMPLEMENTED (3) migrate_server($anvil); } elsif ($anvil->data->{switches}{migrate_from}) { # This is called after a migration is complete, so we're basically just doing a status check. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0529", variables => { server => $anvil->data->{environment}{OCF_RESKEY_name} }}); server_status($anvil); } elsif ($anvil->data->{switches}{start}) { # Start the server start_server($anvil); } elsif ($anvil->data->{switches}{stop}) { # Stop the server stop_server($anvil); } elsif (($anvil->data->{switches}{monitor}) or ($anvil->data->{switches}{status})) { # Report the status of the server. server_status($anvil); } elsif (($anvil->data->{switches}{metadaata}) or ($anvil->data->{switches}{'meta-data'})) { show_metadata($anvil); } elsif ($anvil->data->{switches}{promote}) { # We don't support this, so we return OCF_ERR_UNIMPLEMENTED (3) $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0299", variables => { server => $anvil->data->{environment}{OCF_RESKEY_name} }}); $anvil->nice_exit({exit_code => 3}); } elsif ($anvil->data->{switches}{demote}) { # We don't support this, so we return OCF_ERR_UNIMPLEMENTED (3) $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0300", variables => { server => $anvil->data->{environment}{OCF_RESKEY_name} }}); $anvil->nice_exit({exit_code => 3}); } elsif ($anvil->data->{switches}{'validate-all'}) { # Validate our local config and setup. validate_all($anvil); $anvil->nice_exit({exit_code => 0}); } elsif (($anvil->data->{switches}{help}) or ($anvil->data->{switches}{usage})) { # Show the usage information show_usage($anvil); $anvil->nice_exit({exit_code => 0}); } elsif ($anvil->data->{switches}{notify}) { # We don't implement this $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level =>0, key => "log_0301"}); $anvil->nice_exit({exit_code => 3}); } else { # We were called in some unexpected way. Log an error, show usage and exit. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level =>0, key => "log_0302"}); show_environment($anvil, 0); $anvil->nice_exit({exit_code => 1}); } # If we hit here, something very wrong happened. $anvil->nice_exit({exit_code => 255}); ############################################################################################################# # Functions # ############################################################################################################# # This will either verify that 'libvirtd' and 'drbd' are running (and start them if not) is called with # "start". If called with "stop", a check is made on both nodes. If all VMs are gone, "libvirtd" and "drbd" # are stopped. sub check_daemons { my ($anvil, $task) = @_; my $problem = $anvil->Cluster->parse_cib({debug => 3}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { problem => $problem }}); if ($problem) { # Pacemaker isn't running, or some other problem. Someone must have called this script # directly or something. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0133"}); $anvil->nice_exit({exit_code => 1}); } # Is the peer running? We'll use this to know whether to try and start daemons on the peer. my $peer_name = $anvil->data->{cib}{parsed}{peer}{name}; my $peer_ready = $anvil->data->{cib}{parsed}{peer}{ready}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { peer_name => $peer_name, peer_ready => $peer_ready, }}); if ($task eq "start") { ### It doesn't look like we need to start drbd. Up'ing the first resource works without it. #foreach my $daemon ("libvirtd.service", "drbd.service") foreach my $daemon ("libvirtd.service") { my $running_local = 0; my $running_peer = 0; my ($output, $return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{systemctl}." status ".$daemon}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { output => $output, return_code => $return_code, }}); if ($return_code eq "3") { # It is stopped, start it.. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0482", variables => { daemon => $daemon }}); my ($output, $return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{systemctl}." start ".$daemon}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { output => $output, return_code => $return_code, }}); my $loops = 0; my $running = 0; until ($running) { my ($output, $return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{systemctl}." status ".$daemon}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { output => $output, return_code => $return_code, }}); if ($return_code eq "0") { # It's running $running = 1; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0483", variables => { daemon => $daemon }}); } else { $loops++; if ($loops > 5) { # Give up $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0134", variables => { daemon => $daemon }}); $anvil->nice_exit({exit_code => 1}); } else { # Wait for a second. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0484", variables => { daemon => $daemon }}); sleep 1; } } } } elsif ($return_code eq "0") { # Running, nothing to do. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0485", variables => { daemon => $daemon }}); } if ($peer_ready) { my ($output, $error, $return_code) = $anvil->Remote->call({ target => $peer_name, shell_call => $anvil->data->{path}{exe}{systemctl}." status ".$daemon, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, error => $error, return_code => $return_code, }}); if ($return_code eq "3") { # Stopped, start it.. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0486", variables => { daemon => $daemon, host => $peer_name, }}); my ($output, $error, $return_code) = $anvil->Remote->call({ target => $peer_name, shell_call => $anvil->data->{path}{exe}{systemctl}." start ".$daemon, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { output => $output, error => $error, return_code => $return_code, }}); my $loops = 0; my $running = 0; until ($running) { my ($output, $error, $return_code) = $anvil->Remote->call({ target => $peer_name, shell_call => $anvil->data->{path}{exe}{systemctl}." status ".$daemon, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { output => $output, error => $error, return_code => $return_code, }}); if ($return_code eq "0") { $running = 1; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0487", variables => { daemon => $daemon, host => $peer_name, }}); } else { $loops++; if ($loops > 5) { ### TODO: We may want to NOT die here, if ### we're booting a server (though we ### will if we're migrating). # Give up $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0135", variables => { daemon => $daemon, host => $peer_name, }}); $anvil->nice_exit({exit_code => 1}); } else { # Wait for a second. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0488", variables => { daemon => $daemon, host => $peer_name, }}); sleep 1; } } } } elsif ($return_code eq "0") { # Running, nothing to do. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0489", variables => { daemon => $daemon, host => $peer_name, }}); } } } } =cut # It's simpler (and thus safer) to not stop daemons. if ($task eq "stop") { print "Stopping daemons\n"; my $stop = 0; # Check both nodes if a server is running on either node. my $local_vm_count = 0; my $remote_vm_count = 0; # Call virsh list --all my ($local_output, $local_return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{virsh}." list --all"}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_output => $local_output, local_return_code => $local_return_code, }}); if (not $local_return_code) { # Parse output foreach my $line (split/\n/, $local_output) { $line = $anvil->Words->clean_spaces({ string => $line }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { line => $line }}); if ($line =~ /(\d+)\s+(.*?)\s+running/) { $local_vm_count++; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_vm_count => $local_vm_count }}); } } } my ($remote_output, $remote_error, $remote_return_code) = $anvil->Remote->call({ target => $peer_name, shell_call => $anvil->data->{path}{exe}{virsh}." list --all", }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { remote_output => $remote_output, remote_error => $remote_error, remote_return_code => $remote_return_code, }}); if (not $remote_return_code) { # Parse output foreach my $line (split/\n/, $remote_output) { $line = $anvil->Words->clean_spaces({ string => $line }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { line => $line }}); if ($line =~ /(\d+)\s+(.*?)\s+running/) { $remote_vm_count++; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { remote_vm_count => $remote_vm_count }}); } } } $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_vm_count => $local_vm_count, remote_vm_count => $remote_vm_count, }}); if ((not $local_vm_count) && (not $remote_vm_count)) { if ($peer_ready) { # No servers running on either node. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0490"}); } else { # No servers running here and the peer is not in the cluster. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0491"}); } foreach my $daemon ("libvirtd.service", "drbd.service") { my $running_local = 0; my $running_peer = 0; my ($local_output, $local_return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{systemctl}." status ".$daemon}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_output => $local_output, local_return_code => $local_return_code, }}); if ($local_return_code eq "3") { # Already stopped. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0492", variables => { daemon => $daemon }}); } elsif ($local_return_code eq "0") { # Running, stop it. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0493", variables => { daemon => $daemon }}); my ($output, $return_code) = $anvil->System->call({shell_call => $anvil->data->{path}{exe}{systemctl}." stop ".$daemon}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code, }}); } my ($remote_output, $remote_error, $remote_return_code) = $anvil->Remote->call({ target => $peer_name, shell_call => $anvil->data->{path}{exe}{systemctl}." status ".$daemon, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { remote_output => $remote_output, remote_error => $remote_error, remote_return_code => $remote_return_code, }}); if ($remote_return_code eq "3") { # Already stopped. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0494", variables => { daemon => $daemon, host => $peer_name, }}); } elsif ($remote_return_code eq "0") { # Running, stop it. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0495", variables => { daemon => $daemon, host => $peer_name, }}); my ($output, $error, $return_code) = $anvil->Remote->call({ target => $peer_name, shell_call => $anvil->data->{path}{exe}{systemctl}." stop ".$daemon, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, error => $error, return_code => $return_code, }}); } } } else { # Servers are still running, don't stop the daemons. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0496"}); } } =cut return(0); } =cut STATES The State field lists what state each domain is currently in. A domain can be in one of the following possible states: running - The domain is currently running on a CPU idle - The domain is idle, and not running or runnable. This can be caused because the domain is waiting on IO (a traditional wait state) or has gone to sleep because there was nothing else for it to do. paused - The domain has been paused, usually occurring through the administrator running virsh suspend. When in a paused state the domain will still consume allocated resources like memory, but will not be eligible for scheduling by the hypervisor. in shutdown - The domain is in the process of shutting down, i.e. the guest operating system has been notified and should be in the process of stopping its operations gracefully. shut off - The domain is not running. Usually this indicates the domain has been shut down completely, or has not been started. crashed - The domain has crashed, which is always a violent ending. Usually this state can only occur if the domain has been configured not to restart on crash. pmsuspended - The domain has been suspended by guest power management, e.g. entered into s3 state. =cut # This boots the server if possible. sub start_server { my ($anvil) = @_; # Before we do anything, make sure that 'libvirtd' and 'drbd' services are running. check_daemons($anvil, "start"); # Start procedure; # 1. Read the XML definition file and find the backing storage and bridges. Soft error if read fails. # 2. Make sure the name matches. # 3. Make sure we have enough free RAM. # 4. Make sure the emulator exists (can be an issue after migrating from an different gen Anvil!). # 5.1. Make sure optical drives with mounted data have the disk present. Soft error if not. # 5.2. Find any backing DRBD devices # 6. For each DRBD device; # 6.1. Make sure the backing LV is ACTIVE. Soft error if not. # 6.2. Check if the drbd resource is up. If not, up it. # 6.3. Make sure the backing disk is UpToDate. Soft error if not. # 6.4. Make sure the backing device is 'Connected' or 'Connecting'. Call a connect if not. # 7. Make sure all bridges exist and soft error if not. # 8. Start the server. my $server = $anvil->data->{environment}{OCF_RESKEY_name}; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0303", variables => { server => $server }}); if ((not $anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node}) && ($anvil->data->{switches}{target})) { $anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node} = $anvil->data->{switches}{target}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "environment::OCF_RESKEY_CRM_meta_on_node" => $anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node}, }}); } # Make sure things are sane. validate_all($anvil); # Is the server already running somewhere? find_server($anvil); # Start the resource, if needed. start_drbd_resource($anvil); # Still alive? Boot! my ($success) = $anvil->Server->boot_virsh({debug => 2, server => $server}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { success => $success }}); if ($success) { # Success! $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0309", variables => { server => $server }}); $anvil->nice_exit({exit_code => 0}); } else { # WTF? $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0310", variables => { server => $server, 'state' => defined $anvil->data->{server}{location}{$server}{host_name} ? $anvil->data->{server}{location}{$server}{host_name} : "#!string!unit_0003!#", }}); $anvil->nice_exit({exit_code => 6}); } # If we're still alive, then we didn't see the server in the list of running servers, which is really weird. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0311", variables => { server => $server }}); $anvil->nice_exit({exit_code => 1}); } # This stops the DRBD resource(s) that ran under a server. sub stop_drbd_resource { my ($anvil) = @_; my $local_host = $anvil->Get->short_host_name(); my $server = $anvil->data->{environment}{OCF_RESKEY_name}; my $host = $anvil->Get->short_host_name; my $peer = $anvil->data->{drbd}{config}{$host}{peer}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { server => $server, host => $host, peer => $peer, }}); # Stop the DRBD resource. foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}}) { my $peer_ip = $anvil->data->{drbd}{config}{$host}{resource}{$resource}{connection}{$peer}{ip_address}; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0408", variables => { server => $server, peer => $peer, peer_ip => $peer_ip, resource => $resource, }}); # Bring the peer's resource down. $anvil->DRBD->manage_resource({ debug => 3, resource => $resource, task => "down", target => $peer_ip, }); # Bring the local resource down $anvil->DRBD->manage_resource({ debug => 3, resource => $resource, task => "down", }); } return(0); } # This starts the drbd resource(s) for the requested server, if needed. sub start_drbd_resource { my ($anvil) = @_; my $local_host = $anvil->Get->short_host_name(); my $server = $anvil->data->{environment}{OCF_RESKEY_name}; my $host = $anvil->Get->short_host_name; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { server => $server, host => $host, }}); # Do we need startup? my $local_startup_needed = 0; $anvil->DRBD->get_status({debug => 3}); foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}}) { # Is the current resource up locally already? my $role = defined $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role} ? $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role} : ""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:resource' => $resource, 's2:role' => $role, }}); if ((lc($role) ne "secondary") && (lc($role) ne "primary")) { $local_startup_needed = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_startup_needed => $local_startup_needed }}); last; } else { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0434", variables => { resource => $resource, role => $role, }}); } } # Do I need to start the DRBD resource locally? If so, do so. $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { local_startup_needed => $local_startup_needed }}); if ($local_startup_needed) { foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}}) { # Bring the local resource up $anvil->DRBD->manage_resource({ debug => 2, resource => $resource, task => "up", }); # Now wait for it to come up. my $waiting = 1; my $wait_until = time + 5; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:time' => time, 's2:wait_until' => $wait_until, }}); while($waiting) { $anvil->DRBD->get_status({debug => 3}); my $all_up = 1; foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}}) { my $disk_state = lc($anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}{$volume}{'disk-state'}); $disk_state = "" if not defined $disk_state; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:resource' => $resource, 's2:volume' => $volume, 's3:disk_state' => $disk_state, }}); if (($disk_state ne "inconsistent") && ($disk_state ne "outdated") && ($disk_state ne "consistent") && ($disk_state ne "uptodate")) { $all_up = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_up => $all_up }}); } } if ($all_up) { $waiting = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); } if ($waiting) { sleep 1; } elsif (time > $wait_until) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "warning_0138"}); $waiting = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); } } } # If auto-promote isn't set, promote the resource. if (not $anvil->data->{drbd}{config}{$local_host}{'auto-promote'}) { foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}}) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0420", variables => { server => $server, resource => $resource, }}); # Make the local resource primary. $anvil->DRBD->manage_resource({ resource => $resource, task => "primary", }); } } } # See if we're inconsistent and, if so, if we can connect our peers. sleep 2; $anvil->DRBD->get_status({debug => 3}); my $peer_startup_needed = 1; foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}}) { # Is the current resource up locally already? my $role = defined $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role} ? $anvil->data->{drbd}{status}{$host}{resource}{$resource}{role} : ""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:resource' => $resource, 's2:role' => $role, }}); # Check all volumes. foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}}) { my $disk_state = defined $anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}{$volume}{'disk-state'} ? $anvil->data->{drbd}{status}{$host}{resource}{$resource}{devices}{volume}{$volume}{'disk-state'} : ""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { disk_state => $disk_state }}); if ((lc($disk_state) eq "consistent") or (lc($disk_state) eq "outdated") or (lc($disk_state) eq "failed") or (not $disk_state)) { # This will trigger trying to ssh into peer(s) and up'ing their resource. $peer_startup_needed = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peer_startup_needed => $peer_startup_needed }}); last; } } } ### NOTE: We always check the peer now, in case it's resource is down and ours happens to be up. # Do we need to start the resource on our peers? #$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peer_startup_needed => $peer_startup_needed }}); #if (not $peer_startup_needed) #{ # $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0431"}); # return(0); #} # Start DRBD on the peer(s). foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}}) { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { resource => $resource }}); foreach my $host (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}}) { my $is_local = $anvil->Network->is_local({host => $host}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:host' => $host, 's2:is_local' => $is_local, }}); my $connection_state = $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}{$host}{'connection-state'}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { connection_state => $connection_state }}); if (lc($connection_state) ne "connected") { # Try to connect to the peer and up this reasource. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0694", variables => { host => $host, resource => $resource, connection_state => $connection_state, }}); my ($access) = $anvil->Remote->test_access({target => $host}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { access => $access }}); if ($access) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0695", variables => { host => $host, resource => $resource, }}); $anvil->DRBD->manage_resource({ debug => 2, resource => $resource, task => "up", target => $host, }); } else { # No access $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "warning_0136", variables => { host => $host }}); } } } } # Loop until all our resources are Connected or UpToDate my $waiting = 1; my $wait_until = time + 5; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:time' => time, 's2:wait_until' => $wait_until, }}); while($waiting) { sleep 1; $anvil->DRBD->get_status({debug => 3}); my $all_resources_ok = 1; foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}}) { # This is set to '1' is either the volumes are UpToDate or Sync'ing. $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{ok} = 0; foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}}) { # This will be used to mark if a volume is being sync'ed later, if needed. $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}{$volume}{ok} = 0; my $disk_state = lc($anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}{$volume}{'disk-state'}); $disk_state = "" if not defined $disk_state; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { disk_state => $disk_state }}); if ($disk_state ne "uptodate") { $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}{$volume}{ok} = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "drbd::status::${local_host}::resource::${resource}::devices::volume::${volume}::ok" => $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}{$volume}{ok}, }}); } } if (not $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{ok}) { # See if we're a SyncTarget foreach my $connection (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}}) { my $connection_state = $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}{'connection-state'}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { connection => $connection, connection_state => $connection_state, }}); foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}{$connection}{volume}}) { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "drbd::status::${local_host}::resource::${resource}::devices::volume::${volume}::ok" => $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}{$volume}{ok}, }}); next if $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}{$volume}{ok}; my $replication_state = lc($anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{connection}{$connection}{volume}{$volume}{'replication-state'}); $replication_state = "" if not defined $replication_state; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 's1:volume' => $volume, 's2:replication_state' => $replication_state, }}); if ($replication_state =~ /sync/) { # We're good to go. $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}{$volume}{ok} = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "drbd::status::${local_host}::resource::${resource}::devices::volume::${volume}::ok" => $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}{$volume}{ok}, }}); } } } } # Loop through all volumes on all resources and see if they're OK. If they are, mark the resource as OK. my $resource_ok = 1; foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}}) { my $volume_ok = $anvil->data->{drbd}{status}{$local_host}{resource}{$resource}{devices}{volume}{$volume}{ok}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { volume => $volume, volume_ok => $volume_ok, }}); if (not $volume_ok) { $resource_ok = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { resource_ok => $resource_ok }}); } } if (not $resource_ok) { $all_resources_ok = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_resources_ok => $all_resources_ok }}); } } if ($all_resources_ok) { $waiting = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); } elsif (time > $wait_until) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "warning_0137"}); $waiting = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); } } return(0); } # This uses the DRBD information to find other peers and see if the server is running on them. sub find_server { my ($anvil) = @_; my $server = $anvil->data->{environment}{OCF_RESKEY_name}; my $host = $anvil->Get->short_host_name; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0415", variables => { server => $server }}); foreach my $ip_address (sort {$a cmp $b} keys %{$anvil->data->{drbd}{config}{$host}{ip_addresses}}) { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { ip_address => $ip_address }}); $anvil->Server->find({ debug => 3, target => $ip_address, remote_user => "root", }); } foreach my $this_server (sort {$a cmp $b} keys %{$anvil->data->{server}{location}}) { my $status = $anvil->data->{server}{location}{$this_server}{status}; my $host = $anvil->data->{server}{location}{$this_server}{host_name}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { this_server => $this_server, status => $status, host => $host, }}); } if ((exists $anvil->data->{server}{location}{$server}) && ($anvil->data->{server}{location}{$server}{host_name})) { # The server is running. If it is running here, exit with success. If it's running elsewhere, # exit with a failure. my $status = $anvil->data->{server}{location}{$server}{status}; my $host = $anvil->data->{server}{location}{$server}{host_name}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { status => $status, host => $host, }}); if ($host eq $anvil->Get->host_name) { # Already running, we're good, and we're done. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0416", variables => { server => $server }}); $anvil->nice_exit({exit_code => 0}); } elsif ($host =~ /dr(\d+)$/) { # The server is running elsewhere. If the peer host is DR, exit with # OCF_ERR_CONFIGURED (6) so that pacemaker doesn't try to also start the server on # the other node, because we don't know the state of it here. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0417", variables => { server => $server, host => $host, }}); $anvil->nice_exit({exit_code => 6}); } else { # It looks like it's running on the peer. So we'll exit OCF_ERR_INSTALLED (5) to tell # pacemaker to try to start it on our peer. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0418", variables => { server => $server, host => $host, }}); $anvil->nice_exit({exit_code => 6}); } } return(0); } # This shuts down the server if possible. sub stop_server { my ($anvil) = @_; # Stopping the server is simply a question of "is the server running?" and, if so, stop it. Once # stopped, and if enabled, we stop the DRBD resource on both nodes. my $server = $anvil->data->{environment}{OCF_RESKEY_name}; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0582", variables => { server => $server }}); # Read in an parse the server's XML. $anvil->System->check_storage(); $anvil->Server->get_status({server => $server}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0313", variables => { server => $server }}); my $success = $anvil->Server->shutdown_virsh({server => $server}); if (not $success) { # Something went wrong. Details should be in the logs. $anvil->nice_exit({exit_code => 1}); } # Now stop the DRBD resource(s). $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'environment::OCF_RESKEY_CRM_meta_stop_drbd_resources' => $anvil->data->{environment}{OCF_RESKEY_CRM_meta_stop_drbd_resources}, }}); if ($anvil->data->{environment}{OCF_RESKEY_CRM_meta_stop_drbd_resources}) { stop_drbd_resource($anvil); } $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0324", variables => { server => $server }}); $anvil->nice_exit({exit_code => 0}); } # This checks the status of the server. sub server_status { my ($anvil) = @_; # If the named server is running, return OCF_SUCCESS (rc: 0), otherwise OCF_NOT_RUNNING (rc: 7). If # the server is failed, return OCF_ERR_GENERIC (1). my $state = ""; my $server = $anvil->data->{environment}{OCF_RESKEY_name}; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0521", variables => { server => $server }}); if (not $anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout}) { # Set a sane default of 20 seconds. $anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout} = 20000; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "log_0331", variables => { timeout => $anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout} }}); } # Is 'libvirtd' running? We'll wait up to half the timeout for it to start (in case it _just_ started) # before timing out. my $wait_until = time + ($anvil->data->{environment}{OCF_RESKEY_CRM_meta_timeout} / 2000); # Devide by 2000 to convert to seconds and total second halved. my $look_for_pid = 0; my $libvirtd_wait = 1; my $warning_shown = 0; while($libvirtd_wait) { my $running = $anvil->System->check_daemon({daemon => "libvirtd.service"}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { running => $running }}); if ($running) { $libvirtd_wait = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { libvirtd_wait => $libvirtd_wait }}); } else { # On EL8 and above, libvirtd starts on demand, so this error isn't if (not $warning_shown) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0522", variables => { wait_time => ($wait_until - time) }}); $warning_shown = 1; } sleep 1; if (time > $wait_until) { # Libvirtd isn't running, try to find the PID of the server (in case it's # running and libvirtd isn't) $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, priority => "alert", key => "warning_0057"}); $look_for_pid = 1; $libvirtd_wait = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { look_for_pid => $look_for_pid, libvirtd_wait => $libvirtd_wait, }}); } } } # If libvirtd wasn't running, we'll manually look for a PID. if ($look_for_pid) { my $server_up = 0; my $shell_call = $anvil->data->{path}{exe}{ps}." aux"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code, }}); foreach my $line (split/\n/, $output) { next if $line !~ /qemu-kvm/; $line = $anvil->Words->clean_spaces({ string => $line }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { line => $line }}); if ($line =~ /guest=(.*?),/) { my $this_server = $1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { this_server => $this_server }}); if ($this_server eq $server) { # Found it. $server_up = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { server_up => $server_up }}); last; } } } if ($server_up) { # The server is running. Exit with OCF_SUCCESS (rc 0); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0523"}); $anvil->nice_exit({exit_code => 0}); } else { # The server is not running. Exit with OCF_NOT_RUNNING (rc: 7) $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0524"}); $anvil->nice_exit({exit_code => 7}); } } else { # Parse the virsh state. If it's listed as 'crashed', return OCF_ERR_GENERIC (rc: 1). If it's # 'in shutdown', 'loop' gets set to 1 and this will loop indefinitely. We don't put a timer # on it, we let pacemaker handle that. my $loop = 1; while($loop) { $loop = 0; my $found = 0; my $shell_call = $anvil->data->{path}{exe}{virsh}." list --all"; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code, }}); foreach my $line (split/\n/, $output) { $line = $anvil->Words->clean_spaces({ string => $line }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { line => $line }}); if ($line =~ /\s\Q$server\E\s+(.*)/) { my $state = $1; $found = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { found => $found, 'state' => $state, }}); =cut * Server states; running - The domain is currently running on a CPU idle - The domain is idle, and not running or runnable. This can be caused because the domain is waiting on IO (a traditional wait state) or has gone to sleep because there was nothing else for it to do. paused - The domain has been paused, usually occurring through the administrator running virsh suspend. When in a paused state the domain will still consume allocated resources like memory, but will not be eligible for scheduling by the hypervisor. in shutdown - The domain is in the process of shutting down, i.e. the guest operating system has been notified and should be in the process of stopping its operations gracefully. shut off - The domain is not running. Usually this indicates the domain has been shut down completely, or has not been started. crashed - The domain has crashed, which is always a violent ending. Usually this state can only occur if the domain has been configured not to restart on crash. pmsuspended - The domain has been suspended by guest power management, e.g. entered into s3 state. =cut ### TODO: Should we treat 'idle' same as crashed? if ($state eq "crashed") { # Woops. Exit with OCF_ERR_GENERIC (rc: 1). $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "warning_0058", variables => { server => $server }}); $anvil->nice_exit({exit_code => 1}); } elsif ($state eq "in shutdown") { # Wait. $loop = 1; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0525", variables => { server_name => $server }}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { loop => $loop }}); sleep 1; last; } elsif ($state eq "shut off") { # Exit with OCF_NOT_RUNNING (rc: 7); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0526", variables => { server_name => $server }}); $anvil->nice_exit({exit_code => 7}); } else { # In some fashion or another, the server is running. Exit with OCF_SUCCESS (rc: 0) $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0527", variables => { 'state' => $state, server_name => $server, }}); $anvil->nice_exit({exit_code => 0}); } } } # If it wasn't found at all, exit. if (not $found) { # Exit with OCF_NOT_RUNNING (rc: 7); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0526", variables => { server_name => $server }}); $anvil->nice_exit({exit_code => 7}); } } } # This method should never return. Just in case it does though, exit with generic error. $anvil->nice_exit({exit_code => 1}); } # Migrate the server sub migrate_server { my ($anvil) = @_; ### NOTE: For now, we're not going to block if the target is not UpToDate. There are times when a ### user might want to do this (ie: sync will be done soon and the need to evacuate the node ### ASAP is high). Maybe we'll enforce this and require a '--force' switch later? # If we were given 'migrate_to', we need to make sure the storage is UpToDate on the peer for all # backing resources. We can't check the target's bridges, but the migation will fail if one is # missing. # If we're given 'migrate_from', we're pulling the server towards us, so we can check both brdiges # and storage. my $local_host = $anvil->Get->short_host_name(); my $server = $anvil->data->{environment}{OCF_RESKEY_name}; my $source = $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source}; my $target = $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target}; my $meta_on_node = $anvil->data->{environment}{OCF_RESKEY_CRM_meta_on_node}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { server => $server, source => $source, target => $target, meta_on_node => $meta_on_node, }}); # Make sure switches are at least defined. $anvil->data->{switches}{migrate_to} = "" if not defined $anvil->data->{switches}{migrate_to}; $anvil->data->{switches}{migrate_from} = "" if not defined $anvil->data->{switches}{migrate_from}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'switches::migrate_to' => $anvil->data->{switches}{migrate_to}, 'switches::migrate_from' => $anvil->data->{switches}{migrate_from}, }}); # Log what we're doing. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0528", variables => { server => $server, target_host => $target, }}); # If there is a '.mnX' (migration network X) entry that can be resolved, we'll change the # target to use that. This is a dedicated, usually back-to-back network used in nodes specifically # for migration. my $test_target = $target; $test_target =~ s/\..*$//; $test_target .= ".mn1"; # Might want to make this a loop to support MN2+ later my $test_ip = $anvil->Convert->host_name_to_ip({debug => 2, host_name => $test_target}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { test_target => $test_target, test_ip => $test_ip, }}); if ($test_ip) { # Can we access the peer with this? my ($access) = $anvil->Remote->test_access({debug => 3, target => $test_ip}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { access => $access }}); # Did we get access? if ($access) { # Yup! Switch the target. $target = $test_ip; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0663", variables => { target => $test_target, ip => $target, }}); } } # Before migrating, make sure the daemons are running on the peer. check_daemons($anvil, "start"); # The actual migration command will involve enabling dual primary, then beginning the migration. The # virsh call will depend on if we're pushing or pulling. Once the migration completes, regardless of # success or failure, dual primary will be disabled again. my $migrated = 0; if ($target) { # Can I even connect to the target? my ($access) = $anvil->Remote->test_access({debug => 3, target => $target}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { access => $access }}); if (not $access) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0429", variables => { server => $server, target => $target, }}); ### TODO: I wonder if this should be exit'ed with '6'? $anvil->nice_exit({exit_code => 5}); } # Find the server $anvil->Server->find({debug => 3}); my $server_host = defined $anvil->data->{server}{location}{$server}{host_name} ? $anvil->data->{server}{location}{$server}{host_name} : ""; my $server_status = defined $anvil->data->{server}{location}{$server}{status} ? $anvil->data->{server}{location}{$server}{status} : ""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { server_host => $server_host, server_status => $server_status, }}); # Is it already on the target? if (not $server_status) { # Maybe... $anvil->Server->find({debug => 3, target => $target}); $server_host = defined $anvil->data->{server}{location}{$server}{host_name} ? $anvil->data->{server}{location}{$server}{host_name} : ""; $server_status = defined $anvil->data->{server}{location}{$server}{status} ? $anvil->data->{server}{location}{$server}{status} : ""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { server_host => $server_host, server_status => $server_status, }}); if (($server_host eq $target) && (($server_status) && ($server_status eq "running"))) { # Already over there, we're done. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0430", variables => { server => $server, target => $target, }}); $anvil->nice_exit({exit_code => 0}); } } if (not $server_host) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0344", variables => { server => $server }}); $anvil->nice_exit({exit_code => 1}); } # Get a view of the servers locally and on our peer. validate_all($anvil); # Get the DRBD status. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0387"}); $anvil->DRBD->get_status({debug => 3}); # Make sure all resource(s) are ready for the server. my $all_up_to_date = 1; my $host = $anvil->Get->short_host_name; my $peer_name = $anvil->data->{drbd}{config}{$host}{peer}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host => $host, peer_name => $peer_name, }}); foreach my $resource (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{resource}}) { my $connection_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'connection-state'}; my $peer_node_id = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'peer-node-id'}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { peer_node_id => $peer_node_id, resource => $resource, connection_state => $connection_state, }}); if (lc($connection_state) ne "connected") { # Try to bring the resource up on the peer now. $anvil->DRBD->manage_resource({ resource => $resource, task => "up", target => $target, }); # We'll give it 20 seconds. my $wait = 20; while($wait) { $anvil->DRBD->get_status({debug => 3}); $connection_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{'connection-state'}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { resource => $resource, connection_state => $connection_state, }}); if (lc($connection_state) ne "connected") { # It's up! $wait = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'wait' => $wait }}); } else { $wait--; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'wait' => $wait }}); if (not $wait) { # We're done waiting. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0428", variables => { server => $server, target => $target, resource => $resource, connection_state => $connection_state, }}); ### TODO: I wonder if this should be exit'ed with '6'? $anvil->nice_exit({exit_code => 5}); } } } } foreach my $volume (sort {$a cmp $b} keys %{$anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{volume}}) { my $peer_disk_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{volume}{$volume}{'peer-disk-state'}; my $percent_in_sync = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{volume}{$volume}{'percent-in-sync'}; my $replication_state = $anvil->data->{drbd}{status}{$host}{resource}{$resource}{connection}{$peer_name}{volume}{$volume}{'replication-state'}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { volume => $volume, peer_disk_state => $peer_disk_state, percent_in_sync => $percent_in_sync, replication_state => $replication_state, }}); if (lc($peer_disk_state) ne "uptodate") { $all_up_to_date = 0; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_up_to_date => $all_up_to_date }}); } } } $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { all_up_to_date => $all_up_to_date }}); if (not $all_up_to_date) { ### TODO: If we decide later to block migration to Inconsistent peers, here's where we'd do it. } # If we're still alive, we're ready to migrate. ($migrated) = $anvil->Server->migrate_virsh({ debug => 2, server => $server, source => $source, target => $target, }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { migrated => $migrated }}); } elsif ($source) { ### NOTE: Pacemaker doesn't seem to ever pull servers. # Pull the server here. Start by verifying it's on the 'meta_on_node' host. # Scan locally and on our peer $anvil->Server->find({debug => 2}); $anvil->Server->find({debug => 2, target => $meta_on_node, refresh => 0}); my $host = defined $anvil->data->{server}{location}{$server}{host_name} ? $anvil->data->{server}{location}{$server}{host_name} : ""; my $short_host = ($host =~ /^(.*?)\..*$/)[0]; my $status = defined $anvil->data->{server}{location}{$server}{status} ? $anvil->data->{server}{location}{$server}{status} : ""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host => $host, short_host => $short_host, status => $status, target => $target, }}); # Convert the host to a short name, in case the node's name is the short version. my $server_host = defined $anvil->data->{server}{location}{$server}{host_name} ? $anvil->data->{server}{location}{$server}{host_name} : ""; my $server_status = defined $anvil->data->{server}{location}{$server}{status} ? $anvil->data->{server}{location}{$server}{status} : ""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { server_host => $server_host, server_status => $server_status, }}); # This is called after a migration. If that is the case here, the target will be us. Just # make sure it is running and, if so, return '0'. The 'meta_on_node' is the new host. if (($target eq $anvil->Get->host_name) or ($target eq $anvil->Get->short_host_name) or ($target eq $meta_on_node)) { # If it's running, we're succesfully out. if ((($host eq $target) or ($short_host eq $target)) && ($status eq "running")) { # Success! $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0347", variables => { server => $server }}); $anvil->nice_exit({exit_code => 0}); } # If we're still alive, we'll proceed as if we're pulling the server to us, and maybe # that will work. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0348", variables => { server => $server }}); } # Validate as if we were about to boot the server. validate_all($anvil); # Call the pull migation. ($migrated) = $anvil->Server->migrate_virsh({ debug => 2, server => $server, source => $source, target => $target }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { migrated => $migrated }}); } $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { migrated => $migrated }}); if (not $migrated) { # Exit $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0357"}); $anvil->nice_exit({exit_code => 1}); } # If we made it here, we succeeded. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0360"}); $anvil->nice_exit({exit_code => 0}); } # Validation checks that we have the definition XML, resource config and that needed apps are installed. sub validate_all { my ($anvil) = @_; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0361"}); my $server = $anvil->data->{environment}{OCF_RESKEY_name}; my $source = defined $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source} ? $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_source} : ""; my $target = defined $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target} ? $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target} : ""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { server => $server, source => $source, target => $target, }}); # Log what we're doing. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0581", variables => { server => $server }}); # Read in and parse the server's XML. $anvil->System->check_storage({debug => 3}); $anvil->Server->get_status({debug => 2, server => $server}); # Is the name in the definition file what we expect (and did we read the XML data at all)? validate_name($anvil); # Make sure the emulator it wants is the one we have. validate_emulator($anvil); # These tests are only needed if we're about to boot the server if (($anvil->data->{switches}{start}) or ($source)) { # Check that we have enough RAM. validate_ram($anvil); } # Validate bridges validate_bridges($anvil); # Validate storage (Disks and optical media) validate_storage($anvil); return(0); } # This ensures that the bridges the server connects to exist on this node. sub validate_bridges { my ($anvil) = @_; # Get my bridge list $anvil->Get->bridges({debug => 3}); # Find the Optical drives and DRBD devices. my $local_host = $anvil->Get->short_host_name(); my $server = $anvil->data->{environment}{OCF_RESKEY_name}; foreach my $mac (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{from_disk}{device}{interface}}) { # See if we have this bridge my $found = 0; my $bridge = $anvil->data->{server}{$local_host}{$server}{from_disk}{device}{interface}{$mac}{bridge}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { bridge => $bridge }}); foreach my $interface_name (sort {$a cmp $b} keys %{$anvil->data->{$local_host}{network}{bridges}{bridge}}) { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { interface_name => $interface_name }}); if ((exists $anvil->data->{$local_host}{network}{bridges}{bridge}{$interface_name}) && ($anvil->data->{$local_host}{network}{bridges}{bridge}{$interface_name}{found})) { $found = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { found => $found }}); last; } } if ($found) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0368", variables => { bridge => $bridge }}); } else { # Missing bridge. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0369", variables => { bridge => $bridge }}); $anvil->nice_exit({exit_code => 5}); } } $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0366"}); return(0); } # This looks up the disks and optical media connected to this server. sub validate_storage { my ($anvil) = @_; # When checking on a running server, use 'from_virsh'. my $server = $anvil->data->{environment}{OCF_RESKEY_name}; my $target = defined $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target} ? $anvil->data->{environment}{OCF_RESKEY_CRM_meta_migrate_target} : ""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { server => $server, target => $target, }}); my $local_host = $anvil->Get->short_host_name(); my $xml_source = "from_disk"; if ($anvil->data->{server}{$local_host}{$server}{from_virsh}{host_name}) { $xml_source = "from_virsh"; } $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { server => $server, xml_source => $xml_source, }}); ### TODO: If we're called with a status and find an ISO file missing and eject it instead of failing. ### For now, we just fault out. # Do the optical discs in the drive exist? If not, we'll eject it if we're about to boot and fail if # we're about to migrate. We skip this check if we're migrating off or shutting down the server. if ((exists $anvil->data->{server}{$local_host}{$server}{$xml_source}{device}{cdrom}) && (not $target) && (not $anvil->data->{switches}{stop})) { foreach my $device_target (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{$xml_source}{device}{cdrom}{target}}) { if ($anvil->data->{server}{$local_host}{$server}{$xml_source}{device}{cdrom}{target}{$device_target}{path}) { my $file = $anvil->data->{server}{$local_host}{$server}{$xml_source}{device}{cdrom}{target}{$device_target}{path}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { file => $file }}); if (not -e $file) { # It doesn't exist. Exit with OCF_ERR_INSTALLED (5). $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0398", variables => { file => $file }}); $anvil->nice_exit({exit_code => 5}); } elsif (not -r $file) { # We can't read it. Exit with OCF_ERR_PERM (4). $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0399", variables => { file => $file }}); $anvil->nice_exit({exit_code => 4}); } else { # We're OK. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 2, key => "log_0400", variables => { file => $file }}); } } } } # Verify DRBD devices now validate_storage_drbd($anvil); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0367"}); return(0); } # THis makes sure that the needed backing DRBD devices are on this node. If so, and if they are not up, they # will be brought up. If that fails, it errors out. sub validate_storage_drbd { my ($anvil) = @_; # Now check storage. my $server = $anvil->data->{environment}{OCF_RESKEY_name}; my $xml_source = "from_disk"; my $host = $anvil->Get->short_host_name; my $local_host = $anvil->Get->short_host_name(); # Did I find a resource for each disk? foreach my $device_path (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{device}}) { next if not $device_path; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "server::${local_host}::${server}::device::${device_path}::resource" => $anvil->data->{server}{$local_host}{$server}{device}{$device_path}{resource}, }}); if (not $anvil->data->{server}{$local_host}{$server}{device}{$device_path}{resource}) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 0, priority => "err", key => "log_0414", variables => { drbd_device => $device_path }}); $anvil->nice_exit({exit_code => 5}); } } foreach my $device_target (sort {$a cmp $b} keys %{$anvil->data->{server}{$local_host}{$server}{$xml_source}{device}{disk}{target}}) { my $drbd_device = $anvil->data->{server}{$local_host}{$server}{$xml_source}{device}{disk}{target}{$device_target}{path}; my $drbd_resource = defined $anvil->data->{drbd}{config}{$host}{drbd_path}{$drbd_device}{resource} ? $anvil->data->{drbd}{config}{$host}{drbd_path}{$drbd_device}{resource} : ""; my $on_lv = defined $anvil->data->{drbd}{config}{$host}{drbd_path}{$drbd_device}{on} ? $anvil->data->{drbd}{config}{$host}{drbd_path}{$drbd_device}{on} : ""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { host => $host, drbd_device => $drbd_device, drbd_resource => $drbd_resource, on_lv => $on_lv, }}); if (not $drbd_resource) { # See if we can find the resource in the 'by-res' hash. $drbd_resource = defined $anvil->data->{drbd}{config}{$host}{'by-res'}{$drbd_device}{resource} ? $anvil->data->{drbd}{config}{$host}{'by-res'}{$drbd_device}{resource} : ""; $on_lv = defined $anvil->data->{drbd}{config}{$host}{'by-res'}{$drbd_device}{backing_lv} ? $anvil->data->{drbd}{config}{$host}{'by-res'}{$drbd_device}{backing_lv} : ""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { drbd_resource => $drbd_resource, on_lv => $on_lv, }}); } # If the logical volume here here and active? if ((not $on_lv) or (not exists $anvil->data->{lvm}{$local_host}{lv}{$on_lv})) { # LV not found $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 0, priority => "err", key => "log_0374", variables => { drbd_device => $drbd_device, lv_path => $on_lv }}); $anvil->nice_exit({exit_code => 5}); } elsif (not $anvil->data->{lvm}{$local_host}{lv}{$on_lv}{active}) { # LV not active. If we're starting the server or we're the migration target, try to # activate it. my $active = $anvil->System->activate_lv({debug => 3, path => $on_lv}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { active => $active }}); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 1, key => "log_0413", variables => { lv_path => $on_lv }}); if (not $active) { # Boo :( $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 0, priority => "err", key => "log_0375", variables => { drbd_device => $drbd_device, lv_path => $on_lv }}); $anvil->nice_exit({exit_code => 5}); } } # LV is good if I am still alive. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 1, key => "log_0376", variables => { drbd_device => $drbd_device, lv_path => $on_lv, }}); } ### NOTE: Checking/Managing firewall ports is expensive option, so DRBD ports are permanently opened ### when a resource is created. return(0); } # This verifies that the requested emulator exists and can be used. sub validate_emulator { my ($anvil) = @_; # What emulator is this using? my $local_host = $anvil->Get->short_host_name(); my $server = $anvil->data->{environment}{OCF_RESKEY_name}; my $emulator = $anvil->data->{server}{$local_host}{$server}{from_disk}{info}{emulator}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { emulator => $emulator, "server::${local_host}::${server}::from_disk::info::emulator" => $anvil->data->{server}{$local_host}{$server}{from_disk}{info}{emulator} }}); if (not -e $emulator) { # It doesn't exist. Exit with OCF_ERR_INSTALLED (5). $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0401", variables => { emulator => $emulator, definition_file => $anvil->data->{server}{definition_file}, }}); $anvil->nice_exit({exit_code => 5}); } if (not -x $emulator) { # We can't execute it. Exit with OCF_ERR_PERM (4). $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0402", variables => { emulator => $emulator }}); $anvil->nice_exit({exit_code => 4}); } $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0364"}); return(0); } # This makes sure the name we see in the definition file matches what we expect. sub validate_name { my ($anvil) = @_; my $local_host = $anvil->Get->short_host_name(); my $server = $anvil->data->{environment}{OCF_RESKEY_name}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { server => $server, "server::${local_host}::${server}::from_disk::info::name" => $anvil->data->{server}{$local_host}{$server}{from_disk}{info}{name}, }}); # If we failed to read the XML, the server probably doesn't exist. if (not $anvil->data->{server}{$local_host}{$server}{from_disk}{xml}) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0403", variables => { server => $server, name => $anvil->data->{server}{definition_xml}->{name}, }}); $anvil->nice_exit({exit_code => 1}); } # Is the name in the definition file what we expect? if ($server ne $anvil->data->{server}{$local_host}{$server}{from_disk}{info}{name}) { $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0403", variables => { server => $server, name => $anvil->data->{server}{$local_host}{$server}{from_disk}{info}{name}, }}); $anvil->nice_exit({exit_code => 1}); } $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0363"}); return(0); } # This checks that there is enough RAM to run this server. sub validate_ram { my ($anvil) = @_; # How mcuh RAM does the server need and how much do we have free? my $local_host = $anvil->Get->short_host_name(); my $server = $anvil->data->{environment}{OCF_RESKEY_name}; my $server_ram_bytes = $anvil->data->{server}{$local_host}{$server}{from_disk}{memory}; my $available = $anvil->Get->free_memory({debug => 3}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { server_ram_bytes => $anvil->Convert->add_commas({number => $server_ram_bytes})." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $server_ram_bytes}).")", available => $anvil->Convert->add_commas({number => $available})." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $available}).")", }}); if ($server_ram_bytes > $available) { # Not enough free memory. $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "log_0404", variables => { name => $server, ram => $anvil->Convert->bytes_to_human_readable({'bytes' => $server_ram_bytes}), ram_bytes => $anvil->Convert->add_commas({number => $server_ram_bytes}), available_ram => $anvil->Convert->bytes_to_human_readable({'bytes' => $server_ram_bytes}), available_ram_bytes => $anvil->Convert->add_commas({number => $available}), }}); $anvil->nice_exit({exit_code => 1}); } $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0365"}); return(0); } ### TODO: Make sure the appropriate SN ports are opened. # This stops (drbdadm down ) the storage for a given server on both nodes. sub manage_drbd_resource { my ($anvil, $task, $resource) = @_; return(0); } # This reads the XML definition data into an XML data hash. sub read_server_definition { my ($anvil) = @_; my $server = $anvil->data->{environment}{OCF_RESKEY_name}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { server => $server, }}); return(0); } # This logs the details of this call. sub show_environment { my ($anvil, $level) = @_; foreach my $key (sort {$a cmp $b} keys %{$anvil->data->{switches}}) { next if $key eq "raw"; next if $anvil->data->{switches}{$key} eq ""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $level, list => { "switches::${key}" => $anvil->data->{switches}{$key} }}); } foreach my $key (sort {$a cmp $b} keys %{$anvil->data->{environment}}) { next if $anvil->data->{environment}{$key} eq ""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $level, list => { "environment::${key}" => $anvil->data->{environment}{$key} }}); } foreach my $key (sort {$a cmp $b} keys %ENV) { next if exists $anvil->data->{environment}{$key}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $level, list => { "ENV::${key}" => $ENV{$key} }}); } foreach my $value (@ARGV) { $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $level, list => { "ARGV" => $value }}); } return(0); } # This just prints a quick usage message for now. sub show_usage { my ($anvil) = @_; ### TODO: How to use this... $anvil->nice_exit({exit_code => 0}); } # This prints out the metadata and exits. sub show_metadata { my ($anvil) = @_; # This is a pretty simple agent, by design. We only take a server name for now. print ' 1.0 This resource agent manages KVM+qemu virtual servers on an Anvil! m3 Intelligent Availability(tm) system. It manages underlying components like DRBD 9 storage resources, brodge connections and so forth. Anvil! m3 server resource agent This is the name of the server as reported by virsh. Server name '; $anvil->nice_exit({exit_code => 0}); }