#!/usr/bin/perl # # This is the resource agent used to manage servers on the Anvil! Intelligent Availability platform. # # License: GNU General Public License (GPL) v2+ # (c) 1997-2018 - Alteeve's Niche! Inc. # # WARNING: This is a pretty purpose-specific resource agent. No effort was made to test this on an rgmanager # cluster or on any configuration outside how the Anvil! m3 uses it. If you plan to adapt it to # another purpose, let us know and we'll try to help. # # Based on: https://github.com/ClusterLabs/resource-agents/blob/master/doc/dev-guides/ra-dev-guide.asc # # Error types from pacemaker's perspective; # # - Soft Error - Unless specifically configured otherwise, pacemaker will attempt to recover a resource # in-place - usually by restarting the resource on the same node. # - Hard Error - Unless specifically configured otherwise, pacemaker will attempt to recover a resource # which failed with this error by restarting the resource on a different node. # - Fatal Error - This is a cluster-wide error, it would make no sense to recover such a resource on a # different node, let alone in-place. When a resource fails with this error, Pacemaker will # attempt to shut down the resource, and wait for administrator intervention. # # Exit codes; # 0 - OCF_SUCCESS # - The action completed successfully. This is the expected return code for any successful start, stop, # migrate_to, meta_data, help, and usage action. # - For monitor, however, a modified convention applies: # - If the server is running we return, OCF_SUCCESS. If not running and gracefully stopped or migrated # off, return OCF_NOT_RUNNING. # # 1 - OCF_ERR_GENERIC # - The action returned a generic error. This is used only when none of the more specific error codes, # defined below, accurately describes the problem. # - Pacemaker interprets this exit code as a soft error. # # 2 - OCF_ERR_ARGS # - The resource’s configuration is not valid on this machine. This can happen if the serve fails to boot # because of a missing bridge, for example. # # 3 - OCF_ERR_UNIMPLEMENTED # - The resource agent was instructed to execute an action that we do not implement. # - Not all resource agent actions are mandatory. We don't implement 'promote' or 'demote'. We do implement # 'migrate_to', 'migrate_from', and 'notify'. If we're misconfigured as a master/slave resource, for # example, then will alert the user about this misconfiguration by returning OCF_ERR_UNIMPLEMENTED. # # 4 - OCF_ERR_PERM # - The action failed due to insufficient permissions. This may be due to a node not being able to open a # definition file or resource config. # - Pacemaker interprets this exit code as a hard error. # # 5 - OCF_ERR_INSTALLED # - The action failed because a required component is missing on the node where the action was executed. # This may be due to a required binary not being executable, or a the DRBD resource config file not # existing. # - Pacemaker interprets this exit code as a hard error. # # 6 - OCF_ERR_CONFIGURED # - The action failed because the user misconfigured the resource in pacemaker. For example, the user may # have configured an alphanumeric string for a parameter that really should be an integer. # - Pacemaker interprets this exit code as a fatal error. # # 7 - OCF_NOT_RUNNING # - The resource was found not to be running. This is an exit code that may be returned by the monitor # action exclusively. Note that this implies that the resource has either gracefully shut down, or has # never been started. # # 8 - OCF_RUNNING_MASTER # 9 - OCF_FAILED_MASTER # - These OCF exit codes are not used here. # # NOTE: We don't use Anvil::Tools to keep overhead low and to keep this agent independent as possible. use strict; use warnings; use XML::Simple; use JSON; use Math::BigInt; use Data::Dumper; # Turn off buffering so that the pinwheel will display while waiting for the SSH call(s) to complete. $| = 1; my $THIS_FILE = ($0 =~ /^.*\/(.*)$/)[0]; my $running_directory = ($0 =~ /^(.*?)\/$THIS_FILE$/)[0]; if (($running_directory =~ /^\./) && ($ENV{PWD})) { $running_directory =~ s/^\./$ENV{PWD}/; } my $conf = { 'log' => { facility => "local0", level => 2, line_numbers => 1, tag => "ocf:alteeve:".$THIS_FILE, }, # If a program isn't at the defined path, $ENV{PATH} will be searched. path => { config => { definition => "/mnt/anvil/definitions/#!NAME!#.xml", }, exe => { brctl => "/usr/sbin/brctl", cibadmin => "/usr/sbin/cibadmin", crm_error => "/usr/sbin/crm_error", drbdadm => "/usr/sbin/drbdadm", drbdsetup => "/usr/sbin/drbdsetup", echo => "/usr/bin/echo", free => "/usr/bin/free", getent => "/usr/bin/getent", logger => "/usr/bin/logger", stonith_admin => "/usr/sbin/stonith_admin", virsh => "/usr/bin/virsh", }, }, environment => { # This is the name of the server we're managing. # Example values: OCF_RESKEY_name => defined $ENV{OCF_RESKEY_name} ? $ENV{OCF_RESKEY_name} : "", # srv01-c7 # This is our node name OCF_RESKEY_CRM_meta_on_node => defined $ENV{OCF_RESKEY_CRM_meta_on_node} ? $ENV{OCF_RESKEY_CRM_meta_on_node} : "", # m3-a02n01.alteeve.com # This says "UUID", but it's the node ID. OCF_RESKEY_CRM_meta_on_node_uuid => defined $ENV{OCF_RESKEY_CRM_meta_on_node_uuid} ? $ENV{OCF_RESKEY_CRM_meta_on_node_uuid} : "", # 1 # This is the timeout for the called action in millisecond. OCF_RESKEY_CRM_meta_timeout => defined $ENV{OCF_RESKEY_CRM_meta_timeout} ? $ENV{OCF_RESKEY_CRM_meta_timeout} : "", # 20000 # If this is set, we'll bump our log level as well. PCMK_debug => defined $ENV{PCMK_debug} ? $ENV{PCMK_debug} : "", # 0 # These are other variables that are set, but we don't currently care about them OCF_EXIT_REASON_PREFIX => defined $ENV{OCF_EXIT_REASON_PREFIX} ? $ENV{OCF_EXIT_REASON_PREFIX} : "", # ocf-exit-reason: OCF_RA_VERSION_MAJOR => defined $ENV{OCF_RA_VERSION_MAJOR} ? $ENV{OCF_RA_VERSION_MAJOR} : "", # 1 OCF_RA_VERSION_MINOR => defined $ENV{OCF_RA_VERSION_MINOR} ? $ENV{OCF_RA_VERSION_MINOR} : "", # 0 OCF_RESKEY_crm_feature_set => defined $ENV{OCF_RESKEY_crm_feature_set} ? $ENV{OCF_RESKEY_crm_feature_set} : "", # 3.0.12 OCF_RESOURCE_INSTANCE => defined $ENV{OCF_RESOURCE_INSTANCE} ? $ENV{OCF_RESOURCE_INSTANCE} : "", # srv01-c7 OCF_RESOURCE_PROVIDER => defined $ENV{OCF_RESOURCE_PROVIDER} ? $ENV{OCF_RESOURCE_PROVIDER} : "", # alteeve OCF_RESOURCE_TYPE => defined $ENV{OCF_RESOURCE_TYPE} ? $ENV{OCF_RESOURCE_TYPE} : "", # server OCF_ROOT => defined $ENV{OCF_ROOT} ? $ENV{OCF_ROOT} : "", # /usr/lib/ocf }, }; # If pacemaker is in debug, so are we, if ($conf->{environment}{PCMK_debug}) { $conf->{'log'}{level} = 3; } # Find executables. find_executables($conf); # Get any command line switches. get_switches($conf); ### TEST: to be removed later if ($conf->{switches}{test}) { $conf->{environment}{OCF_RESKEY_name} = "srv01-c7"; $conf->{environment}{OCF_RESKEY_CRM_meta_on_node} = "m3-a02n01.alteeve.com"; $conf->{environment}{OCF_RESKEY_CRM_meta_timeout} = 20000; } # Something for the logs to_log($conf, {message => "ocf:alteeve:server invoked.", 'line' => __LINE__, level => 2}); # This is for debugging. show_environment($conf, 2); ### What are we being asked to do? # start  - Starts the resource. # stop  - Shuts down the resource. # monitor  - (status aliases here) Queries the resource for its state. # meta-data  - Dumps the resource agent metadata. # promote  - Turns a resource into the Master role (Master/Slave resources only). # demote  - Turns a resource into the Slave role (Master/Slave resources only). # migrate_to - migration target # migrate_from - Implement live migration of resources. # validate-all - Validates a resource’s configuration. # help  - (usage maps here) Displays a usage message when the resource agent is invoked from the command line, rather than by the cluster manager. # notify  - Inform resource about changes in state of other clones. if ($conf->{switches}{start}) { # Start the server start_server($conf); } elsif ($conf->{switches}{stop}) { # Stop the server stop_server($conf); } elsif (($conf->{switches}{monitor}) or ($conf->{switches}{status})) { # Report the status of the server. server_status($conf); } elsif (($conf->{switches}{metadaata}) or ($conf->{switches}{'meta-data'})) { show_metadata($conf); } elsif ($conf->{switches}{promote}) { # We don't support this, so we return OCF_ERR_UNIMPLEMENTED (3) to_log($conf, {message => "We were asked to promote: [".$conf->{environment}{OCF_RESKEY_name}."], which makes no sense and is not supported. Ignoreing.", 'line' => __LINE__, level => 0, priority => "err"}); exit(3); } elsif ($conf->{switches}{demote}) { # We don't support this, so we return OCF_ERR_UNIMPLEMENTED (3) to_log($conf, {message => "We were asked to demote: [".$conf->{environment}{OCF_RESKEY_name}."], which makes no sense and is not supported. Ignoreing.", 'line' => __LINE__, level => 0, priority => "err"}); exit(3); } elsif (($conf->{switches}{migrate_to}) && ($conf->{switches}{migrate_from})) { # We don't support this, so we return OCF_ERR_UNIMPLEMENTED (3) migrate_server($conf); } elsif ($conf->{switches}{'validate-all'}) { # Validate our local config and setup. validate($conf); exit(0); } elsif (($conf->{switches}{help}) && ($conf->{switches}{usage})) { # Show the usage information show_usage($conf); exit(0); } elsif ($conf->{switches}{notify}) { # We don't implement this to_log($conf, {message => "We were asked to notify, but this is not a promotable (we're stateless) agent. Ignoring.", 'line' => __LINE__, level => 0, priority => "warn"}); exit(3); } else { # We were called in some unexpected way. Log an error, show usage and exit. show_environment($conf, 0); to_log($conf, {message => "We were invoked with an unexpected (or no) command. Environment variables and arguments below.", 'line' => __LINE__, level => 0, priority => "warn"}); exit(1); } # If we hit here, something very wrong happened. exit(255); ############################################################################################################# # Functions # ############################################################################################################# # This boots the server if possible. sub start_server { my ($conf) = @_; # Start procedure; # 1. Read the XML definition file and find the backing storage and bridges. Soft error if read fails. # 2. Make sure the name matches. # 3. Make sure we have enough free RAM. # 4. Make sure the emulator exists (can be an issue after migrating from an different gen Anvil!). # 5.1. Make sure optical drives with mounted data have the disk present. Soft error if not. # 5.2. Find any backing DRBD devices # 6. For each DRBD device; # 6.1. Make sure the backing LV is ACTIVE. Soft error if not. # 6.2. Check if the drbd resource is up. If not, up it. # 6.3. Make sure the backing disk is UpToDate. Soft error if not. # 6.4. Make sure the backing device is 'Connected' or 'Connecting'. Call a connect if not. # 7. Make sure all bridges exist and soft error if not. # 8. Start the server. to_log($conf, {message => "We've been asked to start the server: [".$conf->{environment}{OCF_RESKEY_name}."]..", 'line' => __LINE__, level => 2}); validate_all($conf); exit(0); } # This shuts down the server if possible. sub stop_server { my ($conf) = @_; exit(0); } # This checks the status of the server. sub server_status { my ($conf) = @_; # If the named server is running, return OCF_SUCCESS (0), otherwise OCF_NOT_RUNNING (7). If the # server is failed, return OCF_ERR_GENERIC (1). my $state = ""; my $server = $conf->{environment}{OCF_RESKEY_name}; ### NOTE: When pacemaker is first starting, virsh won't be up right away. So if we get a return code ### of '1', we'll try again up to 50% of 'environment::OCF_RESKEY_CRM_meta_timeout'. if (not $conf->{environment}{OCF_RESKEY_CRM_meta_timeout}) { # Set a sane default of 20 seconds. $conf->{environment}{OCF_RESKEY_CRM_meta_timeout} = 20000; to_log($conf, {message => "The environment variable 'OCF_RESKEY_CRM_meta_timeout' was not set, so setting it to: [".$conf->{environment}{OCF_RESKEY_CRM_meta_timeout}."].", 'line' => __LINE__, level => 1, priority => "warn"}); } my $return_code = undef; my $output = ""; my $current_time = time; my $timeout = $current_time + int(($conf->{environment}{OCF_RESKEY_CRM_meta_timeout} /= 1000) / 2); my $waiting = 1; to_log($conf, {message => "current_time: [$current_time], timeout: [$timeout].", 'line' => __LINE__, level => 2}); while($waiting) { # Make the call ($return_code, $output) = shell_call($conf, $conf->{path}{exe}{virsh}." list"); to_log($conf, {message => "return_code: [$return_code].", 'line' => __LINE__, level => 2}); if (not $return_code) { $waiting = 0; to_log($conf, {message => "waiting: [$waiting].", 'line' => __LINE__, level => 2}); } elsif (time > $timeout) { # We've waited long enough. $waiting = 0; to_log($conf, {message => "The 'virsh' call exited with the return code: [$return_code]. The 'libvirtd' may have failed to start. We won't wait any longer.", 'line' => __LINE__, level => 1, priority => "warn"}); } else { to_log($conf, {message => "The 'virsh' call exited with the return code: [$return_code]. The 'libvirtd' service might be starting, so we will check again shortly.", 'line' => __LINE__, level => 2}); sleep 2; } } # If I got a non-zero return code, something went wrong with the virsh call. if ($return_code) { to_log($conf, {message => "It would appear that libvirtd is not operating (or not operating correctly). Expected the return code '0' but got: [$return_code].", 'line' => __LINE__, level => 0, priority => "err"}); if ($output) { to_log($conf, {message => "Output of: [".$conf->{path}{exe}{virsh}." list] follows;", 'line' => __LINE__, level => 0, priority => "err"}); to_log($conf, {message => "Output: [$output]", 'line' => __LINE__, level => 0, priority => "err"}); } exit(1); } # If we're still alive, process the output foreach my $line (split/\n/, $output) { $line =~ s/^\s+//; $line =~ s/\s+$//; $line =~ s/\s+/ /g; if ($line =~ /^(\d+) $server (.*)$/) { $state = $2; to_log($conf, {message => "server: [$server], state: [$state]", 'line' => __LINE__, level => 2}); last; } } # If there is a state, see what the state is. if ($state) { ### What is the state? ## States we return OCF_SUCCESS (0). # running - The domain is currently running on a CPU # paused - The domain has been paused, usually occurring through the administrator # running virsh suspend. When in a paused state the domain will still consume # allocated resources like memory, but will not be eligible for scheduling by # the hypervisor. # pmsuspended - The domain has been suspended by guest power management, e.g. entered into s3 # state. # in shutdown - The domain is in the process of shutting down, i.e. the guest operating # system has been notified and should be in the process of stopping its # operations gracefully. ## States we'll return OCF_NOT_RUNNING (7). # shut off - The domain is not running. Usually this indicates the domain has been shut # down completely, or has not been started. ## States we'll return OCF_ERR_GENERIC (1). # idle - The domain is idle, and not running or runnable. This can be caused because # the domain is waiting on IO (a traditional wait state) or has gone to sleep # because there was nothing else for it to do. # crashed - The domain has crashed, which is always a violent ending. Usually this state # can only occur if the domain has been configured not to restart on crash. if (($state eq "running") or ($state eq "paused") or ($state eq "pmsuspended") or ($state eq "in shutdown")) { to_log($conf, {message => "The server: [$server] is: [$state], which is OK.", 'line' => __LINE__, level => 1}); exit(0); } elsif ($state eq "shut off") { to_log($conf, {message => "The server: [$server] is: [$state].", 'line' => __LINE__, level => 1}); exit(7); } elsif (($state eq "idle") or ($state eq "crashed")) { to_log($conf, {message => "The server: [$server] is in a bad state: [$state]!", 'line' => __LINE__, level => 0, priority => "err"}); exit(1); } else { # WTF? to_log($conf, {message => "The server: [$server] is in an unexpected state: [$state]!", 'line' => __LINE__, level => 0, priority => "err"}); exit(1); } } else { # Not running. Exit with OCF_NOT_RUNNING to_log($conf, {message => "The server: [$server] is not running on this node.", 'line' => __LINE__, level => 1}); exit(7); } exit(0); } # Migrate the server sub migrate_server { my ($conf) = @_; # If we were given 'migrate_to', then just verify that the node name makes sense. If we were given # 'migrate_from', we need to find the peer. # Return failed until this is actually implemented. exit(1); } # Validation checks that we have the definition XML, resource config and that needed apps are installed. sub validate_all { my ($conf) = @_; to_log($conf, {message => "Running validation tests...", 'line' => __LINE__, level => 2}); # Read in the server's definition file (if found and readable). read_server_definition($conf); to_log($conf, {message => "- Server definition was read.", 'line' => __LINE__, level => 2}); # Does the internal server name match? validate_name($conf); to_log($conf, {message => "- Server name is valid.", 'line' => __LINE__, level => 2}); # Make sure the emulator it wants is the one we have. validate_emulator($conf); to_log($conf, {message => "- Eumlator is valid.", 'line' => __LINE__, level => 2}); # These tests are only needed if we're about to boot the server if ($conf->{switches}{start}) { # Check that we have enough RAM. validate_ram($conf); to_log($conf, {message => "- Sufficient RAM is available.", 'line' => __LINE__, level => 2}); } # Validate bridges validate_bridges($conf); to_log($conf, {message => "- Network bridge(s) are available.", 'line' => __LINE__, level => 2}); # Validate storage (Disks and optical media) validate_storage($conf); to_log($conf, {message => "- Storage is valid and ready.", 'line' => __LINE__, level => 2}); exit(0); } # This ensures that the bridges the server connects to exist on this node. sub validate_bridges { my ($conf) = @_; # Find the Optical drives and DRBD devices. foreach my $device_ref (@{$conf->{server}{definition_xml}->{devices}}) { foreach my $interface_ref (@{$device_ref->{interface}}) { foreach my $source_ref (@{$interface_ref->{source}}) { my $bridge = $source_ref->{bridge}; $conf->{server}{bridges}{$bridge} = 1; to_log($conf, {message => "server::bridges::${bridge}: [".$conf->{server}{bridges}{$bridge}."].", 'line' => __LINE__, level => 3}); } } } # Get a list of available bridges. my ($return_code, $output) = shell_call($conf, $conf->{path}{exe}{brctl}." show"); my $first_line = 1; foreach my $line (split/\n/, $output) { to_log($conf, {message => "line: [$line].", 'line' => __LINE__, level => 3}); if ($first_line) { # We skip the first line instead of parse the string to avoid getting caught by # translations. $first_line = 0; } elsif ($line =~ /^(\S+)\s/) { my $bridge = $1; $conf->{'local'}{bridge}{$bridge} = 1; to_log($conf, {message => "local::bridge::${bridge}: [$bridge].", 'line' => __LINE__, level => 3}); } } # Verify bridges now foreach my $bridge (sort {$a cmp $b} keys %{$conf->{server}{bridges}}) { if ($conf->{'local'}{bridge}{$bridge}) { to_log($conf, {message => "The bridge: [$bridge] is available for this server.", 'line' => __LINE__, level => 2}); } else { # Missing bridge. to_log($conf, {message => "The server wants to connect to the bridge: [$bridge] which we do not have on this node.", 'line' => __LINE__, level => 0, priority => "err"}); exit(5); } } return(0); } # This looks up the disks and optical media connected to this server. sub validate_storage { my ($conf) = @_; # Find the bridge(s) this server uses. foreach my $device_ref (@{$conf->{server}{definition_xml}->{devices}}) { foreach my $disk_ref (@{$device_ref->{disk}}) { my $type = $disk_ref->{device}; to_log($conf, {message => "type: [$type].", 'line' => __LINE__, level => 2}); if ($type eq "disk") { foreach my $source_ref (@{$disk_ref->{source}}) { my $device_path = $source_ref->{dev}; $conf->{server}{disks}{$device_path} = 1; to_log($conf, {message => "server::disks::${device_path}: [".$conf->{server}{disks}{$device_path}."].", 'line' => __LINE__, level => 2}); } } elsif ($type eq "cdrom") { foreach my $source_ref (@{$disk_ref->{source}}) { my $file = $source_ref->{file}; $conf->{server}{optical}{$file} = 1; to_log($conf, {message => "server::optical::${file}: [".$conf->{server}{optical}{$file}."].", 'line' => __LINE__, level => 2}); } } } } # Verify optical disks now validate_storage_optical($conf); # Verify DRBD devices now validate_storage_drbd($conf); return(0); } # THis makes sure that the needed backing DRBD devices are on this node. If so, and if they are not up, they # will be brought up. If that fails, it errors out. sub validate_storage_drbd { my ($conf) = @_; # Read in the DRBD configuration XML. my ($return_code, $drbd_body) = shell_call($conf, $conf->{path}{exe}{drbdadm}." dump-xml"); if ($return_code) { # Something went wrong. to_log($conf, {message => "The attempt to read the DRBD configuration returned a non-zero code: [$return_code]. The returned output (if any) was: [$drbd_body].", 'line' => __LINE__, level => 0, priority => "err"}); exit(1); } # Parse the XML my $drbd_xml = ""; my $xml = XML::Simple->new(); eval { $drbd_xml = $xml->XMLin($drbd_body, KeyAttr => ["name", "vnr"], ForceArray => 1) }; if ($@) { chomp $@; my $error = "[ Error ] - The was a problem parsing: [$drbd_body]. The error was:\n"; $error .= "===========================================================\n"; $error .= $@."\n"; $error .= "===========================================================\n"; to_log($conf, {message => $error, 'line' => __LINE__, level => 0, priority => "err"}); exit(1); } foreach my $resource (sort {$a cmp $b} keys %{$drbd_xml->{resource}}) { to_log($conf, {message => "resource: [$resource].", 'line' => __LINE__, level => 2}); my $peer = ""; my $local = ""; foreach my $connection_ref (@{$drbd_xml->{resource}->{$resource}->{connection}}) { my $protocol = $connection_ref->{section}->{net}->{option}->{protocol}->{value}; my $fencing = $connection_ref->{section}->{net}->{option}->{fencing}->{value}; to_log($conf, {message => "protocol: [$resource], fencing: [$fencing].", 'line' => __LINE__, level => 3}); # If this isn't set to 'resource-and-stonith', it's a DR connection and we'll ignore # it. next if $fencing ne "resource-and-stonith"; # Look at the hosts foreach my $host (sort {$a cmp $b} keys %{$connection_ref->{host}}) { my $address = $connection_ref->{host}->{$host}->{address}->[0]->{content}; my $port = $connection_ref->{host}->{$host}->{address}->[0]->{port}; my $short_hostname = $host; $short_hostname =~ s/\..*$//; my $local_hostname = $conf->{environment}{OCF_RESKEY_CRM_meta_on_node}; to_log($conf, {message => "host: [$host ($short_hostname)], address: [$address:$port], local_hostname: [$local_hostname].", 'line' => __LINE__, level => 2}); # Is this me or the peer? if (($local_hostname eq $short_hostname) or ($local_hostname =~ /^$short_hostname\./)) { # This is us. $local = $host; to_log($conf, {message => "Recording the local connection details for the resource: [$resource] -> [$address:$port].", 'line' => __LINE__, level => 2}); $conf->{server}{drbd}{'local'}{hostname} = $host, $conf->{server}{drbd}{'local'}{short_hostname} = $short_hostname, $conf->{server}{drbd}{'local'}{address} = $address, $conf->{server}{drbd}{'local'}{port} = $port, } else { # This is our peer $peer = $host; to_log($conf, {message => "Recording the peer's connection details for the resource: [$resource] -> [$address:$port].", 'line' => __LINE__, level => 2}); $conf->{server}{drbd}{peer}{hostname} = $host, $conf->{server}{drbd}{peer}{short_hostname} = $short_hostname, $conf->{server}{drbd}{peer}{address} = $address, $conf->{server}{drbd}{peer}{port} = $port, } } } to_log($conf, {message => "local: [$local], peer: [$peer].", 'line' => __LINE__, level => 2}); foreach my $volume (sort {$a cmp $b} keys %{$drbd_xml->{resource}->{$resource}->{host}->{$local}->{volume}}) { my $backing_device = $drbd_xml->{resource}->{$resource}->{host}->{$local}->{volume}->{$volume}->{disk}->[0]; my $device_path = $drbd_xml->{resource}->{$resource}->{host}->{$local}->{volume}->{$volume}->{device}->[0]->{content}; my $device_minor = $drbd_xml->{resource}->{$resource}->{host}->{$local}->{volume}->{$volume}->{device}->[0]->{minor}; to_log($conf, {message => "volume: [$volume], backing_device: [$backing_device], device_path: [$device_path], device_minor: [$device_minor].", 'line' => __LINE__, level => 3}); $conf->{server}{drbd}{'local'}{device}{$device_path}{lv} = $backing_device; $conf->{server}{drbd}{'local'}{device}{$device_path}{minor} = $device_minor; to_log($conf, {message => "server::drbd::local::device::${device_path}::lv: [".$conf->{server}{drbd}{'local'}{device}{$device_path}{lv}."], server::drbd::local::device::${device_path}::minor: [".$conf->{server}{drbd}{'local'}{device}{$device_path}{minor}."].", 'line' => __LINE__, level => 2}); } } foreach my $device_path (sort {$a cmp $b} keys %{$conf->{server}{disks}}) { to_log($conf, {message => "Checking that the DRBD device: [$device_path] is ready.", 'line' => __LINE__, level => 2}); to_log($conf, {message => "server::drbd::local::device::${device_path}::lv: [".$conf->{server}{drbd}{'local'}{device}{$device_path}{lv}."].", 'line' => __LINE__, level => 2}); if (not $conf->{server}{drbd}{'local'}{device}{$device_path}{lv}) { # The backing LV doesn't exist. to_log($conf, {message => "The server wants to use: [$device_path] as a hard drive, but we couldn't find the backing logical volume on this node.", 'line' => __LINE__, level => 0, priority => "err"}); exit(5); } elsif (not -e $conf->{server}{drbd}{'local'}{device}{$device_path}{lv}) { # The backing LV doesn't exist. to_log($conf, {message => "The server wants to use: [$device_path] as a hard drive, but the backing logical volume: [".$conf->{server}{drbd}{'local'}{device}{$device_path}{lv}."] doesn't exist on this node.", 'line' => __LINE__, level => 0, priority => "err"}); exit(5); } else { to_log($conf, {message => "The server wants to use: [$device_path] as a hard drive, which is backed by the logical volume: [".$conf->{server}{drbd}{'local'}{device}{$device_path}{lv}."]. Checking that these are ready.", 'line' => __LINE__, level => 1}); } } # Now read in the status of the drbd devices $return_code = undef; ($return_code, my $status_json) = shell_call($conf, $conf->{path}{exe}{drbdsetup}." status --json"); if ($return_code) { # Something went wrong. to_log($conf, {message => "The attempt to read the DRBD status returned a non-zero code: [$return_code]. The returned output (if any) was: [$status_json].", 'line' => __LINE__, level => 0, priority => "err"}); exit(1); } my $json = JSON->new->allow_nonref; my $drbd_status = $json->decode($status_json); # Pull out my data #my $local_disk_state = foreach my $connection_ref (@{$drbd_status->[0]->{connections}}) { } print Dumper $drbd_status; return(0); } # This makes sure that any media in the server's optical drive exists here and is readable. sub validate_storage_optical { my ($conf) = @_; foreach my $file (sort {$a cmp $b} keys %{$conf->{server}{optical}}) { to_log($conf, {message => "Checking that the optical disc image: [$file] exists.", 'line' => __LINE__, level => 2}); # If the file doesn't exist, exit with OCF_ERR_INSTALLED (5). If we can't read it, exit with # OCF_ERR_PERM (4). if (not -e $file) { # It doesn't exist. Exit with OCF_ERR_INSTALLED (5). to_log($conf, {message => "The server has the ISO: [$file] mounted in its optical drive, but that file doesn't exist on this system.", 'line' => __LINE__, level => 0, priority => "err"}); exit(5); } elsif (not -r $file) { # We can't read it. Exit with OCF_ERR_PERM (4). to_log($conf, {message => "The server has the ISO: [$file] mounted in its optical drive, which we have, but we can't read it. Check permissions and for SELinux denials.", 'line' => __LINE__, level => 0, priority => "err"}); exit(4); } else { # We're OK. to_log($conf, {message => "The server has the ISO: [$file] mounted in its optical drive, which we have.", 'line' => __LINE__, level => 2}); } } return(0); } # This verifies that the requested emulator exists and can be used. sub validate_emulator { my ($conf) = @_; # What emulator is this using? my $emulator = $conf->{server}{definition_xml}->{devices}->[0]->{emulator}->[0]; to_log($conf, {message => "emulator: [$emulator]", 'line' => __LINE__, level => 2}); if (not -e $emulator) { # It doesn't exist. Exit with OCF_ERR_INSTALLED (5). to_log($conf, {message => "The server wants to use the emulator: [$emulator] which doesn't exist on this node. Was this server migrated from a different generation Anvil! system? Please update '...' in the server's definition file: [".$conf->{server}{definition_file}."].", 'line' => __LINE__, level => 0, priority => "err"}); exit(5); } if (not -x $emulator) { # We can't execute it. Exit with OCF_ERR_PERM (4). to_log($conf, {message => "The server wants to use the emulator: [$emulator] which exists, but we can't run. Please check permissions and for SELinux denials.", 'line' => __LINE__, level => 0, priority => "err"}); exit(4); } return(0); } # This makes sure the name we see in the definition file matches what we expect. sub validate_name { my ($conf) = @_; my $server = $conf->{environment}{OCF_RESKEY_name}; if ($server ne $conf->{server}{definition_xml}->{name}->[0]) { to_log($conf, {message => "The configured server name: [$server] does not match the name of the server in the definition file: [".$conf->{server}{definition_xml}->{name}."]!", 'line' => __LINE__, level => 0, priority => "err"}); exit(1); } return(0); } # This checks that there is enough RAM to run this server. sub validate_ram { my ($conf) = @_; # How mcuh RAM does the server need? my $server_ram_value = $conf->{server}{definition_xml}->{memory}->[0]->{content}; my $server_ram_units = $conf->{server}{definition_xml}->{memory}->[0]->{unit}; to_log($conf, {message => "server_ram_value: [$server_ram_value], server_ram_units: [$server_ram_units].", 'line' => __LINE__, level => 2}); # Convert to bytes my $server_ram_bytes = $server_ram_value; if ($server_ram_units =~ /^k/i) { $server_ram_bytes = ($server_ram_value * (2 ** 10)); } elsif ($server_ram_units =~ /^m/i) { $server_ram_bytes = ($server_ram_value * (2 ** 20)); } elsif ($server_ram_units =~ /^g/i) { $server_ram_bytes = ($server_ram_value * (2 ** 30)); } elsif ($server_ram_units =~ /^t/i) { $server_ram_bytes = ($server_ram_value * (2 ** 40)); } elsif ($server_ram_units =~ /^p/i) { $server_ram_bytes = Math::BigInt->new('2')->bpow('50')->bmul($server_ram_value); } elsif ($server_ram_units =~ /^e/i) { $server_ram_bytes = Math::BigInt->new('2')->bpow('60')->bmul($server_ram_value); } elsif ($server_ram_units =~ /^z/i) { $server_ram_bytes = Math::BigInt->new('2')->bpow('70')->bmul($server_ram_value); } elsif ($server_ram_units =~ /^y/i) { $server_ram_bytes = Math::BigInt->new('2')->bpow('80')->bmul($server_ram_value); } to_log($conf, {message => "server_ram_bytes: [$server_ram_bytes].", 'line' => __LINE__, level => 3}); # How much RAM do we have available? my $available = 0; my ($free_rc, $free_output) = shell_call($conf, $conf->{path}{exe}{free}." --bytes"); foreach my $line (split/\n/, $free_output) { to_log($conf, {message => "line: [$line].", 'line' => __LINE__, level => 3}); if ($line =~ /Mem:\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)$/) { my $total = $1; my $used = $2; my $free = $3; my $shared = $4; my $cache = $5; $available = $6; to_log($conf, {message => "total: [$total], used: [$used], free: [$free], shared: [$shared], cache: [$cache], available: [$available]", 'line' => __LINE__, level => 3}); } } to_log($conf, {message => "server_ram_bytes: [".comma($conf, $server_ram_bytes)." bytes].", 'line' => __LINE__, level => 2}); to_log($conf, {message => "available: ...... [".comma($conf, $available)." bytes].", 'line' => __LINE__, level => 2}); if ($server_ram_bytes > $available) { # Not enough free memory. to_log($conf, {message => "The configured server name: [".$conf->{environment}{OCF_RESKEY_name}."] needs: [".comma($conf, $server_ram_bytes)." bytes] of RAM, but only: [".comma($conf, $available)." bytes] are available!", 'line' => __LINE__, level => 0, priority => "err"}); exit(1); } return(0); } # This reads the XML definition data into an XML data hash. sub read_server_definition { my ($conf) = @_; my $server = $conf->{environment}{OCF_RESKEY_name}; my $definition_file = $conf->{path}{config}{definition}; $definition_file =~ s/#!NAME!#/$server/; my $server_xml = ""; to_log($conf, {message => "server: [$server], definition_file: [$definition_file]", 'line' => __LINE__, level => 3}); # If the file doesn't exist, return OCF_ERR_INSTALLED (5). If the file exists but we can't read it, # return OCF_ERR_PERM (4). if (not -e $definition_file) { to_log($conf, {message => "The definition file: [$definition_file] for the server: [$server] does not exist here!", 'line' => __LINE__, level => 0, priority => "err"}); exit(5); } elsif (not -r $definition_file) { to_log($conf, {message => "The definition file: [$definition_file] for the server: [$server] can not be read!", 'line' => __LINE__, level => 0, priority => "err"}); exit(4); } # Still alive? Read it in. my ($definition_xml) = read_file($conf, $definition_file); to_log($conf, {message => "definition_xml: [$definition_xml]", 'line' => __LINE__, level => 3}); my $xml = XML::Simple->new(); eval { $server_xml = $xml->XMLin($definition_xml, KeyAttr => {}, ForceArray => 1) }; if ($@) { chomp $@; my $error = "[ Error ] - The was a problem parsing: [$definition_file]. The error was:\n"; $error .= "===========================================================\n"; $error .= $@."\n"; $error .= "===========================================================\n"; to_log($conf, {message => $error, 'line' => __LINE__, level => 0, priority => "err"}); exit(1); } $conf->{server}{definition_xml} = $server_xml; $conf->{server}{definition_file} = $definition_file; return(0); } # This reads in a file and returns the contents as a single string variable. sub read_file { my ($conf, $file) = @_; my $body = ""; open (my $file_handle, "<".$file) or to_log($conf, {message => "Failed to read: [$file]. The error was: $!", 'line' => __LINE__, level => 0, priority => "err", exit_code => 1}); while(<$file_handle>) { # This should not generate output. chomp; my $line = $_; to_log($conf, {message => "Output: [$line]", 'line' => __LINE__, level => 3}); $body .= $line."\n"; } close $file_handle; to_log($conf, {message => "body: [$body]", 'line' => __LINE__, level => 3}); return($body); } # This makes a system call and returns the return code and the output as string variable. sub shell_call { my ($conf, $shell_call) = @_; $shell_call .= " 2>&1; ".$conf->{path}{exe}{echo}." return_code:\$?"; my $return_code = 9999; my $output = ""; to_log($conf, {message => "Calling: [$shell_call]", 'line' => __LINE__, level => 2}); open (my $file_handle, $shell_call." 2>&1 |") or to_log($conf, {message => "Failed to call: [".$shell_call."]. The error was: $!", 'line' => __LINE__, level => 0, priority => "err", exit_code => 1}); while(<$file_handle>) { # This should not generate output. chomp; my $line = $_; to_log($conf, {message => "line: [$line]", 'line' => __LINE__, level => 3}); if ($line =~ /^return_code:(\d+)$/) { $return_code = $1; to_log($conf, {message => "return_code: [$return_code]", 'line' => __LINE__, level => 3}); next; } $output .= $line."\n"; to_log($conf, {message => "Output: [$line]", 'line' => __LINE__, level => 3}); } close $file_handle; to_log($conf, {message => "return_code: [$return_code], output: [$output]", 'line' => __LINE__, level => 3}); return($return_code, $output); } # This logs the details of this call. sub show_environment { my ($conf, $level) = @_; foreach my $key (sort {$a cmp $b} keys %{$conf->{switches}}) { next if $key eq "raw"; next if $conf->{switches}{$key} eq ""; to_log($conf, {message => "Command line switch: [$key] -> [".$conf->{switches}{$key}."]", 'line' => __LINE__, level => $level}); } foreach my $key (sort {$a cmp $b} keys %{$conf->{environment}}) { next if $conf->{environment}{$key} eq ""; to_log($conf, {message => "OCF Environment variable: [$key] -> [".$conf->{environment}{$key}."]", 'line' => __LINE__, level => $level}); } foreach my $key (sort {$a cmp $b} keys %ENV) { next if exists $conf->{environment}{$key}; to_log($conf, {message => "System Environment variable: [$key] -> [".$ENV{$key}."]", 'line' => __LINE__, level => ($level + 1)}); } return(0); } # This just prints a quick usage message for now. sub show_usage { my ($conf) = @_; print "TODO: How to use this...\n"; exit(0); } # This prints out the metadata and exits. sub show_metadata { my ($conf) = @_; # This is a pretty simple agent, by design. We only take a server name for now. print ' 0.1 This resource agent manages KVM+qemu virtual servers on an Anvil! m3 Intelligent Availability(tm) system. It manages underlying components like DRBD 9 storage resources, brodge connections and so forth. Anvil! m3 server resource agent This is the name of the server as reported by virsh. Server name '; exit(0); } # This gathers command line switches and stores them in 'swithes::'. sub get_switches { my ($conf) = @_; my $last_argument = ""; $conf->{switches}{raw} = ""; foreach my $argument (@ARGV) { to_log($conf, {message => "argument: [$argument]", 'line' => __LINE__, level => 3}); if ($last_argument eq "raw") { # Don't process anything. $conf->{switches}{raw} .= " ".$argument; to_log($conf, {message => "switches::raw: [".$conf->{switches}{raw}."]", 'line' => __LINE__, level => 3}); } elsif ($argument =~ /^-/) { # If the argument is just '--', appeand everything after it to 'raw'. if ($argument eq "--") { $last_argument = "raw"; $conf->{switches}{raw} = ""; to_log($conf, {message => "switches::raw: [".$conf->{switches}{raw}."]", 'line' => __LINE__, level => 3}); } else { ($last_argument) = ($argument =~ /^-{1,2}(.*)/)[0]; to_log($conf, {message => "last_argument: [$last_argument]", 'line' => __LINE__, level => 3}); if ($last_argument =~ /=/) { # Break up the variable/value. ($last_argument, my $value) = (split /=/, $last_argument, 2); $conf->{switches}{$last_argument} = $value; to_log($conf, {message => "switches::${last_argument}: [".$conf->{switches}{$last_argument}."]", 'line' => __LINE__, level => 3}); } else { $conf->{switches}{$last_argument} = "#!SET!#"; to_log($conf, {message => "switches::${last_argument}: [".$conf->{switches}{$last_argument}."]", 'line' => __LINE__, level => 3}); } } } else { if ($last_argument) { $conf->{switches}{$last_argument} = $argument; to_log($conf, {message => "switches::${last_argument}: [".$conf->{switches}{$last_argument}."]", 'line' => __LINE__, level => 3}); $last_argument = ""; to_log($conf, {message => "last_argument: [$last_argument]", 'line' => __LINE__, level => 3}); } else { # Got a value without an argument. That's OK. $conf->{switches}{$argument} = "#!SET!#"; to_log($conf, {message => "switches::${argument}: [".$conf->{switches}{$argument}."]", 'line' => __LINE__, level => 3}); } } } # Clean up the initial space added to 'raw'. to_log($conf, {message => "switches::raw: [".$conf->{switches}{raw}."]", 'line' => __LINE__, level => 3}); if ($conf->{switches}{raw}) { $conf->{switches}{raw} =~ s/^ //; to_log($conf, {message => "switches::raw: [".$conf->{switches}{raw}."]", 'line' => __LINE__, level => 3}); } return(0); } # This adds commas to long numbers. sub comma { my ($conf, $number) = @_; return undef if not defined $number; # Strip out any existing commas. $number =~ s/,//g; # Record and remove the sign, if present. my $sign = ""; if ($number =~ /^\+/) { $number =~ s/^\+//g; $sign = "+"; } elsif ($number =~ /^\-/) { $number =~ s/^\-//g; $sign = "-"; } # Split on the left-most period. my ($whole, $decimal) = split/\./, $number, 2; $whole = "" if not defined $whole; $decimal = "" if not defined $decimal; # Now die if either number has a non-digit character in it. if (($whole =~ /\D/) or ($decimal =~ /\D/)) { to_log($conf, {message => "We were asked to insert commas into a dumber that is not actually a number: [$number]. This is likely a symptom of a larger problem.", 'line' => __LINE__, level => 0, priority => "err"}); exit(1); } local($_) = $whole ? $whole : ""; 1 while s/^(-?\d+)(\d{3})/$1,$2/; $whole = $_; my $return = $decimal ? $whole.".".$decimal : $whole; if ($sign) { $return = $sign.$return; } return ($return); } # Log file entries sub to_log { my ($conf, $parameters) = @_; my $facility = defined $parameters->{facility} ? $parameters->{facility} : $conf->{'log'}{facility}; my $level = defined $parameters->{level} ? $parameters->{level} : 1; my $line = defined $parameters->{'line'} ? $parameters->{'line'} : 0; my $message = defined $parameters->{message} ? $parameters->{message} : ""; my $priority = defined $parameters->{priority} ? $parameters->{priority} : ""; my $exit_code = defined $parameters->{exit_code} ? $parameters->{exit_code} : ""; # Leave if we don't care about this message return if $level > $conf->{'log'}{level}; return if not $message; # Build the message. We log the line if (($conf->{'log'}{line_numbers}) && ($line)) { $message = $line."; ".$message; } my $priority_string = $facility; if ($priority) { $priority_string .= ".".$priority; } elsif ($level eq "0") { $priority_string .= ".notice"; } elsif (($level eq "1") or ($level eq "2")) { $priority_string .= ".info"; } else { $priority_string .= ".debug"; } # Clean up the string for bash $message =~ s/"/\\\"/gs; #$message =~ s/\(/\\\(/gs; my $shell_call = $conf->{path}{exe}{logger}." --priority ".$priority_string." --tag ".$conf->{'log'}{tag}." -- \"".$message."\""; open (my $file_handle, $shell_call." 2>&1 |") or die "Failed to call: [".$shell_call."]. The error was: $!\n"; while(<$file_handle>) { # This should not generate output. chomp; my $line = $_; print "Unexpected logging output: [".$line."]\n"; } close $file_handle; if ($exit_code =~ /^\d+$/) { exit($exit_code); } return(0); } # This checks the given paths and, if something isn't found, it searches PATH trying to find it. sub find_executables { my ($conf) = @_; # Variables. my $check = ""; my $bad = 0; # Log entries can only happen if I've found 'logger', so an extra check will be made on 'to_log' # calls. my @dirs = split/:/, $ENV{PATH}; foreach my $exe (sort {$b cmp $a} keys %{$conf->{path}{exe}}) { if ( not -e $conf->{path}{exe}{$exe} ) { to_log($conf, {message => "The program: [$exe] is not at: [".$conf->{path}{exe}{$exe}."]. Looking for it now...", 'line' => __LINE__, level => 1}); foreach my $path (@dirs) { $check = "$path/$exe"; $check =~ s/\/\//\//g; to_log($conf, {message => "Checking: [$check]", 'line' => __LINE__, level => 2}); if ( -e $check ) { if (-e $conf->{path}{exe}{logger}) { to_log($conf, {message => "Found it! Changed path for: [$exe] from: [".$conf->{path}{exe}{$exe}."] to: [$check]", 'line' => __LINE__, level => 1}); } else { warn "DEBUG: Found it! Changed path for: [$exe] from: [".$conf->{path}{exe}{$exe}."] to: [$check]\n"; } $conf->{path}{exe}{$exe} = $check; } else { to_log($conf, {message => "Not found.", 'line' => __LINE__, level => 2}); } } } else { to_log($conf, {message => "Found!", 'line' => __LINE__, level => 3}); next; } # Make sure it exists now. to_log($conf, {message => "Checking again if: [$exe] is at: [".$conf->{path}{exe}{$exe}."].", 'line' => __LINE__, level => 3}); if (not -e $conf->{path}{exe}{$exe}) { $bad = 1; if (-e $conf->{path}{exe}{logger}) { to_log($conf, {message => "Failed to find executable: [$exe]. Unable to proceed.", 'line' => __LINE__, level => 0}); } else { warn "Failed to find executable: [$exe]. Unable to proceed.\n"; } } } if ($bad) { exit(1); } return(0); }