diff --git a/Anvil/Tools/Cluster.pm b/Anvil/Tools/Cluster.pm index 3f737456..4408d171 100644 --- a/Anvil/Tools/Cluster.pm +++ b/Anvil/Tools/Cluster.pm @@ -236,9 +236,12 @@ sub add_server ### NOTE: 'INFINITY' doesn't work in some cases, so we set 1 day timeouts. If windows can't install ### an OS update in 24 hours, there's probably deeper issues. + ### + ### NOTE: If you update this command, check that scan-cluster->check_resources() is also updated! + ### ### TODO: If the target_role is 'started' because the server was running, we may need to later do an ### update to set it to 'stopped' after we've verified it's in the cluster below. - my $resource_command = $anvil->data->{path}{exe}{pcs}." resource create ".$server_name." ocf:alteeve:server name=\"".$server_name."\" meta allow-migrate=\"true\" target-role=\"".$target_role."\" op monitor interval=\"60\" start timeout=\"60\" on-fail=\"block\" stop timeout=\"300\" on-fail=\"block\" migrate_to timeout=\"600\" on-fail=\"block\""; + my $resource_command = $anvil->data->{path}{exe}{pcs}." resource create ".$server_name." ocf:alteeve:server name=\"".$server_name."\" meta allow-migrate=\"true\" target-role=\"".$target_role."\" op monitor interval=\"60\" start timeout=\"60\" on-fail=\"block\" stop timeout=\"300\" on-fail=\"block\" migrate_to timeout=\"600\" on-fail=\"block\" migrate_from timeout=\"600\" on-fail=\"block\""; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { resource_command => $resource_command }}); my ($output, $return_code) = $anvil->System->call({shell_call => $resource_command}); diff --git a/scancore-agents/scan-cluster/scan-cluster b/scancore-agents/scan-cluster/scan-cluster index ccef1f10..fb62d06f 100755 --- a/scancore-agents/scan-cluster/scan-cluster +++ b/scancore-agents/scan-cluster/scan-cluster @@ -99,7 +99,7 @@ check_config($anvil); # Check the fence delay check_fence_delay($anvil); -# Check for failed resources +# Check for failed resources or resources that need updates check_resources($anvil); # Shut down. @@ -114,7 +114,10 @@ $anvil->ScanCore->agent_shutdown({agent => $THIS_FILE}); sub check_resources { my ($anvil) = @_; - + + my ($problem) = $anvil->Cluster->parse_cib({debug => 2}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }}); + foreach my $server (sort {$a cmp $b} keys %{$anvil->data->{crm_mon}{parsed}{'pacemaker-result'}{resources}{resource}}) { # This is used for alerts, if needed below. @@ -269,6 +272,45 @@ sub check_resources $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "scan_cluster_alert_0015", variables => $variables}); $anvil->Alert->register({debug => 2, alert_level => "notice", message => "scan_cluster_alert_0015", variables => $variables, set_by => $THIS_FILE}); } + + # Check to see if server's parameters need updating + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { server => $server }}); + my $update = 0; + foreach my $op (sort {$a cmp $b} keys %{$anvil->data->{cib}{parsed}{cib}{resources}{primitive}{$server}{operations}{op}}) + { + my $name = $anvil->data->{cib}{parsed}{cib}{resources}{primitive}{$server}{operations}{op}{$op}{name}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { name => $name }}); + if (($name eq "migrate_to") or ($name eq "migrate_from")) + { + my $on_fail = exists $anvil->data->{cib}{parsed}{cib}{resources}{primitive}{$server}{operations}{op}{$op}{'on-fail'} ? $anvil->data->{cib}{parsed}{cib}{resources}{primitive}{$server}{operations}{op}{$op}{'on-fail'} : ""; + my $timeout = exists $anvil->data->{cib}{parsed}{cib}{resources}{primitive}{$server}{operations}{op}{$op}{timeout} ? $anvil->data->{cib}{parsed}{cib}{resources}{primitive}{$server}{operations}{op}{$op}{timeout} : 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + on_fail => $on_fail, + timeout => $timeout, + }}); + if (($on_fail ne "block") or ($timeout != 600)) + { + $update = 1; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { update => $update }}); + } + } + } + + if ($update) + { + + my $resource_command = $anvil->data->{path}{exe}{pcs}." resource update ".$server." ocf:alteeve:server name=\"".$server."\" meta allow-migrate=\"true\" op monitor interval=\"60\" start timeout=\"60\" on-fail=\"block\" stop timeout=\"300\" on-fail=\"block\" migrate_to timeout=\"600\" on-fail=\"block\" migrate_from timeout=\"600\" on-fail=\"block\""; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0739", variables => { + server => $server, + command => $resource_command, + }}); + + my ($output, $return_code) = $anvil->System->call({shell_call => $resource_command}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + output => $output, + return_code => $return_code, + }}); + } } } diff --git a/share/words.xml b/share/words.xml index e6e93ac0..e735f11a 100644 --- a/share/words.xml +++ b/share/words.xml @@ -972,8 +972,8 @@ resource #!variable!server!# { Link State Duplex Link Drops - - + + Table public history @@ -2389,6 +2389,7 @@ The file: [#!variable!file!#] needs to be updated. The difference is: The DR host: [#!variable!host!#] was not linked to the Anvil! node: [#!variable!anvil!#], nothing to do. The job: [#!variable!command!#] (with job UUID: [#!variable!job_uuid!#]) is being skipped for now, already started a job (started job_uuid: [#!variable!started_job!#]) with this command on this loop. There are no databases available at this time. + The server: [#!variable!server!#] needs it's pacemaker configuration updated. Running: [#!variable!command!#]. The host name: [#!variable!target!#] does not resolve to an IP address.