This fixes a race condition when multiple servers are provisioned at (nearly) the same time.

* In DRBD->get_next_resource(), implemented a "hold" system where the DRBD minor and TCP port(s) returned are marked as being held for one minute. So subsequent calls won't use the same numbers.
* In anvil-daemon, added a check in run_jobs() where only one instance of a given job command will be started per 2-second loop. This should help reduce the chance of simultaneous race confitions in general.
* Removed from anvil-provision-server and most other tools the call to Job->get_job_uuid(). If the program is called without the job_uuid, don't try to find it. This allows a human (or script) to make repeated calls to a program without one of those calls running a pending job instead.

Signed-off-by: digimer <mkelly@alteeve.ca>
main
digimer 2 years ago
parent e7537b0ca3
commit 895f1ec262
  1. 3
      Anvil/Tools/Cluster.pm
  2. 138
      Anvil/Tools/DRBD.pm
  3. 3
      notes
  4. 2
      share/words.xml
  5. 8
      tools/anvil-boot-server
  6. 22
      tools/anvil-daemon
  7. 8
      tools/anvil-delete-server
  8. 13
      tools/anvil-download-file
  9. 5
      tools/anvil-manage-power
  10. 8
      tools/anvil-manage-server
  11. 8
      tools/anvil-manage-server-storage
  12. 8
      tools/anvil-migrate-server
  13. 13
      tools/anvil-provision-server
  14. 8
      tools/anvil-rename-server
  15. 8
      tools/anvil-safe-stop
  16. 8
      tools/anvil-shutdown-server
  17. 8
      tools/anvil-sync-shared
  18. 8
      tools/anvil-update-system
  19. 8
      tools/striker-boot-machine

@ -143,6 +143,9 @@ sub add_server
if (exists $anvil->data->{cib}{parsed}{cib}{resources}{primitive}{$server_name}{type})
{
# The server already exists
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
"cib::parsed::cib::resources::primitive::${server_name}::type" => $anvil->data->{cib}{parsed}{cib}{resources}{primitive}{$server_name}{type},
}});
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "err", key => "error_0213", variables => { server_name => $server_name }});
return("!!error!!");
}

@ -1921,6 +1921,75 @@ ORDER BY
}
else
{
# See if this minor is held by someone.
my $variable_name = "drbd::hold::minor::".$free_minor."::until";
my ($variable_value, $variable_uuid, undef) = $anvil->Database->read_variable({
debug => $debug,
variable_name => $variable_name,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
's1:variable_name' => $variable_name,
's2:variable_value' => $variable_value,
's3:variable_uuid' => $variable_uuid,
}});
if (($variable_value) && ($variable_value !~ /^\d+$/))
{
# Bad value, clear it.
$variable_uuid = $anvil->Database->insert_or_update_variables({
debug => $debug,
variable_uuid => $variable_uuid,
variable_value => "0",
update_value_only => "",
});
$variable_value = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
variable_uuid => $variable_uuid,
variable_value => $variable_value
}});
}
if ($variable_uuid)
{
my $now_time = time;
my $age = $now_time - $variable_value;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
age => $age,
now_time => $now_time,
}});
if (($variable_value) && ($now_time > $variable_value))
{
# This is being held, move on.
$free_minor++;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { free_minor => $free_minor }});
next;
}
else
{
# Either the hold is stale or invalid, delete it.
$variable_uuid = $anvil->Database->insert_or_update_variables({
debug => $debug,
variable_uuid => $variable_uuid,
variable_value => "0",
update_value_only => "",
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { variable_uuid => $variable_uuid }});
}
}
# To prevent race conditions, put a one minute hold on the minor number.
$variable_uuid = $anvil->Database->insert_or_update_variables({
debug => $debug,
variable_name => $variable_name,
variable_value => time+60,
variable_default => "0",
variable_description => "striker_0301",
variable_section => "hold",
variable_source_uuid => "NULL",
variable_source_table => "",
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { variable_uuid => $variable_uuid }});
$looking = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { looking => $looking }});
}
@ -1956,6 +2025,74 @@ ORDER BY
}
else
{
# See if this minor is held by someone.
my $variable_name = "drbd::hold::tcp_port::".$check_port."::until";
my ($variable_value, $variable_uuid, undef) = $anvil->Database->read_variable({
debug => $debug,
variable_name => $variable_name,
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
's1:variable_name' => $variable_name,
's2:variable_value' => $variable_value,
's3:variable_uuid' => $variable_uuid,
}});
if (($variable_value) && ($variable_value !~ /^\d+$/))
{
# Bad value, clear it.
$variable_uuid = $anvil->Database->insert_or_update_variables({
debug => $debug,
variable_uuid => $variable_uuid,
variable_value => "0",
update_value_only => "",
});
$variable_value = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
variable_uuid => $variable_uuid,
variable_value => $variable_value
}});
}
if ($variable_uuid)
{
my $now_time = time;
my $age = $now_time - $variable_value;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
age => $age,
now_time => $now_time }});
if (($variable_value) && ($now_time > $variable_value))
{
# This is being held, move on.
$check_port++;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { check_port => $check_port }});
next;
}
else
{
# Either the hold is stale or invalid, delete it.
$variable_uuid = $anvil->Database->insert_or_update_variables({
debug => $debug,
variable_uuid => $variable_uuid,
variable_value => "0",
update_value_only => "",
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { variable_uuid => $variable_uuid }});
}
}
# To prevent a race condition, put a one minute hold on this port number.
$variable_uuid = $anvil->Database->insert_or_update_variables({
debug => $debug,
variable_name => $variable_name,
variable_value => time+60,
variable_default => "0",
variable_description => "striker_0301",
variable_section => "hold",
variable_source_uuid => "NULL",
variable_source_table => "",
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { variable_uuid => $variable_uuid }});
# This is a free port.
$free_ports .= $check_port.",";
$port_count++;
@ -1977,6 +2114,7 @@ ORDER BY
}
}
# Mark these ports as assigned.
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
free_minor => $free_minor,
free_ports => $free_ports,

@ -20,6 +20,9 @@ Common queries;
* SELECT a.scan_hardware_uuid, b.host_name, a.scan_hardware_cpu_cores AS cores, a.scan_hardware_cpu_threads AS threads, pg_size_pretty(a.scan_hardware_ram_total) AS ram_total, pg_size_pretty(a.scan_hardware_memory_total) AS memory_total, pg_size_pretty(a.scan_hardware_memory_free) AS memory_free FROM scan_hardware a, hosts b WHERE a.scan_hardware_host_uuid = b.host_uuid ORDER BY b.host_name ASC;
for lv in $(lvscan | grep deploy| awk '{print $2}' | sed s/\'//g); do lvremove -y $lv; done; rm -f /etc/drbd.d/an-test-deploy*; lvscan; ls -lah /etc/drbd.d/
# Fail a resource for testing purposes.
crm_resource --fail --resource srv02-b -N vm-a01n01

@ -2371,6 +2371,7 @@ The file: [#!variable!file!#] needs to be updated. The difference is:
<key name="log_0734">The DR host: [#!variable!host!#] as been linked to the Anvil! node: [#!variable!anvil!#].</key>
<key name="log_0735">The DR host: [#!variable!host!#] as been _unlinked_ to the Anvil! node: [#!variable!anvil!#].</key>
<key name="log_0736">The DR host: [#!variable!host!#] was not linked to the Anvil! node: [#!variable!anvil!#], nothing to do.</key>
<key name="log_0737">The job: [#!variable!command!#] (with job UUID: [#!variable!job_uuid!#]) is being skipped for now, already started a job (started job_uuid: [#!variable!started_job!#]) with this command on this loop.</key>
<!-- Messages for users (less technical than log entries), though sometimes used for logs, too. -->
<key name="message_0001">The host name: [#!variable!target!#] does not resolve to an IP address.</key>
@ -3213,6 +3214,7 @@ If you are comfortable that the target has changed for a known reason, you can s
<key name="striker_0298">TCP Port</key>
<key name="striker_0299">Migration Network link #!variable!number!#</key>
<key name="striker_0300">This is where you configure the optional network dedicated to RAM-copy during live migrations.</key>
<key name="striker_0301">This puts a temporary hold on a DRBD minor number or TCP port so that it isn't used again in the time between when it was queried as the next free number, and before it can be used.</key>
<!-- These are generally units and appended to numbers -->
<key name="suffix_0001">#!variable!number!#/sec</key>

@ -43,14 +43,6 @@ if (not $anvil->data->{sys}{database}{connections})
$anvil->nice_exit({exit_code => 1});
}
# If we don't have a job UUID, try to find one.
if (not $anvil->data->{switches}{'job-uuid'})
{
# Load the job data.
$anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({program => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }});
}
if ($anvil->data->{switches}{'job-uuid'})
{
# Load the job data.

@ -1481,6 +1481,28 @@ sub run_jobs
updated_seconds_ago => $updated_seconds_ago,
}});
# To minimize the chance of race conditions, any given command will be called only once at a
# time. If two jobs of the same command exist, only one will be called.
if ($job_progress != 100)
{
my $short_command = $job_command;
$short_command =~ s/\s.*$//;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { short_command => $short_command }});
if (exists $anvil->data->{sys}{started}{$short_command})
{
# Skip it.
my $started_job = $anvil->data->{sys}{started}{$short_command};
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0737", variables => {
started_job => $started_job,
job_uuid => $job_uuid,
command => $short_command,
}});
next;
}
$anvil->data->{sys}{started}{$short_command} = $job_uuid;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "sys::started::${short_command}" => $anvil->data->{sys}{started}{$short_command} }});
}
# If this is a start-up call, only start jobs whose status is 'anvil_startup'.
if (($startup) && ($job_status ne "anvil_startup"))
{

@ -54,14 +54,6 @@ if (not $anvil->data->{sys}{database}{connections})
$anvil->data->{sys}{anvil_uuid} = $anvil->Cluster->get_anvil_uuid();
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'sys::anvil_uuid' => $anvil->data->{sys}{anvil_uuid} }});
# If we don't have a job UUID, try to find one.
if (not $anvil->data->{switches}{'job-uuid'})
{
# Load the job data.
$anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({debug => 2, program => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }});
}
# If we still don't have a job-uuit, go into interactive mode.
if ($anvil->data->{switches}{'job-uuid'})
{

@ -105,19 +105,6 @@ sub get_job_details
{
my ($anvil) = @_;
# If I don't have a job-uuid, see if any jobs are pending
if (not $anvil->data->{switches}{'job-uuid'})
{
my $job_uuid = $anvil->Job->get_job_uuid({debug => 2, program => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }});
if ($anvil->Validate->uuid({uuid => $job_uuid}))
{
# Got one!
$anvil->data->{switches}{'job-uuid'} = $job_uuid;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'switches::job-uuid' => $anvil->data->{switches}{'job-uuid'} }});
}
}
# If we've got a job-uuid, load the details.
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'switches::job-uuid' => $anvil->data->{switches}{'job-uuid'} }});
if ($anvil->data->{switches}{'job-uuid'})

@ -228,11 +228,6 @@ sub do_poweroff
$job_uuid = $anvil->data->{switches}{'job-uuid'};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }});
}
else
{
$job_uuid = $anvil->Job->get_job_uuid({debug => 2, program => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_uuid => $job_uuid }});
}
# Make sure the 'reboot needed' flag is set. When 'anvil-daemon' starts, it will use this to confirm
# that it is starting post-reboot and clear it.

@ -52,14 +52,6 @@ if (not $anvil->data->{sys}{database}{connections})
$anvil->nice_exit({exit_code => 1});
}
# If we don't have a job UUID, try to find one.
if (not $anvil->data->{switches}{'job-uuid'})
{
# Load the job data.
$anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({program => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }});
}
# If we still don't have a job-uuit, go into interactive mode.
if ($anvil->data->{switches}{'job-uuid'})
{

@ -70,14 +70,6 @@ if (not $anvil->data->{sys}{database}{connections})
$anvil->nice_exit({exit_code => 1});
}
# If we don't have a job UUID, try to find one.
if (not $anvil->data->{switches}{'job-uuid'})
{
# Load the job data.
$anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({program => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }});
}
$anvil->Database->get_hosts();
$anvil->Database->get_anvils();
$anvil->Database->get_servers();

@ -59,14 +59,6 @@ if (not $anvil->data->{sys}{database}{connections})
$anvil->nice_exit({exit_code => 1});
}
# If we don't have a job UUID, try to find one.
if (not $anvil->data->{switches}{'job-uuid'})
{
# Load the job data.
$anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({program => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }});
}
if ($anvil->data->{switches}{'job-uuid'})
{
# Load the job data.

@ -66,14 +66,6 @@ if (not $anvil->data->{sys}{database}{connections})
$anvil->nice_exit({exit_code => 1});
}
# If we don't have a job UUID, try to find one.
if (not $anvil->data->{switches}{'job-uuid'})
{
# Load the job data.
$anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({program => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }});
}
# If we still don't have a job-uuit, go into interactive mode.
if ($anvil->data->{switches}{'job-uuid'})
{
@ -354,7 +346,10 @@ sub add_server_to_cluster
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "job_0207"});
# Is our peer in the cluster? For that matter, are we?
my $problem = $anvil->Cluster->add_server({server_name => $anvil->data->{job}{server_name}});
my $problem = $anvil->Cluster->add_server({
debug => 2,
server_name => $anvil->data->{job}{server_name},
});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { problem => $problem }});
if ($problem)
{

@ -54,14 +54,6 @@ if (not $anvil->data->{sys}{database}{connections})
$anvil->nice_exit({exit_code => 1});
}
# If we don't have a job UUID, try to find one.
if (not $anvil->data->{switches}{'job-uuid'})
{
# Load the job data.
$anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({program => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }});
}
# If we still don't have a job-uuit, go into interactive mode.
if ($anvil->data->{switches}{'job-uuid'})
{

@ -74,14 +74,6 @@ if (not $anvil->data->{sys}{database}{connections})
$anvil->nice_exit({exit_code => 1});
}
# If we don't have a job UUID, try to find one.
if (not $anvil->data->{switches}{'job-uuid'})
{
# Load the job data.
$anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({program => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }});
}
# If we still don't have a job-uuit, go into interactive mode.
if ($anvil->data->{switches}{'job-uuid'})
{

@ -53,14 +53,6 @@ if (not $anvil->data->{sys}{database}{connections})
$anvil->nice_exit({exit_code => 1});
}
# If we don't have a job UUID, try to find one.
if (not $anvil->data->{switches}{'job-uuid'})
{
# Load the job data.
$anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({program => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }});
}
if ($anvil->data->{switches}{'job-uuid'})
{
# Load the job data.

@ -40,14 +40,6 @@ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "
$anvil->Database->connect;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 2, key => "log_0132"});
# If we don't have a job-uuid, look for one.
if (not $anvil->data->{switches}{'job-uuid'})
{
# Load the job data.
$anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({debug => 2, program => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }});
}
# If we have a job-uuid, process it.
if ($anvil->data->{switches}{'job-uuid'})
{

@ -54,14 +54,6 @@ if (not $anvil->data->{sys}{database}{connections})
$anvil->nice_exit({exit_code => 1});
}
# Did we get called with a job UUID? If not, try to find a pending job and take it.
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }});
if (not $anvil->data->{switches}{'job-uuid'})
{
# See if a job is waiting to run.
$anvil->data->{switches}{job_uuid} = $anvil->Job->get_job_uuid({program => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }});
}
if ($anvil->data->{switches}{'job-uuid'})
{
# Load the job details. If anything is returned, there was a problem.

@ -48,14 +48,6 @@ if (not $anvil->data->{sys}{database}{connections})
$anvil->nice_exit({exit_code => 1});
}
# If we don't have a job UUID, try to find one.
if (not $anvil->data->{switches}{'job-uuid'})
{
# Load the job data.
$anvil->data->{switches}{'job-uuid'} = $anvil->Job->get_job_uuid({program => $THIS_FILE});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "switches::job-uuid" => $anvil->data->{switches}{'job-uuid'} }});
}
if ($anvil->data->{switches}{'job-uuid'})
{
# Load the job data.

Loading…
Cancel
Save