The main purpose of this commit is to add serial execution of similar jobs to help reduce race conditions for scripted jobs, like multiple server creation.

* Fixed a small logging bug in DRBD->allow_two_primaries().
* Updated Database->get_jobs() to record jobs sorted by modified_date so that jobs can be run in the order they were recorded.
* Updated anvil-daemon to track which commands need to be run, and when two or more of the same command need to be run, they're run serially, with each subsequent run starting after the previous one completes.

Signed-off-by: digimer <mkelly@alteeve.ca>
main
digimer 2 years ago
parent 38d088a998
commit 47f7a35df3
  1. 2
      Anvil/Tools/DRBD.pm
  2. 28
      Anvil/Tools/Database.pm
  3. 1
      share/words.xml
  4. 146
      tools/anvil-daemon

@ -210,7 +210,7 @@ sub allow_two_primaries
}
my $key = $set_to eq "yes" ? "log_0350" : "log_0642";
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 1, key => "", variables => {
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level => 1, key => $key, variables => {
resource => $resource,
target_name => $peer_name,
target_node_id => $target_node_id,

@ -4648,6 +4648,10 @@ sub get_jobs
{
delete $anvil->data->{jobs}{running};
}
if (exists $anvil->data->{jobs}{modified_date})
{
delete $anvil->data->{jobs}{modified_date};
}
my $query = "
SELECT
@ -4757,6 +4761,30 @@ WHERE
"jobs::running::${job_uuid}::job_status" => $anvil->data->{jobs}{running}{$job_uuid}{job_status},
"jobs::running::${job_uuid}::modified_date" => $anvil->data->{jobs}{running}{$job_uuid}{modified_date},
}});
# Make it possible to sort by modified date for serial execution of similar jobs.
$anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_command} = $job_command;
$anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_data} = $job_data;
$anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_picked_up_by} = $job_picked_up_by;
$anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_picked_up_at} = $job_picked_up_at;
$anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_updated} = $job_updated;
$anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_name} = $job_name;
$anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_progress} = $job_progress;
$anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_title} = $job_title;
$anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_description} = $job_description;
$anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_status} = $job_status;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => {
"jobs::modified_date::${modified_date}::job_uuid::${job_uuid}::job_command" => $anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_command},
"jobs::modified_date::${modified_date}::job_uuid::${job_uuid}::job_data" => $anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_data},
"jobs::modified_date::${modified_date}::job_uuid::${job_uuid}::job_picked_up_by" => $anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_picked_up_by},
"jobs::modified_date::${modified_date}::job_uuid::${job_uuid}::job_picked_up_at" => $anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_picked_up_at},
"jobs::modified_date::${modified_date}::job_uuid::${job_uuid}::job_updated" => $anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_updated},
"jobs::modified_date::${modified_date}::job_uuid::${job_uuid}::job_name" => $anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_name},
"jobs::modified_date::${modified_date}::job_uuid::${job_uuid}::job_progress" => $anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_progress},
"jobs::modified_date::${modified_date}::job_uuid::${job_uuid}::job_title" => $anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_title},
"jobs::modified_date::${modified_date}::job_uuid::${job_uuid}::job_description" => $anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_description},
"jobs::modified_date::${modified_date}::job_uuid::${job_uuid}::job_status" => $anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_status},
}});
}
my $return_count = @{$return};

@ -2392,6 +2392,7 @@ The file: [#!variable!file!#] needs to be updated. The difference is:
<key name="log_0738">There are no databases available at this time.</key>
<key name="log_0739">The server: [#!variable!server!#] needs it's pacemaker configuration updated. Running: [#!variable!command!#].</key>
<key name="log_0740">Running the scan-agent: [#!variable!agent!#] now to ensure that the database has an updated view of resources.</key>
<key name="log_0741">I was about to start: [#!variable!command!#] with the job UUID: [#!variable!this_job_uuid!#]. However, another job using the same command with the job UUID: [#!variable!other_job_uuid!#]. To avoid race conditions, only one process with a given command is run at the same time.</key>
<!-- Messages for users (less technical than log entries), though sometimes used for logs, too. -->
<key name="message_0001">The host name: [#!variable!target!#] does not resolve to an IP address.</key>

@ -1412,51 +1412,53 @@ sub run_jobs
# Get a list of pending or incomplete jobs.
my $ended_within = $startup ? 1 : 300;
my $return = $anvil->Database->get_jobs({ended_within => $ended_within});
my $count = @{$return};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
ended_within => $ended_within,
'return' => $return,
count => $count,
}});
foreach my $hash_ref (@{$return})
{
my $job_uuid = $hash_ref->{job_uuid};
my $job_command = $hash_ref->{job_command};
my $job_data = $hash_ref->{job_data};
my $job_picked_up_by = $hash_ref->{job_picked_up_by};
my $job_picked_up_at = $hash_ref->{job_picked_up_at};
my $job_updated = $hash_ref->{job_updated};
my $job_name = $hash_ref->{job_name};
my $job_progress = $hash_ref->{job_progress};
my $job_title = $hash_ref->{job_title};
my $job_description = $hash_ref->{job_description};
my $job_status = $hash_ref->{job_status};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { ended_within => $ended_within }});
$anvil->Database->get_jobs({ended_within => $ended_within});
foreach my $modified_date (sort {$a cmp $b} keys %{$anvil->data->{jobs}{modified_date}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { modified_date => $modified_date }});
foreach my $job_uuid (sort {$a cmp $b} keys %{$anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}})
{
# Reload the jobs so we get an updated view of them.
$anvil->Database->get_jobs({ended_within => $ended_within});
# Collect the data.
my $job_command = $anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_command};
my $short_command = $job_command;
$short_command =~ s/\s.*$//;
my $job_data = $anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_data};
my $job_picked_up_by = $anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_picked_up_by};
my $job_picked_up_at = $anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_picked_up_at};
my $job_updated = $anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_updated};
my $job_name = $anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_name};
my $job_progress = $anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_progress};
my $job_title = $anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_title};
my $job_description = $anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_description};
my $job_status = $anvil->data->{jobs}{modified_date}{$modified_date}{job_uuid}{$job_uuid}{job_status};
my $started_seconds_ago = $job_picked_up_at ? (time - $job_picked_up_at) : 0;
my $updated_seconds_ago = $job_updated ? (time - $job_updated) : 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
job_uuid => $job_uuid,
job_command => $job_command,
job_data => $job_data,
job_picked_up_by => $job_picked_up_by,
job_picked_up_at => $job_picked_up_at,
job_updated => $job_updated,
job_name => $job_name,
job_progress => $job_progress,
job_title => $job_title,
job_description => $job_description,
job_status => $job_status,
started_seconds_ago => $started_seconds_ago,
updated_seconds_ago => $updated_seconds_ago,
's01:job_uuid' => $job_uuid,
's02:job_command' => $job_command,
's03:short_command' => $short_command,
's04:job_data' => $job_data,
's05:job_picked_up_by' => $job_picked_up_by,
's06:job_picked_up_at' => $job_picked_up_at,
's07:job_updated' => $job_updated,
's08:job_name' => $job_name,
's09:job_progress' => $job_progress,
's10:job_title' => $job_title,
's11:job_description' => $job_description,
's12:job_status' => $job_status,
's13:started_seconds_ago' => $started_seconds_ago,
's14:updated_seconds_ago' => $updated_seconds_ago,
}});
# To minimize the chance of race conditions, any given command will be called only once at a
# time. If two jobs of the same command exist, only one will be called.
# To minimize the chance of race conditions, any given command will be called only
# once at a time. If two jobs of the same command exist, only one will be called.
if ($job_progress != 100)
{
my $short_command = $job_command;
$short_command =~ s/\s.*$//;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { short_command => $short_command }});
if (exists $anvil->data->{sys}{started}{$short_command})
{
# Skip it.
@ -1482,7 +1484,19 @@ sub run_jobs
next;
}
if ($job_progress ne "100")
if ($job_progress == 100)
{
# This is a job that might have just completed, clear the started value.
$anvil->data->{jobs}{$job_uuid}{started} = 0;
$job_picked_up_at = 0;
$job_picked_up_by = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
job_picked_up_at => $job_picked_up_at,
job_picked_up_by => $job_picked_up_by,
"jobs::${job_uuid}::started" => $anvil->data->{jobs}{$job_uuid}{started},
}});
}
else
{
$anvil->data->{sys}{jobs_running} = 1;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "sys::jobs_running" => $anvil->data->{sys}{jobs_running} }});
@ -1501,9 +1515,9 @@ sub run_jobs
# If the job is done, just clear the 'job_picked_up_by' and be done.
if ($job_progress ne "100")
{
# It's possible that the job updated to 100% and exited after we
# gathered the job data, so we won't restart until we've seen it not
# running and not at 100% after 5 loops.
# It's possible that the job updated to 100% and exited after
# we gathered the job data, so we won't restart until we've
# seen it not running and not at 100% after 5 loops.
if ((not exists $anvil->data->{lost_job_count}{$job_uuid}) or (not defined $anvil->data->{lost_job_count}{$job_uuid}))
{
$anvil->data->{lost_job_count}{$job_uuid} = 0;
@ -1511,7 +1525,8 @@ sub run_jobs
}
if ($anvil->data->{lost_job_count}{$job_uuid} > 5)
{
# The previous job is gone, but the job isn't finished. Start it again.
# The previous job is gone, but the job isn't
# finished. Start it again.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, priority => "alert", key => "warning_0007", variables => {
command => $job_command,
pid => $job_picked_up_by,
@ -1542,6 +1557,12 @@ sub run_jobs
$job_picked_up_by = 0;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { job_picked_up_by => $job_picked_up_by }});
}
elsif ($job_progress ne "100")
{
# The job is running.
$anvil->data->{jobs_started}{$short_command} = $job_uuid;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "jobs_started::${short_command}" => $anvil->data->{jobs_started}{$short_command} }});
}
}
# Convert the double-banged strings into a proper message.
@ -1590,7 +1611,7 @@ sub run_jobs
$jobs_file .= $json_string.",\n";
# If the job is done, move on.
next if $job_progress eq "100";
next if $job_progress == 100;
next if $anvil->data->{switches}{'no-start'};
# If 'startup' is set, we only care if 'job_status' is 'anvil_startup'
@ -1604,9 +1625,39 @@ sub run_jobs
next;
}
# If the job is not running, start it.
# If the job is not running, and we've not started any other of the same command this
# loop, start it.
if (not $job_picked_up_by)
{
if (exists $anvil->data->{jobs_started}{$short_command})
{
# Is the job_uuid associated with this command done?
my $started_job_uuid = $anvil->data->{jobs_started}{$short_command};
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { started_job_uuid => $started_job_uuid }});
if (exists $anvil->data->{jobs}{running}{$started_job_uuid})
{
if ($anvil->data->{jobs}{running}{$started_job_uuid}{job_progress} != 100)
{
# Don't start it in this pass.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0741", variables => {
command => $job_command,
this_job_uuid => $job_uuid,
other_job_uuid => $started_job_uuid,
}});
next;
}
else
{
# The previous job is done, delete it.
$anvil->data->{jobs_started}{$short_command} = "";
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"jobs_started::${short_command}" => $anvil->data->{jobs_started}{$short_command},
}});
}
}
}
my $command = $job_command." --job-uuid ".$job_uuid;
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 1, key => "log_0210", variables => { command => $command }});
@ -1648,6 +1699,11 @@ sub run_jobs
# Record that we've tried to start this job, so that we don't try to restart it for any reason for at least a minute.
$anvil->data->{jobs}{$job_uuid}{started} = time;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { 'jobs::$job_uuid::started' => $anvil->data->{jobs}{$job_uuid}{started} }});
# Record that a job with this command has started
$anvil->data->{jobs_started}{$short_command} = $job_uuid;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "jobs_started::${short_command}" => $anvil->data->{jobs_started}{$short_command} }});
}
}
}

Loading…
Cancel
Save