* Reworked the anvil-join-anvil job parsing to help diagnose occassional faults. Also changed a fatal parse error to one that allows the run to be retried.

Signed-off-by: Digimer <digimer@alteeve.ca>
main
Digimer 4 years ago
parent 0aac9ead8c
commit 5b4bfa747c
  1. 4
      ocf/alteeve/server
  2. 6
      share/words.xml
  3. 54
      tools/anvil-join-anvil

@ -199,13 +199,13 @@ $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 0, level
foreach my $key (sort {$a cmp $b} keys %{$anvil->data->{environment}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => {
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"environment::${key}" => $anvil->data->{environment}{$key},
}});
}
foreach my $key (sort {$a cmp $b} keys %{$anvil->data->{switches}})
{
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => {
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {
"switches::${key}" => $anvil->data->{switches}{$key},
}});
}

@ -419,6 +419,7 @@ The attempt to start the servers appears to have failed. The return code '0' was
<key name="error_0305">Unable to connect to the database, unable to manage a server at this time.</key>
<key name="error_0306">Unable to connect to the database, unable to provision a server at this time.</key>
<key name="error_0307">Failed to perform requested task(s) because the requester is not authenticated.</key>
<key name="error_0308"><![CDATA[[ Error ] - The Job: [#!variable!job-uuid!#] appears to have passed malformed data. The raw data was: [#!variable!raw!#]. Expected 'as_machine=<host_type>,manifest_uuid=<manifest_uuid>,anvil_uuid=<anvil_uuid>'. Either the parse failed, or the data was somehow invalid.]]></key>
<!-- Files templates -->
<!-- NOTE: Translating these files requires an understanding of which lines are translatable -->
@ -806,7 +807,7 @@ Failure! The return code: [#!variable!return_code!#] was received ('0' was expec
<key name="job_0073">This machine will join an #!string!brand_0006!# as a node or DR host. The role and #!string!brand_0006!# will be determined by the associated Install Manifest UUID.</key>
<key name="job_0074">'Join #!string!brand_0002!#' job: [#!variable!job-uuid!#] picked up.</key>
<key name="job_0075">This will become: [#!variable!machine!#] using data from the install manifest UUID: [#!variable!manifest_uuid!#].</key>
<key name="job_0076">[ Error ] - Failed to load and parse the install manifest. Details will be found in the logs.</key>
<key name="job_0076">[ Error ] - Failed to load and parse the install manifest. Details will be found in the logs. Exiting, This is a fatal error.</key>
<key name="job_0077">The host name is already: [#!variable!host_name!#], no change needed.</key>
<key name="job_0078">Updating the network configuration for: [#!variable!interface!#].</key>
<key name="job_0079">Disconnected from all database(s). Will reconnect after the network configuration changes have taken effect.</key>
@ -2677,7 +2678,7 @@ Here we will inject 't_0006', which injects 't_0001' which has a variable: [#!st
<key name="warning_0043">[ Warning ] - The install manifest with the UUID: [#!variable!uuid!#] was not found.</key>
<key name="warning_0044">[ Warning ] - The install manifest: [#!variable!name!#] with the UUID: [#!variable!uuid!#] has already been deleted.</key>
<key name="warning_0045">[ Warning ] - The install manifest: [#!variable!name!#] with the UUID: [#!variable!uuid!#] was NOT deleted. The reason may be in the: [#!data!path::log::main!#] log file on this host.</key>
<key name="warning_0046">[ Warning ] - The Install Manifest with the UUID: [#!variable!uuid!#] was not found.</key>
<key name="warning_0046">[ Warning ] - The install manifest with the UUID: [#!variable!uuid!#] was not found.</key>
<key name="warning_0047">[ Warning ] - The password to set for this #!string!brand_0006!# was not set.</key>
<key name="warning_0048">[ Warning ] - The password verification was not set.</key>
<key name="warning_0049">[ Warning ] - The passwords do not match.</key>
@ -2793,6 +2794,7 @@ Read UUID: .... [#!variable!read_uuid!#]
<key name="warning_0118">[ Warning ] - The 'admin' group was created as a system group with the group ID: [#!variable!gid!#].</key>
<key name="warning_0119">[ Warning ] - The 'admin' user was created with the user ID: [#!variable!uid!#].</key>
<key name="warning_0120">[ Warning ] - Timed out waiting for the database: [#!variable!uuid!#] to become available.</key>
<key name="warning_0121">[ Warning ] - The Anvil! with the UUID: [#!variable!uuid!#] was not found. Exiting, will re-run the anvil-join-anvil job again in a few moments.</key>
<!-- The entries below here are not sequential, but use a key to find the entry. -->
<!-- Run 'striker-parse-os-list to find new entries. -->

@ -1912,19 +1912,20 @@ sub load_job
$machine = "" if not defined $machine;
$manifest_uuid = "" if not defined $manifest_uuid;
$anvil_uuid = "" if not defined $anvil_uuid;
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
machine => $machine,
manifest_uuid => $manifest_uuid,
anvil_uuid => $anvil_uuid,
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => {
's1:job_data' => $anvil->data->{jobs}{job_data},
's2:machine' => $machine,
's3:manifest_uuid' => $manifest_uuid,
's4:anvil_uuid' => $anvil_uuid,
}});
if ((not $machine) or
(not $manifest_uuid) or
(not $anvil_uuid))
{
# Terminate the job entirely, it's likely an unrecoverable problem.
update_progress($anvil, 100, "job_0092,!!job-uuid!".$anvil->data->{switches}{'job-uuid'}."!!,!!raw!".$anvil->data->{jobs}{job_data}."!!");
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "job_0092", variables => {
# This occassionally is hit, but then works when tried again.
update_progress($anvil, 100, "error_0308,!!job-uuid!".$anvil->data->{switches}{'job-uuid'}."!!,!!raw!".$anvil->data->{jobs}{job_data}."!!");
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0308", variables => {
'job-uuid' => $anvil->data->{switches}{'job-uuid'},
raw => $anvil->data->{jobs}{job_data},
}});
@ -1941,34 +1942,41 @@ sub load_job
"sys::anvil_uuid" => $anvil->data->{sys}{anvil_uuid},
}});
# Load in the host info and the manifest.
# Load in the host, manifest and anvil data.
$anvil->Database->get_hosts();
$anvil->Database->get_manifests();
$anvil->Database->get_anvils();
# Parse the manifest
if (not exists $anvil->data->{manifests}{manifest_uuid}{$manifest_uuid})
{
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "warning_0046", variables => { uuid => $anvil->data->{sys}{manifest_uuid} }});
update_progress($anvil, 0, "warning_0046,!!uuid!".$anvil->data->{sys}{manifest_uuid}."!!");
sleep 10;
$anvil->nice_exit({exit_code => 2});
}
# Parse the manifest
my $problem = $anvil->Striker->load_manifest({manifest_uuid => $anvil->data->{sys}{manifest_uuid}});
$anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => {
manifest_uuid => $anvil->data->{sys}{manifest_uuid},
problem => $problem,
}});
if ($problem)
{
# Something went wrong
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "warning_0046", variables => { uuid => $anvil->data->{sys}{manifest_uuid} }});
update_progress($anvil, 0, "job_0076");
sleep 2;
# Something went wrong, fatally. Abort the job.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "job_0076", variables => { uuid => $anvil->data->{sys}{manifest_uuid} }});
update_progress($anvil, 100, "job_0076,!!uuid!".$anvil->data->{sys}{manifest_uuid}."!!");
$anvil->nice_exit({exit_code => 2});
}
# Load the manifest and anvil data.
$anvil->Database->get_anvils();
if ((not exists $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}) or (not exists $anvil->data->{manifests}{manifest_uuid}{$manifest_uuid}))
# Make sure we have a valid Anvil!
if (not exists $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid})
{
# Terminate the job entirely, it's likely an unrecoverable problem.
update_progress($anvil, 100, "job_0092,!!job-uuid!".$anvil->data->{switches}{'job-uuid'}."!!,!!raw!".$anvil->data->{jobs}{job_data}."!!");
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 2, key => "job_0092", variables => {
'job-uuid' => $anvil->data->{switches}{'job-uuid'},
raw => $anvil->data->{jobs}{job_data},
}});
sleep 2;
# Odd. Error out, the Anvil! might not be loaded yet.
$anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "warning_0121", variables => { uuid => $anvil->data->{sys}{anvil_uuid} }});
update_progress($anvil, 0, "warning_0121,!!uuid!".$anvil->data->{sys}{anvil_uuid}."!!");
sleep 10;
$anvil->nice_exit({exit_code => 5});
}

Loading…
Cancel
Save