From fb70836126a8831e61ab488a1e1111dc6005a8c1 Mon Sep 17 00:00:00 2001 From: digimer Date: Wed, 12 Apr 2023 22:25:24 -0400 Subject: [PATCH 01/12] This moves the call of anvil-safe-start out of scancore and into a new, dedicated systemd unit that runs on boot only. Signed-off-by: digimer --- anvil.spec.in | 2 +- tools/scancore | 20 ++------------------ units/Makefile.am | 1 + units/anvil-safe-start.service | 11 +++++++++++ 4 files changed, 15 insertions(+), 19 deletions(-) create mode 100644 units/anvil-safe-start.service diff --git a/anvil.spec.in b/anvil.spec.in index 18d73818..4f701a1b 100644 --- a/anvil.spec.in +++ b/anvil.spec.in @@ -388,7 +388,7 @@ fi %{_usr}/lib/ocf/resource.d/alteeve/server %files dr -# +# %changelog diff --git a/tools/scancore b/tools/scancore index 69b8f55b..82ed293b 100755 --- a/tools/scancore +++ b/tools/scancore @@ -429,33 +429,17 @@ sub startup_tasks }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { variable_uuid => $variable_uuid }}); - # If this is a node and we've been up for less than ten minutes, call anvil-safe-start as a - # background process. It will exit if it is disabled. + # This used to call anvil-safe-start, which isn't done here anymore. my $host_type = $anvil->Get->host_type(); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { host_type => $host_type }}); if ($host_type eq "node") { - my $uptime = $anvil->Get->uptime; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { uptime => $uptime }}); - if ($uptime < 600) - { - # Run it as a background task - my $shell_call = $anvil->data->{path}{exe}{'anvil-safe-start'}.$anvil->Log->switches; - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0210", variables => { command => $shell_call }}); - $anvil->System->call({shell_call => $shell_call, background => 1}); - } - else - { - # Log that we've been up too long to auto-start the cluster. - my $say_uptime = $anvil->Convert->time({'time' => $uptime}); - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "log_0620", variables => { uptime => $say_uptime }}); - } + # For future use. } elsif ($host_type eq "striker") { # We're a striker, so we're going to check for / remove transient database records on tables # that always grow (temperature, power, etc) and whose data loses value as it ages. - } return(0); diff --git a/units/Makefile.am b/units/Makefile.am index 442de927..6c518615 100644 --- a/units/Makefile.am +++ b/units/Makefile.am @@ -3,5 +3,6 @@ MAINTAINERCLEANFILES = Makefile.in servicedir = $(SYSTEMD_UNIT_DIR) dist_service_DATA = \ anvil-daemon.service \ + anvil-safe-start.service \ scancore.service \ striker-ui-api.service diff --git a/units/anvil-safe-start.service b/units/anvil-safe-start.service new file mode 100644 index 00000000..f4eac0bc --- /dev/null +++ b/units/anvil-safe-start.service @@ -0,0 +1,11 @@ +[Unit] +Description=Anvil! IA Platform - This service sanity checks and, if all is well, auto-starts the cluster and servers +Wants=network.target + +[Service] +ExecStart=/usr/sbin/anvil-safe-start +RemainAfterExit=true +Type=oneshot + +[Install] +WantedBy=multi-user.target From 025c2a6f547ea0c96353bdc5d09669a3cc97a675 Mon Sep 17 00:00:00 2001 From: digimer Date: Thu, 13 Apr 2023 00:26:32 -0400 Subject: [PATCH 02/12] * Updated Email->get_next_server() to ignore DELETED mail servers, and it now loads mail servers if not yet in memory. This resolves issue #306. Signed-off-by: digimer --- Anvil/Tools/Email.pm | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/Anvil/Tools/Email.pm b/Anvil/Tools/Email.pm index 434b781c..4ec97fcd 100644 --- a/Anvil/Tools/Email.pm +++ b/Anvil/Tools/Email.pm @@ -335,6 +335,7 @@ sub get_current_server return($newest_mail_server_uuid); } + =head2 get_next_server When two or more mail servers are configured, this will return the C<< mail_server_uuid >> of the mail server used in the most distant past. If two or more mail servers have never been used before, a random unused server is returned. @@ -352,6 +353,11 @@ sub get_next_server my $debug = defined $parameter->{debug} ? $parameter->{debug} : 3; $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => $debug, key => "log_0125", variables => { method => "Email->get_next_server()" }}); + if (not exists $anvil->data->{mail_servers}{mail_server}) + { + $anvil->Database->get_mail_servers({debug => $debug}); + } + # If configured/running, the number of messages in queue is checked. If '0', # 'mail_server::queue_empty' is updated with the current time. If 1 or more, the time since the queue # was last 0 is checked. If > 300, the mail server is reconfigured to use the mail server with the @@ -360,11 +366,15 @@ sub get_next_server my $oldest_mail_server_uuid = ""; foreach my $mail_server_uuid (keys %{$anvil->data->{mail_servers}{mail_server}}) { - my $last_used = $anvil->data->{mail_servers}{mail_server}{$mail_server_uuid}{last_used}; + # HELO domain is 'DELETED' is the mail server is not used anymore + my $last_used = $anvil->data->{mail_servers}{mail_server}{$mail_server_uuid}{last_used}; + my $helo_domain = $anvil->data->{mail_servers}{mail_server}{$mail_server_uuid}{mail_server_helo_domain}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { mail_server_uuid => $mail_server_uuid, last_used => $last_used, + helo_domain => $helo_domain, }}); + next if $helo_domain eq "DELETED"; if ($last_used < $oldest_mail_server_time) { From c956f75406f7369141b7459f55d931fa68416764 Mon Sep 17 00:00:00 2001 From: digimer Date: Fri, 14 Apr 2023 10:18:12 -0400 Subject: [PATCH 03/12] Enabled anvil-safe-start in '%post node'. Signed-off-by: digimer --- anvil.spec.in | 2 ++ 1 file changed, 2 insertions(+) diff --git a/anvil.spec.in b/anvil.spec.in index 4f701a1b..b468744f 100644 --- a/anvil.spec.in +++ b/anvil.spec.in @@ -302,6 +302,8 @@ then rm -f /etc/anvil/type.dr fi touch /etc/anvil/type.node +echo "Enabling anvil-safe-start" +systemctl enable --now anvil-safe-start.service %pre dr From fdf49c696f3b36414694b65d15179d9c0992946e Mon Sep 17 00:00:00 2001 From: digimer Date: Fri, 14 Apr 2023 12:23:21 -0400 Subject: [PATCH 04/12] Updated anvil-report-usage to ignore deleted servers. Also added a check to ensure hosts are loaded if not. Signed-off-by: digimer --- tools/anvil-report-usage | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/tools/anvil-report-usage b/tools/anvil-report-usage index da21c6e5..e0767431 100755 --- a/tools/anvil-report-usage +++ b/tools/anvil-report-usage @@ -625,22 +625,30 @@ sub collect_server_data $anvil->data->{longest}{ip_address} = 0; foreach my $server_uuid (sort {$a cmp $b} keys %{$anvil->data->{servers}{server_uuid}}) { - my $server_name = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_name}; - my $anvil_uuid = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_anvil_uuid}; - my $anvil_name = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_name}; - my $server_definition = $anvil->data->{server_definitions}{server_definition_server_uuid}{$server_uuid}{server_definition_xml}; - my $server_ram = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_ram_in_use}; + my $server_name = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_name}; + my $anvil_uuid = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_anvil_uuid}; + my $anvil_name = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_name}; + my $server_ram = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_ram_in_use}; + my $server_state = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_state}; if ($anvil->data->{servers}{server_uuid}{$server_uuid}{server_configured_ram} > $server_ram) { $server_ram = $anvil->data->{servers}{server_uuid}{$server_uuid}{server_configured_ram}; } my $say_server_ram = $anvil->Convert->bytes_to_human_readable({'bytes' => $server_ram}); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - 's1:server_name' => $server_name, - 's2:anvil_name' => $anvil_name, - 's3:server_ram' => $anvil->Convert->add_commas({number => $server_ram})." (".$say_server_ram.")", + 's1:server_uuid' => $server_uuid, + 's2:server_name' => $server_name, + 's3:anvil_uuid' => $anvil_uuid, + 's4:anvil_name' => $anvil_name, + 's5:server_state' => $server_state, + 's6:server_ram' => $anvil->Convert->add_commas({number => $server_ram})." (".$say_server_ram.")", }}); + next if $server_state eq "DELETED"; + + my $server_definition = $anvil->data->{server_definitions}{server_definition_server_uuid}{$server_uuid}{server_definition_xml}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { server_definition => $server_definition }}); + my $target = $anvil->Get->short_host_name; my $source = "from_db"; $anvil->Server->parse_definition({ @@ -722,6 +730,15 @@ sub collect_server_data # have a matching node name. my $node1_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node1_host_uuid}; my $node2_host_uuid = $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_node2_host_uuid}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { + 's1:node1_host_uuid' => $node1_host_uuid, + 's2:node2_host_uuid' => $node2_host_uuid, + }}); + + if (not exists $anvil->data->{hosts}{host_uuid}{$node1_host_uuid}) + { + $anvil->Database->get_hosts({debug => 2}); + } # Get names. my $node1_host_name = $anvil->data->{hosts}{host_uuid}{$node1_host_uuid}{host_name}; From f086c1be39d09b7f9d746ed8f746b08eb2fd99e6 Mon Sep 17 00:00:00 2001 From: digimer Date: Fri, 14 Apr 2023 13:02:50 -0400 Subject: [PATCH 05/12] Fixed a bug where the total RAM was shown instead of the free RAM. Signed-off-by: digimer --- tools/anvil-report-usage | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/anvil-report-usage b/tools/anvil-report-usage index e0767431..291fcdae 100755 --- a/tools/anvil-report-usage +++ b/tools/anvil-report-usage @@ -338,7 +338,7 @@ sub show_anvils { $first_line .= " | ".sprintf("%-${longest_ram_used}s", $ram_used_string); } - $first_line .= " | ".sprintf("%-${longest_ram_free}s", $ram_used_string); + $first_line .= " | ".sprintf("%-${longest_ram_free}s", $ram_free_string); $first_line .= " | ".sprintf("%-${longest_bridge_string}s", $bridge_string); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { first_line => $first_line }}); From 89eae7098e3277e46204504157a1ced947042f79 Mon Sep 17 00:00:00 2001 From: digimer Date: Mon, 17 Apr 2023 20:43:28 -0400 Subject: [PATCH 06/12] NOTE: This updates the reserved RAM to 8 GiB from 4 GiB! * Adds support for 'anvil_resources::ram::reserved' that can be set to a number of MiB to override the default 8192. * Adds support for 'anvil::::resources::ram::reserved' to allow for per-Anvil! node override on the reserved RAM default, and over the 'anvil_resources::ram::reserved' option. Signed-off-by: digimer --- Anvil/Tools/Get.pm | 89 ++++++++++++++++++++++++++++++++++++++++------ anvil.conf | 10 ++++++ share/words.xml | 2 ++ 3 files changed, 91 insertions(+), 10 deletions(-) diff --git a/Anvil/Tools/Get.pm b/Anvil/Tools/Get.pm index a7d3c7d5..2b80a9a4 100644 --- a/Anvil/Tools/Get.pm +++ b/Anvil/Tools/Get.pm @@ -689,34 +689,103 @@ ORDER BY } # Check if the reserved RAM is overriden by the config + my $default_reserved = 8192; + if (not exists $anvil->data->{anvil_resources}{ram}{reserved}) + { + $anvil->data->{anvil_resources}{ram}{reserved} = $default_reserved; + } + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + "anvil_resources::ram::reserved" => $anvil->data->{anvil_resources}{ram}{reserved}." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $anvil->data->{anvil_resources}{ram}{reserved}}).")", + }}); + + $anvil->data->{anvil_resources}{ram}{reserved} =~ s/,//g; + $anvil->data->{anvil_resources}{ram}{reserved} =~ s/\s//g; + $anvil->data->{anvil_resources}{ram}{reserved} =~ s/MiB$//i; + $anvil->data->{anvil_resources}{ram}{reserved} =~ s/MB$//i; + $anvil->data->{anvil_resources}{ram}{reserved} =~ s/M$//i; + if ((not $anvil->data->{anvil_resources}{ram}{reserved}) or ($anvil->data->{anvil_resources}{ram}{reserved} =~ /\D/)) + { + # Invalid value. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "warning_0151", variables => { + was => $anvil->data->{anvil_resources}{ram}{reserved}, + set => $default_reserved, + }}); + $anvil->data->{anvil_resources}{ram}{reserved} = $default_reserved; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + "anvil_resources::ram::reserved" => $anvil->data->{anvil_resources}{ram}{reserved}, + }}); + } + + #anvil::::resources::ram::reserved + if (exists $anvil->data->{anvil}{$anvil_uuid}{resources}{ram}{reserved}) + { + $anvil->data->{anvil}{$anvil_uuid}{resources}{ram}{reserved} =~ s/,//g; + $anvil->data->{anvil}{$anvil_uuid}{resources}{ram}{reserved} =~ s/\s//g; + $anvil->data->{anvil}{$anvil_uuid}{resources}{ram}{reserved} =~ s/MiB$//i; + $anvil->data->{anvil}{$anvil_uuid}{resources}{ram}{reserved} =~ s/MB$//i; + $anvil->data->{anvil}{$anvil_uuid}{resources}{ram}{reserved} =~ s/M$//i; + if ((not $anvil->data->{anvil_resources}{ram}{reserved}) or ($anvil->data->{anvil_resources}{ram}{reserved} =~ /\D/)) + { + # Invalid value. + my $anvil_name = $anvil->Get->anvil_name_from_uuid({anvil_uuid => $anvil_uuid}); + $anvil_name = $anvil_uuid if not $anvil_name; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, level => 0, priority => "alert", key => "warning_0152", variables => { + anvil => $anvil_name, + was => $anvil->data->{anvil}{$anvil_uuid}{resources}{ram}{reserved}, + set => $anvil->data->{anvil_resources}{ram}{reserved}, + }}); + $anvil->data->{anvil}{$anvil_uuid}{resources}{ram}{reserved} = $anvil->data->{anvil_resources}{ram}{reserved}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + "anvil::${anvil_uuid}::resources::ram::reserved" => $anvil->data->{anvil}{$anvil_uuid}{resources}{ram}{reserved}, + }}); + } + + if ($anvil->data->{anvil}{$anvil_uuid}{resources}{ram}{reserved}) + { + $anvil->data->{anvil_resources}{ram}{reserved} = $anvil->data->{anvil}{$anvil_uuid}{resources}{ram}{reserved}; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + "anvil_resources::ram::reserved" => $anvil->data->{anvil_resources}{ram}{reserved}, + }}); + } + } + my $ram_reserved = $anvil->Convert->human_readable_to_bytes({ base2 => 1, - size => $anvil->data->{anvil_resources}{ram}{reserved}, + size => $anvil->data->{anvil_resources}{ram}{reserved}." MiB", }); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + ram_reserved => $ram_reserved." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $ram_reserved}).")", + }}); if (($ram_reserved eq "!!error!!") or - (not $ram_reserved) or + (not $ram_reserved) or ($ram_reserved < (2**30)) or ($ram_reserved > $anvil->data->{anvil_resources}{$anvil_uuid}{ram}{hardware})) { - # The reserved RAM is invalid, so reset it. - $ram_reserved = 0; + # The reserved RAM is invalid, so reset it to 8 GiB + $ram_reserved = 8589934592; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + ram_reserved => $ram_reserved." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $ram_reserved}).")", + }}); } - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { - "anvil_resources::ram::reserved" => $ram_reserved." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $ram_reserved}).")", - }}); - # Take 4 GiB or what was provided by the config off the available RAM for the host - $anvil->data->{anvil_resources}{$anvil_uuid}{ram}{reserved} = $ram_reserved ? $ram_reserved : (4*(2**30)); # Reserve 4 GiB by default or what's set in the config file. + $anvil->data->{anvil_resources}{$anvil_uuid}{ram}{reserved} = $ram_reserved; $anvil->data->{anvil_resources}{$anvil_uuid}{ram}{available} -= $anvil->data->{anvil_resources}{$anvil_uuid}{ram}{reserved}; $anvil->data->{anvil_resources}{$anvil_uuid}{ram}{available} -= $anvil->data->{anvil_resources}{$anvil_uuid}{ram}{allocated}; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { "anvil_resources::${anvil_uuid}::ram::allocated" => $anvil->data->{anvil_resources}{$anvil_uuid}{ram}{allocated}." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $anvil->data->{anvil_resources}{$anvil_uuid}{ram}{allocated}}).")", "anvil_resources::${anvil_uuid}::ram::reserved" => $anvil->data->{anvil_resources}{$anvil_uuid}{ram}{reserved}." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $anvil->data->{anvil_resources}{$anvil_uuid}{ram}{reserved}}).")", "anvil_resources::${anvil_uuid}::ram::available" => $anvil->data->{anvil_resources}{$anvil_uuid}{ram}{available}." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $anvil->data->{anvil_resources}{$anvil_uuid}{ram}{available}}).")", }}); + if ($anvil->data->{anvil_resources}{$anvil_uuid}{ram}{available} < 0) + { + $anvil->data->{anvil_resources}{$anvil_uuid}{ram}{available} = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { + "anvil_resources::${anvil_uuid}::ram::available" => $anvil->data->{anvil_resources}{$anvil_uuid}{ram}{available}." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $anvil->data->{anvil_resources}{$anvil_uuid}{ram}{available}}).")", + }}); + } + # process bridges now foreach my $bridge_name (sort {$a cmp $b} keys %{$anvil->data->{anvil_resources}{$anvil_uuid}{bridges}}) { diff --git a/anvil.conf b/anvil.conf index 06f55fc6..dff3be6a 100644 --- a/anvil.conf +++ b/anvil.conf @@ -193,6 +193,16 @@ striker::repo::extra-packages = # Setting this to '0' will disable auto-management of the firewall. sys::manage::firewall = 1 +# By default, Anvil! nodes reserve 8 GiB of RAM for host OS use. That is to say, if a node (pair) has 128GiB +# of physical RAM, 120 GiB will be available for allocation to servers. Generally, this should NOT be reduced +# as doing so could trigger oom-killer (out of memory killer) to terminate servers. This is an integer +# representing the reserved RAM in MiB +# NOTE: Minimum is 1024, but really, never go below 4096 unless you _really_ know what you're doing. +#anvil_resources::ram::reserved = 8192 + +# If you wish to alter the amount of reservered RAM for a single Anvil! node, you can use: +#anvil::::resources::ram::reserved = 8192 + ### Server related options # This is the "short list" of servers shown when provisioning a new server. To see the full list of options, diff --git a/share/words.xml b/share/words.xml index 3b9d0fe5..1dc8fd53 100644 --- a/share/words.xml +++ b/share/words.xml @@ -3584,6 +3584,8 @@ The error was: [ Warning ] - The IPMI stonith resource: [#!variable!resource!#] is in the role: [#!variable!role!#] (should be 'Started'). Will check the IPMI config now. [ Warning ] - Failed to find a valid IP address or password to be used to setup the DR host's IPMI. [ Warning ] - The test "fail file": [#!variable!fail_file!#] was found. So long as this file exists, the ocf:alteeve:server RA will return 'OCF_ERR_GENERIC' (exit code 1). Delete the file to resume normal operation. + [ Warning ] - The configured reserved RAM was set to: [#!variable!was!#], which appears invalid. It must be an integer value representing the amount of RAM to reserve, in MiB. The reserved RAM is being set to: [#!variable!was!#]. + [ Warning ] - The configured reserved RAM was set to: [#!variable!was!#], which appears invalid. It must be an integer value representing the amount of RAM to reserve, in MiB. The reserved RAM is being set to: [#!variable!was!#]. From 83a527f4fa9137eea6c0e1e0befb76bac1eb61e9 Mon Sep 17 00:00:00 2001 From: digimer Date: Tue, 18 Apr 2023 11:18:42 -0400 Subject: [PATCH 07/12] * Removed enabling anvil-safe-start out of the RPM and into anvil-join-anvil. Signed-off-by: digimer --- anvil.spec.in | 2 -- share/words.xml | 1 + tools/anvil-join-anvil | 9 +++++++++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/anvil.spec.in b/anvil.spec.in index b468744f..4f701a1b 100644 --- a/anvil.spec.in +++ b/anvil.spec.in @@ -302,8 +302,6 @@ then rm -f /etc/anvil/type.dr fi touch /etc/anvil/type.node -echo "Enabling anvil-safe-start" -systemctl enable --now anvil-safe-start.service %pre dr diff --git a/share/words.xml b/share/words.xml index 1dc8fd53..d44b7617 100644 --- a/share/words.xml +++ b/share/words.xml @@ -1531,6 +1531,7 @@ Note: This is a permanent action! If you protect this server again later, a full --install-media - File name or file UUID. Available discs are:]]> File name: [#!variable!name!#], file UUID: [#!variable!uuid!#], size: [#!variable!size!#]]]> + Enabling the enable-safe-start daemon. Starting: [#!variable!program!#]. diff --git a/tools/anvil-join-anvil b/tools/anvil-join-anvil index 6f968345..c9e683cd 100755 --- a/tools/anvil-join-anvil +++ b/tools/anvil-join-anvil @@ -66,6 +66,15 @@ configure_pacemaker($anvil); # Configure DRBD configure_drbd($anvil); +# Enable anvil-safe-start +if (1) +{ + my ($return_code) = $anvil->System->enable_daemon({daemon => "anvil-safe-start.service"}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { return_code => $return_code }}); + update_progress($anvil, (99, "job_0094,!!daemon!anvil-safe-start.service!!"); + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0094", variables => { daemon => "anvil-safe-start.service" }}); +} + update_progress($anvil, 100, "job_0128"); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0128"}); From 8ba613952cf20a382d5b203ef44aaa8ad3b63508 Mon Sep 17 00:00:00 2001 From: digimer Date: Tue, 18 Apr 2023 12:32:52 -0400 Subject: [PATCH 08/12] Typo fix. Signed-off-by: digimer --- tools/anvil-join-anvil | 4 ++-- tools/anvil-manage-dr | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/anvil-join-anvil b/tools/anvil-join-anvil index c9e683cd..e3b5867b 100755 --- a/tools/anvil-join-anvil +++ b/tools/anvil-join-anvil @@ -70,8 +70,8 @@ configure_drbd($anvil); if (1) { my ($return_code) = $anvil->System->enable_daemon({daemon => "anvil-safe-start.service"}); - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 3, list => { return_code => $return_code }}); - update_progress($anvil, (99, "job_0094,!!daemon!anvil-safe-start.service!!"); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { return_code => $return_code }}); + update_progress($anvil, 99, "job_0094,!!daemon!anvil-safe-start.service!!"); $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 1, key => "job_0094", variables => { daemon => "anvil-safe-start.service" }}); } diff --git a/tools/anvil-manage-dr b/tools/anvil-manage-dr index b28d93dc..9a10b28b 100755 --- a/tools/anvil-manage-dr +++ b/tools/anvil-manage-dr @@ -226,6 +226,7 @@ sub handle_links dr_link_note => "user-created", }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { returned_dr_link_uuid => $returned_dr_link_uuid }}); + print "\n".$anvil->Words->string({key => "log_0734", variables => { host => $anvil->data->{hosts}{host_uuid}{$host_uuid}{short_host_name}, anvil => $anvil->data->{anvils}{anvil_uuid}{$anvil_uuid}{anvil_name}, From 0874ad571a294d1c300858153a4d0a513f46e0f9 Mon Sep 17 00:00:00 2001 From: digimer Date: Tue, 18 Apr 2023 14:33:58 -0400 Subject: [PATCH 09/12] Updated anvil-safe-start to not give up on starting corosync/pacemaker if it fails on the first try. Signed-off-by: digimer --- share/words.xml | 9 +++++++ tools/anvil-safe-start | 55 +++++++++++++++++++++++++++++++++--------- 2 files changed, 53 insertions(+), 11 deletions(-) diff --git a/share/words.xml b/share/words.xml index d44b7617..c93548d5 100644 --- a/share/words.xml +++ b/share/words.xml @@ -386,6 +386,7 @@ The attempt to start the servers appears to have failed. The return code '0' was ==== #!variable!output!# ==== +We're done waiting, exiting out. ' or '--server-uuid .]]> Could not find the server: [#!variable!server!#] on this Anvil! in the database. @@ -3587,6 +3588,14 @@ The error was: [ Warning ] - The test "fail file": [#!variable!fail_file!#] was found. So long as this file exists, the ocf:alteeve:server RA will return 'OCF_ERR_GENERIC' (exit code 1). Delete the file to resume normal operation. [ Warning ] - The configured reserved RAM was set to: [#!variable!was!#], which appears invalid. It must be an integer value representing the amount of RAM to reserve, in MiB. The reserved RAM is being set to: [#!variable!was!#]. [ Warning ] - The configured reserved RAM was set to: [#!variable!was!#], which appears invalid. It must be an integer value representing the amount of RAM to reserve, in MiB. The reserved RAM is being set to: [#!variable!was!#]. + +The attempt to start the servers appears to have failed. The return code '0' was expected, but: [#!variable!return_code!#] was received. The output was: +==== +#!variable!output!# +==== +We will wait: [#!variable!waiting!#] seconds and then try again. We'll give up if it keeps failing after: [#!variable!time_left!#] seconds. + + diff --git a/tools/anvil-safe-start b/tools/anvil-safe-start index cb4f50d2..d04e7ffe 100755 --- a/tools/anvil-safe-start +++ b/tools/anvil-safe-start @@ -284,22 +284,55 @@ sub start_pacemaker ### TODO: A lot more testing is needed for degraded single-node start later. ### Should we use --all, or wait for our peer? For now, we wait. - #my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster start --all"; - my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster start"; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); - my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + ### NOTE: This can be racy during initial setup, calling the start before /etc/hosts is + ### populated. So this watches for that corner case. + my $wait_until = time + 120; + my $waiting = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { - output => $output, - return_code => $return_code, + wait_until => $wait_until, + waiting => $waiting, }}); - if ($return_code) + while($waiting) { - # What?! Fail out, we're done. - $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0256", variables => { + #my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster start --all"; + my $shell_call = $anvil->data->{path}{exe}{pcs}." cluster start"; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { shell_call => $shell_call }}); + my ($output, $return_code) = $anvil->System->call({shell_call => $shell_call}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { output => $output, return_code => $return_code, }}); - $anvil->nice_exit({exit_code => 1}); + if ($return_code) + { + # Are we done waiting? + if (time > $wait_until) + { + # We're done. + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "err", key => "error_0256", variables => { + output => $output, + return_code => $return_code, + }}); + $anvil->nice_exit({exit_code => 1}); + } + else + { + # Report the error and sleep + my $time_left = $wait_until - time; + $anvil->Log->entry({source => $THIS_FILE, line => __LINE__, 'print' => 1, level => 0, priority => "alert", key => "warning_0153", variables => { + output => $output, + return_code => $return_code, + time_left => $time_left, + waiting => 10, + }}); + sleep 10; + } + } + else + { + # Success! + $waiting = 0; + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { waiting => $waiting }}); + } } ### TODO: We may implement the logic to fence our peer (similar to cman's post_join_delay' @@ -309,7 +342,7 @@ sub start_pacemaker # the peer and, if the fence succeeds, unblock quorum. my $start_time = time; my $wait_for_peer = $start_time + 120; - my $waiting = 1; + $waiting = 1; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { start_time => $start_time, wait_for_peer => $wait_for_peer, From bd575c6a7d3a2f15b5cbad28625ae5e39c8d3202 Mon Sep 17 00:00:00 2001 From: digimer Date: Tue, 18 Apr 2023 19:02:51 -0400 Subject: [PATCH 10/12] Bumped logging for storage group management. Signed-off-by: digimer --- Anvil/Tools/Cluster.pm | 2 +- tools/anvil-provision-server | 25 ++++++++++++++++++++----- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/Anvil/Tools/Cluster.pm b/Anvil/Tools/Cluster.pm index 16c39af1..76c2cf03 100644 --- a/Anvil/Tools/Cluster.pm +++ b/Anvil/Tools/Cluster.pm @@ -502,7 +502,7 @@ ORDER BY } } my $storage_group_member_uuid = $anvil->Database->insert_or_update_storage_group_members({ - debug => $debug, + debug => 2, storage_group_member_storage_group_uuid => $storage_group_uuid, storage_group_member_host_uuid => $host_uuid, storage_group_member_vg_uuid => $storage_group_member_vg_uuid, diff --git a/tools/anvil-provision-server b/tools/anvil-provision-server index fce6076f..97dea740 100755 --- a/tools/anvil-provision-server +++ b/tools/anvil-provision-server @@ -2136,7 +2136,10 @@ sub interactive_ask_server_cpu { my $anvil_uuid = $anvil->data->{new_server}{anvil_uuid}; $anvil->Database->get_anvils(); - $anvil->Get->available_resources({anvil_uuid => $anvil_uuid}); + $anvil->Get->available_resources({ + debug => 2, + anvil_uuid => $anvil_uuid, + }); my $default_cpu = $anvil->data->{switches}{cpu}; if (not $default_cpu) @@ -2201,7 +2204,10 @@ sub interactive_ask_server_ram { my $anvil_uuid = $anvil->data->{new_server}{anvil_uuid}; $anvil->Database->get_anvils(); - $anvil->Get->available_resources({anvil_uuid => $anvil_uuid}); + $anvil->Get->available_resources({ + debug => 2, + anvil_uuid => $anvil_uuid, + }); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { "anvil_resources::${anvil_uuid}::ram::available" => $anvil->data->{anvil_resources}{$anvil_uuid}{ram}{available}." (".$anvil->Convert->bytes_to_human_readable({'bytes' => $anvil->data->{anvil_resources}{$anvil_uuid}{ram}{available}}).")", }}); @@ -2299,7 +2305,10 @@ sub interactive_ask_server_storage_group { my $anvil_uuid = $anvil->data->{new_server}{anvil_uuid}; $anvil->Database->get_anvils(); - $anvil->Get->available_resources({debug => 2, anvil_uuid => $anvil_uuid}); + $anvil->Get->available_resources({ + debug => 2, + anvil_uuid => $anvil_uuid, + }); # I need a list of Storage groups, my $say_ram = $anvil->Convert->bytes_to_human_readable({"bytes" => $anvil->data->{new_server}{ram}}); @@ -2409,7 +2418,10 @@ sub interactive_ask_server_storage_size $anvil->Database->get_anvils(); $anvil->Database->get_storage_group_data(); - $anvil->Get->available_resources({debug => 2, anvil_uuid => $anvil_uuid}); + $anvil->Get->available_resources({ + debug => 2, + anvil_uuid => $anvil_uuid, + }); $vg_size = $anvil->data->{anvil_resources}{$anvil_uuid}{storage_group}{$storage_group_uuid}{vg_size}; $vg_free = $anvil->data->{anvil_resources}{$anvil_uuid}{storage_group}{$storage_group_uuid}{free_size}; @@ -2866,7 +2878,10 @@ sub interactive_ask_server_confirm my $anvil_uuid = $anvil->data->{new_server}{anvil_uuid}; $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => { anvil_uuid => $anvil_uuid }}); - $anvil->Get->available_resources({anvil_uuid => $anvil_uuid, debug => 2}); + $anvil->Get->available_resources({ + debug => 2, + anvil_uuid => $anvil_uuid, + }); $anvil->data->{new_server}{name} = ""; $anvil->data->{new_server}{uuid} = ""; From dc7b909bfca2548daf7d17cc9b9084992a87e541 Mon Sep 17 00:00:00 2001 From: digimer Date: Tue, 18 Apr 2023 19:14:59 -0400 Subject: [PATCH 11/12] More logging to debug storage group race condition Signed-off-by: digimer --- Anvil/Tools/Database.pm | 8 ++++---- notes | 2 ++ tools/anvil-manage-dr | 2 +- tools/anvil-manage-storage-groups | 2 +- tools/anvil-provision-server | 2 +- tools/anvil-report-usage | 5 ++++- 6 files changed, 13 insertions(+), 8 deletions(-) diff --git a/Anvil/Tools/Database.pm b/Anvil/Tools/Database.pm index a05d6a43..1e842e1d 100644 --- a/Anvil/Tools/Database.pm +++ b/Anvil/Tools/Database.pm @@ -6051,11 +6051,11 @@ WHERE }}); my $query = "DELETE FROM history.storage_group_members WHERE storage_group_member_uuid = ".$anvil->Database->quote($storage_group_member_uuid).";"; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { query => $query }}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => { query => $query }}); $anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__}); $query = "DELETE FROM storage_group_members WHERE storage_group_member_uuid = ".$anvil->Database->quote($storage_group_member_uuid).";"; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { query => $query }}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => { query => $query }}); $anvil->Database->write({query => $query, source => $THIS_FILE, line => __LINE__}); } } @@ -13948,11 +13948,11 @@ SET WHERE storage_group_member_uuid = ".$anvil->Database->quote($storage_group_member_uuid)." ;"; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { query => $query }}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => { query => $query }}); $anvil->Database->write({uuid => $uuid, query => $query, source => $file ? $file." -> ".$THIS_FILE : $THIS_FILE, line => $line ? $line." -> ".__LINE__ : __LINE__}); $query = "DELETE FROM storage_group_members WHERE storage_group_member_uuid = ".$anvil->Database->quote($storage_group_member_uuid).";"; - $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => $debug, list => { query => $query }}); + $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 1, list => { query => $query }}); $anvil->Database->write({uuid => $uuid, query => $query, source => $file ? $file." -> ".$THIS_FILE : $THIS_FILE, line => $line ? $line." -> ".__LINE__ : __LINE__}); } } diff --git a/notes b/notes index e302d858..628d8f1b 100644 --- a/notes +++ b/notes @@ -16,6 +16,8 @@ Common queries; * SELECT a.job_uuid, b.host_name, a.job_command, a.job_data, a.job_progress, a.job_status FROM jobs a, hosts b WHERE a.job_host_uuid = b.host_uuid AND a.job_progress != 100; * SELECT a.host_name, b.file_name, c.file_location_active FROM hosts a, files b, file_locations c WHERE a.host_uuid = c.file_location_host_uuid AND b.file_uuid = c.file_location_file_uuid ORDER BY b.file_name ASC, a.host_name ASC; * SELECT a.dr_link_uuid, b.host_name, c.anvil_name, a.dr_link_note FROM dr_links a, hosts b, anvils c WHERE a.dr_link_host_uuid = b.host_uuid AND a.dr_link_anvil_uuid = c.anvil_uuid ORDER BY c.anvil_name ASC, b.host_name ASC; +* SELECT a.storage_group_uuid, d.storage_group_member_uuid, b.anvil_name, a.storage_group_name, c.host_name, d.storage_group_member_vg_uuid, d.storage_group_member_note FROM storage_groups a, anvils b, hosts c, storage_group_members d WHERE a.storage_group_uuid = d.storage_group_member_storage_group_uuid AND a.storage_group_anvil_uuid = b.anvil_uuid AND c.host_uuid = d.storage_group_member_host_uuid ORDER BY a.storage_group_name ASC, c.host_name ASC; + # Fail a resource for testing purposes. crm_resource --fail --resource srv02-b -N vm-a01n01 diff --git a/tools/anvil-manage-dr b/tools/anvil-manage-dr index 9a10b28b..ec9c0ce2 100755 --- a/tools/anvil-manage-dr +++ b/tools/anvil-manage-dr @@ -534,7 +534,7 @@ sub sanity_check # Get the Anvil! details. $anvil->Database->get_hosts(); $anvil->Database->get_anvils(); - $anvil->Database->get_storage_group_data(); + $anvil->Database->get_storage_group_data({debug => 2}); $anvil->Database->get_dr_links({debug => 2}); # Does this Anvil! have at least one DR node? If there's only one, use it. If more than one, we need diff --git a/tools/anvil-manage-storage-groups b/tools/anvil-manage-storage-groups index 12dc6a34..62500ca2 100755 --- a/tools/anvil-manage-storage-groups +++ b/tools/anvil-manage-storage-groups @@ -60,7 +60,7 @@ if (not $anvil->data->{sys}{database}{connections}) $anvil->Database->get_hosts({include_deleted => 1}); $anvil->Database->get_anvils(); -$anvil->Database->get_storage_group_data(); +$anvil->Database->get_storage_group_data({debug => 2}); get_vg_data($anvil); get_storage_data($anvil); diff --git a/tools/anvil-provision-server b/tools/anvil-provision-server index 97dea740..3ccee7dc 100755 --- a/tools/anvil-provision-server +++ b/tools/anvil-provision-server @@ -2417,7 +2417,7 @@ sub interactive_ask_server_storage_size }}); $anvil->Database->get_anvils(); - $anvil->Database->get_storage_group_data(); + $anvil->Database->get_storage_group_data({debug => 2}); $anvil->Get->available_resources({ debug => 2, anvil_uuid => $anvil_uuid, diff --git a/tools/anvil-report-usage b/tools/anvil-report-usage index 291fcdae..d158a0b1 100755 --- a/tools/anvil-report-usage +++ b/tools/anvil-report-usage @@ -116,7 +116,10 @@ sub collect_anvil_data }}); } - $anvil->Get->available_resources({anvil_uuid => $anvil_uuid}); + $anvil->Get->available_resources({ + debug => 2, + anvil_uuid => $anvil_uuid, + }); my $cpu_cores = $anvil->data->{anvil_resources}{$anvil_uuid}{cpu}{cores}; my $cpu_threads = $anvil->data->{anvil_resources}{$anvil_uuid}{cpu}{threads}; my $say_cpu = $anvil->Words->string({key => "message_0289", variables => { From c11be1ad1a69b3dc8f43176c96f529e09d065319 Mon Sep 17 00:00:00 2001 From: digimer Date: Wed, 19 Apr 2023 12:36:05 -0400 Subject: [PATCH 12/12] Added a skip to ignore dot files when looking at new files. Signed-off-by: digimer --- tools/anvil-manage-files | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/anvil-manage-files b/tools/anvil-manage-files index f98fd8ea..81cf7d69 100755 --- a/tools/anvil-manage-files +++ b/tools/anvil-manage-files @@ -554,6 +554,9 @@ sub check_incoming full_path => $full_path, }}); + # Skip dot-files, they're usually files being uploaded. + next if $file_name =~ /^\./; + # Do I know about this file? If so, is the file the same size? If either is no, calculate the md5sum. my ($file_uuid, $recorded_size, $recorded_mtime, $recorded_md5sum) = get_file_db_info($anvil, "", $file_name); $anvil->Log->variables({source => $THIS_FILE, line => __LINE__, level => 2, list => {