* SELECT a.job_uuid, b.host_name, a.job_command, a.job_data, a.job_progress, a.job_status FROM jobs a, hosts b WHERE a.job_host_uuid = b.host_uuid AND a.job_progress != 100;
* SELECT a.dr_link_uuid, b.host_name, c.anvil_name, a.dr_link_note FROM dr_links a, hosts b, anvils c WHERE a.dr_link_host_uuid = b.host_uuid AND a.dr_link_anvil_uuid = c.anvil_uuid ORDER BY c.anvil_name ASC, b.host_name ASC;
* SELECT a.storage_group_uuid, d.storage_group_member_uuid, b.anvil_name, a.storage_group_name, c.host_name, d.storage_group_member_vg_uuid, d.storage_group_member_note FROM storage_groups a, anvils b, hosts c, storage_group_members d WHERE a.storage_group_uuid = d.storage_group_member_storage_group_uuid AND a.storage_group_anvil_uuid = b.anvil_uuid AND c.host_uuid = d.storage_group_member_host_uuid ORDER BY a.storage_group_name ASC, c.host_name ASC;
* SELECT a.scan_hardware_uuid, b.host_name, a.scan_hardware_cpu_cores AS cores, a.scan_hardware_cpu_threads AS threads, pg_size_pretty(a.scan_hardware_ram_total) AS ram_total, pg_size_pretty(a.scan_hardware_memory_total) AS memory_total, pg_size_pretty(a.scan_hardware_memory_free) AS memory_free FROM scan_hardware a, hosts b WHERE a.scan_hardware_host_uuid = b.host_uuid ORDER BY b.host_name ASC;
* SELECT a.scan_apc_ups_name AS name, a.scan_apc_ups_serial_number AS sn, a.scan_apc_ups_health AS health, a.scan_apc_ups_nmc_serial_number AS nmc_sn, a.scan_apc_ups_nmc_mac_address AS mac, a.scan_apc_ups_ip AS ip, b._percentage_charge AS charge, d.scan_apc_ups_battery_temperature AS btemp FROM scan_apc_upses a, scan_apc_ups_input b, scan_apc_ups_output c, scan_apc_ups_batteries d WHERE a.scan_apc_ups_uuid = b.scan_apc_ups_input_scan_apc_ups_uuid AND a.scan_apc_ups_uuid = c.scan_apc_ups_output_scan_apc_ups_uuid AND a.scan_apc_ups_uuid = d.scan_apc_ups_battery_scan_apc_ups_uuid ORDER BY name ASC;
* SELECT b.host_name, a.network_interface_uuid, a.network_interface_mac_address AS mac, a.network_interface_name AS name, a.network_interface_speed AS speed, a.network_interface_link_state AS link, a.network_interface_operational AS op, a.network_interface_duplex AS duplex, a.network_interface_medium AS medium, a.network_interface_bond_uuid AS bond_uuid, a.network_interface_bridge_uuid AS bridge_uuid FROM network_interfaces a, hosts b WHERE a.network_interface_host_uuid = b.host_uuid AND b.host_name LIKE 'an-a02%' AND a.network_interface_operational != 'DELETED' ORDER BY b.host_name ASC, a.network_interface_name ASC;
* SELECT b.host_name, a.bond_uuid, a.bond_name, a.bond_mode, a.bond_mtu AS mtu, a.bond_primary_interface AS primary, a.bond_active_interface AS active, a.bond_mac_address AS mac, a.bond_operational AS op, c.bridge_name, a.modified_date FROM bonds a, hosts b, bridges c WHERE a.bond_host_uuid = b.host_uuid AND a.bond_bridge_uuid = c.bridge_uuid AND (b.host_uuid = 'b4e46faf-0ebe-e211-a0d6-00262d0ca874' OR b.host_uuid = '4ba42b4e-9bf7-e311-a889-899427029de4') ORDER BY b.host_name ASC, a.bond_name ASC;
* SELECT b.host_name, a.bridge_uuid, a.bridge_name, a.bridge_id, a.bridge_mtu FROM bridges a, hosts b WHERE a.bridge_host_uuid = b.host_uuid AND b.host_name LIKE 'an-a02%' ORDER BY b.host_name ASC, a.bridge_name ASC;
* SELECT a.host_name, b.file_name, c.file_location_active AS active, c.file_location_ready AS ready FROM hosts a, files b, file_locations c WHERE a.host_uuid = c.file_location_host_uuid AND b.file_uuid = c.file_location_file_uuid ORDER BY b.file_name ASC, a.host_name ASC;
* SELECT b.host_name, a.health_agent_name, a.health_source_name, a.health_source_weight FROM health a, hosts b WHERE b.host_uuid = a.health_host_uuid AND b.host_name LIKE 'an-a02%' order by b.host_name ASC, a.health_agent_name ASC, a.health_source_weight ASC;
for lv in $(lvscan | grep deploy| awk '{print $2}' | sed s/\'//g); do lvremove -y $lv; done; rm -f /etc/drbd.d/an-test-deploy*; lvscan; ls -lah /etc/drbd.d/
- How to write a NetworkManager dispatcher script to apply ethtool commands? - https://access.redhat.com/solutions/2841131
- Setup nodes to log to striker? - https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/networking_guide/sec-configuring_netconsole
- Pacemaker can be monitored via SNMP - https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/high_availability_add-on_reference/s1-snmpandpacemaker-HAAR
Changes made using tools such as nmcli do not require a reload but do require the associated interface to be put down and then up again. That can be done by using commands in the following format:
* nmcli dev disconnect interface-name
Followed by:
* nmcli con up interface-name
NOTE: RHEL doesn't support direct-cabled bonds - https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/networking_guide/ch-configure_network_bonding
ifcfg-X config Notes - /usr/share/doc/initscripts-*/sysconfig.txt (Look for the sections describing files /etc/sysconfig/network and /etc/sysconfig/network-scripts/ifcfg-<interface-name>);
* PREFIXx overrules NETMASKx. Use PREFIXx, not NETMASKx.
* The 'x' suffice for PREFIX, NETMASK, etc start at 0 and must count up by 1 at a time.
* ZONE will be useful for the firewall stuff later.
* ETHTOOL_OPTS is deprecated, replaced by using udev rules
* initscripts interpret PEERDNS=no to mean "never touch resolv.conf". NetworkManager interprets it to say "never add automatic (DHCP, PPP, VPN, etc.) nameservers to resolv.conf".
Bond
* resend_igmp & num_unsol_na={1~255} may help if a switch is slow to notice traffic has moved to the new interface. default is 1. Each update is send 200ms apart.
* Bridged interfaces should use BRIDGE_UUID="", _not_ BRIDGE="". The former causes the later to be ignored and the later is only used for possible compatibility reasons.
* resources can contain an US-ASCII character, except for spaces
* A resource is a single replication stream for 1 or more resources, max 65.535 vols per resource
* DRBD does, however, ship with an LVM integration facility that automates the creation of LVM snapshots immediately before synchronization. This ensures that a consistent copy of the data is always available on the peer, even while synchronization is running. See Using automated LVM snapshots during DRBD synchronization for details on using this facility.
* Checksum-based synchronization computes a block's hash on source and target and skips if matching, possibly making resync much faster for blocks rewritten with the same data, but at the cost of CPU. Make this a user-configurable option under the advanced tab.
* Suspended replication allows congested replication links to suspend replication, leaving the peer in a consistent state, but allowing the primary to "pull ahead". When the congestion passes, the delta resyncs. Make this a user-configurable option with scary warnings.
* Online verification can (should?) be run periodically on the server host (verification source will overwrite deltas on the verification target). Perhaps schedule to run once/month? Do resource sequentially as this places a CPU load on the nodes.
* Replication traffic integrity checking uses a given available kernel crypto to verify data integrity on transmission to the peer. If the replicated block can not be verified against the digest, the connection is dropped and immediately re-established; because of the bitmap the typical result is a retransmission.
** Make an option in the advanced tab. Test to see overhead this adds. Choose the lowest overhead algo (within reason)
* Support for disk flushes might be something we want to disable, as it seems to force write-through even with a function FBWC/BBU. Need to test.
* Note; "Inconsistent" is almost always useless. "Consistent" and "Outdated" are able to be used safely, just without whatever happened on the peer after.
* Truck based replication, also known as disk shipping, is a means of preseeding a remote site with data to be replicated, by physically shipping storage media to the remote site.
* Make sure that selinux doesn't block DRBD comms over the SN
* See "5.15.1. Growing on-line" for growing a DRBD resource
** Shrinking online is ONLY possible if the metadata is external. Worth creating *_md LVs? Offline requires backing up and restoring the MD
- I wonder if you'd have the same results if you could get vol1 into an UpToDate/UpToDate state using the drbdsetup equivalent of drbdadm new-current-uuid --clear-bitmap <res>
* A resource agent receives all configuration information about the resource it manages via environment variables. The names of these environment variables are always the name of the resource parameter, prefixed with OCF_RESKEY_. For example, if the resource has an ip parameter set to 192.168.1.1, then the resource agent will have access to an environment variable OCF_RESKEY_ip holding that value.
14:08 < lge> anyways, you can also "on-fail: retry"
OK, set the stop timeout to 60, set 'on-fail: block" and set the failure-timeout to 60 and see how pacemaker reacts.
failure-timeout
===
Migrate servers;
- Let ScanCore set 'node-health' attribute (http://clusterlabs.org/pacemaker/doc/en-US/Pacemaker/1.1/html-single/Pacemaker_Explained/index.html#s-node-health)
- Set 'migration-limit' to '1' to enforce serial live migration (http://clusterlabs.org/pacemaker/doc/en-US/Pacemaker/1.1/html-single/Pacemaker_Explained/index.html#s-cluster-options).
Migrate a single server by setting a location constraint against the node we want the VM off of.
- If anything goes wrong, the server will enter a blocked state in pacemaker.
- Recovery needs to be 'unmanage -> clean' to avoid a stop call.
11:57 <@kgaillot> for your design, that sounds right. between cleanup and manage, i'd make sure there was a PE run without any pending actions blocked by the unmanaging -- you can either look at the logs on the DC, run "crm_simulate -SL", or just check the status for a bit
11:58 <@kgaillot> you can play around with it by putting a higher preference on the to-be-cleaned node, to make sure it *does* move when you re-manage. that way you can see what logs/simulate/status look like
12:07 <@kgaillot> i'm thinking if you do crm_resource --reprobe instead of cleanup in the above sequence, that should prevent anything unexpected
12:07 <@kgaillot> unmanage -> adjust preferences if needed -> reprobe resource -> wait for probe results to come back in, and if status looks good -> re-manage
12:08 <@kgaillot> the reprobe will wipe the entire resource history and fail counts for the resource, causing pacemaker to recheck the current status on all nodes. if the status then shows the resource running where you expect/want it, with no errors, then it's not going to do anything further
12:09 <@kgaillot> (in 2.0, cleanup only erases the history where the resource has failed, while reprobe erases the history regardless)
12:13 <@kgaillot> if there are no failures in the resource history, there should be no risk of a full stop. if there is no resource history at all, then after reprobe, there should be no risk of any actions (assuming you've set up location preferences and stickiness how you want them)
Recover from a failed migration;
reset location to prefer current host -> unmanage resource -> cleanup resource -> manage resource
(running on node 2, so re-add location constraint - basically, make sure location constraint favours current host)
# Command section – Refer to Chapter 2 for a list of kickstart options. You must include the required options.
### NOTE: The %packages, %pre, %pre-install, %post, %onerror, and %traceback sections are all required to be closed with %end
# Section 2
# The %packages section – Refer to Chapter 3 for details.
# Section 3:
# The %pre, %pre-install, %post, %onerror, and %traceback sections – These sections can be in any order and are not required. Refer to Chapter 4, Chapter 5, and Chapter 6 for details.
====
0 root@pulsar:/var/lib/libvirt/images# dev_PATH=$(udevadm info /dev/sdb | grep -e ID_PATH=)
0 root@pulsar:/var/lib/libvirt/images# if [[ $dev_PATH == *"usb"* ]]; then echo "USB drive"; elif [[ $dev_PATH == *"nvme"* ]]; then echo "NVMe drive"; elif [[ $dev_PATH == *"ata"* ]]; then echo "SATA drive"; elif [[ $dev_PATH == *"scsi"* ]]; then echo "SCSI drive"; fi
USB drive
0 root@pulsar:/var/lib/libvirt/images# dev_PATH=$(udevadm info /dev/sda | grep -e ID_PATH=)
0 root@pulsar:/var/lib/libvirt/images# if [[ $dev_PATH == *"usb"* ]]; then echo "USB drive"; elif [[ $dev_PATH == *"nvme"* ]]; then echo "NVMe drive"; elif [[ $dev_PATH == *"ata"* ]]; then echo "SATA drive"; elif [[ $dev_PATH == *"scsi"* ]]; then echo "SCSI drive"; fi
SATA drive
0 root@pulsar:/var/lib/libvirt/images# dev_PATH=$(udevadm info /dev/nvme0n1 | grep -e ID_PATH=)
0 root@pulsar:/var/lib/libvirt/images# if [[ $dev_PATH == *"usb"* ]]; then echo "USB drive"; elif [[ $dev_PATH == *"nvme"* ]]; then echo "NVMe drive"; elif [[ $dev_PATH == *"ata"* ]]; then echo "SATA drive"; elif [[ $dev_PATH == *"scsi"* ]]; then echo "SCSI drive"; fi
NVMe drive
[root@localhost ~]# dev_PATH=$(udevadm info /dev/sda | grep -e ID_PATH=)
[root@localhost ~]# if [[ $dev_PATH == *"usb"* ]]; then echo "USB drive"; elif [[ $dev_PATH == *"nvme"* ]]; then echo "NVMe drive"; elif [[ $dev_PATH == *"ata"* ]]; then echo "SATA drive"; elif [[ $dev_PATH == *"scsi"* ]]; then echo "SCSI drive"; fi
OS10(conf-if-ma-1/1/1)# <165>1 2021-04-02T12:23:40.141901+00:00 OS10 dn_alm 652 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %IP_ADDRESS_ADD: IP Address add is successful. IP 10.201.1.2/16 in VRF:default added successfully
OS10(conf-if-ma-1/1/1)# no shutdown
# Connected via SSH to confirm access:
OS10(config)#
<165>1 2021-04-02T12:25:49.956308+00:00 OS10 dn_alm 652 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %ALM_AUTH_EVENT: Authentication event was raised MESSAGE=pam_unix(sshd:session): session opened for user admin by (uid=0)
<86>1 2021-04-02T12:25:49.501860+00:00 OS10 sshd 6512 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) pam_unix(sshd:session): session opened for user admin by (uid=0)
<86>1 2021-04-02T12:25:51.795620+00:00 OS10 sshd 6527 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) pam_unix(sshd:session): session opened for user admin by (uid=0)
<165>1 2021-04-02T12:25:51.957630+00:00 OS10 dn_alm 652 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %ALM_AUTH_EVENT: Authentication event was raised MESSAGE=pam_unix(sshd:session): session opened for user admin by (uid=0)
<165>1 2021-04-02T12:31:11.702525+00:00 OS10 dn_alm 803 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %IFM_ASTATE_UP: Interface admin state up :port-channel1000
<165>1 2021-04-02T12:31:11.707501+00:00 OS10 dn_alm 803 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %IFM_OSTATE_DN: Interface operational state is down :port-channel1000
<165>1 2021-04-02T12:31:11.720332+00:00 OS10 dn_alm 803 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %IFM_OSTATE_UP: Interface operational state is up :port-channel1000
<165>1 2021-04-02T12:31:11.845070+00:00 OS10 dn_alm 803 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %IFM_OSTATE_UP: Interface operational state is up :vlan4094
<165>1 2021-04-02T12:31:11.863978+00:00 OS10 dn_alm 803 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %VLT_PEER_UP: VLT unit 1 is up
<165>1 2021-04-02T12:31:18.996716+00:00 OS10 dn_alm 803 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %VLT_ELECTION_ROLE: VLT unit 1 is elected as secondary
<165>1 2021-04-02T12:31:19.087695+00:00 OS10 dn_alm 803 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %IFM_OSTATE_UP: Interface operational state is up :vlan1
Download the firmware from the Dell digital locker. Once downloaded, extracted and the sum verified, copy the 'PKGS_OS10-Enterprise-xxx-installer-x86_64.bin' file to one of the striker's /var/lib/tftpboot/' directory and start the dhcpd service.
### NOTE: Use the striker NOT connected to the switch being upgraded! If necessary, move the uplink interfaces off of the switch to be upgraded!
Once ready, connect to the switch over serial port.
### WARNING: The firmware update completely resets the switch! Backup you config before hand, if necessary. Also, watch for the loss of VLANs causing switch loops when 2+ uplinks are connected!
Installing OS10 on primary volume ... OK <- This takes a long time, be patient
Setting up shared data ... OK
Synchronizing standby image ...
OS10 installation is complete.
Creating ext4 filesystem on sda4 ... OK
Installing GRUB-UEFI ... OK
Saving system information ... OK
Saving ONIE support information ... OK
ONIE: NOS install successful: tftp://10.201.4.1/PKGS_OS10-Enterprise-10.5.2.3.316stretch-installer-x86_64.bin
ONIE: Rebooting...
ONIE:/ # discover: installer mode detected.
Stopping: discover...start-stop-daemon: warning: killing process 2881: No such process
done.
Stopping: dropbear ssh daemon... done.
Stopping: telnetd... done.
Stopping: syslogd... done.
Info: Unmounting kernel filesystems
umount: can't umount /: Invalid argument
The system is going down NOW!
Sent SIGTERM to all processes
Sent SIGKILL tosd 4:0:0:0: [sda] Synchronizing SCSI cache
reboot: Restarting system
reboot: machine restart
### NOTE: The login prompt will appear before the system is ready to log in. The default username and password revert to 'admin' / 'admin', but this won't work for the first couple of minutes.
# Hosts added or updated by the Anvil! on: [2021/04/17 16:24:52]:
10.201.10.2 Could not get property: Refusing activation, D-Bus is shutting down Could not get property: Refusing activation, D-Bus is shutting down. Could not get property: Refusing activation, D-Bus is shutting down.bcn1
192.168.122.12 Could not get property: Refusing activation, D-Bus is shutting down.ifn1
10.101.10.2 Could not get property: Refusing activation, D-Bus is shutting down.sn1
May 02 13:35:21 an-a01n02.zennioptical.com setroubleshoot[5333]: SELinux is preventing /usr/sbin/drbdsetup from create access on the netlink_generic_socket labeled drbd_t. For complete SELinux messages run: sealert -l 4079c288-db4a-4f44-a588-94f1dbfff269
May 02 13:35:21 an-a01n02.zennioptical.com setroubleshoot[5333]: SELinux is preventing /usr/sbin/drbdsetup from create access on the netlink_generic_socket labeled drbd_t.