Tasks; 1. When provisioning a server; - First, check if either node is SyncSource, if so use that node. - Second, check which node has the most servers by RAM count, use that node. 2. Provision; - Create DRBD resource, force primary on install target - Create pacemaker resource in stopped state - Set location constraint to prefer target node - Boot server Create "Node status" which returns "degraded" if the peer is gone Common queries; * SELECT a.job_uuid, b.host_name, a.job_command, a.job_data, a.job_progress, a.job_status FROM jobs a, hosts b WHERE a.job_host_uuid = b.host_uuid AND a.job_progress != 100; * SELECT a.dr_link_uuid, b.host_name, c.anvil_name, a.dr_link_note FROM dr_links a, hosts b, anvils c WHERE a.dr_link_host_uuid = b.host_uuid AND a.dr_link_anvil_uuid = c.anvil_uuid ORDER BY c.anvil_name ASC, b.host_name ASC; * SELECT a.storage_group_uuid, d.storage_group_member_uuid, b.anvil_name, a.storage_group_name, c.host_name, d.storage_group_member_vg_uuid, d.storage_group_member_note FROM storage_groups a, anvils b, hosts c, storage_group_members d WHERE a.storage_group_uuid = d.storage_group_member_storage_group_uuid AND a.storage_group_anvil_uuid = b.anvil_uuid AND c.host_uuid = d.storage_group_member_host_uuid ORDER BY a.storage_group_name ASC, c.host_name ASC; * SELECT a.scan_hardware_uuid, b.host_name, a.scan_hardware_cpu_cores AS cores, a.scan_hardware_cpu_threads AS threads, pg_size_pretty(a.scan_hardware_ram_total) AS ram_total, pg_size_pretty(a.scan_hardware_memory_total) AS memory_total, pg_size_pretty(a.scan_hardware_memory_free) AS memory_free FROM scan_hardware a, hosts b WHERE a.scan_hardware_host_uuid = b.host_uuid ORDER BY b.host_name ASC; * SELECT a.scan_apc_ups_name AS name, a.scan_apc_ups_serial_number AS sn, a.scan_apc_ups_health AS health, a.scan_apc_ups_nmc_serial_number AS nmc_sn, a.scan_apc_ups_nmc_mac_address AS mac, a.scan_apc_ups_ip AS ip, b._percentage_charge AS charge, d.scan_apc_ups_battery_temperature AS btemp FROM scan_apc_upses a, scan_apc_ups_input b, scan_apc_ups_output c, scan_apc_ups_batteries d WHERE a.scan_apc_ups_uuid = b.scan_apc_ups_input_scan_apc_ups_uuid AND a.scan_apc_ups_uuid = c.scan_apc_ups_output_scan_apc_ups_uuid AND a.scan_apc_ups_uuid = d.scan_apc_ups_battery_scan_apc_ups_uuid ORDER BY name ASC; * SELECT b.host_name, a.network_interface_uuid, a.network_interface_mac_address AS mac, a.network_interface_name AS name, a.network_interface_speed AS speed, a.network_interface_link_state AS link, a.network_interface_operational AS op, a.network_interface_duplex AS duplex, a.network_interface_medium AS medium, a.network_interface_bond_uuid AS bond_uuid, a.network_interface_bridge_uuid AS bridge_uuid FROM network_interfaces a, hosts b WHERE a.network_interface_host_uuid = b.host_uuid AND b.host_name LIKE 'an-a02%' AND a.network_interface_operational != 'DELETED' ORDER BY b.host_name ASC, a.network_interface_name ASC; * SELECT b.host_name, a.bond_uuid, a.bond_name, a.bond_mode, a.bond_mtu AS mtu, a.bond_primary_interface AS primary, a.bond_active_interface AS active, a.bond_mac_address AS mac, a.bond_operational AS op, c.bridge_name, a.modified_date FROM bonds a, hosts b, bridges c WHERE a.bond_host_uuid = b.host_uuid AND a.bond_bridge_uuid = c.bridge_uuid AND (b.host_uuid = 'b4e46faf-0ebe-e211-a0d6-00262d0ca874' OR b.host_uuid = '4ba42b4e-9bf7-e311-a889-899427029de4') ORDER BY b.host_name ASC, a.bond_name ASC; * SELECT b.host_name, a.bridge_uuid, a.bridge_name, a.bridge_id, a.bridge_mtu FROM bridges a, hosts b WHERE a.bridge_host_uuid = b.host_uuid AND b.host_name LIKE 'an-a02%' ORDER BY b.host_name ASC, a.bridge_name ASC; * SELECT a.host_name, b.file_name, c.file_location_active AS active, c.file_location_ready AS ready FROM hosts a, files b, file_locations c WHERE a.host_uuid = c.file_location_host_uuid AND b.file_uuid = c.file_location_file_uuid ORDER BY b.file_name ASC, a.host_name ASC; * SELECT b.host_name, a.health_agent_name, a.health_source_name, a.health_source_weight FROM health a, hosts b WHERE b.host_uuid = a.health_host_uuid AND b.host_name LIKE 'an-a02%' order by b.host_name ASC, a.health_agent_name ASC, a.health_source_weight ASC; for lv in $(lvscan | grep deploy| awk '{print $2}' | sed s/\'//g); do lvremove -y $lv; done; rm -f /etc/drbd.d/an-test-deploy*; lvscan; ls -lah /etc/drbd.d/ # Fail a resource for testing purposes. crm_resource --fail --resource srv02-b -N vm-a01n01 # Recover without reboot crm_resource --resource srv01-a --refresh uname -r; grubby --default-kernel; lsinitrd -m /boot/initramfs-4.18.0-448.el8.x86_64.img | grep lvm; systemctl is-enabled scancore.service; dnf -y update; systemctl disable --now anvil-daemon; systemctl disable --now scancore When pairing Striker, make sure new config goes to all known nodes! dnf -y update && dnf -y install https://www.alteeve.com/an-repo/m3/anvil-release-latest.noarch.rpm && alteeve-repo-setup -y && dnf -y install anvil-striker --allowerasing dnf -y update && dnf -y install https://www.alteeve.com/an-repo/m3/anvil-release-latest.noarch.rpm && alteeve-repo-setup -y && dnf -y install anvil-node --allowerasing dnf -y update && dnf -y install https://www.alteeve.com/an-repo/m3/anvil-release-latest.noarch.rpm && alteeve-repo-setup -y && dnf -y install anvil-dr --allowerasing ### Currently set default zone; # Doesn't seem to matter - /etc/firewalld/firewalld.conf:6:DefaultZone=public firewall-cmd --get-default-zone # public firewall-cmd --permanent --set-default-zone=IFN1 firewall-cmd --permanent --new-zone="IFN1" firewall-cmd --permanent --zone=IFN1 --set-description="Internet-Facing Network 1" firewall-cmd --permanent --zone=IFN1 --set-short="IFN1" firewall-cmd --permanent --zone=IFN1 --add-interface=ifn1_bond1 firewall-cmd --permanent --zone=IFN1 --add-service=ssh firewall-cmd --permanent --zone=IFN1 --add-service=postgresql firewall-cmd --permanent --zone=IFN1 --add-port=22869/tcp firewall-cmd --reload # Configure APC PDUs and UPSes tcpip -i -s -g web -h enable web -s enable snmp -S enable -c1 private -a1 writeplus snmp -S enable -c2 public -a2 writeplus /root/ci-tools/ci-destroy-anvil-bm-vm /root/ci-tools/ci-setup-anvil-bm-vm rhel-8 ci ci watch 'echo "striker 1"; ssh root@an-striker01 "grep ^database /etc/anvil/anvil.conf | grep host"; echo "striker 2"; ssh root@an-striker02 "grep ^database /etc/anvil/anvil.conf | grep host"; echo "node 1"; ssh root@an-a01n01 "grep ^database /etc/anvil/anvil.conf | grep host"; echo "node 2"; ssh root@an-a01n02 "grep ^database /etc/anvil/anvil.conf | grep host"; echo "dr 1"; ssh root@an-a01dr01 "grep ^database /etc/anvil/anvil.conf | grep host";' Anvil! to Anvil! live migration; 1. Create LVs 2. Make sure /etc/hosts is populated 3. If DR is used, disconnect first to stay within the max-peers=3 3. Update dbrd config, A:1 -> A:2, A:1 -> B:1, B:1 -> B:2 (if both online and UpToDate, otherwise both from UpToDate) 4. Create drbd md on new Anvil! 5. drbdadm adjust on old nodes. 6. Wait for DRBD resource to sync to node 1 (it can sync to node 2 later) 7. Copy server's XML to new cluster 8. pcs resource unmanage srv01-cs8 9. Allow dual primary between A1:B1 (or A2:B1) - [root@an-a01n01 ~]# pcs resource disable srv01-cs8 Warning: 'srv01-cs8' is unmanaged [root@an-a01n01 ~]# pcs resource manage srv01-cs8 [root@an-a01n01 ~]# pcs resource delete srv01-cs8 Deleting Resource - srv01-cs8 10. ============ # Dump su - postgres -c "pg_dump anvil > /var/lib/pgsql/anvil.out" su - postgres -c "pg_dump --schema-only anvil > /var/lib/pgsql/anvil_schema.out" su - postgres -c "dropdb anvil" && su - postgres -c "createdb --owner admin anvil" && su - postgres -c "psql anvil < /var/lib/pgsql/anvil.out" su postgres -c "psql anvil" ============ Jenkins; Initial setup: export NODE_NAME=anvil-ci-bm export python=python3 ./ci-update-yum ============ ==] UEFI Setup [====================================================================== ignoredisk --only-use=vdb,vda clearpart --none --initlabel part raid.312 --fstype="mdmember" --ondisk=vda --size=19966 part raid.293 --fstype="mdmember" --ondisk=vdb --size=512 part raid.319 --fstype="mdmember" --ondisk=vdb --size=19966 part raid.286 --fstype="mdmember" --ondisk=vda --size=512 raid pv.326 --device=pv1 --fstype="lvmpv" --level=RAID1 raid.312 raid.319 raid /boot/efi --device=efi --fstype="efi" --level=RAID1 --fsoptions="umask=0077,shortname=winnt" --label=efi raid.286 raid.293 volgroup striker_vg0 --pesize=4096 pv.326 logvol swap --fstype="swap" --size=4096 --name=lv_swap --vgname=striker_vg0 logvol / --fstype="xfs" --size=15852 --label="lv_root" --name=lv_root --vgname=striker_vg0 ====================================================================================== DOCS; - - Explanation of 'comps.xml' (package grouping) - https://pagure.io/fedora-comps - Firewalld - https://www.digitalocean.com/community/tutorials/how-to-set-up-a-firewall-using-firewalld-on-centos-7 - PXE; - https://docs.fedoraproject.org/en-US/fedora/f28/install-guide/advanced/Network_based_Installations/ - https://docs.fedoraproject.org/en-US/Fedora/26/html/Installation_Guide/chap-pxe-server-setup.html - UEFI PXE notes - https://www.syslinux.org/wiki/index.php?title=PXELINUX#UEFI - How to write a NetworkManager dispatcher script to apply ethtool commands? - https://access.redhat.com/solutions/2841131 - Setup nodes to log to striker? - https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/networking_guide/sec-configuring_netconsole - Pacemaker can be monitored via SNMP - https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/high_availability_add-on_reference/s1-snmpandpacemaker-HAAR - corosync.conf - https://access.redhat.com/articles/3185291 ==== Network planning; 10.x.y.z / 10.x.y.z / x = Network; - BCN = 200 + network ie: BCN1 = 10.201.y.z BCN2 = 10.202.y.z - SN = 100 + network ie: SN1 = 10.101.y.z SN2 = 10.102.y.z - MN = 199 (only 1, always back-to-back between nodes 1 and 2) ie: MN1 = 10.199.y.z y = Device Type. Foudation Pack; 1. Switches 2. PDUs 3. UPSes 4. Strikers 5. Striker IPMI (BCN only) Anvil! systems; 1st - 10 = Node IP 11 = Node IPMI 2nd - 12 = Node IP 13 = Node IPMI 3rd - 14 = Node IP 15 = Node IPMI N... z = Device Sequence - Foundation pack devices are simple sequence - Anvils; .1 = node 1, .2 = node 2, .3 = dr ==== RHEL 8 Firewall - https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/8-beta/html/configuring_and_managing_networking/assembly_using-firewalls_configuring-networking-with-gnome-gui ============================= ### Nodes * BCN Ports TCP 22 sshd TCP 2224 pcsd It is crucial to open port 2224 in such a way that pcs from any node can talk to all nodes in the cluster, including itself. UDP 5404 corosync Required on corosync nodes if corosync is configured for multicast UDP UDP 5405 corosync Required on all corosync nodes (needed by corosync) TCP 5900+ vnc TCP 49152-49215 virsh live migration - migration_port_min and migration_port_max attributes in the /etc/libvirt/qemu.conf * SN Ports (Pull ports from DRBD resource config) TCP 7788+ drbd 1 port per resource * IFN Ports TCP 22 sshd MN Ports TCP 49152-49215 virsh mn live migration - migration_port_min and migration_port_max attributes in the /etc/libvirt/qemu.conf Ports we care about Porto Number Used by Nets Description NOTE: DHCP listens to raw sockets and ignores firewalld rules. We need to stop dhcpd directly - https://kb.isc.org/docs/aa-00378 * After all changes; firewall-cmd --zone=public --add-port=49152-49215/tcp --permanent firewall-cmd --reload - Paths If we want to create services or helpers later, look under - /usr/lib/firewalld/ Core firewalld configs, including defaults zones, etc - /etc/firewalld/ * Zones are meant to deal with dynamic environments and aren't that useful in mostly static server environments * Use 'firewall-cmd' WITHOUT '--permanent' for things like enabling the VNC port for a server. Use '--permanent' for everything else. ==== Striker as PXE server ==== # Bootloader for BIOS OS="fedora28" mkdir /var/lib/tftpboot/ cp $(anvil source)/pxe/tftpboot/bios/* /var/lib/tftpboot/ chmod 755 /var/lib/tftpboot/* # Bootloader for UEFI cp $(anvil source)/pxe/tftpboot/uefi/* /var/lib/tftpboot/uefi/ chmod 755 /var/lib/tftpboot/uefi/* # Copy kernel images for tftpboot downloads mkdir -p /var/lib/tftpboot/${OS} cp /lib/modules/$(uname -r)/vmlinuz /var/lib/tftpboot/${OS}/ mkinitrd /var/lib/tftpboot/${OS}/initrd.img $(uname -r) # Configs from anvil source rsync -av pxe/tftpboot/pxelinux.cfg/default root@f28-striker01:/var/lib/tftpboot/pxelinux.cfg/ rsync -av pxe/tftpboot/pxelinux/uefi root@f28-striker01:/var/lib/tftpboot/pxelinux/ ==== DB stuff; Dump; su - postgres -c "pg_dump anvil" > /anvil.out Drop; su - postgres -c "dropdb anvil" && su - postgres -c "createdb --owner admin anvil" && su - postgres -c "psql anvil" Reload the DB; su - postgres -c "dropdb anvil" && su - postgres -c "createdb --owner admin anvil" && su - postgres -c "psql anvil < /anvil.out" && su - postgres -c "psql anvil" ### Load client data ## Workstation setup dnf -y install postgresql postgresql-server postgresql-plperl postgresql-setup --initdb --unit postgresql vim /var/lib/pgsql/data/postgresql.conf # Add around line 60: listen_addresses = '*' vim /var/lib/pgsql/data/pg_hba.conf # Add around like 84: host all all all md5 systemctl start postgresql.service su - postgres -c "createuser --no-superuser --createdb --no-createrole admin" su - postgres -c "psql template1 -c \"ALTER ROLE postgres WITH PASSWORD 'Initial1';\"" su - postgres -c "psql template1 -c \"ALTER ROLE admin WITH PASSWORD 'Initial1';\"" # If there was a previous DB su - postgres -c "dropdb client" # Copy and load cp /path/to/client_anvil.out /tmp/anvil.out su - postgres -c "createdb --owner admin client" && su - postgres -c "psql client < /tmp/anvil.out" && su - postgres -c "psql client" Changes made using tools such as nmcli do not require a reload but do require the associated interface to be put down and then up again. That can be done by using commands in the following format: * nmcli dev disconnect interface-name Followed by: * nmcli con up interface-name NOTE: RHEL doesn't support direct-cabled bonds - https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/networking_guide/ch-configure_network_bonding ifcfg-X config Notes - /usr/share/doc/initscripts-*/sysconfig.txt (Look for the sections describing files /etc/sysconfig/network and /etc/sysconfig/network-scripts/ifcfg-); - man 5 nm-settings-ifcfg-rh - https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/networking_guide/sec-Using_Channel_Bonding#s3-modules-bonding-directives - /usr/share/doc/kernel-doc-*/Documentation/networking/bonding.txt iface * PREFIXx overrules NETMASKx. Use PREFIXx, not NETMASKx. * The 'x' suffice for PREFIX, NETMASK, etc start at 0 and must count up by 1 at a time. * ZONE will be useful for the firewall stuff later. * ETHTOOL_OPTS is deprecated, replaced by using udev rules * initscripts interpret PEERDNS=no to mean "never touch resolv.conf". NetworkManager interprets it to say "never add automatic (DHCP, PPP, VPN, etc.) nameservers to resolv.conf". Bond * resend_igmp & num_unsol_na={1~255} may help if a switch is slow to notice traffic has moved to the new interface. default is 1. Each update is send 200ms apart. * Bridged interfaces should use BRIDGE_UUID="", _not_ BRIDGE="". The former causes the later to be ignored and the later is only used for possible compatibility reasons. Bridge * STP=no is default, we'll test 'yes'. * DOMAIN="" ======= virt-manager stores information in dconf-editor -> /org/virt-manager/virt-manager/connections ($HOME/.config/dconf/user) ==== dconf read /org/virt-manager/virt-manager/connections/uris ['qemu+ssh://root@localhost/system', 'qemu+ssh://root@wp-a01n02.remote/system', 'qemu+ssh://root@an-nas02.kw01.alteeve.ca/system', 'qemu+ssh://root@hb-a01n01.remote/system', 'qemu+ssh://root@hb-a01n02.remote/system', 'qemu:///system'] ==== dconf read /org/virt-manager/virt-manager/connections/autoconnect ['qemu+ssh://root@localhost/system'] ==== # Web - TODO: Setup to auto-use "Let's Encrypt", but make sure we have an offline fall-back # SELinux restorecon -rv /var/www ============================================================= * Network; ** {bc,if,s}nX_{link,bond,bridge}Y naming ** firewall; - https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/high_availability_add-on_reference/s1-firewalls-haar firewall-cmd --permanent --add-service=high-availability firewall-cmd --add-service=high-availability firewall-cmd --reload * Cluster Config; ==== Both nodes echo Initial1 | passwd hacluster --stdin systemctl start pcsd.service systemctl enable pcsd.service systemctl disable libvirtd.service systemctl stop libvirtd.service ==== One node pcs host auth el8-a01n01 el8-a01n02 -u hacluster -p "secret" ### VMs pcs cluster setup m3-anvil-01 el8-a01n01 el8-a01n02 pcs cluster start --all pcs stonith create virsh_node1 fence_virsh pcmk_host_list="el8-a01n01" ipaddr="" passwd="secret" login="root" delay="15" port="el8-a01n01" op monitor interval="60" pcs stonith create virsh_node2 fence_virsh pcmk_host_list="el8-a01n02" ipaddr="" passwd="secret" login="root" port="el8-a01n02" op monitor interval="60" ### Real iron. pcs stonith create ipmilan_node1 fence_ipmilan pcmk_host_list="mk-a02n01" ipaddr="" password="another secret p" username="admin" delay="15" op monitor interval="60" pcs stonith level add 1 mk-a02n01 ipmilan_node1 pcs stonith create ipmilan_node2 fence_ipmilan pcmk_host_list="mk-a02n02" ipaddr="" password="another secret p" username="admin" op monitor interval="60" pcs stonith level add 1 mk-a02n02 ipmilan_node2 pcs stonith create apc_snmp_node1_psu1 fence_apc_snmp pcmk_host_list="mk-a02n01" pcmk_off_action="reboot" ip="" port="3" power_wait="5" op monitor interval="60" pcs stonith create apc_snmp_node1_psu2 fence_apc_snmp pcmk_host_list="mk-a02n01" pcmk_off_action="reboot" ip="" port="3" power_wait="5" op monitor interval="60" pcs stonith level add 2 mk-a02n01 apc_snmp_node1_psu1,apc_snmp_node1_psu2 pcs stonith create apc_snmp_node2_psu1 fence_apc_snmp pcmk_host_list="mk-a02n02" pcmk_off_action="reboot" ip="" port="4" power_wait="5" op monitor interval="60" pcs stonith create apc_snmp_node2_psu2 fence_apc_snmp pcmk_host_list="mk-a02n02" pcmk_off_action="reboot" ip="" port="4" power_wait="5" op monitor interval="60" pcs stonith level add 2 mk-a02n02 apc_snmp_node2_psu1,apc_snmp_node2_psu2 pcs stonith create delay_node1 fence_delay pcmk_host_list="mk-a02n01" wait="60" op monitor interval="60" pcs stonith level add 3 mk-a02n01 delay_node1 pcs stonith create delay_node2 fence_delay pcmk_host_list="mk-a02n02" wait="60" op monitor interval="60" pcs stonith level add 3 mk-a02n02 delay_node2 # Either case pcs property set stonith-max-attempts=INFINITY pcs property set stonith-enabled=true ### TODO: Look into 'priority-fencing-delay' # Create a new server resource, stopped, create the location constraint (higher == preferred), then start. pcs resource create srv01-test ocf:alteeve:server name="srv01-test" meta allow-migrate="true" target-role="stopped" op monitor interval="60" start timeout="INFINITY" on-fail="block" stop timeout="INFINITY" on-fail="block" migrate_to timeout="INFINITY" pcs constraint location srv01-test prefers mk-a02n01=200 mk-a02n02=100 pcs resource enable srv01-test - or - pcs resource update srv01-test ocf:alteeve:server name="srv01-test" meta allow-migrate="true" target-role="stopped" op monitor interval="60" start timeout="INFINITY" on-fail="block" stop timeout="INFINITY" on-fail="block" migrate_to timeout="INFINITY" # Test stonith_admin --fence el8-a01n02 --verbose; crm_error $? pcs resource create srv01-test ocf:alteeve:server name="srv01-test" meta allow-migrate="true" target-role="started" op monitor interval="60" start timeout="INFINITY" on-fail="block" stop timeout="INFINITY" on-fail="block" migrate_to timeout="INFINITY" pcs constraint location srv01-test prefers el8-a01n01=200 el8-a01n02=100 stonith-max-attempts=INFINITY cluster-recheck-interval puts an upper bound on the "i give up" time ==== pcs resource create srv01-cs8 ocf:alteeve:server name="srv01-cs8" meta allow-migrate="true" target-role="stopped" op monitor interval="60" start timeout="INFINITY" on-fail="block" stop timeout="INFINITY" on-fail="block" migrate_to timeout="INFINITY" pcs constraint location srv01-cs8 prefers mk-a02n01=200 mk-a02n02=100 ==== DRBD notes * resources can contain an US-ASCII character, except for spaces * A resource is a single replication stream for 1 or more resources, max 65.535 vols per resource * DRBD does, however, ship with an LVM integration facility that automates the creation of LVM snapshots immediately before synchronization. This ensures that a consistent copy of the data is always available on the peer, even while synchronization is running. See Using automated LVM snapshots during DRBD synchronization for details on using this facility. ** https://docs.linbit.com/docs/users-guide-9.0/#s-lvm-snapshots * Checksum-based synchronization computes a block's hash on source and target and skips if matching, possibly making resync much faster for blocks rewritten with the same data, but at the cost of CPU. Make this a user-configurable option under the advanced tab. * Suspended replication allows congested replication links to suspend replication, leaving the peer in a consistent state, but allowing the primary to "pull ahead". When the congestion passes, the delta resyncs. Make this a user-configurable option with scary warnings. * Online verification can (should?) be run periodically on the server host (verification source will overwrite deltas on the verification target). Perhaps schedule to run once/month? Do resource sequentially as this places a CPU load on the nodes. * Replication traffic integrity checking uses a given available kernel crypto to verify data integrity on transmission to the peer. If the replicated block can not be verified against the digest, the connection is dropped and immediately re-established; because of the bitmap the typical result is a retransmission. ** Make an option in the advanced tab. Test to see overhead this adds. Choose the lowest overhead algo (within reason) * Support for disk flushes might be something we want to disable, as it seems to force write-through even with a function FBWC/BBU. Need to test. * Note; "Inconsistent" is almost always useless. "Consistent" and "Outdated" are able to be used safely, just without whatever happened on the peer after. * Truck based replication, also known as disk shipping, is a means of preseeding a remote site with data to be replicated, by physically shipping storage media to the remote site. * Make sure that selinux doesn't block DRBD comms over the SN * See "5.15.1. Growing on-line" for growing a DRBD resource ** Shrinking online is ONLY possible if the metadata is external. Worth creating *_md LVs? Offline requires backing up and restoring the MD Provisioning a server will need to: * Create the LVs * Open up the DRBD ports * Create the DRBD resource(s); Find the lowest free rX.res, create it locally and on the peer (if up), firewall-cmd --zone=public --permanent --add-port=7788-7790/tcp firewall-cmd --reload * Provision the server via virt-install * push the new XML to striker such that the peer's anvil daemon picks it up and writes it out. [root@el8-a01n01 drbd.d]# drbdsetup status r0 --verbose --statistics r0 node-id:1 role:Primary suspended:no write-ordering:flush volume:0 minor:0 disk:UpToDate quorum:yes size:10485404 read:9682852 written:0 al-writes:0 bm-writes:0 upper-pending:0 lower-pending:0 al-suspended:no blocked:no el8-a01n02.alteeve.com node-id:0 connection:Connected role:Secondary congested:no volume:0 replication:SyncSource peer-disk:Inconsistent done:92.29 resync-suspended:no received:0 sent:9679140 out-of-sync:808144 pending:6 unacked:3 [root@el8-a01n02 ~]# cat /sys/kernel/debug/drbd/resources/r0/connections/el8-a01n01.alteeve.com/0/proc_drbd 0: cs:SyncSource ro:Primary/Secondary ds:UpToDate/Inconsistent C r----- ns:24360 nr:10485404 dw:10485404 dr:25420 al:0 bm:0 lo:0 pe:[0;0] ua:0 ap:[0;0] ep:1 wo:2 oos:10461044 [>....................] sync'ed: 0.3% (10212/10236)M finish: 0:50:01 speed: 3,480 (5,020 -- 3,480) K/sec 99% sector pos: 20970808/20970808 resync: used:0/61 hits:557 misses:2 starving:0 locked:0 changed:1 act_log: used:0/1237 hits:0 misses:0 starving:0 locked:0 changed:0 blocked on activity log: 0 [root@el8-a01n02 ~]# drbdadm primary r0 r0: State change failed: (-1) Multiple primaries not allowed by config Command 'drbdsetup primary r0' terminated with exit code 11 [root@el8-a01n02 ~]# drbdadm net-options --allow-two-primaries=yes r0 [root@el8-a01n02 ~]# drbdadm net-options --allow-two-primaries=no r0 drbdsetup show all drbdsetup show all --show-defaults == virt-install stuff * Get a list of --os-variants: 'osinfo-query os' * virt-install --print-xml (or --transient) * Migate; # For all resources under the server; #drbdadm net-options r0 --allow-two-primaries=yes drbdsetup net-options srv01-c7_0 2 --_name=m3-a02n01.alteeve.com --csums-alg=md5 --data-integrity-alg=md5 --after-sb-0pri=discard-zero-changes --after-sb-1pri=discard-secondary --after-sb-2pri=disconnect --protocol=C --fencing=resource-and-stonith --allow-two-primaries=yes drbdsetup net-options srv01-c7_0 2 --_name=m3-a02n01.alteeve.com --csums-alg=md5 --data-integrity-alg=md5 --after-sb-0pri=discard-zero-changes --after-sb-1pri=discard-secondary --after-sb-2pri=disconnect --protocol=C --fencing=resource-and-stonith --allow-two-primaries=yes # Adding a second volume to a running resource; - I wonder if you'd have the same results if you could get vol1 into an UpToDate/UpToDate state using the drbdsetup equivalent of drbdadm new-current-uuid --clear-bitmap # Migrate: virsh -c qemu+ssh://root@m3-a02n02.alteeve.com/system list virsh migrate --unsafe --undefinesource --live srv01-c7 qemu+ssh://m3-a02n01.alteeve.com/system virsh -c qemu+ssh://root@m3-a02n02.alteeve.com/system migrate --undefinesource --live srv01-c7 qemu+ssh://m3-a02n01.alteeve.com/system # Again for all resource under the server; drbdadm net-options r0 --allow-two-primaries=no drbdsetup net-options --_name= --allow-two-primaries=yes virsh migrate --undefinesource --live qemu+ssh:///system drbdsetup net-options --_name= --allow-two-primaries=no pcs constraint list --full Location Constraints: Resource: srv01-c7 Enabled on: m3-a02n02.alteeve.com (score:50) (id:location-srv01-c7-m3-a02n02.alteeve.com-50) pcs constraint remove location-srv01-c7-m3-a02n02.alteeve.com-50 Set to 90% of BCN bandwidth migrate-setspeed domain bandwidth Set the maximum migration bandwidth (in MiB/s) for a domain which is being migrated to another host. bandwidth is interpreted as an unsigned long long value. Specifying a negative value results in an essentially unlimited value being provided to the hypervisor. The hypervisor can choose whether to reject the value or convert it to the maximum value allowed. migrate-getspeed domain Get the maximum migration bandwidth (in MiB/s) for a domain. # Provision servers mkdir /mnt/anvil/{provision,files,archive,definitions} == Resource Agent; https://github.com/ClusterLabs/resource-agents/blob/master/doc/dev-guides/ra-dev-guide.asc * A resource agent receives all configuration information about the resource it manages via environment variables. The names of these environment variables are always the name of the resource parameter, prefixed with OCF_RESKEY_. For example, if the resource has an ip parameter set to, then the resource agent will have access to an environment variable OCF_RESKEY_ip holding that value. === When stopping a server; 14:03 < lge> "on-fail: block" 14:03 < lge> is per operation type. 14:08 < lge> anyways, you can also "on-fail: retry" OK, set the stop timeout to 60, set 'on-fail: block" and set the failure-timeout to 60 and see how pacemaker reacts. failure-timeout === Migrate servers; - Let ScanCore set 'node-health' attribute (http://clusterlabs.org/pacemaker/doc/en-US/Pacemaker/1.1/html-single/Pacemaker_Explained/index.html#s-node-health) - Set 'migration-limit' to '1' to enforce serial live migration (http://clusterlabs.org/pacemaker/doc/en-US/Pacemaker/1.1/html-single/Pacemaker_Explained/index.html#s-cluster-options). Migrate a single server by setting a location constraint against the node we want the VM off of. - If anything goes wrong, the server will enter a blocked state in pacemaker. - Recovery needs to be 'unmanage -> clean' to avoid a stop call. 11:57 <@kgaillot> for your design, that sounds right. between cleanup and manage, i'd make sure there was a PE run without any pending actions blocked by the unmanaging -- you can either look at the logs on the DC, run "crm_simulate -SL", or just check the status for a bit 11:58 <@kgaillot> you can play around with it by putting a higher preference on the to-be-cleaned node, to make sure it *does* move when you re-manage. that way you can see what logs/simulate/status look like 12:07 <@kgaillot> i'm thinking if you do crm_resource --reprobe instead of cleanup in the above sequence, that should prevent anything unexpected 12:07 <@kgaillot> unmanage -> adjust preferences if needed -> reprobe resource -> wait for probe results to come back in, and if status looks good -> re-manage 12:08 <@kgaillot> the reprobe will wipe the entire resource history and fail counts for the resource, causing pacemaker to recheck the current status on all nodes. if the status then shows the resource running where you expect/want it, with no errors, then it's not going to do anything further 12:09 <@kgaillot> (in 2.0, cleanup only erases the history where the resource has failed, while reprobe erases the history regardless) 12:13 <@kgaillot> if there are no failures in the resource history, there should be no risk of a full stop. if there is no resource history at all, then after reprobe, there should be no risk of any actions (assuming you've set up location preferences and stickiness how you want them) Recover from a failed migration; reset location to prefer current host -> unmanage resource -> cleanup resource -> manage resource (running on node 2, so re-add location constraint - basically, make sure location constraint favours current host) https://pykickstart.readthedocs.io/en/latest/kickstart-docs.html#chapter-1-introduction ==== Sample kickstart for Fedora28 netinstall #version=DEVEL ignoredisk --only-use=vda # Partition clearing information clearpart --none --initlabel # Use graphical install graphical # Use network installation url --url="" # Keyboard layouts keyboard --vckeymap=us --xlayouts='us' # System language lang en_CA.UTF-8 # Network information network --bootproto=dhcp --device=ens3 --ipv6=auto --activate network --hostname=localhost.localdomain # Root password rootpw --iscrypted $6$fyAht.3wBVlRGgqG$5dqIv2NrBD87uA51fxuoic/t2G93pXPUjVlh27Avg20ZGY409SK8cMVgABswF.krJSVIyoHfIChXNfpP/qTjI1 # Run the Setup Agent on first boot firstboot --enable # Do not configure the X Window System skipx # System services services --enabled="chronyd" # System timezone timezone Etc/GMT --isUtc # System bootloader configuration bootloader --location=mbr --boot-drive=vda %packages @^server-product-environment %end %addon com_redhat_kdump --disable --reserve-mb='128' %end %anaconda pwpolicy root --minlen=6 --minquality=1 --notstrict --nochanges --notempty pwpolicy user --minlen=6 --minquality=1 --notstrict --nochanges --emptyok pwpolicy luks --minlen=6 --minquality=1 --notstrict --nochanges --notempty %end ==== ==== M3 Striker Kickstart # Setion 1 # Command section – Refer to Chapter 2 for a list of kickstart options. You must include the required options. ### NOTE: The %packages, %pre, %pre-install, %post, %onerror, and %traceback sections are all required to be closed with %end # Section 2 # The %packages section – Refer to Chapter 3 for details. # Section 3: # The %pre, %pre-install, %post, %onerror, and %traceback sections – These sections can be in any order and are not required. Refer to Chapter 4, Chapter 5, and Chapter 6 for details. ==== 0 root@pulsar:/var/lib/libvirt/images# dev_PATH=$(udevadm info /dev/sdb | grep -e ID_PATH=) 0 root@pulsar:/var/lib/libvirt/images# if [[ $dev_PATH == *"usb"* ]]; then echo "USB drive"; elif [[ $dev_PATH == *"nvme"* ]]; then echo "NVMe drive"; elif [[ $dev_PATH == *"ata"* ]]; then echo "SATA drive"; elif [[ $dev_PATH == *"scsi"* ]]; then echo "SCSI drive"; fi USB drive 0 root@pulsar:/var/lib/libvirt/images# dev_PATH=$(udevadm info /dev/sda | grep -e ID_PATH=) 0 root@pulsar:/var/lib/libvirt/images# if [[ $dev_PATH == *"usb"* ]]; then echo "USB drive"; elif [[ $dev_PATH == *"nvme"* ]]; then echo "NVMe drive"; elif [[ $dev_PATH == *"ata"* ]]; then echo "SATA drive"; elif [[ $dev_PATH == *"scsi"* ]]; then echo "SCSI drive"; fi SATA drive 0 root@pulsar:/var/lib/libvirt/images# dev_PATH=$(udevadm info /dev/nvme0n1 | grep -e ID_PATH=) 0 root@pulsar:/var/lib/libvirt/images# if [[ $dev_PATH == *"usb"* ]]; then echo "USB drive"; elif [[ $dev_PATH == *"nvme"* ]]; then echo "NVMe drive"; elif [[ $dev_PATH == *"ata"* ]]; then echo "SATA drive"; elif [[ $dev_PATH == *"scsi"* ]]; then echo "SCSI drive"; fi NVMe drive [root@localhost ~]# dev_PATH=$(udevadm info /dev/sda | grep -e ID_PATH=) [root@localhost ~]# if [[ $dev_PATH == *"usb"* ]]; then echo "USB drive"; elif [[ $dev_PATH == *"nvme"* ]]; then echo "NVMe drive"; elif [[ $dev_PATH == *"ata"* ]]; then echo "SATA drive"; elif [[ $dev_PATH == *"scsi"* ]]; then echo "SCSI drive"; fi SCSI drive 0 root@pulsar:/var/lib/libvirt/images# cat /sys/class/block/sdb/device/model Flash Disk 0 root@pulsar:/var/lib/libvirt/images# cat /sys/class/block/sda/device/model SanDisk SDSSDXPS 0 root@pulsar:/var/lib/libvirt/images# cat /sys/class/block/nvme0n1/device/model INTEL SSDPEKKW512G7 ### Stuff only NVMe has 1 root@pulsar:/var/lib/libvirt/images# cat /sys/class/block/nvme0n1/device/subsysnqn nqn.2014.08.org.nvmexpress:80868086BTPY63650FPG512F INTEL SSDPEKKW512G7 0 root@pulsar:/var/lib/libvirt/images# cat /sys/class/block/sda/device/subsysnqn cat: /sys/class/block/sda/device/subsysnqn: No such file or directory 1 root@pulsar:/var/lib/libvirt/images# cat /sys/class/block/sdb/device/subsysnqn cat: /sys/class/block/sdb/device/subsysnqn: No such file or directory 0 root@pulsar:/var/lib/libvirt/images# cat /sys/class/block/nvme0n1/device/serial BTPY63650FPG512F 0 root@pulsar:/var/lib/libvirt/images# cat /sys/class/block/sda/device/serial cat: /sys/class/block/sda/device/serial: No such file or directory 1 root@pulsar:/var/lib/libvirt/images# cat /sys/class/block/sdb/device/serial cat: /sys/class/block/sdb/device/serial: No such file or directory 1 root@pulsar:/var/lib/libvirt/images# cat /sys/class/block/nvme0n1/device/transport pcie 0 root@pulsar:/var/lib/libvirt/images# cat /sys/class/block/sda/device/transport cat: /sys/class/block/sda/device/transport: No such file or directory 1 root@pulsar:/var/lib/libvirt/images# cat /sys/class/block/sdb/device/transport cat: /sys/class/block/sdb/device/transport: No such file or directory Disk size: /sys/class/block/sda/size * ===== man pages 1 Executable programs or shell commands 5 File formats and conventions eg /etc/passwd 7 Miscellaneous (including macro packages and conventions), e.g. man(7), groff(7) 8 System administration commands (usually only for root) A manual page consists of several sections. Conventional section names include NAME, SYNOPSIS, CONFIGURATION, DESCRIPTION, OPTIONS, EXIT STATUS, RETURN VALUE, ERRORS, ENVIRONMENT, FILES, VERSIONS, CONFORMING TO, NOTES, BUGS, EXAMPLE, AUTHORS, and SEE ALSO. The following conventions apply to the SYNOPSIS section and can be used as a guide in other sections. bold text type exactly as shown. italic text replace with appropriate argument. [-abc] any or all arguments within [ ] are optional. -a|-b options delimited by | cannot be used together. argument ... argument is repeatable. [expression] ... entire expression within [ ] is repeatable. ==== ======================================= chrissie's cluster script # Run these commands on all nodes: cp ../ifup-local /sbin /sbin/ifup-local pcs host auth -uhacluster -phacluster amy.chrissie.net anna.chrissie.net clara.chrissie.net fanny.chrissie.net if [ "$(hostname)" != "amy.chrissie.net" -a "$(hostname)" != "amy" ] then exit fi # and these on just one: pcs cluster setup taroxVMs amy.chrissie.net anna.chrissie.net clara.chrissie.net fanny.chrissie.net pcs cluster start --all sleep 30 pcs stonith create fence-virsh fence_virsh ipaddr= login=root passwd=christine pcmk_host_map="amy:rhel8-1;anna:rhel8-2;clara:rhel8-3;fanny:rhel8-4" if [ ! -e '/dev/an-a01n01_vg0/srv09-psql_0' ]; then /sbin/lvcreate -L 69GiB -n srv09-psql_0 an-a01n01_vg0 fi virt-install --connect qemu:///system \ --name srv09-psql \ --ram 4096 \ --arch x86_64 \ --vcpus 2 \ --cpu Nehalem,+fsgsbase \ --cdrom '/shared/files/Win2016_Server_64-bit_English.iso' \ --boot menu=on \ --disk path='/shared/files/virtio-win.iso',device=cdrom --force\ --os-variant win2k8 \ --network bridge=ifn_bridge1,model=virtio \ --disk path=/dev/an-a01n01_vg0/srv09-psql_0,bus=virtio,cache=writethrough \ --graphics spice \ --noautoconsole --wait -1 > /var/log/anvil-server_srv09-psql.log & # Migration; pcs constraint remove $(pcs constraint show --full | grep ban-srv07-el6 | perl -pe 's/^.*?id:(.*?)\)/$1/') # Attach a network interface: virsh attach-interface win2019_test bridge ifn_bridge1 --live --model virtio # Detach a network interface: virsh detach-interface win2019_test bridge --mac 52:54:00:ee:b5:1d # Attach disks virsh attach-disk srv34-nas /dev/drbd/by-res/srv34-nas/1 vdb --persistent --targetbus virtio --sourcetype block --subdriver raw # Change the MTU of a device; ip link set mtu 9000 # Change the MTU of an interface in windows (not tested yet - http://networking.nitecruzr.net/2007/11/setting-mtu-in-windows-vista.html) netsh interface ipv4 set subinterface "Local Area Connection" mtu=nnnn store=persistent yum install kernel-2.6.32-754.27.1.el6.x86_64 kernel-devel-2.6.32-754.27.1.el6.x86_64 kernel-headers-2.6.32-754.27.1.el6.x86_64 DRBD Notes; * c-max-rate is default 100M. Detect when the SN is 10Gbps and up to 750M * drbdsetup show --show-defaults <- shows defaults When a node comes online, and the peer and DR Host are both UpToDate, manually run the commands so that the restarting node connects to the DR first, so that it will sync from DR and minimize IO load on the peer node. The sequence of commands needed to up a resource are exposed by 'drbdadm up -d '. Ie: [root@el8-a01n02 ~]# drbdadm -d up srv02-hi drbdsetup new-resource srv02-hi 1 --auto-promote=yes drbdsetup new-minor srv02-hi 2 0 drbdsetup new-peer srv02-hi 0 --_name=el8-a01n01 --allow-two-primaries=no --after-sb-0pri=discard-zero-changes --after-sb-1pri=discard-secondary --after-sb-2pri=disconnect --timeout=100 --protocol=C --fencing=resource-and-stonith drbdsetup new-peer srv02-hi 2 --_name=el8-a01dr01 --allow-two-primaries=no --after-sb-0pri=discard-zero-changes --after-sb-1pri=discard-secondary --after-sb-2pri=disconnect --timeout=100 --protocol=A --fencing=dont-care drbdsetup new-path srv02-hi 0 ipv4: ipv4: drbdsetup new-path srv02-hi 2 ipv4: ipv4: drbdmeta 2 v09 /dev/node_8da3d2fe/srv02-hi_0 internal apply-al drbdsetup attach 2 /dev/node_8da3d2fe/srv02-hi_0 /dev/node_8da3d2fe/srv02-hi_0 internal --disk-flushes=no --md-flushes=no drbdsetup connect srv02-hi 0 drbdsetup connect srv02-hi 2 Manual split brain recovery; # Both nodes; drbdadm disconnect # Node to discard; drbdadm connect --discard-my-data # Node to save data on; drbdadm connect ================== # Server srv01-sql, example showing two disks in one VM. resource srv01-sql { on mk-a02n01 { node-id 0; volume 0 { device /dev/drbd_srv01-sql_0 minor 0; disk /dev/rhel/srv01-sql_0; meta-disk internal; } volume 1 { device /dev/drbd_srv01-sql_1 minor 1; disk /dev/rhel/srv01-sql_1; meta-disk internal; } } on mk-a02n02 { node-id 1; volume 0 { device /dev/drbd_srv01-sql_0 minor 0; disk /dev/rhel/srv01-sql_0; meta-disk internal; } volume 1 { device /dev/drbd_srv01-sql_1 minor 1; disk /dev/rhel/srv01-sql_1; meta-disk internal; } } on mk-a02dr01 { node-id 2; volume 0 { device /dev/drbd_srv01-sql_0 minor 0; disk /dev/rhel_new-dr/srv01-sql_0; meta-disk internal; } volume 1 { device /dev/drbd_srv01-sql_1 minor 1; disk /dev/rhel_new-dr/srv01-sql_1; meta-disk internal; } } ### NOTE: Remember to open the appropriate firewall port! # firewall-cmd --zone=SN1 --permanent --add-port=7788/tcp --permanent # firewall-cmd --zone=SN1 --permanent --add-port=7788/tcp connection { host mk-a02n01 address; host mk-a02n02 address; net { protocol C; fencing resource-and-stonith; } } connection { host mk-a02n01 address; host mk-a02dr01 address; net { protocol A; fencing dont-care; } } connection { host mk-a02n02 address; host mk-a02dr01 address; net { protocol A; fencing dont-care; } } } ================== 1. Battery, short = -, add + / - to cell icon mediawiki on EL8 install notes (starting from a minimal install); dnf module reset php dnf module enable php:7.4 # All dnf install httpd php php-gd php-xml php-mbstring php-json \ vim bash-completion wget tar rsync mlocate php-pecl-apcu \ memcached php-pear icu php-intl php-pgsql bzip2 mod_ssl ### PostgreSQL dnf install postgresql-server postgresql-plperl postgresql-setup --initdb systemctl start postgresql.service systemctl enable postgresql.service ### MariaDB dnf install httpd php php-gd php-xml php-mbstring php-json \ php-mysqlnd php-gd php-xml mariadb-server mariadb \ systemctl start mariadb mysql_secure_installation |Set root password? [Y/n] y |New password: |Re-enter new password: |Password updated successfully! |Remove anonymous users? [Y/n] y |Disallow root login remotely? [Y/n] y |Remove test database and access to it? [Y/n] y |Reload privilege tables now? [Y/n] y mysql -u root -p ### In mariadb MariaDB [(none)]> CREATE DATABASE an_wiki; MariaDB [(none)]> CREATE USER 'alteeve'@'localhost' IDENTIFIED BY 'experience tell mineral'; MariaDB [(none)]> GRANT ALL PRIVILEGES ON an_wiki.* TO 'alteeve'@'localhost'; MariaDB [(none)]> FLUSH PRIVILEGES; MariaDB [(none)]> SHOW DATABASES; +--------------------+ | Database | +--------------------+ | an_wiki | | information_schema | | mysql | | performance_schema | +--------------------+ MariaDB [(none)]> SHOW GRANTS FOR 'alteeve'@'localhost'; +----------------------------------------------------------------------------------------------------------------+ | Grants for digimer@localhost | +----------------------------------------------------------------------------------------------------------------+ | GRANT USAGE ON *.* TO `digimer`@`localhost` IDENTIFIED BY PASSWORD '*xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' | | GRANT ALL PRIVILEGES ON `an_wiki`.* TO `digimer`@`localhost` | +----------------------------------------------------------------------------------------------------------------+ MariaDB [(none)]> exit # Back to terminal systemctl enable mariadb # diff -u /var/lib/pgsql/data/pg_hba.conf.orig /var/lib/pgsql/data/pg_hba.conf ==== --- /var/lib/pgsql/data/pg_hba.conf.orig 2021-02-17 02:50:10.959000000 -0500 +++ /var/lib/pgsql/data/pg_hba.conf 2021-02-17 02:52:54.859000000 -0500 @@ -77,11 +77,12 @@ # TYPE DATABASE USER ADDRESS METHOD # "local" is for Unix domain socket connections only -local all all peer +local all postgres ident +local all all md5 # IPv4 local connections: -host all all ident +host all all md5 # IPv6 local connections: -host all all ::1/128 ident +host all all ::1/128 md5 # Allow replication connections from localhost, by a user with the # replication privilege. local replication all peer ==== # diff -u /etc/php.ini.orig /etc/php.ini ==== --- /etc/php.ini.orig 2021-02-17 02:56:32.293000000 -0500 +++ /etc/php.ini 2021-02-17 02:57:33.731000000 -0500 @@ -903,6 +903,7 @@ ; 'extension='php_.dll') is supported for legacy reasons and may be ; deprecated in a future PHP major version. So, when it is possible, please ; move to the new ('extension=) syntax. +extension=php_pgsql.so ;;;; ; Note: packaged extension modules are now loaded via the .ini files # Download and install cd /var/www/ wget https://releases.wikimedia.org/mediawiki/1.37/mediawiki-1.37.1.tar.gz tar -xvzf mediawiki-1.37.1.tar.gz cd /var/www/html ln -s ../mediawiki-1.37.1 ./w systemctl enable httpd.service systemctl enable memcached.service systemctl start httpd.service systemctl start memcached.service firewall-cmd --zone=public --add-service=http --permanent firewall-cmd --zone=public --add-service=https --permanent firewall-cmd --reload ### Certbot / Let's Encrypt # EPEL / snapd dnf config-manager --set-enabled powertools dnf install epel-release epel-next-release dnf install snapd systemctl enable --now snapd.socket ln -s /var/lib/snapd/snap /snap ### Setup vhost # httpd.conf ### Log out and back in to ensure snapd path # If the next step fails with "too early for operation, device not yet seeded or device model not acknowledged", restart snapd snap install core snap refresh core snap install --classic certbot # certbot certbot --apache # answer questions ==== Dell S4128T-ON Configuration # Terminal access using serial port on back on the switch (USB-B front connect seems flaky) screen /dev/ttyUSB0 115200 u: admin p: admin # Interface numbering (management port on the rear is 'mgmt 1/1/1'); OS10# show interface status -------------------------------------------------------------------------------------------------- Port Description Status Speed Duplex Mode Vlan Tagged-Vlans -------------------------------------------------------------------------------------------------- Eth 1/1/1 down 0 full A 1 - Eth 1/1/2 down 0 full A 1 - Eth 1/1/3 down 0 full A 1 - Eth 1/1/4 down 0 full A 1 - Eth 1/1/5 down 0 full A 1 - Eth 1/1/6 down 0 full A 1 - Eth 1/1/7 down 0 full A 1 - Eth 1/1/8 down 0 full A 1 - Eth 1/1/9 down 0 full A 1 - Eth 1/1/10 down 0 full A 1 - Eth 1/1/11 down 0 full A 1 - Eth 1/1/12 down 0 full A 1 - Eth 1/1/13 down 0 full A 1 - Eth 1/1/14 down 0 full A 1 - Eth 1/1/15 down 0 full A 1 - Eth 1/1/16 down 0 full A 1 - Eth 1/1/17 down 0 full A 1 - Eth 1/1/18 down 0 full A 1 - Eth 1/1/19 down 0 full A 1 - Eth 1/1/20 down 0 full A 1 - Eth 1/1/21 down 0 full A 1 - Eth 1/1/22 down 0 full A 1 - Eth 1/1/23 down 0 full A 1 - Eth 1/1/24 down 0 full A 1 - Eth 1/1/25 up 100G full A 1 - Eth 1/1/26 up 100G full A 1 - Eth 1/1/27 down 0 full A 1 - Eth 1/1/28 down 0 full A 1 - Eth 1/1/29 down 0 full A 1 - Eth 1/1/30 down 0 full A 1 - -------------------------------------------------------------------------------------------------- OS10# configure terminal OS10(config)# interface range ethernet 1/1/1-1/1/24 OS10(conf-range-eth1/1/1-1/1/24)# exit OS10(config)# interface range ethernet 1/1/1-1/1/24,1/1/27-1/1/30 OS10(conf-range-eth1/1/1-1/1/24,1/1/27-1/1/30)# exit # Configure management IP address OS10# configure terminal OS10(config)# interface mgmt 1/1/1 OS10(conf-if-ma-1/1/1)# no ip address dhcp OS10(conf-if-ma-1/1/1)# ip address OS10(conf-if-ma-1/1/1)# <165>1 2021-04-02T12:23:40.141901+00:00 OS10 dn_alm 652 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %IP_ADDRESS_ADD: IP Address add is successful. IP in VRF:default added successfully OS10(conf-if-ma-1/1/1)# no shutdown # Connected via SSH to confirm access: OS10(config)# <165>1 2021-04-02T12:25:49.956308+00:00 OS10 dn_alm 652 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %ALM_AUTH_EVENT: Authentication event was raised MESSAGE=pam_unix(sshd:session): session opened for user admin by (uid=0) <86>1 2021-04-02T12:25:49.501860+00:00 OS10 sshd 6512 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) pam_unix(sshd:session): session opened for user admin by (uid=0) <86>1 2021-04-02T12:25:51.795620+00:00 OS10 sshd 6527 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) pam_unix(sshd:session): session opened for user admin by (uid=0) <165>1 2021-04-02T12:25:51.957630+00:00 OS10 dn_alm 652 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %ALM_AUTH_EVENT: Authentication event was raised MESSAGE=pam_unix(sshd:session): session opened for user admin by (uid=0) OS10(config)# exit OS10# write memory ### Set hostname: OS10# configure terminal OS10(config)# hostname an-switch02 an-switch02(config)# ======] VLT Config [======= ### Stacking is not a thing anymore, but VLT is its replacement. # On both switches; OS10# configure terminal OS10(config)# interface range ethernet 1/1/25-1/1/26 OS10(conf-range-eth1/1/25-1/1/26)# no shutdown OS10(conf-range-eth1/1/25-1/1/26)# no switchport <165>1 2021-04-02T11:47:05.731264+00:00 OS10 dn_alm 652 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %STP_ROOT_CHANGE: STP:Root Brg Chg RPVST root changed for vlan 1. <165>1 2021-04-02T11:47:05.739950+00:00 OS10 dn_alm 652 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %STP_ROOT_CHANGE: STP:Root Brg Chg My ID:f0d4.e250.cb8b OldRt:32769:f0d4.e250.c00b NewRt:32769:f0d4.e250.cb8b OS10(conf-range-eth1/1/25-1/1/26)# exit OS10(config)# vlt-domain 1 OS10(conf-vlt-1)# discovery-interface ethernet 1/1/25-1/1/26 # When this is done to the second switch; <165>1 2021-04-02T12:31:11.702525+00:00 OS10 dn_alm 803 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %IFM_ASTATE_UP: Interface admin state up :port-channel1000 <165>1 2021-04-02T12:31:11.707501+00:00 OS10 dn_alm 803 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %IFM_OSTATE_DN: Interface operational state is down :port-channel1000 <165>1 2021-04-02T12:31:11.720332+00:00 OS10 dn_alm 803 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %IFM_OSTATE_UP: Interface operational state is up :port-channel1000 <165>1 2021-04-02T12:31:11.845070+00:00 OS10 dn_alm 803 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %IFM_OSTATE_UP: Interface operational state is up :vlan4094 <165>1 2021-04-02T12:31:11.863978+00:00 OS10 dn_alm 803 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %VLT_PEER_UP: VLT unit 1 is up <165>1 2021-04-02T12:31:18.996716+00:00 OS10 dn_alm 803 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %VLT_ELECTION_ROLE: VLT unit 1 is elected as secondary <165>1 2021-04-02T12:31:19.087695+00:00 OS10 dn_alm 803 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %IFM_OSTATE_UP: Interface operational state is up :vlan1 # Configure the same MAC address to the VLT on both switches: OS10# configure terminal OS10(config)# vlt-domain 1 OS10(conf-vlt-1)# vlt-mac 00:00:00:00:00:02 # Set once per VLT domain, not per switch OS10# show vlt 1 mismatch (If no issues, VLT is OK) # See how I am and my role (* == switch you're on) an-switch02(config)# show vlt 1 role VLT Unit ID Role ------------------------ * 1 secondary 2 primary =====] VLAN Config [======== OS10# configure terminal OS10(config)# interface mgmt 1/1/1 OS10(conf-if-ma-1/1/1)# no ip address dhcp OS10(conf-if-ma-1/1/1)# ip address OS10(conf-if-ma-1/1/1)# no shutdown OS10(conf-if-ma-1/1/1)# exit OS10(config)# write memory OS10(config)# hostname an-switch01 an-switch01(config)# interface vlan 100 an-switch01(conf-if-vl-100)# description BCN1 an-switch01(config)# exit an-switch01(conf-if-vl-100)# interface range ethernet 1/1/1-1/1/14 an-switch01(conf-range-eth1/1/1-1/1/10)# switchport access vlan 100 an-switch01(conf-range-eth1/1/1-1/1/10)# no shutdown an-switch01(conf-range-eth1/1/1-1/1/10)# exit an-switch01(config)# interface vlan 200 an-switch01(conf-if-vl-200)# description SN1 an-switch01(conf-if-vl-200)# exit an-switch01(config)# interface range ethernet 1/1/11-1/1/14 an-switch01(conf-range-eth1/1/11-1/1/14)# switchport access vlan 200 an-switch01(conf-range-eth1/1/11-1/1/14)# no shutdown an-switch01(conf-range-eth1/1/11-1/1/14)# exit an-switch01(config)# interface vlan 300 an-switch01(conf-if-vl-300)# description IFN1 an-switch01(conf-if-vl-300)# exit an-switch01(config)# interface range ethernet 1/1/15-1/1/24 an-switch01(conf-range-eth1/1/15-1/1/24)# switchport access vlan 300 an-switch01(conf-range-eth1/1/15-1/1/24)# no shutdown an-switch01(conf-range-eth1/1/15-1/1/24)# exit an-switch01(config)# show vlan Codes: * - Default VLAN, M - Management VLAN, R - Remote Port Mirroring VLANs, @ - Attached to Virtual Network, P - Primary, C - Community, I - Isolated Q: A - Access (Untagged), T - Tagged NUM Status Description Q Ports * 1 Active A Eth1/1/27-1/1/30 A Po1000 100 Active BCN1 T Po1000 A Eth1/1/1-1/1/10 200 Active SN1 T Po1000 A Eth1/1/11-1/1/14 300 Active IFN1 T Po1000 A Eth1/1/15-1/1/24 4094 Active T Po1000 an-switch01(config)# write memory ### Delete a VLAN: an-switch02(config)# no interface vlan 3400 an-switch02(config)# show vlan === Firmware Update === Download the firmware from the Dell digital locker. Once downloaded, extracted and the sum verified, copy the 'PKGS_OS10-Enterprise-xxx-installer-x86_64.bin' file to one of the striker's /var/lib/tftpboot/' directory and start the dhcpd service. ### NOTE: Use the striker NOT connected to the switch being upgraded! If necessary, move the uplink interfaces off of the switch to be upgraded! Once ready, connect to the switch over serial port. ### WARNING: The firmware update completely resets the switch! Backup you config before hand, if necessary. Also, watch for the loss of VLANs causing switch loops when 2+ uplinks are connected! # screen /dev/ttyUSB0 115200 # Reboot the switches; reload When the grub boot menu appears, choose +--------------------------------------------------------+ |*ONIE: Install OS | | ONIE: Rescue | | ONIE: Uninstall OS | | ONIE: Update ONIE | | ONIE: Embed ONIE | | ONIE: Diag ONIE | +--------------------------------------------------------+ # This will start a constant scan of IP addresses and local storage looking for the firmware. To stop this, type: ONIE:/ # onie-discovery-stop # Set the IP address so that we can talk to the striker with the firmware. ONIE:/ # ifconfig eth0 netmask up ONIE:/ # ping PING ( 56 data bytes 64 bytes from seq=0 ttl=64 time=0.247 ms 64 bytes from seq=1 ttl=64 time=0.506 ms ONIE:/ # onie-nos-install tftp:// discover: installer mode detected. Stopping: discover... done. Info: Fetching tftp:// ... <- This step takes a while PKGS_OS10-Enterprise 100% |*******************************| 744M 0:00:00 ETA ONIE: Executing installer: tftp:// Initializing installer ... OK Verifying image checksum ... OK OS10 Installer: machine: dellemc_s4100_c2338/s4128t Fixing up partitions ... OK Deleting logical volume CONFIG ... OK Deleting logical volume SYSROOT1 ... OK Deleting logical volume SYSROOT2 ... OK Creating logical volume SYSROOT ... OK Creating ext4 filesystem on SYSROOT ... OK Extracting OS10 ... OK Installing OS10 on primary volume ... OK <- This takes a long time, be patient Setting up shared data ... OK Synchronizing standby image ... OS10 installation is complete. Creating ext4 filesystem on sda4 ... OK Installing GRUB-UEFI ... OK Saving system information ... OK Saving ONIE support information ... OK ONIE: NOS install successful: tftp:// ONIE: Rebooting... ONIE:/ # discover: installer mode detected. Stopping: discover...start-stop-daemon: warning: killing process 2881: No such process done. Stopping: dropbear ssh daemon... done. Stopping: telnetd... done. Stopping: syslogd... done. Info: Unmounting kernel filesystems umount: can't umount /: Invalid argument The system is going down NOW! Sent SIGTERM to all processes Sent SIGKILL tosd 4:0:0:0: [sda] Synchronizing SCSI cache reboot: Restarting system reboot: machine restart ### NOTE: The login prompt will appear before the system is ready to log in. The default username and password revert to 'admin' / 'admin', but this won't work for the first couple of minutes. ## OLD an-switch02# show version Dell EMC Networking OS10 Enterprise Copyright (c) 1999-2020 by Dell Inc. All Rights Reserved. OS Version: Build Version: Build Time: 2020-01-30T21:08:56+0000 System Type: S4128T-ON Architecture: x86_64 Up Time: 22:49:57 an-switch02# ## New an-a01n01# show version Dell EMC Networking OS10 Enterprise Copyright (c) 1999-2021 by Dell Inc. All Rights Reserved. OS Version: Build Version: Build Time: 2021-02-26T20:03:25+0000 System Type: S4128T-ON Architecture: x86_64 ==================================== Configure a pair of Dell N-Series switch pair (OS6) # Using the micro-USB serial interface; screen /dev/ttyUSB0 115200 console>enable console# # Assemble the stack Port 1 -> 2, 2 -> 1 The stack auto-forms. # Set management IP address console#configure terminal console(config)#interface out-of-band console(config-if)#ip address console(config-if)#exit # Connect the Management interface of the stack ID 1 switch to one of the BCN ports. console#ping Pinging with 0 bytes of data: Reply From icmp_seq = 0. time= 640 usec. Reply From icmp_seq = 1. time= 558 usec. Reply From icmp_seq = 2. time= 224 usec. Reply From icmp_seq = 3. time= 192 usec. ---- PING statistics---- 4 packets transmitted, 4 packets received, 0% packet loss round-trip (msec) min/avg/max = 0/0/0 # Set a web user and password (obviously, use a better password) console(config)#no passwords min-length console(config)#username admin password admin privilege 15 # You should now be able to connect to the web interface # Show interfaces console(config)#show interfaces configuration Port Description Duplex Speed Neg MTU Admin State --------- ------------------------------ ------ ------- ---- ----- ----- Gi1/0/1 N/A Unknown Auto 1518 Up Gi1/0/2 N/A Unknown Auto 1518 Up Gi1/0/3 Full 100 Auto 1518 Up Gi1/0/4 Full 100 Auto 1518 Up Gi1/0/5 N/A Unknown Auto 1518 Up Gi1/0/6 N/A Unknown Auto 1518 Up Gi1/0/7 N/A Unknown Auto 1518 Up Gi1/0/8 N/A Unknown Auto 1518 Up Gi1/0/9 N/A Unknown Auto 1518 Up Gi1/0/10 Full 1000 Auto 1518 Up Gi1/0/11 N/A Unknown Auto 1518 Up Gi1/0/12 N/A Unknown Auto 1518 Up Gi1/0/13 N/A Unknown Auto 1518 Up Gi1/0/14 N/A Unknown Auto 1518 Up Gi1/0/15 N/A Unknown Auto 1518 Up Gi1/0/16 N/A Unknown Auto 1518 Up Gi1/0/17 N/A Unknown Auto 1518 Up Gi1/0/18 N/A Unknown Auto 1518 Up Gi1/0/19 N/A Unknown Auto 1518 Up Gi1/0/20 N/A Unknown Auto 1518 Up Gi1/0/21 N/A Unknown Auto 1518 Up Gi1/0/22 N/A Unknown Auto 1518 Up Gi1/0/23 N/A Unknown Auto 1518 Up Gi1/0/24 Full 1000 Auto 1518 Up Tw1/0/1 Full 25000 Off 1518 Up Tw1/0/2 Full 25000 Off 1518 Up Tw1/0/3 Full 25000 Off 1518 Up Tw1/0/4 Full 25000 Off 1518 Up Fo1/0/1 Full 40000 Off 1518 Up Fo1/0/2 Full 40000 Off 1518 Up Gi2/0/1 N/A Unknown Auto 1518 Up Gi2/0/2 N/A Unknown Auto 1518 Up Gi2/0/3 N/A Unknown Auto 1518 Up Gi2/0/4 N/A Unknown Auto 1518 Up Gi2/0/5 Full 100 Auto 1518 Up Gi2/0/6 Full 100 Auto 1518 Up Gi2/0/7 Full 1000 Auto 1518 Up Gi2/0/8 Full 1000 Auto 1518 Up Gi2/0/9 N/A Unknown Auto 1518 Up Gi2/0/10 N/A Unknown Auto 1518 Up Gi2/0/11 N/A Unknown Auto 1518 Up Gi2/0/12 N/A Unknown Auto 1518 Up Gi2/0/13 N/A Unknown Auto 1518 Up Gi2/0/14 N/A Unknown Auto 1518 Up Gi2/0/15 N/A Unknown Auto 1518 Up Gi2/0/16 N/A Unknown Auto 1518 Up Gi2/0/17 N/A Unknown Auto 1518 Up Gi2/0/18 N/A Unknown Auto 1518 Up Gi2/0/19 N/A Unknown Auto 1518 Up Gi2/0/20 N/A Unknown Auto 1518 Up Gi2/0/21 N/A Unknown Auto 1518 Up Gi2/0/22 N/A Unknown Auto 1518 Up Gi2/0/23 N/A Unknown Auto 1518 Up Gi2/0/24 N/A Unknown Auto 1518 Up Tw2/0/1 Full 25000 Off 1518 Up Tw2/0/2 Full 25000 Off 1518 Up Tw2/0/3 Full 25000 Off 1518 Up Tw2/0/4 Full 25000 Off 1518 Up Fo2/0/1 Full 40000 Off 1518 Up Fo2/0/2 Full 40000 Off 1518 Up Oob Type Admin State --- ------------------------------ ----- oob Out-Of-Band Up # Configure VLANs rs-striker03(config)#vlan 100 rs-striker03(config-vlan100)#name BCN1 rs-striker03(config-vlan100)#interface range gi1/0/1-14,gi2/0/1-14 rs-striker03(config-if)#switchport access vlan 100 rs-striker03(config-if)#no shutdown rs-striker03(config-if)#exit #### NOTE: Put IP on VID 1! rs-switch03(config)#show vlan VLAN Name Ports Type ----- --------------- ------------- -------------- 1 default Po1-128, Default Gi1/0/1-12, Te1/0/1-4, Gi2/0/1-12, Te2/0/1-4 300 IFN1 Gi1/0/13-24, Static Gi2/0/13-24 rs-switch03(config)#interface vlan 1 rs-switch03(config-if-vlan1)#ip address rs-switch03(config-if-vlan1)#exit rs-switch03(config)#exit rs-switch03#copy running-config startup-config ########################### rs-striker03#show vlan VLAN Name Ports Type ----- --------------- ------------- -------------- 1 default Po1-128, Default Te1/0/1-4, Te2/0/1-4 100 BCN1 Gi1/0/1-12, Static Gi2/0/1-12 300 IFN1 Gi1/0/13-24, Static Gi2/0/13-24 rs-striker03#copy running-config startup-config This operation may take few minutes. Management interfaces will not be available during this time. Are you sure you want to save? (y/n) y Configuration Saved! rs-striker03# # Firmware update # Copy .stk file to striker's /var/lib/tftpboot/ directory console#copy tftp:// backup ==== Transfer Mode.................................. TFTP Server IP Address.............................. Source File Path............................... ./ Source Filename................................ N2200v6.6.3.10.stk Data Type...................................... Code Destination Filename........................... backup Management access will be blocked for the duration of the transfer Are you sure you want to start? (y/n) y File transfer in progress. Management access will be blocked for the duration of the transfer. Please wait... TFTP Code transfer starting... 40602956 bytes transferred Attempting to send the STK file to other units in the stack... File transfer operation completed successfully. ==== console#show version Machine Description............... Dell EMC Networking Switch System Model ID................... N2224X-ON Machine Type...................... Dell EMC Networking N2224X-ON Serial Number..................... TH0X621WCET000A800GV Manufacturer...................... 0xbc00 Burned In MAC Address............. 8C47.BE75.9D0F System Object ID.................. SOC Version....................... BCM56172_B0 HW Version........................ 2 CPLD Version...................... 260 Image File........................ N2200v6.6.1.1 Software Capability............... Stack Limit = 12, VLAN Limit = 4093 unit active backup current-active next-active ---- ----------- ----------- -------------- -------------- 1 2 console#boot system backup Activating image backup .. console#reload Are you sure you want to reload the stack? (y/n) y ==================================== -=] Rename a resource (ex: srv09-few-tcpremote1 -> srv09-fea-tcpremote1) 1. pacemaker - 1.1 - Record current settings: # pcs resource config srv09-few-tcpremote1 Resource: srv09-few-tcpremote1 (class=ocf provider=alteeve type=server) Attributes: name=srv09-few-tcpremote1 Meta Attrs: allow-migrate=true target-role=Stopped Operations: migrate_from interval=0s timeout=600 (srv09-few-tcpremote1-migrate_from-interval-0s) migrate_to interval=0s timeout=86400 (srv09-few-tcpremote1-migrate_to-interval-0s) monitor interval=60 (srv09-few-tcpremote1-monitor-interval-60) notify interval=0s timeout=20 (srv09-few-tcpremote1-notify-interval-0s) start interval=0s on-fail=block timeout=300 (srv09-few-tcpremote1-start-interval-0s) stop interval=0s on-fail=block timeout=86400 (srv09-few-tcpremote1-stop-interval-0s) 1.2 - Delete the resource ==================================== $body = $cgi->param('POSTDATA') # gives you the body of the request as a string, which you can then process as JSON ==================================== # Hosts added or updated by the Anvil! on: [2021/04/17 16:24:52]: Could not get property: Refusing activation, D-Bus is shutting down Could not get property: Refusing activation, D-Bus is shutting down. Could not get property: Refusing activation, D-Bus is shutting down.bcn1 Could not get property: Refusing activation, D-Bus is shutting down.ifn1 Could not get property: Refusing activation, D-Bus is shutting down.sn1 ausearch -c 'drbdsetup' --raw | audit2allow -M my-drbdsetup && semodule -X 300 -i my-drbdsetup.pp May 02 13:35:21 an-a01n02.zennioptical.com setroubleshoot[5333]: SELinux is preventing /usr/sbin/drbdsetup from create access on the netlink_generic_socket labeled drbd_t. For complete SELinux messages run: sealert -l 4079c288-db4a-4f44-a588-94f1dbfff269 May 02 13:35:21 an-a01n02.zennioptical.com setroubleshoot[5333]: SELinux is preventing /usr/sbin/drbdsetup from create access on the netlink_generic_socket labeled drbd_t. If you believe that drbdsetup should be allowed create access on netlink_generic_socket labeled drbd_t by default. # ausearch -c 'drbdsetup' --raw | audit2allow -M my-drbdsetup # semodule -X 300 -i my-drbdsetup.pp If you believe that virsh should be allowed read access on the srv16-an-psql-qa.xml file by default. # ausearch -c 'virsh' --raw | audit2allow -M my-virsh # semodule -X 300 -i my-virsh.pp ==================================== ### Edit corosync.conf to add 'token' and the node names on one node # diff -u /root/corosync.conf /etc/corosync/corosync.conf --- /root/corosync.conf 2021-07-06 15:01:18.956703529 -0400 +++ /etc/corosync/corosync.conf 2021-07-06 15:15:23.438494607 -0400 @@ -4,17 +4,20 @@ transport: knet crypto_cipher: aes256 crypto_hash: sha256 + token: 10000 } nodelist { node { - ring0_addr: an-a02n01 + ring0_addr: an-a02n01.bcn1 + ring1_addr: an-a02n01.sn1 name: an-a02n01 nodeid: 1 } node { - ring0_addr: an-a02n02 + ring0_addr: an-a02n02.bcn1 + ring1_addr: an-a02n02.sn1 name: an-a02n02 nodeid: 2 } ### Update the peer with the new corosync.conf and then tell pacemaker to reload pcs cluster sync pcs cluster reload corosync ### Verify; corosync-cmapctl | grep -e totem.token -e knet runtime.config.totem.interface.0.knet_ping_interval (u32) = 2500 runtime.config.totem.interface.0.knet_ping_timeout (u32) = 5000 runtime.config.totem.interface.1.knet_ping_interval (u32) = 2500 runtime.config.totem.interface.1.knet_ping_timeout (u32) = 5000 runtime.config.totem.knet_compression_level (i32) = 0 runtime.config.totem.knet_compression_model (str) = none runtime.config.totem.knet_compression_threshold (u32) = 0 runtime.config.totem.knet_pmtud_interval (u32) = 30 runtime.config.totem.token (u32) = 10000 runtime.config.totem.token_retransmit (u32) = 2380 runtime.config.totem.token_retransmits_before_loss_const (u32) = 4 runtime.config.totem.token_warning (u32) = 75 totem.token (u32) = 10000 totem.transport (str) = knet ==================================== Gi1/0/24 + Gi2/0/24 Dell LACP Config (OS10 - https://www.dell.com/support/kbdoc/en-us/000102901/dell-emc-networking-os10-how-to-set-up-virtual-link-trunking-vlt) * On both switches; an-switch02# configure terminal * IFN Port channel is 3 an-switch02(config)# interface port-channel 3 an-switch02(conf-if-po-3)# <165>1 2021-10-19T04:58:56.022086+00:00 an-switch02 dn_alm 920 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %IFM_ASTATE_UP: Interface admin state up :port-channel3 <165>1 2021-10-19T04:58:56.022722+00:00 an-switch02 dn_alm 920 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %IFM_OSTATE_DN: Interface operational state is down :port-channel3 an-switch02(conf-if-po-3)# lacp fallback enable an-switch02(conf-if-po-3)# description IFN1 an-switch02(conf-if-po-3)# exit an-switch02(config)# exit an-switch02# show port-channel summary Flags: D - Down I - member up but inactive P - member up and active U - Up (port-channel) F - Fallback Activated -------------------------------------------------------------------------------- Group Port-Channel Type Protocol Member Ports -------------------------------------------------------------------------------- 3 port-channel3 (D) Eth STATIC 1000 port-channel1000 (U) Eth STATIC 1/1/25(P) 1/1/26(P) an-switch02# configure terminal an-switch02(config)# interface ethernet 1/1/24 an-switch02(conf-if-eth1/1/24)# channel-group 3 an-switch02(conf-if-eth1/1/24)# <165>1 2021-10-19T05:09:41.237808+00:00 an-switch02 dn_alm 920 - - Node.1-Unit.1:PRI [event], Dell EMC (OS10) %IFM_OSTATE_UP: Interface operational state is up :port-channel3 exit exit