From 7dfaf243b205cd94dcc4db7eca08fdf698172320 Mon Sep 17 00:00:00 2001 From: Steve Keay Date: Thu, 2 Apr 2026 13:33:40 +0100 Subject: [PATCH 01/15] Fix test so it is not affected by environment --- python/understack-workflows/tests/test_bmc.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/understack-workflows/tests/test_bmc.py b/python/understack-workflows/tests/test_bmc.py index 122432d50..6f237808f 100644 --- a/python/understack-workflows/tests/test_bmc.py +++ b/python/understack-workflows/tests/test_bmc.py @@ -12,9 +12,15 @@ def mock_creds(mocker): return mock -def test_bmc_for_ip_address(mock_creds): +def test_bmc_for_ip_address(mock_creds, monkeypatch): + # The credential function above is overridden by the environment variable so + # make sure it is removed, so we are using the correct test credential and + # not a production one. + monkeypatch.delenv("BMC_MASTER", raising=False) assert os.getenv("BMC_MASTER") is None + bmc = bmc_for_ip_address("1.2.3.4") + assert bmc.ip_address == "1.2.3.4" assert bmc.url() == "https://1.2.3.4" assert bmc.username == "root" From 33aafdaed6f93e8e31ba89e5bbfdf19f345b2077 Mon Sep 17 00:00:00 2001 From: Steve Keay Date: Thu, 2 Apr 2026 13:35:37 +0100 Subject: [PATCH 02/15] Simplify PXE NIC selection algorithm and allow for more edge cases User can pass a list of known switches to prefer. --- .../tests/test_pxe_port_heuristic.py | 74 ++++++++++-- .../pxe_port_heuristic.py | 110 +++++++++++------- 2 files changed, 135 insertions(+), 49 deletions(-) diff --git a/python/understack-workflows/tests/test_pxe_port_heuristic.py b/python/understack-workflows/tests/test_pxe_port_heuristic.py index fa2f06084..37bc96336 100644 --- a/python/understack-workflows/tests/test_pxe_port_heuristic.py +++ b/python/understack-workflows/tests/test_pxe_port_heuristic.py @@ -63,13 +63,73 @@ def test_connected_is_better(): memory_gib=0, cpu=x, interfaces=[ - InterfaceInfo("iDRAC", x, x, remote_switch_port_name=x), - InterfaceInfo("NIC.Embedded.1-1", x, x, remote_switch_port_name=x), - InterfaceInfo("NIC.Embedded.1-1", x, x, remote_switch_port_name=x), - InterfaceInfo("NIC.Integrated.1-1", x, x, remote_switch_port_name=None), - InterfaceInfo("NIC.Integrated.1-2", x, x, remote_switch_port_name=None), - InterfaceInfo("NIC.Slot.1-1", x, x, remote_switch_port_name=None), - InterfaceInfo("NIC.Slot.1-2", x, x, remote_switch_port_name=x), + InterfaceInfo("iDRAC", x, x, remote_switch_mac_address=x), + InterfaceInfo("NIC.Embedded.1-1", x, x, remote_switch_mac_address=x), + InterfaceInfo("NIC.Embedded.1-1", x, x, remote_switch_mac_address=x), + InterfaceInfo("NIC.Integrated.1-1", x, x, remote_switch_mac_address=None), + InterfaceInfo("NIC.Integrated.1-2", x, x, remote_switch_mac_address=None), + InterfaceInfo("NIC.Slot.1-1", x, x, remote_switch_mac_address=None), + InterfaceInfo("NIC.Slot.1-2", x, x, remote_switch_mac_address=x), ], ) assert guess_pxe_interface(device_info) == "NIC.Slot.1-2" + + +def test_connected_macs_picks_first(): + any_mac = "00:00:00:00:00:00" + pxe_mac = "11:22:33:44:55:66" + x = "test" + device_info = ChassisInfo( + manufacturer=x, + model_number=x, + serial_number=x, + bmc_ip_address=x, + bios_version=x, + power_on=False, + memory_gib=0, + cpu=x, + interfaces=[ + InterfaceInfo("iDRAC", x, x, remote_switch_mac_address=x), + InterfaceInfo("NIC.Embedded.1-1", x, x, remote_switch_mac_address=None), + InterfaceInfo("NIC.Embedded.1-1", x, x, remote_switch_mac_address=any_mac), + InterfaceInfo( + "NIC.Integrated.1-1", x, x, remote_switch_mac_address=any_mac + ), + InterfaceInfo( + "NIC.Integrated.1-2", x, x, remote_switch_mac_address=any_mac + ), + InterfaceInfo("NIC.Slot.1-1", x, x, remote_switch_mac_address=any_mac), + InterfaceInfo("NIC.Slot.1-2", x, x, remote_switch_mac_address=any_mac), + ], + ) + assert guess_pxe_interface(device_info, {pxe_mac}) == "NIC.Integrated.1-1" + + +def test_connected_to_known_pxe_is_best(): + any_mac = "00:00:00:00:00:00" + pxe_mac = "11:22:33:44:55:66" + x = "test" + device_info = ChassisInfo( + manufacturer=x, + model_number=x, + serial_number=x, + bmc_ip_address=x, + bios_version=x, + power_on=False, + memory_gib=0, + cpu=x, + interfaces=[ + InterfaceInfo("iDRAC", x, x, remote_switch_mac_address=x), + InterfaceInfo("NIC.Embedded.1-1", x, x, remote_switch_mac_address=None), + InterfaceInfo("NIC.Embedded.1-1", x, x, remote_switch_mac_address=any_mac), + InterfaceInfo( + "NIC.Integrated.1-1", x, x, remote_switch_mac_address=any_mac + ), + InterfaceInfo( + "NIC.Integrated.1-2", x, x, remote_switch_mac_address=any_mac + ), + InterfaceInfo("NIC.Slot.1-1", x, x, remote_switch_mac_address=pxe_mac), + InterfaceInfo("NIC.Slot.1-2", x, x, remote_switch_mac_address=pxe_mac), + ], + ) + assert guess_pxe_interface(device_info, {pxe_mac}) == "NIC.Slot.1-1" diff --git a/python/understack-workflows/understack_workflows/pxe_port_heuristic.py b/python/understack-workflows/understack_workflows/pxe_port_heuristic.py index 5d0047d69..02cffa997 100644 --- a/python/understack-workflows/understack_workflows/pxe_port_heuristic.py +++ b/python/understack-workflows/understack_workflows/pxe_port_heuristic.py @@ -1,55 +1,81 @@ +import logging + from understack_workflows.bmc_chassis_info import ChassisInfo -from understack_workflows.bmc_chassis_info import InterfaceInfo + +logger = logging.getLogger(__name__) # We try not choose interface whose name contains any of these: DISQUALIFIED = ["DRAC", "ILO", "NIC.EMBEDDED"] -# A higher number is more likely to be PXE interface: -NIC_PREFERENCE = { - "NIC.Integrated.1-1-1": 100, - "NIC.Integrated.1-1": 99, - "NIC.Slot.1-1-1": 98, - "NIC.Slot.1-1": 97, - "NIC.Integrated.1-2-1": 96, - "NIC.Integrated.1-2": 95, - "NIC.Slot.1-2-1": 94, - "NIC.Slot.1-2": 93, - "NIC.Slot.1-3-1": 92, - "NIC.Slot.1-3": 91, - "NIC.Slot.2-1-1": 90, - "NIC.Slot.2-1": 89, - "NIC.Integrated.2-1-1": 88, - "NIC.Integrated.2-1": 87, - "NIC.Slot.2-2-1": 86, - "NIC.Slot.2-2": 85, - "NIC.Integrated.2-2-1": 84, - "NIC.Integrated.2-2": 83, - "NIC.Slot.3-1-1": 82, - "NIC.Slot.3-1": 81, - "NIC.Slot.3-2-1": 80, - "NIC.Slot.3-2": 79, -} - - -def guess_pxe_interface(device_info: ChassisInfo) -> str: - """Determine most probable PXE interface for BMC.""" - interface = max(device_info.interfaces, key=_pxe_preference) - return interface.name +# Preferred interfaces, most likely first +NIC_PREFERENCE = [ + "NIC.Integrated.1-1-1", + "NIC.Integrated.1-1", + "NIC.Slot.1-1-1", + "NIC.Slot.1-1", + "NIC.Integrated.1-2-1", + "NIC.Integrated.1-2", + "NIC.Slot.1-2-1", + "NIC.Slot.1-2", + "NIC.Slot.1-3-1", + "NIC.Slot.1-3", + "NIC.Slot.2-1-1", + "NIC.Slot.2-1", + "NIC.Integrated.2-1-1", + "NIC.Integrated.2-1", + "NIC.Slot.2-2-1", + "NIC.Slot.2-2", + "NIC.Integrated.2-2-1", + "NIC.Integrated.2-2", + "NIC.Slot.3-1-1", + "NIC.Slot.3-1", + "NIC.Slot.3-2-1", + "NIC.Slot.3-2", +] -def _pxe_preference(interface: InterfaceInfo) -> list: - """Relative likelihood that interface is used for PXE. +def guess_pxe_interface( + device_info: ChassisInfo, pxe_switch_macs: set[str] | None = None +) -> str: + """Determine most probable PXE interface for BMC.""" + if pxe_switch_macs is None: + pxe_switch_macs = set() - Prefer names that are not disqualified. + candidate_interfaces = { + i + for i in device_info.interfaces + if not any(q in i.name.upper() for q in DISQUALIFIED) + } + candidate_interface_names = {i.name for i in candidate_interfaces} + connected_interface_names = { + i.name for i in candidate_interfaces if i.remote_switch_mac_address + } + pxe_connected_interface_names = { + i.name + for i in candidate_interfaces + if i.remote_switch_mac_address in pxe_switch_macs + } - After that, prefer interfaces that have an LLDP neighbor. + for name in NIC_PREFERENCE: + if name in pxe_connected_interface_names: + logger.info("PXE port is %s, connected to known pxe switch", name) + return name - Finally, score the interface name according to the list above. - """ - is_eligible = not any(x in interface.name.upper() for x in DISQUALIFIED) + for name in NIC_PREFERENCE: + if name in connected_interface_names: + logger.info("PXE port is %s, the first connected interface", name) + return name - link_detected = interface.remote_switch_port_name is not None + for name in NIC_PREFERENCE: + if name in candidate_interface_names: + logger.info("PXE port is %s, preferred eligible interface", name) + return name - name_preference = NIC_PREFERENCE.get(interface.name, 0) + if candidate_interface_names: + name = min(candidate_interface_names) + logger.info("PXE port is %s, first eligible interface", name) + return name - return [is_eligible, link_detected, name_preference] + name = device_info.interfaces[0].name + logger.info("PXE port is %s, chosen as last resort", name) + return name From eee9f853a5f29b07aedd7180430a5c8d967e5175 Mon Sep 17 00:00:00 2001 From: Steve Keay Date: Thu, 2 Apr 2026 13:38:50 +0100 Subject: [PATCH 03/15] Handle case where BIOS setting changes have already been staged --- .../tests/test_bmc_bios.py | 65 +++++++++++++++++++ .../understack_workflows/bmc_bios.py | 65 +++++++++++++++++-- 2 files changed, 124 insertions(+), 6 deletions(-) create mode 100644 python/understack-workflows/tests/test_bmc_bios.py diff --git a/python/understack-workflows/tests/test_bmc_bios.py b/python/understack-workflows/tests/test_bmc_bios.py new file mode 100644 index 000000000..7114c9a4f --- /dev/null +++ b/python/understack-workflows/tests/test_bmc_bios.py @@ -0,0 +1,65 @@ +from understack_workflows import bmc_bios + + +def test_update_dell_bios_settings_skips_patch_when_desired_values_are_pending(mocker): + bmc = mocker.Mock() + bmc.system_path = "/redfish/v1/Systems/System.Embedded.1" + bmc.redfish_request.side_effect = [ + { + "Attributes": { + "PxeDev1EnDis": "Disabled", + "PxeDev1Interface": "NIC.Slot.1-1", + "HttpDev1EnDis": "Disabled", + "HttpDev1Interface": "NIC.Slot.1-1", + "HttpDev1TlsMode": "TLS", + "TimeZone": "Local", + "OS-BMC.1.AdminState": "Enabled", + "IPMILan.1.Enable": "Enabled", + } + }, + {"Attributes": bmc_bios.required_bios_settings("NIC.Embedded.1-1-1")}, + ] + patch_bios_settings = mocker.patch.object(bmc_bios, "patch_bios_settings") + + result = bmc_bios.update_dell_bios_settings(bmc, "NIC.Embedded.1-1-1") + + assert result == {} + patch_bios_settings.assert_not_called() + + +def test_update_dell_bios_settings_only_patches_settings_not_already_pending(mocker): + bmc = mocker.Mock() + bmc.system_path = "/redfish/v1/Systems/System.Embedded.1" + bmc.redfish_request.side_effect = [ + { + "Attributes": { + "PxeDev1EnDis": "Enabled", + "PxeDev1Interface": "NIC.Slot.1-1", + "HttpDev1EnDis": "Disabled", + "HttpDev1Interface": "NIC.Slot.1-1", + "HttpDev1TlsMode": "TLS", + "TimeZone": "UTC", + "OS-BMC.1.AdminState": "Enabled", + "IPMILan.1.Enable": "Disabled", + } + }, + { + "Attributes": { + "PxeDev1EnDis": "Enabled", + "PxeDev1Interface": "NIC.Embedded.1-1-1", + "TimeZone": "UTC", + "IPMILan.1.Enable": "Disabled", + } + }, + ] + patch_bios_settings = mocker.patch.object(bmc_bios, "patch_bios_settings") + + result = bmc_bios.update_dell_bios_settings(bmc, "NIC.Embedded.1-1-1") + + assert result == { + "HttpDev1EnDis": "Enabled", + "HttpDev1Interface": "NIC.Embedded.1-1-1", + "HttpDev1TlsMode": "None", + "OS-BMC.1.AdminState": "Disabled", + } + patch_bios_settings.assert_called_once_with(bmc, result) diff --git a/python/understack-workflows/understack_workflows/bmc_bios.py b/python/understack-workflows/understack_workflows/bmc_bios.py index e087daa92..6dc1b0196 100644 --- a/python/understack-workflows/understack_workflows/bmc_bios.py +++ b/python/understack-workflows/understack_workflows/bmc_bios.py @@ -29,6 +29,50 @@ def required_bios_settings(pxe_interface: str) -> dict: } +def required_change_for_bios_setting( + key: str, + required_value: str, + current_settings: dict, + pending_settings: dict, +) -> str | None: + active_value = current_settings.get(key) + pending_value = pending_settings.get(key) + + if active_value is None: + logger.debug("X - BIOS has no %s setting", key) + return None + + if pending_value == required_value: + logger.debug( + "[✓] %s currently %r but already pending update to %r", + key, + active_value, + required_value, + ) + return None + + if pending_value is not None: + logger.debug( + "⚠ - %s should be set to %r but with pending update to %r, updating", + key, + required_value, + pending_value, + ) + return required_value + + if active_value == required_value: + logger.debug("✓ - %s already set to %r", key, required_value) + return None + + logger.debug( + "→ - %s is currently %r, setting to %r", + key, + active_value, + required_value, + ) + return required_value + + def update_dell_bios_settings(bmc: Bmc, pxe_interface: str) -> dict: """Check and update BIOS settings to standard as required. @@ -37,16 +81,25 @@ def update_dell_bios_settings(bmc: Bmc, pxe_interface: str) -> dict: Returns the changes that were made """ current_settings = bmc.redfish_request(bmc.system_path + "/Bios")["Attributes"] + pending_settings = bmc.redfish_request(bmc.system_path + "/Bios/Settings").get( + "Attributes", {} + ) required_settings = required_bios_settings(pxe_interface) - required_changes = { - k: v - for k, v in required_settings.items() - if (k in current_settings and current_settings[k] != v) - } + logger.info("%s Checking BIOS settings", bmc) + required_changes = {} + for key, required_value in required_settings.items(): + required_change = required_change_for_bios_setting( + key, + required_value, + current_settings, + pending_settings, + ) + if required_change is None: + continue + required_changes[key] = required_change if required_changes: - logger.info("%s Updating BIOS settings: %s", bmc, required_changes) patch_bios_settings(bmc, required_changes) logger.info("%s BIOS settings will be updated on next server boot.", bmc) else: From 11e1b2e3ce53907defc81c66be5e0236fc357542 Mon Sep 17 00:00:00 2001 From: Steve Keay Date: Thu, 2 Apr 2026 13:43:26 +0100 Subject: [PATCH 04/15] Move workflow decisions and actions into the enrol python script The python script is "beefed up" with extra capabilities and it now handles the transitioning of the node through the various stages, including configuring RAID, running inspection, etc. Logging and error handling is improved. Ability to cope with non-standard cabling is improved slightly - we used to rely completely on a cabling convention to determine which port should be used for PXE. We now detect that from the LLDP data reported by the BMC (when available - note that we have hardware where the LLDP feature does not work). This provides us with the chassis MAC address of the connected switch. Until we can find an easy way to look this up, we allow the user to pass one or more PXE switch MACs on the command line. --- .../tests/test_enrol_server.py | 411 ++++++++++++++++++ .../understack_workflows/bmc_chassis_info.py | 20 + .../understack_workflows/bmc_settings.py | 2 +- .../understack_workflows/ironic/client.py | 32 ++ .../understack_workflows/ironic_node.py | 200 +++++++-- .../main/enroll_server.py | 257 +++++++++-- .../workflowtemplates/enroll-server.yaml | 231 +--------- 7 files changed, 869 insertions(+), 284 deletions(-) create mode 100644 python/understack-workflows/tests/test_enrol_server.py diff --git a/python/understack-workflows/tests/test_enrol_server.py b/python/understack-workflows/tests/test_enrol_server.py new file mode 100644 index 000000000..27f473cdb --- /dev/null +++ b/python/understack-workflows/tests/test_enrol_server.py @@ -0,0 +1,411 @@ +from types import SimpleNamespace +from typing import cast +from unittest.mock import MagicMock +from unittest.mock import call + +from ironicclient.common.apiclient import exceptions as ironic_exceptions +from ironicclient.v1.node import Node + +from understack_workflows.bmc_chassis_info import ChassisInfo +from understack_workflows.bmc_chassis_info import InterfaceInfo +from understack_workflows.main import enroll_server + + +def make_device_info( + *, + power_on: bool, + connected_mac: str | None = None, + serial_number: str = "ABC123", +) -> ChassisInfo: + return ChassisInfo( + manufacturer="Dell", + model_number="R760", + serial_number=serial_number, + bmc_ip_address="10.0.0.10", + bios_version="1.0.0", + power_on=power_on, + memory_gib=64, + cpu="Xeon", + interfaces=[ + InterfaceInfo("iDRAC", "bmc", "00:00:00:00:00:01"), + InterfaceInfo( + "NIC.Integrated.1-1", + "PXE NIC", + "00:00:00:00:00:02", + remote_switch_mac_address=connected_mac, + remote_switch_port_name="Ethernet1/1" if connected_mac else None, + ), + ], + ) + + +def make_bmc(mocker, fake_sushy=None): + bmc = mocker.MagicMock() + bmc.ip_address = "10.0.0.10" + bmc.username = "root" + bmc.password = "calvin" + bmc.url.return_value = "https://10.0.0.10" + bmc.get_system_path.return_value = "/redfish/v1/Systems/System.Embedded.1" + if fake_sushy is not None: + bmc.sushy.return_value = fake_sushy + return bmc + + +def make_raid_hardware(): + controller = SimpleNamespace( + identity="RAID.Integrated.1-1", + drives=[SimpleNamespace(identity="Disk1"), SimpleNamespace(identity="Disk2")], + ) + system = SimpleNamespace(storage=SimpleNamespace(get_members=lambda: [controller])) + return SimpleNamespace( + get_system_collection=lambda: SimpleNamespace(get_members=lambda: [system]) + ) + + +def make_ironic_client( + *, + node_name: str, + node_uuid: str = "node-123", + existing_node=None, + inspect_interfaces: list[str] | None = None, + traits: list[str] | None = None, + runbook_ids: dict[str, str] | None = None, +): + if inspect_interfaces is None: + inspect_interfaces = ["idrac-redfish", "idrac-redfish"] + if traits is None: + traits = [] + if runbook_ids is None: + runbook_ids = {} + + created_node = SimpleNamespace( + uuid=node_uuid, + provision_state="enroll", + driver="idrac", + inspect_interface="idrac-redfish", + ) + inspect_interface_iter = iter(inspect_interfaces) + fake_client = MagicMock() + fake_client.node.api.runbook.get.side_effect = lambda trait: SimpleNamespace( + uuid=runbook_ids[trait] + ) + fake_client.node.create.return_value = created_node + fake_client.node.get_traits.return_value = traits + + def get_node(ident, fields=None): + if ident == node_name and fields is None: + if existing_node is None: + raise ironic_exceptions.NotFound() + return existing_node + if ident == node_uuid and fields == ["driver"]: + return SimpleNamespace(driver="idrac") + if ident == node_uuid and fields == ["inspect_interface"]: + return SimpleNamespace(inspect_interface=next(inspect_interface_iter)) + raise AssertionError( + f"Unexpected node lookup ident={ident!r} fields={fields!r}" + ) + + fake_client.node.get.side_effect = get_node + return fake_client, created_node + + +def test_enrol_happy_path_uses_real_ironic_workflow(mocker): + initial_info = make_device_info(power_on=False, connected_mac=None) + powered_info = make_device_info( + power_on=True, + connected_mac="AA:BB:CC:DD:EE:FF", + ) + fake_bmc = make_bmc(mocker, fake_sushy=make_raid_hardware()) + fake_ironic, created_node = make_ironic_client( + node_name="Dell-ABC123", + inspect_interfaces=["idrac-redfish", "idrac-redfish"], + ) + + mocker.patch.object(enroll_server, "bmc_for_ip_address", return_value=fake_bmc) + mocker.patch.object( + enroll_server, + "chassis_info", + side_effect=[initial_info, powered_info], + ) + set_bmc_password = mocker.patch.object(enroll_server, "set_bmc_password") + update_dell_drac_settings = mocker.patch.object( + enroll_server, "update_dell_drac_settings" + ) + bmc_set_hostname = mocker.patch.object(enroll_server, "bmc_set_hostname") + update_dell_bios_settings = mocker.patch.object( + enroll_server, "update_dell_bios_settings" + ) + sleep = mocker.patch.object(enroll_server.time, "sleep") + mocker.patch( + "understack_workflows.ironic.client.get_ironic_client", + return_value=fake_ironic, + ) + + enroll_server.enrol( + ip_address="10.0.0.10", + firmware_update=False, + raid_configure=True, + pxe_switch_macs={"AA:BB:CC:DD:EE:FF"}, + old_password="old-password", + external_cmdb_id="cmdb-1", + ) + + set_bmc_password.assert_called_once_with( + ip_address="10.0.0.10", + new_password="calvin", + old_password="old-password", + ) + update_dell_drac_settings.assert_called_once_with(fake_bmc) + bmc_set_hostname.assert_called_once_with(fake_bmc, "None", "Dell-ABC123") + update_dell_bios_settings.assert_called_once_with( + fake_bmc, + pxe_interface="NIC.Integrated.1-1", + ) + sleep.assert_called_once_with(120) + + fake_bmc.redfish_request.assert_called_once_with( + path="/redfish/v1/Systems/System.Embedded.1/Actions/ComputerSystem.Reset", + payload={"ResetType": "On"}, + method="POST", + ) + fake_ironic.node.create.assert_called_once_with( + name="Dell-ABC123", + driver="idrac", + driver_info={ + "redfish_address": "https://10.0.0.10", + "redfish_verify_ca": False, + "redfish_username": "root", + "redfish_password": "calvin", + }, + boot_interface="http-ipxe", + inspect_interface="idrac-redfish", + extra={ + "external_cmdb_id": "cmdb-1", + "enrolled_pxe_port": "NIC.Integrated.1-1", + }, + ) + fake_ironic.node.set_target_raid_config.assert_called_once_with( + created_node.uuid, + { + "logical_disks": [ + { + "controller": "RAID.Integrated.1-1", + "is_root_volume": True, + "physical_disks": ["Disk1", "Disk2"], + "raid_level": "1", + "size_gb": "MAX", + } + ] + }, + ) + fake_ironic.node.set_provision_state.assert_has_calls( + [ + call( + created_node.uuid, + "manage", + cleansteps=None, + runbook=None, + disable_ramdisk=None, + ), + call( + created_node.uuid, + "clean", + cleansteps=[ + {"interface": "raid", "step": "delete_configuration"}, + {"interface": "raid", "step": "create_configuration"}, + ], + runbook=None, + disable_ramdisk=True, + ), + call( + created_node.uuid, + "inspect", + cleansteps=None, + runbook=None, + disable_ramdisk=None, + ), + call( + created_node.uuid, + "inspect", + cleansteps=None, + runbook=None, + disable_ramdisk=None, + ), + call( + created_node.uuid, + "provide", + cleansteps=None, + runbook=None, + disable_ramdisk=None, + ), + ] + ) + fake_ironic.node.update.assert_called_once() + patch = fake_ironic.node.update.call_args.args[1] + assert patch == [{"op": "add", "path": "/inspect_interface", "value": "agent"}] + + +def test_enrol_existing_failed_node_recovers_and_updates(mocker): + device_info = make_device_info( + power_on=True, + connected_mac="AA:BB:CC:DD:EE:FF", + ) + existing_node = SimpleNamespace( + uuid="node-999", + provision_state="inspect failed", + driver="idrac", + inspect_interface="idrac-redfish", + ) + fake_bmc = make_bmc(mocker) + fake_ironic, _ = make_ironic_client( + node_name="Dell-ABC123", + node_uuid="node-999", + existing_node=existing_node, + inspect_interfaces=["idrac-redfish", "idrac-redfish"], + ) + + mocker.patch.object(enroll_server, "bmc_for_ip_address", return_value=fake_bmc) + mocker.patch.object(enroll_server, "chassis_info", return_value=device_info) + mocker.patch.object(enroll_server, "set_bmc_password") + mocker.patch.object(enroll_server, "update_dell_drac_settings") + mocker.patch.object(enroll_server, "bmc_set_hostname") + mocker.patch.object(enroll_server, "update_dell_bios_settings") + mocker.patch( + "understack_workflows.ironic.client.get_ironic_client", + return_value=fake_ironic, + ) + + enroll_server.enrol( + ip_address="10.0.0.10", + firmware_update=False, + raid_configure=False, + pxe_switch_macs={"AA:BB:CC:DD:EE:FF"}, + old_password=None, + external_cmdb_id="cmdb-1", + ) + + fake_ironic.node.create.assert_not_called() + fake_ironic.node.set_target_raid_config.assert_not_called() + fake_ironic.node.set_provision_state.assert_has_calls( + [ + call( + existing_node.uuid, + "manage", + cleansteps=None, + runbook=None, + disable_ramdisk=None, + ), + call( + existing_node.uuid, + "inspect", + cleansteps=None, + runbook=None, + disable_ramdisk=None, + ), + call( + existing_node.uuid, + "inspect", + cleansteps=None, + runbook=None, + disable_ramdisk=None, + ), + call( + existing_node.uuid, + "provide", + cleansteps=None, + runbook=None, + disable_ramdisk=None, + ), + ] + ) + update_patch = fake_ironic.node.update.call_args_list[0].args[1] + assert { + "op": "add", + "path": "/extra/enrolled_pxe_port", + "value": "NIC.Integrated.1-1", + } in update_patch + + +def test_apply_firmware_updates_runs_traits_in_numeric_order(mocker): + fake_ironic, _ = make_ironic_client( + node_name="unused", + node_uuid="node-789", + traits=[ + "CUSTOM_FIRMWARE_UPDATE_20_BIOS", + "CUSTOM_FIRMWARE_UPDATE_3_NIC", + "CUSTOM_FIRMWARE_UPDATE_100_STORAGE", + "CUSTOM_UNRELATED", + ], + runbook_ids={ + "CUSTOM_FIRMWARE_UPDATE_3_NIC": "runbook-3", + "CUSTOM_FIRMWARE_UPDATE_20_BIOS": "runbook-20", + "CUSTOM_FIRMWARE_UPDATE_100_STORAGE": "runbook-100", + }, + ) + mocker.patch( + "understack_workflows.ironic.client.get_ironic_client", + return_value=fake_ironic, + ) + node = cast(Node, SimpleNamespace(uuid="node-789")) + + enroll_server.ironic_node.apply_firmware_updates(node) + + fake_ironic.node.api.runbook.get.assert_has_calls( + [ + call("CUSTOM_FIRMWARE_UPDATE_3_NIC"), + call("CUSTOM_FIRMWARE_UPDATE_20_BIOS"), + call("CUSTOM_FIRMWARE_UPDATE_100_STORAGE"), + ] + ) + fake_ironic.node.set_provision_state.assert_has_calls( + [ + call( + "node-789", + "clean", + cleansteps=None, + runbook="runbook-3", + disable_ramdisk=None, + ), + call( + "node-789", + "clean", + cleansteps=None, + runbook="runbook-20", + disable_ramdisk=None, + ), + call( + "node-789", + "clean", + cleansteps=None, + runbook="runbook-100", + disable_ramdisk=None, + ), + ] + ) + + +def test_guess_pxe_interface_unknown_name_avoids_bmc_interface(): + device_info = ChassisInfo( + manufacturer="Dell", + model_number="R760", + serial_number="ABC123", + bmc_ip_address="1.2.3.4", + bios_version="1.0.0", + power_on=False, + memory_gib=0, + cpu="cpu", + interfaces=[ + InterfaceInfo("iDRAC", "bmc", "00:00:00:00:00:01"), + InterfaceInfo( + "NIC.Custom.9-1", + "custom nic", + "00:00:00:00:00:02", + remote_switch_mac_address="AA:BB:CC:DD:EE:FF", + ), + ], + ) + + assert ( + enroll_server.guess_pxe_interface(device_info, {"AA:BB:CC:DD:EE:FF"}) + == "NIC.Custom.9-1" + ) diff --git a/python/understack-workflows/understack_workflows/bmc_chassis_info.py b/python/understack-workflows/understack_workflows/bmc_chassis_info.py index 6f773aef2..0dd5e4da3 100644 --- a/python/understack-workflows/understack_workflows/bmc_chassis_info.py +++ b/python/understack-workflows/understack_workflows/bmc_chassis_info.py @@ -61,6 +61,26 @@ def neighbors(self) -> set: if interface.remote_switch_mac_address } + @property + def dump(self) -> list[str]: + return [ + f"{self.manufacturer} {self.model_number} serial {self.serial_number}", + f"BIOS VERSION {self.bios_version}", + f"BMC IP Address {self.bmc_ip_address}", + *self.dump_interfaces, + f"Power on {self.power_on}", + ] + + @property + def dump_interfaces(self) -> list[str]: + return [ + ( + f"NIC {i.mac_address} {i.name} " + f"[LLDP: {i.remote_switch_mac_address} {i.remote_switch_port_name}]" + ) + for i in self.interfaces + ] + def chassis_info(bmc: Bmc) -> ChassisInfo: """Query DRAC for basic system info via redfish. diff --git a/python/understack-workflows/understack_workflows/bmc_settings.py b/python/understack-workflows/understack_workflows/bmc_settings.py index cfb3330c1..cbe44aba6 100644 --- a/python/understack-workflows/understack_workflows/bmc_settings.py +++ b/python/understack-workflows/understack_workflows/bmc_settings.py @@ -48,7 +48,7 @@ def update_dell_drac_settings(bmc: Bmc) -> dict: elif key not in current_values: logger.warning("%s has no BMC attribute '%s'", bmc, key) elif current_values[key] == required_value: - logger.warning("%s: '%s' already set to '%s'", bmc, key, required_value) + logger.info("%s: '%s' already set to '%s'", bmc, key, required_value) else: required_changes[key] = VALUES_TO_POST[key] diff --git a/python/understack-workflows/understack_workflows/ironic/client.py b/python/understack-workflows/understack_workflows/ironic/client.py index 9f61efcf8..9a7d60a22 100644 --- a/python/understack-workflows/understack_workflows/ironic/client.py +++ b/python/understack-workflows/understack_workflows/ironic/client.py @@ -29,6 +29,38 @@ def get_node(self, node_ident: str, fields: list[str] | None = None) -> Node: def update_node(self, node_id, patch): return self.client.node.update(node_id, patch) + def set_node_provision_state( + self, + node_id: str, + target: str, + clean_steps: list[dict] | None = None, + runbook: str | None = None, + disable_ramdisk: bool | None = None, + ) -> None: + self.client.node.set_provision_state( + node_id, + target, + cleansteps=clean_steps, + runbook=runbook, + disable_ramdisk=disable_ramdisk, + ) + + def wait_for_node_provision_state( + self, node_id: str, expected_state: str, timeout: int = 1800 + ) -> None: + self.client.node.wait_for_provision_state( + node_id, expected_state, timeout=timeout + ) + + def set_node_target_raid_config(self, node_id: str, raid_config: dict) -> None: + self.client.node.set_target_raid_config(node_id, raid_config) + + def get_node_traits(self, node_id: str) -> list[str]: + return cast(list[str], self.client.node.get_traits(node_id)) + + def get_runbook(self, runbook_id: str): + return self.client.node.api.runbook.get(runbook_id) + def get_node_inventory(self, node_ident: str) -> dict: """Fetch node inventory data from Ironic API. diff --git a/python/understack-workflows/understack_workflows/ironic_node.py b/python/understack-workflows/understack_workflows/ironic_node.py index 2f7726455..223a44da6 100644 --- a/python/understack-workflows/understack_workflows/ironic_node.py +++ b/python/understack-workflows/understack_workflows/ironic_node.py @@ -1,39 +1,85 @@ import logging +from collections.abc import Sequence import ironicclient.common.apiclient.exceptions from ironicclient.common.utils import args_array_to_patch +from ironicclient.v1.node import Node from understack_workflows.bmc import Bmc from understack_workflows.ironic.client import IronicClient -from understack_workflows.node_configuration import IronicNodeConfiguration -STATES_ALLOWING_UPDATES = ["enroll", "manageable"] +FIRMWARE_UPDATE_TRAIT_PREFIX = "CUSTOM_FIRMWARE_UPDATE_" +ENROLLABLE_STATES = {"clean failed", "inspect failed", "enroll", "manageable"} logger = logging.getLogger(__name__) def create_or_update( - bmc: Bmc, name: str, manufacturer: str, external_cmdb_id: str | None = None -) -> IronicNodeConfiguration: - """Note interfaces/ports are not synced here, that happens elsewhere.""" + bmc: Bmc, + name: str, + manufacturer: str, + external_cmdb_id: str | None = None, + enrolled_pxe_port: str | None = None, +) -> Node: + """Find-or-create Node by name, update attributes, set state to Manageable. + + If the node exists in a quiescent state like "clean failed" then we will set + it to "manageable" state before proceeding to update the node attributes. + + If the node exists in a "busy" state then we raise an exception. + + If the node does not already exist it is created and then transitioned from + enrol->manageable state. + + Note interfaces/ports are not synced here, that happens elsewhere. + """ client = IronicClient() driver, inspect_interface = _driver_for(manufacturer) try: - ironic_node = client.get_node(name) + node = client.get_node(name) logger.debug( - "Using existing baremetal node %s with name %s", ironic_node.uuid, name + "Baremetal node %s already exists with name %s in provision_state %s", + node.uuid, + name, + node.provision_state, ) + + if node.provision_state not in ENROLLABLE_STATES: + raise Exception( + f"Re-enrol cannot proceed unless node is in one of the states " + f"{sorted(ENROLLABLE_STATES)} but node {node.uuid} " + f"is in '{node.provision_state}'." + ) + + if node.provision_state != "manageable": + transition(node, target_state="manage", expected_state="manageable") + update_ironic_node( - client, bmc, ironic_node, name, driver, inspect_interface, external_cmdb_id + client, + bmc, + node, + name, + driver, + inspect_interface, + external_cmdb_id, + enrolled_pxe_port, ) - # Return node as IronicNodeConfiguration (duck typing - Node has same attrs) - return ironic_node # type: ignore[return-value] except ironicclient.common.apiclient.exceptions.NotFound: logger.debug("Baremetal Node with name %s not found in Ironic, creating.", name) - return create_ironic_node( - client, bmc, name, driver, inspect_interface, external_cmdb_id + node = create_ironic_node( + client, + bmc, + name, + driver, + inspect_interface, + external_cmdb_id, + enrolled_pxe_port, ) + # All newly-created nodes start out with "enrol" state: + transition(node, target_state="manage", expected_state="manageable") + + return node def update_ironic_node( @@ -44,15 +90,8 @@ def update_ironic_node( driver, inspect_interface, external_cmdb_id: str | None = None, + enrolled_pxe_port: str | None = None, ): - if ironic_node.provision_state not in STATES_ALLOWING_UPDATES: - logger.info( - "Baremetal node %s is in %s provision_state, so no updates are allowed", - ironic_node.uuid, - ironic_node.provision_state, - ) - return - updates = [ f"name={name}", f"driver={driver}", @@ -67,12 +106,14 @@ def update_ironic_node( # Update external_cmdb_id only when explicitly provided if external_cmdb_id: updates.append(f"extra/external_cmdb_id={external_cmdb_id}") + if enrolled_pxe_port: + updates.append(f"extra/enrolled_pxe_port={enrolled_pxe_port}") patches = args_array_to_patch("add", updates) logger.info("Updating Ironic node %s patches=%s", ironic_node.uuid, patches) - response = client.update_node(ironic_node.uuid, patches) - logger.info("Ironic node %s Updated: response=%s", ironic_node.uuid, response) + client.update_node(ironic_node.uuid, patches) + logger.debug("Ironic node %s Updated.") def create_ironic_node( @@ -82,7 +123,8 @@ def create_ironic_node( driver: str, inspect_interface: str, external_cmdb_id: str | None = None, -) -> IronicNodeConfiguration: + enrolled_pxe_port: str | None = None, +) -> Node: node_data = { "name": name, "driver": driver, @@ -95,10 +137,17 @@ def create_ironic_node( "boot_interface": "http-ipxe", "inspect_interface": inspect_interface, } + if external_cmdb_id or enrolled_pxe_port: + node_data["extra"] = {} if external_cmdb_id: - node_data["extra"] = {"external_cmdb_id": external_cmdb_id} - # Return node as IronicNodeConfiguration (duck typing - Node has same attrs) - return client.create_node(node_data) # type: ignore[return-value] + node_data["extra"]["external_cmdb_id"] = external_cmdb_id + if enrolled_pxe_port: + node_data["extra"]["enrolled_pxe_port"] = enrolled_pxe_port + + if driver == "fake-hardware": + node_data["resource_class"] = "fakehw" + + return client.create_node(node_data) def _driver_for(manufacturer: str) -> tuple[str, str]: @@ -107,3 +156,102 @@ def _driver_for(manufacturer: str) -> tuple[str, str]: return ("idrac", "idrac-redfish") else: return ("redfish", "redfish") + + +def transition( + node: Node, + target_state: str, + expected_state: str | None = None, + clean_steps: list[dict] | None = None, + runbook: str | None = None, + disable_ramdisk: bool | None = None, +) -> None: + client = IronicClient() + + logger.info("%s requesting provision state %s", node.uuid, target_state) + client.set_node_provision_state( + node.uuid, + target_state, + clean_steps=clean_steps, + runbook=runbook, + disable_ramdisk=disable_ramdisk, + ) + if expected_state: + logger.info( + "Waiting for node %s to enter provision state %s", + node.uuid, + expected_state, + ) + client.wait_for_node_provision_state(node.uuid, expected_state, timeout=1800) + + +def patch(node: Node, updates: Sequence[str]) -> None: + if not updates: + return + logger.info("%s updating node %s", node.uuid, list(updates)) + IronicClient().update_node(node.uuid, args_array_to_patch("add", list(updates))) + + +def set_target_raid_config(node: Node, raid_config: dict) -> None: + IronicClient().set_node_target_raid_config(node.uuid, raid_config) + + +def inspect_out_of_band(node: Node): + refreshed_node = refresh(node, fields=["driver"]) + if refreshed_node.driver == "idrac": + inspect_interface = "idrac-redfish" + elif refreshed_node.driver == "redfish": + inspect_interface = "redfish" + else: + logger.warning("No out-of-band inspection for driver %s", refreshed_node.driver) + return + + logger.info("%s Performing out-of-band inspection", node.uuid) + inspect(node, inspect_interface) + + +def inspect(node: Node, inspect_interface: str = "agent"): + """Set the node inspect interface, inspect and wait for success. + + We raise an error if anything failed. If we return then the everything has + succeeded and the node has returned to the manageable state. + """ + refreshed_node = refresh(node, fields=["inspect_interface"]) + if refreshed_node.inspect_interface != inspect_interface: + patch(node, [f"inspect_interface={inspect_interface}"]) + + transition(node, target_state="inspect", expected_state="manageable") + + +def apply_firmware_updates(node: Node) -> None: + client = IronicClient() + traits = client.get_node_traits(node.uuid) + update_traits = sorted( + (trait for trait in traits if trait.startswith(FIRMWARE_UPDATE_TRAIT_PREFIX)), + key=firmware_trait_sort_key, + ) + if not update_traits: + logger.info("%s No firmware update traits found", node.uuid) + return + + for trait in update_traits: + runbook = client.get_runbook(trait) + logger.info("%s Running firmware update trait %s", node.uuid, trait) + transition( + node, + "clean", + expected_state="manageable", + runbook=runbook.uuid, + ) + + +def firmware_trait_sort_key(trait: str) -> tuple[int, str]: + parts = trait.split("_") + order = parts[3] if len(parts) > 3 else "" + if order.isdigit(): + return (int(order), trait) + return (10**9, trait) + + +def refresh(node: Node, fields: list[str] | None = None) -> Node: + return IronicClient().get_node(node.uuid, fields=fields) diff --git a/python/understack-workflows/understack_workflows/main/enroll_server.py b/python/understack-workflows/understack_workflows/main/enroll_server.py index 1be24d8be..3b77cf7e1 100644 --- a/python/understack-workflows/understack_workflows/main/enroll_server.py +++ b/python/understack-workflows/understack_workflows/main/enroll_server.py @@ -3,21 +3,26 @@ import argparse import logging import os -from pprint import pformat +import time + +from ironicclient.v1.node import Node from understack_workflows import ironic_node from understack_workflows.bmc import Bmc from understack_workflows.bmc import bmc_for_ip_address +from understack_workflows.bmc_chassis_info import ChassisInfo from understack_workflows.bmc_chassis_info import chassis_info from understack_workflows.bmc_credentials import set_bmc_password from understack_workflows.bmc_hostname import bmc_set_hostname from understack_workflows.bmc_settings import update_dell_drac_settings from understack_workflows.helpers import setup_logger +from understack_workflows.main.bios_settings import update_dell_bios_settings +from understack_workflows.pxe_port_heuristic import guess_pxe_interface logger = logging.getLogger(__name__) -def main(): +def main() -> None: """On-board new or Refresh existing baremetal node. - connect to the BMC using standard password, if that fails then use @@ -25,18 +30,6 @@ def main(): - ensure standard BMC password is set - - if DHCP, set permanent IP address, netmask, default gw - - - TODO: create and install SSL certificate - - - TODO: set NTP Server IPs for DRAC - (NTP server IP addresses are different per region) - - - Using BMC, configure our standard BIOS settings - - set PXE boot device - - set timezone to UTC - - set the hostname - - from BMC, discover basic hardware info: - manufacturer, model number, serial number - CPU model(s), RAM size and local storage @@ -45,29 +38,92 @@ def main(): - MAC address - LLDP connections [{remote_mac, remote_interface_name}] + - Figure out which NIC to use for PXE + + - Using BMC, configure our standard BIOS settings + - set PXE boot device + - set timezone to UTC + - set the hostname + - Find or create this baremetal node in Ironic - set the name to "{manufacturer}-{servicetag}" - set the driver as appropriate + - Configure RAID + - Transition through enrol->manage->inspect->cleaning->provide """ setup_logger() args = argument_parser().parse_args() - bmc_ip_address = args.bmc_ip_address - logger.info("%s starting for bmc_ip_address=%s", __file__, bmc_ip_address) + enrol( + ip_address=args.ip_address, + old_password=args.old_password, + firmware_update=args.firmware_update, + pxe_switch_macs=parse_maclist(str(args.pxe_switch_macs)), + raid_configure=args.raid_configure, + external_cmdb_id=args.external_cmdb_id, + ) + - bmc = bmc_for_ip_address(bmc_ip_address) +def parse_maclist(maclist: str) -> set[str]: + return {mac.strip().upper() for mac in maclist.split(",")} - device_id = enroll_server(bmc, args.old_bmc_password, args.external_cmdb_id) - # argo workflows captures stdout as the results which we can use - # to return the device UUID - print(device_id) +def enrol( + ip_address: str, + firmware_update: bool, + raid_configure: bool, + pxe_switch_macs: set[str], + old_password: str | None, + external_cmdb_id: str | None = None, +) -> None: + logger.info("Starting enrol workflow for bmc_ip_address=%s", ip_address) + logger.info(" pxe_switch_macs=%s", pxe_switch_macs) + if external_cmdb_id: + logger.info(" external_cmdb_id=%s", external_cmdb_id) -def enroll_server( - bmc: Bmc, old_password: str | None, external_cmdb_id: str | None = None -) -> str: - """Enroll BMC to Undercloud Ironic.""" + bmc = bmc_for_ip_address(ip_address) + device_info = initialize_bmc(bmc, old_password) + + if insufficient_lldp_data(device_info): + device_info = power_on_and_wait(bmc, device_info) + + pxe_interface = guess_pxe_interface(device_info, pxe_switch_macs) + logger.info("Selected %s as PXE interface", pxe_interface) + + node = ironic_node.create_or_update( + bmc=bmc, + name=device_name(device_info), + manufacturer=device_info.manufacturer, + external_cmdb_id=external_cmdb_id, + enrolled_pxe_port=pxe_interface, + ) + + if raid_configure: + configure_raid(node, bmc) + else: + logger.info("%s RAID configuration was not requested", node.uuid) + + ironic_node.inspect_out_of_band(node) + + if firmware_update: + ironic_node.apply_firmware_updates(node) + + # Applying BIOS settings on Dell servers requires a reboot which we achieve + # by initiating agent inspection. Agent inspection requires BIOS settings + # (to set boot device). Therefore these two actions must go hand-in-hand. + apply_bios_settings(bmc, pxe_interface) + ironic_node.inspect(node) + + # After inspection, our node is left in "manageable" state. All being well, + # the "provide" action will transition manageable->cleaning->available. + ironic_node.transition(node, target_state="provide", expected_state="available") + + logger.info("Completed enrol workflow for bmc_ip_address=%s", ip_address) + + +def initialize_bmc(bmc: Bmc, old_password: str | None) -> ChassisInfo: + """Discover and configure BMC with Undercloud standard settings.""" set_bmc_password( ip_address=bmc.ip_address, new_password=bmc.password, @@ -75,39 +131,158 @@ def enroll_server( ) device_info = chassis_info(bmc) - logger.info("Discovered %s", pformat(device_info)) - device_name = f"{device_info.manufacturer}-{device_info.serial_number}" + for line in device_info.dump: + logger.info("Discovered %s", line) - update_dell_drac_settings(bmc) + if device_info.manufacturer == "Dell": + update_dell_drac_settings(bmc) - bmc_set_hostname(bmc, device_info.bmc_hostname, device_name) + bmc_set_hostname(bmc, device_info.bmc_hostname, device_name(device_info)) + return device_info - node = ironic_node.create_or_update( - bmc=bmc, - name=device_name, - manufacturer=device_info.manufacturer, - external_cmdb_id=external_cmdb_id, + +def insufficient_lldp_data(device_info: ChassisInfo) -> bool: + """Whether the device_info is populated with switch connections. + + We normally get LLDP data for the BMC's own (out of band) interface but that + is not relevant to our investigation so we exclude known BMC interface + names. + + If the server has been powered on, we will often get LLDP data for the other + ports as well. + """ + for i in device_info.interfaces: + if ( + "DRAC" not in i.name.upper() + and "ILO" not in i.name.upper() + and i.remote_switch_mac_address + ): + return False + + return True + + +def power_on_and_wait(bmc: Bmc, device_info: ChassisInfo) -> ChassisInfo: + """If power is off, switch on and wait a minute.""" + if device_info.power_on: + logger.debug("Power is on") + return device_info + + logger.info("Power is off. Switching on and waiting for links to stabilize") + # TODO: figure out how to do this with sushy + bmc.redfish_request( + path=f"{bmc.get_system_path()}/Actions/ComputerSystem.Reset", + payload={"ResetType": "On"}, + method="POST", + ) + time.sleep(120) + return chassis_info(bmc) + + +def device_name(device_info: ChassisInfo) -> str: + return f"{device_info.manufacturer}-{device_info.serial_number}" + + +def configure_raid(node: Node, bmc: Bmc): + raid_details = discover_controller_details(bmc) + if not raid_details: + logger.info("%s No RAID hardware found in node", node.uuid) + return + + logger.info("%s Applying RAID configuration", node.uuid) + raid_config = build_raid_config(**raid_details) + ironic_node.set_target_raid_config(node, raid_config) + ironic_node.transition( + node, + target_state="clean", + expected_state="manageable", + clean_steps=[ + {"interface": "raid", "step": "delete_configuration"}, + {"interface": "raid", "step": "create_configuration"}, + ], + disable_ramdisk=True, ) - logger.info("%s complete for %s", __file__, bmc.ip_address) - return node.uuid + +def discover_controller_details(bmc: Bmc) -> dict | None: + """Parse available RAID controller details for execution.""" + system_objects = bmc.sushy().get_system_collection().get_members() + system = system_objects[0] + for controller in system.storage.get_members(): + if "RAID" in controller.identity: + return { + "controller": controller.identity, + "physical_disks": [d.identity for d in controller.drives], + } + return None + + +def build_raid_config(controller: str, physical_disks: list[str]): + """Return a raid config supported by ironic for cleanup tasks.""" + if len(physical_disks) < 2: + raid_level = "0" + elif len(physical_disks) > 2: + raid_level = "5" + else: + raid_level = "1" + + result = { + "logical_disks": [ + { + "controller": controller, + "is_root_volume": True, + "physical_disks": physical_disks, + "raid_level": raid_level, + "size_gb": "MAX", + } + ] + } + return result + + +def apply_bios_settings(bmc: Bmc, pxe_interface: str): + update_dell_bios_settings(bmc, pxe_interface=pxe_interface) + + +def parse_bool(value: str) -> bool: + return value.lower() == "true" def argument_parser(): - """Parse runtime arguments.""" parser = argparse.ArgumentParser( - prog=os.path.basename(__file__), description="Ingest Baremetal Node" + prog=os.path.basename(__file__), + description="Run the server enrol workflow", ) - parser.add_argument("--bmc-ip-address", type=str, required=True, help="BMC IP") + parser.add_argument("--ip-address", required=True, help="IP Address of BMC") parser.add_argument( - "--old-bmc-password", type=str, required=False, help="Old Password" + "--old-password", + required=False, + help="Old (current) BMC password", + ) + parser.add_argument( + "--firmware-update", + type=parse_bool, + default=False, + help="Run firmware update runbooks after inspection", + ) + parser.add_argument( + "--raid-configure", + type=parse_bool, + default=True, + help="Configure RAID before inspection", ) parser.add_argument( "--external-cmdb-id", - type=str, required=False, + default="", help="External CMDB ID for RXDB integration", ) + parser.add_argument( + "--pxe-switch-macs", + required=False, + default="", + help="Chassis MAC address of switches providing PXE network", + ) return parser diff --git a/workflows/argo-events/workflowtemplates/enroll-server.yaml b/workflows/argo-events/workflowtemplates/enroll-server.yaml index ea58a6d2c..21bc8cddb 100644 --- a/workflows/argo-events/workflowtemplates/enroll-server.yaml +++ b/workflows/argo-events/workflowtemplates/enroll-server.yaml @@ -36,77 +36,19 @@ spec: steps: - - name: enroll-server template: enroll-server - - - name: server-enroll-state - template: openstack-state-cmd - arguments: - parameters: - - name: device_id - value: "{{steps.enroll-server.outputs.result}}" - - - name: manage-server - template: openstack-wait-cmd - arguments: - parameters: - - name: operation - value: "manage" - - name: device_id - value: "{{steps.enroll-server.outputs.result}}" - when: "{{steps.server-enroll-state.outputs.result}} == enroll" - - - name: get-raid-config - template: get-raid-config - when: "{{workflow.parameters.raid_configure}} == true" - - - name: openstack-set-baremetal-node-raid-config - template: openstack-set-baremetal-node-raid-config - arguments: - parameters: - - name: device_id - value: "{{steps.enroll-server.outputs.result}}" - - name: raid_config - value: "{{steps.get-raid-config.outputs.result}}" - when: "{{workflow.parameters.raid_configure}} == true" - - - name: server-manage-state - template: openstack-state-cmd - arguments: - parameters: - - name: device_id - value: "{{steps.enroll-server.outputs.result}}" - - - name: bios-settings - template: bios-settings - - - name: inspect-server - templateRef: - name: inspect-server - template: inspect-server - arguments: - parameters: - - name: node - value: "{{steps.enroll-server.outputs.result}}" - when: "{{steps.server-manage-state.outputs.result}} == manageable" - - - name: firmware-update - templateRef: - name: server-firmware-update - template: server-firmware-update - arguments: - parameters: - - name: node - value: "{{steps.enroll-server.outputs.result}}" - when: "{{workflow.parameters.firmware_update}} == true" - - - name: avail-server - template: openstack-wait-cmd - arguments: - parameters: - - name: operation - value: "provide" - - name: device_id - value: "{{steps.enroll-server.outputs.result}}" - when: "{{steps.server-manage-state.outputs.result}} == manageable" - name: enroll-server container: image: ghcr.io/rackerlabs/understack/ironic-nautobot-client:latest command: - - enroll-server + - enrol-server args: - - --bmc-ip-address + - --ip-address - "{{workflow.parameters.ip_address}}" - - --old-bmc-password + - --firmware-update + - "{{workflow.parameters.firmware_update}}" + - --raid-configure + - "{{workflow.parameters.raid_configure}}" + - --old-password - "{{workflow.parameters.old_password}}" - --external-cmdb-id - "{{workflow.parameters.external_cmdb_id}}" @@ -118,157 +60,14 @@ spec: name: bmc-master readOnly: true envFrom: - - configMapRef: - name: cluster-metadata - env: - - name: WF_NS - value: "{{workflow.namespace}}" - - name: WF_NAME - value: "{{workflow.name}}" - - name: WF_UID - value: "{{workflow.uid}}" - volumes: - - name: bmc-master - secret: - secretName: bmc-master - - name: baremetal-manage - secret: - secretName: baremetal-manage - items: - - key: clouds.yaml - path: clouds.yaml - - name: bios-settings - container: - image: ghcr.io/rackerlabs/understack/ironic-nautobot-client:latest - command: - - bios-settings - args: - - --bmc-ip-address - - "{{workflow.parameters.ip_address}}" - volumeMounts: - - mountPath: /etc/openstack - name: baremetal-manage - readOnly: true - - mountPath: /etc/bmc_master/ - name: bmc-master - readOnly: true - envFrom: - - configMapRef: - name: cluster-metadata - env: - - name: WF_NS - value: "{{workflow.namespace}}" - - name: WF_NAME - value: "{{workflow.name}}" - - name: WF_UID - value: "{{workflow.uid}}" - volumes: - - name: bmc-master - secret: - secretName: bmc-master - - name: openstack-wait-cmd - inputs: - parameters: - - name: operation - - name: device_id - container: - image: ghcr.io/rackerlabs/understack/openstack-client:2025.2 - command: - - openstack - args: - - baremetal - - node - - "{{inputs.parameters.operation}}" - - --wait - - "0" - - "{{inputs.parameters.device_id}}" + - configMapRef: + name: cluster-metadata env: + - name: WF_NS + value: "{{workflow.namespace}}" + - name: WF_NAME + value: "{{workflow.name}}" + - name: WF_UID + value: "{{workflow.uid}}" - name: OS_CLOUD value: understack - volumeMounts: - - mountPath: /etc/openstack - name: baremetal-manage - - name: get-raid-config - container: - image: ghcr.io/rackerlabs/understack/ironic-nautobot-client:latest - command: - - get-raid-devices - args: - - --ip-address - - "{{workflow.parameters.ip_address}}" - volumeMounts: - - mountPath: /etc/openstack - name: baremetal-manage - readOnly: true - - mountPath: /etc/bmc_master/ - name: bmc-master - readOnly: true - env: - - name: WF_NS - value: "{{workflow.namespace}}" - - name: WF_NAME - value: "{{workflow.name}}" - - name: WF_UID - value: "{{workflow.uid}}" - - name: openstack-set-baremetal-node-raid-config - inputs: - parameters: - - name: device_id - - name: raid_config - # https://rackerlabs.github.io/understack/user-guide/openstack-ironic/#setting-baremetal-node-flavor - script: - image: ghcr.io/rackerlabs/understack/openstack-client:2025.2 - command: [sh] - source: | - echo "setting RAID config for node: {{inputs.parameters.device_id}}" - # create the raid-config.json file. I find this easier to read - # than passing a big json string on command line - cat <<'EOF' >> raid-config.json - {{inputs.parameters.raid_config}} - EOF - # create the initial clean steps which will create a raid config - cat <<'EOF' >> raid-clean-steps.json - [ - { - "interface": "raid", - "step": "delete_configuration" - }, - { - "interface": "raid", - "step": "create_configuration" - } - ] - EOF - # apply the target raid config to the node - openstack baremetal node set {{inputs.parameters.device_id}} --target-raid-config raid-config.json - # create the raid config - openstack baremetal node clean --wait 0 --clean-steps raid-clean-steps.json --disable-ramdisk {{inputs.parameters.device_id}} - env: - - name: OS_CLOUD - value: understack - volumeMounts: - - mountPath: /etc/openstack - name: baremetal-manage - - name: openstack-state-cmd - inputs: - parameters: - - name: device_id - container: - image: ghcr.io/rackerlabs/understack/openstack-client:2025.2 - command: - - openstack - args: - - baremetal - - node - - show - - "-f" - - value - - "-c" - - provision_state - - "{{inputs.parameters.device_id}}" - env: - - name: OS_CLOUD - value: understack - volumeMounts: - - mountPath: /etc/openstack - name: baremetal-manage From 754fed56b7f2fb1c9d2584a436c69756e0a205bd Mon Sep 17 00:00:00 2001 From: Steve Keay Date: Thu, 2 Apr 2026 22:33:37 +0100 Subject: [PATCH 05/15] Add code comments to document the enrol process --- .../understack_workflows/main/enroll_server.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/python/understack-workflows/understack_workflows/main/enroll_server.py b/python/understack-workflows/understack_workflows/main/enroll_server.py index 3b77cf7e1..a8fb25ba1 100644 --- a/python/understack-workflows/understack_workflows/main/enroll_server.py +++ b/python/understack-workflows/understack_workflows/main/enroll_server.py @@ -106,19 +106,30 @@ def enrol( ironic_node.inspect_out_of_band(node) + # Anecdotally, applying firmware updates can upset the next-boot HTTP, and + # potentially even upset the bios-settings configuration job in the iDRAC, + # so we do firmware first, which causes a reboot, and only then do we set + # the BIOS settings. + # + # Also, just maybe, the bios setting keys we are trying to set might not be + # available in the old version of the bios, in which case we need to boot + # the bios before redfish will allow us to set those settings. if firmware_update: ironic_node.apply_firmware_updates(node) # Applying BIOS settings on Dell servers requires a reboot which we achieve # by initiating agent inspection. Agent inspection requires BIOS settings # (to set boot device). Therefore these two actions must go hand-in-hand. + # + # Note that we may have already applied BIOS settings above. That is okay, + # it is idempotent. apply_bios_settings(bmc, pxe_interface) ironic_node.inspect(node) - # After inspection, our node is left in "manageable" state. All being well, - # the "provide" action will transition manageable->cleaning->available. + # After successful inspection, our node is left in "manageable" state. All + # being well, the "provide" action will transition manageable -> cleaning -> + # available. ironic_node.transition(node, target_state="provide", expected_state="available") - logger.info("Completed enrol workflow for bmc_ip_address=%s", ip_address) From 32067e8f0dc24bfc1ac178044a3620da7b61c985 Mon Sep 17 00:00:00 2001 From: Steve Keay Date: Thu, 2 Apr 2026 22:33:58 +0100 Subject: [PATCH 06/15] Add extra bios settings step to assist the raid configuration --- .../understack_workflows/main/enroll_server.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/understack-workflows/understack_workflows/main/enroll_server.py b/python/understack-workflows/understack_workflows/main/enroll_server.py index a8fb25ba1..9fddde780 100644 --- a/python/understack-workflows/understack_workflows/main/enroll_server.py +++ b/python/understack-workflows/understack_workflows/main/enroll_server.py @@ -100,6 +100,10 @@ def enrol( ) if raid_configure: + # Raid configuration runs a clean step which does a PXE boot. That + # can't work unless we first apply_bios_settings. + # TODO: why isn't skip_ramdisk working here? + apply_bios_settings(bmc, pxe_interface) configure_raid(node, bmc) else: logger.info("%s RAID configuration was not requested", node.uuid) From 25b98ce7b9bbd1575c078539c4320e134273a216 Mon Sep 17 00:00:00 2001 From: Steve Keay Date: Thu, 2 Apr 2026 22:35:24 +0100 Subject: [PATCH 07/15] Disable PXE boot - we don't use it any more. --- python/understack-workflows/understack_workflows/bmc_bios.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/understack-workflows/understack_workflows/bmc_bios.py b/python/understack-workflows/understack_workflows/bmc_bios.py index 6dc1b0196..826963f19 100644 --- a/python/understack-workflows/understack_workflows/bmc_bios.py +++ b/python/understack-workflows/understack_workflows/bmc_bios.py @@ -9,8 +9,7 @@ def required_bios_settings(pxe_interface: str) -> dict: """Return adjusted Bios settings map for BMC.""" return { - "PxeDev1EnDis": "Enabled", - "PxeDev1Interface": pxe_interface, + "PxeDev1EnDis": "Disabled", "HttpDev1EnDis": "Enabled", "HttpDev1Interface": pxe_interface, # at this time ironic conductor returns http URLs From d365a482e7a257e17bc1ade3fc7b56d8652c7551 Mon Sep 17 00:00:00 2001 From: Steve Keay Date: Tue, 7 Apr 2026 10:48:39 +0100 Subject: [PATCH 08/15] Delete unused script - this is handled during enrol --- python/understack-workflows/pyproject.toml | 1 - .../main/bios_settings.py | 61 ------------------- .../main/enroll_server.py | 2 +- 3 files changed, 1 insertion(+), 63 deletions(-) delete mode 100644 python/understack-workflows/understack_workflows/main/bios_settings.py diff --git a/python/understack-workflows/pyproject.toml b/python/understack-workflows/pyproject.toml index 55fed61e4..a7461fe52 100644 --- a/python/understack-workflows/pyproject.toml +++ b/python/understack-workflows/pyproject.toml @@ -30,7 +30,6 @@ dependencies = [ ] [project.scripts] -bios-settings = "understack_workflows.main.bios_settings:main" bmc-kube-password = "understack_workflows.main.bmc_display_password:main" bmc-password = "understack_workflows.main.print_bmc_password:main" enroll-server = "understack_workflows.main.enroll_server:main" diff --git a/python/understack-workflows/understack_workflows/main/bios_settings.py b/python/understack-workflows/understack_workflows/main/bios_settings.py deleted file mode 100644 index b64a2021a..000000000 --- a/python/understack-workflows/understack_workflows/main/bios_settings.py +++ /dev/null @@ -1,61 +0,0 @@ -import argparse -import logging -import os - -from understack_workflows.bmc import bmc_for_ip_address -from understack_workflows.bmc_bios import update_dell_bios_settings -from understack_workflows.bmc_chassis_info import chassis_info -from understack_workflows.helpers import setup_logger -from understack_workflows.pxe_port_heuristic import guess_pxe_interface - -logger = logging.getLogger(__name__) - - -def main(): - """Update BIOS settings to undercloud standard for the given server. - - - Using BMC, configure our standard BIOS settings - - set PXE boot device - - set timezone to UTC - - set the hostname - - NOTE: take care with order of execution of these workflow steps: - - When asked to change BIOS settings, iDRAC enqueues a "job" that will execute - on next boot of the server. - - The assumption for this workflow is that this server will shortly be PXE - booting into the IPA image for cleaning or inspection. - - Therefore this workflow does not itself boot the server. - - Any subsequent iDRAC operations such as a code update or ironic validation - can clear our pending BIOS update job from the iDRAC job queue. Before we - perform any such operation, we should first do something that will cause a - reboot of the server. - """ - setup_logger() - args = argument_parser().parse_args() - - bmc_ip_address = args.bmc_ip_address - logger.info("%s starting for bmc_ip_address=%s", __file__, bmc_ip_address) - - bmc = bmc_for_ip_address(bmc_ip_address) - device_info = chassis_info(bmc) - pxe_interface = guess_pxe_interface(device_info) - logger.info("Selected %s as PXE interface", pxe_interface) - - update_dell_bios_settings(bmc, pxe_interface=pxe_interface) - - -def argument_parser(): - """Parse runtime arguments.""" - parser = argparse.ArgumentParser( - prog=os.path.basename(__file__), description="Update BIOS Settings" - ) - parser.add_argument("--bmc-ip-address", type=str, required=True, help="BMC IP") - return parser - - -if __name__ == "__main__": - main() diff --git a/python/understack-workflows/understack_workflows/main/enroll_server.py b/python/understack-workflows/understack_workflows/main/enroll_server.py index 9fddde780..81d258337 100644 --- a/python/understack-workflows/understack_workflows/main/enroll_server.py +++ b/python/understack-workflows/understack_workflows/main/enroll_server.py @@ -16,7 +16,7 @@ from understack_workflows.bmc_hostname import bmc_set_hostname from understack_workflows.bmc_settings import update_dell_drac_settings from understack_workflows.helpers import setup_logger -from understack_workflows.main.bios_settings import update_dell_bios_settings +from understack_workflows.bmc_bios import update_dell_bios_settings from understack_workflows.pxe_port_heuristic import guess_pxe_interface logger = logging.getLogger(__name__) From 45eff66e32a8c4e99c4200d852d4b92253555af5 Mon Sep 17 00:00:00 2001 From: Steve Keay Date: Tue, 7 Apr 2026 13:05:54 +0100 Subject: [PATCH 09/15] Enrol with a list of http-boot-interfaces instead of a single interface --- .../tests/test_bmc_bios.py | 7 +- .../tests/test_enrol_server.py | 18 +-- .../tests/test_pxe_port_heuristic.py | 37 +++++- .../understack_workflows/bmc_bios.py | 28 +++-- .../understack_workflows/ironic_node.py | 22 ++-- .../main/enroll_server.py | 33 +++-- .../pxe_port_heuristic.py | 117 ++++++++---------- 7 files changed, 142 insertions(+), 120 deletions(-) diff --git a/python/understack-workflows/tests/test_bmc_bios.py b/python/understack-workflows/tests/test_bmc_bios.py index 7114c9a4f..9dcd32c83 100644 --- a/python/understack-workflows/tests/test_bmc_bios.py +++ b/python/understack-workflows/tests/test_bmc_bios.py @@ -17,11 +17,11 @@ def test_update_dell_bios_settings_skips_patch_when_desired_values_are_pending(m "IPMILan.1.Enable": "Enabled", } }, - {"Attributes": bmc_bios.required_bios_settings("NIC.Embedded.1-1-1")}, + {"Attributes": bmc_bios.required_bios_settings(["NIC.Embedded.1-1-1"])}, ] patch_bios_settings = mocker.patch.object(bmc_bios, "patch_bios_settings") - result = bmc_bios.update_dell_bios_settings(bmc, "NIC.Embedded.1-1-1") + result = bmc_bios.update_dell_bios_settings(bmc, ["NIC.Embedded.1-1-1"]) assert result == {} patch_bios_settings.assert_not_called() @@ -54,12 +54,13 @@ def test_update_dell_bios_settings_only_patches_settings_not_already_pending(moc ] patch_bios_settings = mocker.patch.object(bmc_bios, "patch_bios_settings") - result = bmc_bios.update_dell_bios_settings(bmc, "NIC.Embedded.1-1-1") + result = bmc_bios.update_dell_bios_settings(bmc, ["NIC.Embedded.1-1-1"]) assert result == { "HttpDev1EnDis": "Enabled", "HttpDev1Interface": "NIC.Embedded.1-1-1", "HttpDev1TlsMode": "None", "OS-BMC.1.AdminState": "Disabled", + "PxeDev1EnDis": "Disabled", } patch_bios_settings.assert_called_once_with(bmc, result) diff --git a/python/understack-workflows/tests/test_enrol_server.py b/python/understack-workflows/tests/test_enrol_server.py index 27f473cdb..55e669217 100644 --- a/python/understack-workflows/tests/test_enrol_server.py +++ b/python/understack-workflows/tests/test_enrol_server.py @@ -157,9 +157,10 @@ def test_enrol_happy_path_uses_real_ironic_workflow(mocker): ) update_dell_drac_settings.assert_called_once_with(fake_bmc) bmc_set_hostname.assert_called_once_with(fake_bmc, "None", "Dell-ABC123") - update_dell_bios_settings.assert_called_once_with( + assert update_dell_bios_settings.call_count == 2 + update_dell_bios_settings.assert_any_call( fake_bmc, - pxe_interface="NIC.Integrated.1-1", + pxe_interfaces=["NIC.Integrated.1-1"], ) sleep.assert_called_once_with(120) @@ -181,7 +182,7 @@ def test_enrol_happy_path_uses_real_ironic_workflow(mocker): inspect_interface="idrac-redfish", extra={ "external_cmdb_id": "cmdb-1", - "enrolled_pxe_port": "NIC.Integrated.1-1", + "enrolled_pxe_ports": ["NIC.Integrated.1-1"], }, ) fake_ironic.node.set_target_raid_config.assert_called_once_with( @@ -321,8 +322,8 @@ def test_enrol_existing_failed_node_recovers_and_updates(mocker): update_patch = fake_ironic.node.update.call_args_list[0].args[1] assert { "op": "add", - "path": "/extra/enrolled_pxe_port", - "value": "NIC.Integrated.1-1", + "path": "/extra/enrolled_pxe_ports", + "value": ["NIC.Integrated.1-1"], } in update_patch @@ -405,7 +406,6 @@ def test_guess_pxe_interface_unknown_name_avoids_bmc_interface(): ], ) - assert ( - enroll_server.guess_pxe_interface(device_info, {"AA:BB:CC:DD:EE:FF"}) - == "NIC.Custom.9-1" - ) + assert enroll_server.guess_pxe_interfaces(device_info, {"AA:BB:CC:DD:EE:FF"}) == [ + "NIC.Custom.9-1" + ] diff --git a/python/understack-workflows/tests/test_pxe_port_heuristic.py b/python/understack-workflows/tests/test_pxe_port_heuristic.py index 37bc96336..6ab3ec83c 100644 --- a/python/understack-workflows/tests/test_pxe_port_heuristic.py +++ b/python/understack-workflows/tests/test_pxe_port_heuristic.py @@ -1,6 +1,6 @@ from understack_workflows.bmc_chassis_info import ChassisInfo from understack_workflows.bmc_chassis_info import InterfaceInfo -from understack_workflows.pxe_port_heuristic import guess_pxe_interface +from understack_workflows.pxe_port_heuristic import guess_pxe_interfaces def test_integrated_is_best(): @@ -24,7 +24,12 @@ def test_integrated_is_best(): InterfaceInfo("NIC.Slot.1-2", x, x), ], ) - assert guess_pxe_interface(device_info) == "NIC.Integrated.1-1" + assert guess_pxe_interfaces(device_info) == [ + "NIC.Integrated.1-1", + "NIC.Slot.1-1", + "NIC.Integrated.1-2", + "NIC.Slot.1-2", + ] def test_slot_is_second_best(): @@ -48,7 +53,12 @@ def test_slot_is_second_best(): InterfaceInfo("NIC.Slot.2-1", x, x), ], ) - assert guess_pxe_interface(device_info) == "NIC.Slot.1-1" + assert guess_pxe_interfaces(device_info) == [ + "NIC.Slot.1-1", + "NIC.Slot.2-1", + "NIC.Slot.1-2", + "NIC.Slot.2-2", + ] def test_connected_is_better(): @@ -72,7 +82,12 @@ def test_connected_is_better(): InterfaceInfo("NIC.Slot.1-2", x, x, remote_switch_mac_address=x), ], ) - assert guess_pxe_interface(device_info) == "NIC.Slot.1-2" + assert guess_pxe_interfaces(device_info) == [ + "NIC.Slot.1-2", + "NIC.Integrated.1-1", + "NIC.Slot.1-1", + "NIC.Integrated.1-2", + ] def test_connected_macs_picks_first(): @@ -102,7 +117,12 @@ def test_connected_macs_picks_first(): InterfaceInfo("NIC.Slot.1-2", x, x, remote_switch_mac_address=any_mac), ], ) - assert guess_pxe_interface(device_info, {pxe_mac}) == "NIC.Integrated.1-1" + assert guess_pxe_interfaces(device_info, {pxe_mac}) == [ + "NIC.Integrated.1-1", + "NIC.Slot.1-1", + "NIC.Integrated.1-2", + "NIC.Slot.1-2", + ] def test_connected_to_known_pxe_is_best(): @@ -132,4 +152,9 @@ def test_connected_to_known_pxe_is_best(): InterfaceInfo("NIC.Slot.1-2", x, x, remote_switch_mac_address=pxe_mac), ], ) - assert guess_pxe_interface(device_info, {pxe_mac}) == "NIC.Slot.1-1" + assert guess_pxe_interfaces(device_info, {pxe_mac}) == [ + "NIC.Slot.1-1", + "NIC.Slot.1-2", + "NIC.Integrated.1-1", + "NIC.Integrated.1-2", + ] diff --git a/python/understack-workflows/understack_workflows/bmc_bios.py b/python/understack-workflows/understack_workflows/bmc_bios.py index 826963f19..f8af6cc63 100644 --- a/python/understack-workflows/understack_workflows/bmc_bios.py +++ b/python/understack-workflows/understack_workflows/bmc_bios.py @@ -6,12 +6,17 @@ logger = logging.getLogger(__name__) -def required_bios_settings(pxe_interface: str) -> dict: - """Return adjusted Bios settings map for BMC.""" - return { +def required_bios_settings(pxe_interfaces: list[str]) -> dict[str, str]: + """Return Bios settings map for BMC. + + Note that we set values like HttpDev8Interface for each of the + pxe_interfaces, however dell BIOS has a setting for "number of pxe + interfaces" which we currently leave at the default of 4, so we'll never + actually attempt pxe on anything but the first 4 interfaces. + """ + settings = { + # PXE is enabled by default on DELL, but we don't use it: "PxeDev1EnDis": "Disabled", - "HttpDev1EnDis": "Enabled", - "HttpDev1Interface": pxe_interface, # at this time ironic conductor returns http URLs # when its serving data from its own http server "HttpDev1TlsMode": "None", @@ -27,6 +32,15 @@ def required_bios_settings(pxe_interface: str) -> dict: "IPMILan.1.Enable": "Disabled", } + for i in range(1, 8): + if len(pxe_interfaces) >= i: + settings[f"HttpDev{i}EnDis"] = "Enabled" + settings[f"HttpDev{i}Interface"] = pxe_interfaces[i - 1] + else: + settings[f"HttpDev{i}EnDis"] = "Disabled" + + return settings + def required_change_for_bios_setting( key: str, @@ -72,7 +86,7 @@ def required_change_for_bios_setting( return required_value -def update_dell_bios_settings(bmc: Bmc, pxe_interface: str) -> dict: +def update_dell_bios_settings(bmc: Bmc, pxe_interfaces: list[str]) -> dict: """Check and update BIOS settings to standard as required. Any changes take effect on next server reboot. @@ -83,7 +97,7 @@ def update_dell_bios_settings(bmc: Bmc, pxe_interface: str) -> dict: pending_settings = bmc.redfish_request(bmc.system_path + "/Bios/Settings").get( "Attributes", {} ) - required_settings = required_bios_settings(pxe_interface) + required_settings = required_bios_settings(pxe_interfaces) logger.info("%s Checking BIOS settings", bmc) required_changes = {} diff --git a/python/understack-workflows/understack_workflows/ironic_node.py b/python/understack-workflows/understack_workflows/ironic_node.py index 223a44da6..cf1eb02be 100644 --- a/python/understack-workflows/understack_workflows/ironic_node.py +++ b/python/understack-workflows/understack_workflows/ironic_node.py @@ -1,3 +1,4 @@ +import json import logging from collections.abc import Sequence @@ -19,7 +20,7 @@ def create_or_update( name: str, manufacturer: str, external_cmdb_id: str | None = None, - enrolled_pxe_port: str | None = None, + enrolled_pxe_ports: list[str] | None = None, ) -> Node: """Find-or-create Node by name, update attributes, set state to Manageable. @@ -63,7 +64,7 @@ def create_or_update( driver, inspect_interface, external_cmdb_id, - enrolled_pxe_port, + enrolled_pxe_ports, ) except ironicclient.common.apiclient.exceptions.NotFound: logger.debug("Baremetal Node with name %s not found in Ironic, creating.", name) @@ -74,7 +75,7 @@ def create_or_update( driver, inspect_interface, external_cmdb_id, - enrolled_pxe_port, + enrolled_pxe_ports, ) # All newly-created nodes start out with "enrol" state: transition(node, target_state="manage", expected_state="manageable") @@ -90,7 +91,7 @@ def update_ironic_node( driver, inspect_interface, external_cmdb_id: str | None = None, - enrolled_pxe_port: str | None = None, + enrolled_pxe_ports: list[str] | None = None, ): updates = [ f"name={name}", @@ -106,8 +107,9 @@ def update_ironic_node( # Update external_cmdb_id only when explicitly provided if external_cmdb_id: updates.append(f"extra/external_cmdb_id={external_cmdb_id}") - if enrolled_pxe_port: - updates.append(f"extra/enrolled_pxe_port={enrolled_pxe_port}") + if enrolled_pxe_ports is not None: + payload = json.dumps(enrolled_pxe_ports) + updates.append(f"extra/enrolled_pxe_ports={payload}") patches = args_array_to_patch("add", updates) logger.info("Updating Ironic node %s patches=%s", ironic_node.uuid, patches) @@ -123,7 +125,7 @@ def create_ironic_node( driver: str, inspect_interface: str, external_cmdb_id: str | None = None, - enrolled_pxe_port: str | None = None, + enrolled_pxe_ports: list[str] | None = None, ) -> Node: node_data = { "name": name, @@ -137,12 +139,12 @@ def create_ironic_node( "boot_interface": "http-ipxe", "inspect_interface": inspect_interface, } - if external_cmdb_id or enrolled_pxe_port: + if external_cmdb_id or enrolled_pxe_ports is not None: node_data["extra"] = {} if external_cmdb_id: node_data["extra"]["external_cmdb_id"] = external_cmdb_id - if enrolled_pxe_port: - node_data["extra"]["enrolled_pxe_port"] = enrolled_pxe_port + if enrolled_pxe_ports is not None: + node_data["extra"]["enrolled_pxe_ports"] = enrolled_pxe_ports if driver == "fake-hardware": node_data["resource_class"] = "fakehw" diff --git a/python/understack-workflows/understack_workflows/main/enroll_server.py b/python/understack-workflows/understack_workflows/main/enroll_server.py index 81d258337..22f0699ff 100644 --- a/python/understack-workflows/understack_workflows/main/enroll_server.py +++ b/python/understack-workflows/understack_workflows/main/enroll_server.py @@ -10,14 +10,14 @@ from understack_workflows import ironic_node from understack_workflows.bmc import Bmc from understack_workflows.bmc import bmc_for_ip_address +from understack_workflows.bmc_bios import update_dell_bios_settings from understack_workflows.bmc_chassis_info import ChassisInfo from understack_workflows.bmc_chassis_info import chassis_info from understack_workflows.bmc_credentials import set_bmc_password from understack_workflows.bmc_hostname import bmc_set_hostname from understack_workflows.bmc_settings import update_dell_drac_settings from understack_workflows.helpers import setup_logger -from understack_workflows.bmc_bios import update_dell_bios_settings -from understack_workflows.pxe_port_heuristic import guess_pxe_interface +from understack_workflows.pxe_port_heuristic import guess_pxe_interfaces logger = logging.getLogger(__name__) @@ -28,7 +28,7 @@ def main() -> None: - connect to the BMC using standard password, if that fails then use password supplied in --old-bmc-password option, or factory default - - ensure standard BMC password is set + - ensure standard BMC password is set, and other basic BMC settings - from BMC, discover basic hardware info: - manufacturer, model number, serial number @@ -38,16 +38,13 @@ def main() -> None: - MAC address - LLDP connections [{remote_mac, remote_interface_name}] - - Figure out which NIC to use for PXE + - Figure out which NICs to enable for PXE - - Using BMC, configure our standard BIOS settings - - set PXE boot device - - set timezone to UTC - - set the hostname + - Configure our standard BIOS settings including HTTP boot devices - Find or create this baremetal node in Ironic - - set the name to "{manufacturer}-{servicetag}" - - set the driver as appropriate + - Set the name to "{manufacturer}-{servicetag}" + - Set the driver as appropriate for the manufacturer/model - Configure RAID - Transition through enrol->manage->inspect->cleaning->provide """ @@ -88,22 +85,22 @@ def enrol( if insufficient_lldp_data(device_info): device_info = power_on_and_wait(bmc, device_info) - pxe_interface = guess_pxe_interface(device_info, pxe_switch_macs) - logger.info("Selected %s as PXE interface", pxe_interface) + pxe_interfaces = guess_pxe_interfaces(device_info, pxe_switch_macs) + logger.info("Selected %s as PXE interfaces", pxe_interfaces) node = ironic_node.create_or_update( bmc=bmc, name=device_name(device_info), manufacturer=device_info.manufacturer, external_cmdb_id=external_cmdb_id, - enrolled_pxe_port=pxe_interface, + enrolled_pxe_ports=pxe_interfaces, ) if raid_configure: # Raid configuration runs a clean step which does a PXE boot. That # can't work unless we first apply_bios_settings. # TODO: why isn't skip_ramdisk working here? - apply_bios_settings(bmc, pxe_interface) + apply_bios_settings(bmc, pxe_interfaces) configure_raid(node, bmc) else: logger.info("%s RAID configuration was not requested", node.uuid) @@ -124,10 +121,10 @@ def enrol( # Applying BIOS settings on Dell servers requires a reboot which we achieve # by initiating agent inspection. Agent inspection requires BIOS settings # (to set boot device). Therefore these two actions must go hand-in-hand. - # + # # Note that we may have already applied BIOS settings above. That is okay, # it is idempotent. - apply_bios_settings(bmc, pxe_interface) + apply_bios_settings(bmc, pxe_interfaces) ironic_node.inspect(node) # After successful inspection, our node is left in "manageable" state. All @@ -255,8 +252,8 @@ def build_raid_config(controller: str, physical_disks: list[str]): return result -def apply_bios_settings(bmc: Bmc, pxe_interface: str): - update_dell_bios_settings(bmc, pxe_interface=pxe_interface) +def apply_bios_settings(bmc: Bmc, pxe_interfaces: list[str]): + update_dell_bios_settings(bmc, pxe_interfaces=pxe_interfaces) def parse_bool(value: str) -> bool: diff --git a/python/understack-workflows/understack_workflows/pxe_port_heuristic.py b/python/understack-workflows/understack_workflows/pxe_port_heuristic.py index 02cffa997..f9295e125 100644 --- a/python/understack-workflows/understack_workflows/pxe_port_heuristic.py +++ b/python/understack-workflows/understack_workflows/pxe_port_heuristic.py @@ -1,81 +1,64 @@ import logging from understack_workflows.bmc_chassis_info import ChassisInfo +from understack_workflows.bmc_chassis_info import InterfaceInfo logger = logging.getLogger(__name__) # We try not choose interface whose name contains any of these: DISQUALIFIED = ["DRAC", "ILO", "NIC.EMBEDDED"] -# Preferred interfaces, most likely first -NIC_PREFERENCE = [ - "NIC.Integrated.1-1-1", - "NIC.Integrated.1-1", - "NIC.Slot.1-1-1", - "NIC.Slot.1-1", - "NIC.Integrated.1-2-1", - "NIC.Integrated.1-2", - "NIC.Slot.1-2-1", - "NIC.Slot.1-2", - "NIC.Slot.1-3-1", - "NIC.Slot.1-3", - "NIC.Slot.2-1-1", - "NIC.Slot.2-1", - "NIC.Integrated.2-1-1", - "NIC.Integrated.2-1", - "NIC.Slot.2-2-1", - "NIC.Slot.2-2", - "NIC.Integrated.2-2-1", - "NIC.Integrated.2-2", - "NIC.Slot.3-1-1", - "NIC.Slot.3-1", - "NIC.Slot.3-2-1", - "NIC.Slot.3-2", -] - - -def guess_pxe_interface( + +def guess_pxe_interfaces( device_info: ChassisInfo, pxe_switch_macs: set[str] | None = None -) -> str: - """Determine most probable PXE interface for BMC.""" +) -> list[str]: + """First 8 interface names, most probable PXE interfaces first.""" if pxe_switch_macs is None: pxe_switch_macs = set() - candidate_interfaces = { - i - for i in device_info.interfaces - if not any(q in i.name.upper() for q in DISQUALIFIED) - } - candidate_interface_names = {i.name for i in candidate_interfaces} - connected_interface_names = { - i.name for i in candidate_interfaces if i.remote_switch_mac_address - } - pxe_connected_interface_names = { + candidate_interfaces = {i for i in device_info.interfaces if not disqualified(i)} + + names = [ i.name - for i in candidate_interfaces - if i.remote_switch_mac_address in pxe_switch_macs - } - - for name in NIC_PREFERENCE: - if name in pxe_connected_interface_names: - logger.info("PXE port is %s, connected to known pxe switch", name) - return name - - for name in NIC_PREFERENCE: - if name in connected_interface_names: - logger.info("PXE port is %s, the first connected interface", name) - return name - - for name in NIC_PREFERENCE: - if name in candidate_interface_names: - logger.info("PXE port is %s, preferred eligible interface", name) - return name - - if candidate_interface_names: - name = min(candidate_interface_names) - logger.info("PXE port is %s, first eligible interface", name) - return name - - name = device_info.interfaces[0].name - logger.info("PXE port is %s, chosen as last resort", name) - return name + for i in sorted( + candidate_interfaces, + key=lambda i: likelihood(i, pxe_switch_macs), + ) + ] + + return names[0:7] + + +def disqualified(interface: InterfaceInfo) -> bool: + return any(x in interface.name.upper() for x in DISQUALIFIED) + + +def likelihood(interface: InterfaceInfo, pxe_switch_macs: set[str]) -> list[bool | str]: + """A value that is sortable. Lower is better.""" + return [ + # python sort order: false is "better" than true + not connected_to_known_pxe_switch(interface, pxe_switch_macs), + not connected_to_any_switch(interface), + nic_port_number(interface), + interface.name, # tiebreaker + ] + + +def connected_to_known_pxe_switch(interface: InterfaceInfo, pxe_switch_macs) -> bool: + return interface.remote_switch_mac_address in pxe_switch_macs + + +def connected_to_any_switch(interface: InterfaceInfo) -> bool: + return interface.remote_switch_mac_address is not None + + +def nic_port_number(interface: InterfaceInfo) -> str: + """Preference of interface name like NIC.Slot.1-2-1. + + Prefer Lowest partition, then + prefer Lowest port, then + prefer Lowest slot number, then + prefer Integrated over Slot. + """ + reversed_name = interface.name[::-1] + return reversed_name From 858f3ae3a4890027c8f6e1452778c68c01ede967 Mon Sep 17 00:00:00 2001 From: Steve Keay Date: Tue, 7 Apr 2026 13:35:30 +0100 Subject: [PATCH 10/15] Multi-port PXE behaviour for Redfish inspection port_bios_name_hook We no longer try to set a single solitary interface as "the" PXE interface. We now set a whole slew of interfaces - the server will try them all and use the one that works. So we want Ironic to support HTTP boot on any of the interfaces listed. At the time this hook runs, we don't know which interface will get used, so we set the "pxe" flag on all of them. --- .../ironic_understack/port_bios_name_hook.py | 84 ++++------------ .../tests/test_port_bios_name_hook.py | 98 ++++++++++--------- 2 files changed, 72 insertions(+), 110 deletions(-) diff --git a/python/ironic-understack/ironic_understack/port_bios_name_hook.py b/python/ironic-understack/ironic_understack/port_bios_name_hook.py index d631befbe..f9fdc73be 100644 --- a/python/ironic-understack/ironic_understack/port_bios_name_hook.py +++ b/python/ironic-understack/ironic_understack/port_bios_name_hook.py @@ -1,8 +1,6 @@ from typing import ClassVar -from ironic.common import exception from ironic.drivers.modules.inspector.hooks import base -from ironic.objects.bios import BIOSSetting from oslo_log import log as logging from ironic_understack.ironic_wrapper import ironic_ports_for_node @@ -10,18 +8,17 @@ LOG = logging.getLogger(__name__) PXE_BIOS_NAME_PREFIXES = ["NIC.Integrated", "NIC.Slot"] -BIOS_SETTING_NAME = "HttpDev1Interface" class PortBiosNameHook(base.InspectionHook): """Set bios_name, pxe_enabled, local_link_connection and physical_network. Populates extra.bios_name and port name from inspection inventory, then - determines the PXE port from the BIOS HttpDev1Interface setting (populated - during enrolment). Falls back to a naming-convention heuristic if the - BIOS setting is unavailable. + determines PXE-enabled ports from node.extra["enrolled_pxe_ports"] + (populated during enrolment). If that data is unavailable, all + NIC.Integrated.* and NIC.Slot.* ports are treated as PXE-enabled. - The PXE port gets pxe_enabled=True plus placeholder physical_network and + PXE ports get pxe_enabled=True plus placeholder physical_network and local_link_connection values that neutron requires. """ @@ -37,7 +34,7 @@ def __call__(self, task, inventory, plugin_data): i["mac_address"].upper(): i["name"] for i in inspected_interfaces } - pxe_nic = _bios_pxe_nic(task) + pxe_nics = _enrolled_pxe_nics(task) for baremetal_port in ironic_ports_for_node(task.context, task.node.id): mac = baremetal_port.address.upper() @@ -46,15 +43,10 @@ def __call__(self, task, inventory, plugin_data): _set_port_extra(baremetal_port, mac, bios_name) _set_port_name(baremetal_port, mac, bios_name, task.node.name) - if pxe_nic: - is_pxe = bios_name is not None and ( - pxe_nic.startswith(bios_name) or bios_name.startswith(pxe_nic) - ) - else: - # Fallback: heuristic based on naming convention - is_pxe = bios_name == _pxe_interface_name( - inspected_interfaces, task.node.uuid - ) + is_pxe = bios_name is not None and any( + pxe_nic.startswith(bios_name) or bios_name.startswith(pxe_nic) + for pxe_nic in pxe_nics + ) if baremetal_port.pxe_enabled != is_pxe: LOG.info( @@ -72,45 +64,24 @@ def __call__(self, task, inventory, plugin_data): _set_port_local_link_connection(baremetal_port, mac) -def _bios_pxe_nic(task): - """Read the BIOS PXE NIC FQDD, or return None if unavailable.""" - try: - task.driver.bios.cache_bios_settings(task) - except Exception: - LOG.warning( - "Cannot cache BIOS settings for node %s, " - "falling back to naming heuristic for PXE port.", - task.node.uuid, - ) - return None - - try: - setting = BIOSSetting.get(task.context, task.node.id, BIOS_SETTING_NAME) - except exception.BIOSSettingNotFound: - LOG.warning( - "BIOS setting %s not found for node %s, " - "falling back to naming heuristic for PXE port.", - BIOS_SETTING_NAME, - task.node.uuid, - ) - return None - - if not setting.value: +def _enrolled_pxe_nics(task) -> list[str]: + """Read enrolled PXE NIC names from node.extra, or use broad prefixes.""" + enrolled_pxe_nics = task.node.extra.get("enrolled_pxe_ports") + if enrolled_pxe_nics is None: LOG.warning( - "BIOS setting %s is empty for node %s, " - "falling back to naming heuristic for PXE port.", - BIOS_SETTING_NAME, + "Node %s extra.enrolled_pxe_ports is missing, " + "setting pxe flag on all interfaces starting %s.", task.node.uuid, + PXE_BIOS_NAME_PREFIXES, ) - return None + return PXE_BIOS_NAME_PREFIXES LOG.info( - "Node %s BIOS %s = %s", + "Set node %s pxe flag on interfaces from extra.enrolled_pxe_ports %s", task.node.uuid, - BIOS_SETTING_NAME, - setting.value, + enrolled_pxe_nics, ) - return setting.value + return enrolled_pxe_nics def _set_port_extra(baremetal_port, mac, required_bios_name): @@ -165,18 +136,3 @@ def _set_port_local_link_connection(baremetal_port, mac): baremetal_port.local_link_connection, ) baremetal_port.save() - - -def _pxe_interface_name(inspected_interfaces: list[dict], node_uuid) -> str | None: - """Use a heuristic to determine our default interface for PXE.""" - names = sorted(i["name"] for i in inspected_interfaces) - for prefix in PXE_BIOS_NAME_PREFIXES: - for name in names: - if name.startswith(prefix): - return name - LOG.error( - "No PXE interface found for node %s. Expected to find an " - "interface whose bios_name starts with one of %s", - node_uuid, - PXE_BIOS_NAME_PREFIXES, - ) diff --git a/python/ironic-understack/ironic_understack/tests/test_port_bios_name_hook.py b/python/ironic-understack/ironic_understack/tests/test_port_bios_name_hook.py index edd6b772f..c5966dcfc 100644 --- a/python/ironic-understack/ironic_understack/tests/test_port_bios_name_hook.py +++ b/python/ironic-understack/ironic_understack/tests/test_port_bios_name_hook.py @@ -1,6 +1,5 @@ import logging -from ironic.common import exception from oslo_utils import uuidutils from ironic_understack.port_bios_name_hook import PortBiosNameHook @@ -14,32 +13,12 @@ } -def _make_task(mocker, bios_value=None, bios_error=None, cache_error=None): - mock_node = mocker.Mock(id=1234, uuid=uuidutils.generate_uuid()) +def _make_task(mocker, enrolled_pxe_ports=None): + mock_node = mocker.Mock(id=1234, uuid=uuidutils.generate_uuid(), extra={}) mock_node.name = "Dell-CR1MB0" - task = mocker.Mock(node=mock_node, context=mocker.Mock()) - - if cache_error: - task.driver.bios.cache_bios_settings.side_effect = cache_error - elif bios_error: - mocker.patch( - "ironic_understack.port_bios_name_hook.BIOSSetting.get", - side_effect=bios_error, - ) - elif bios_value is not None: - setting = mocker.Mock(value=bios_value) - mocker.patch( - "ironic_understack.port_bios_name_hook.BIOSSetting.get", - return_value=setting, - ) - else: - setting = mocker.Mock(value=None) - mocker.patch( - "ironic_understack.port_bios_name_hook.BIOSSetting.get", - return_value=setting, - ) - - return task + if enrolled_pxe_ports is not None: + mock_node.extra["enrolled_pxe_ports"] = enrolled_pxe_ports + return mocker.Mock(node=mock_node, context=mocker.Mock()) def _make_port(mocker, mac, **kwargs): @@ -57,10 +36,10 @@ def _make_port(mocker, mac, **kwargs): return port -def test_pxe_from_bios_setting(mocker, caplog): - """BIOS HttpDev1Interface determines pxe_enabled.""" +def test_pxe_from_enrolled_pxe_ports(mocker, caplog): + """node.extra.enrolled_pxe_ports determines pxe_enabled.""" caplog.set_level(logging.DEBUG) - task = _make_task(mocker, bios_value="NIC.Integrated.1-1-1") + task = _make_task(mocker, enrolled_pxe_ports=["NIC.Integrated.1-1-1"]) port1 = _make_port(mocker, "11:11:11:11:11:11") port2 = _make_port(mocker, "22:22:22:22:22:22") @@ -79,10 +58,13 @@ def test_pxe_from_bios_setting(mocker, caplog): assert port1.physical_network == "enrol" -def test_pxe_fallback_when_cache_fails(mocker, caplog): - """Falls back to heuristic when BIOS cache fails.""" +def test_pxe_from_enrolled_pxe_ports_enables_multiple_ports(mocker, caplog): + """Every port in enrolled_pxe_ports gets pxe_enabled=True.""" caplog.set_level(logging.DEBUG) - task = _make_task(mocker, cache_error=Exception("no BIOS")) + task = _make_task( + mocker, + enrolled_pxe_ports=["NIC.Integrated.1-1-1", "NIC.Integrated.1-2-1"], + ) port1 = _make_port(mocker, "11:11:11:11:11:11") port2 = _make_port(mocker, "22:22:22:22:22:22") @@ -94,19 +76,14 @@ def test_pxe_fallback_when_cache_fails(mocker, caplog): PortBiosNameHook().__call__(task, _INVENTORY, {}) - # Heuristic picks NIC.Integrated.1-1 (first sorted match) assert port1.pxe_enabled is True - assert port2.pxe_enabled is False - assert "falling back to naming heuristic" in caplog.text + assert port2.pxe_enabled is True -def test_pxe_fallback_when_setting_missing(mocker, caplog): - """Falls back to heuristic when BIOS setting not found.""" +def test_pxe_fallback_when_enrolled_pxe_ports_missing(mocker, caplog): + """Missing enrolled_pxe_ports enables all matching PXE prefix ports.""" caplog.set_level(logging.DEBUG) - task = _make_task( - mocker, - bios_error=exception.BIOSSettingNotFound(node="fake", name="HttpDev1Interface"), - ) + task = _make_task(mocker) port1 = _make_port(mocker, "11:11:11:11:11:11") port2 = _make_port(mocker, "22:22:22:22:22:22") @@ -119,14 +96,43 @@ def test_pxe_fallback_when_setting_missing(mocker, caplog): PortBiosNameHook().__call__(task, _INVENTORY, {}) assert port1.pxe_enabled is True - assert port2.pxe_enabled is False - assert "falling back to naming heuristic" in caplog.text + assert port2.pxe_enabled is True + assert "setting pxe flag on all interfaces starting" in caplog.text + + +def test_missing_enrolled_pxe_ports_enables_slot_ports_too(mocker, caplog): + """Missing enrolled_pxe_ports enables all supported PXE port prefixes.""" + caplog.set_level(logging.DEBUG) + task = _make_task(mocker) + inventory = { + "memory": {"physical_mb": 98304}, + "interfaces": [ + {"mac_address": "11:11:11:11:11:11", "name": "NIC.Integrated.1-1"}, + {"mac_address": "22:22:22:22:22:22", "name": "NIC.Slot.1-2"}, + {"mac_address": "33:33:33:33:33:33", "name": "eno1"}, + ], + } + + port1 = _make_port(mocker, "11:11:11:11:11:11") + port2 = _make_port(mocker, "22:22:22:22:22:22") + port3 = _make_port(mocker, "33:33:33:33:33:33") + + mocker.patch( + "ironic_understack.port_bios_name_hook.ironic_ports_for_node", + return_value=[port1, port2, port3], + ) + + PortBiosNameHook().__call__(task, inventory, {}) + + assert port1.pxe_enabled is True + assert port2.pxe_enabled is True + assert port3.pxe_enabled is False def test_retaining_physical_network(mocker, caplog): """Existing physical_network and local_link_connection are preserved.""" caplog.set_level(logging.DEBUG) - task = _make_task(mocker, bios_value="NIC.Integrated.1-1-1") + task = _make_task(mocker, enrolled_pxe_ports=["NIC.Integrated.1-1-1"]) port = _make_port( mocker, @@ -153,7 +159,7 @@ def test_retaining_physical_network(mocker, caplog): def test_clears_pxe_on_previously_enabled_port(mocker, caplog): """Port that was pxe_enabled but no longer matches gets cleared.""" caplog.set_level(logging.DEBUG) - task = _make_task(mocker, bios_value="NIC.Integrated.1-2-1") + task = _make_task(mocker, enrolled_pxe_ports=["NIC.Integrated.1-2-1"]) port1 = _make_port(mocker, "11:11:11:11:11:11", pxe_enabled=True) port2 = _make_port(mocker, "22:22:22:22:22:22") @@ -172,7 +178,7 @@ def test_clears_pxe_on_previously_enabled_port(mocker, caplog): def test_removing_bios_name(mocker, caplog): """Port with unknown MAC gets bios_name removed.""" caplog.set_level(logging.DEBUG) - task = _make_task(mocker, bios_value="NIC.Integrated.1-1-1") + task = _make_task(mocker, enrolled_pxe_ports=["NIC.Integrated.1-1-1"]) port = _make_port( mocker, From 6d2e5109ca99c9ddf604efcecd3a86b9d6568d1e Mon Sep 17 00:00:00 2001 From: Steve Keay Date: Tue, 7 Apr 2026 16:56:47 +0100 Subject: [PATCH 11/15] Retry redfish requests after powering on the server (takes a long time) We see HTTP 503 errors from redfish for a while after booting the server. --- .../tests/test_enrol_server.py | 32 +++++++++++++++++++ .../main/enroll_server.py | 31 ++++++++++++++++-- 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/python/understack-workflows/tests/test_enrol_server.py b/python/understack-workflows/tests/test_enrol_server.py index 55e669217..f3112bce3 100644 --- a/python/understack-workflows/tests/test_enrol_server.py +++ b/python/understack-workflows/tests/test_enrol_server.py @@ -6,6 +6,7 @@ from ironicclient.common.apiclient import exceptions as ironic_exceptions from ironicclient.v1.node import Node +from understack_workflows.bmc import RedfishRequestError from understack_workflows.bmc_chassis_info import ChassisInfo from understack_workflows.bmc_chassis_info import InterfaceInfo from understack_workflows.main import enroll_server @@ -246,6 +247,37 @@ def test_enrol_happy_path_uses_real_ironic_workflow(mocker): assert patch == [{"op": "add", "path": "/inspect_interface", "value": "agent"}] +def test_power_on_and_wait_retries_temporary_redfish_503(mocker): + initial_info = make_device_info(power_on=False, connected_mac=None) + powered_info = make_device_info( + power_on=True, + connected_mac="AA:BB:CC:DD:EE:FF", + ) + fake_bmc = make_bmc(mocker) + sleep = mocker.patch.object(enroll_server.time, "sleep") + mocker.patch.object( + enroll_server, + "chassis_info", + side_effect=[ + RedfishRequestError( + "BMC communications failure HTTP 503 Service Unavailable " + "Base.1.12.ServiceTemporarilyUnavailable" + ), + powered_info, + ], + ) + + result = enroll_server.power_on_and_wait(fake_bmc, initial_info) + + assert result is powered_info + fake_bmc.redfish_request.assert_called_once_with( + path="/redfish/v1/Systems/System.Embedded.1/Actions/ComputerSystem.Reset", + payload={"ResetType": "On"}, + method="POST", + ) + assert sleep.call_args_list == [call(120), call(30)] + + def test_enrol_existing_failed_node_recovers_and_updates(mocker): device_info = make_device_info( power_on=True, diff --git a/python/understack-workflows/understack_workflows/main/enroll_server.py b/python/understack-workflows/understack_workflows/main/enroll_server.py index 22f0699ff..3df7b0362 100644 --- a/python/understack-workflows/understack_workflows/main/enroll_server.py +++ b/python/understack-workflows/understack_workflows/main/enroll_server.py @@ -9,6 +9,7 @@ from understack_workflows import ironic_node from understack_workflows.bmc import Bmc +from understack_workflows.bmc import RedfishRequestError from understack_workflows.bmc import bmc_for_ip_address from understack_workflows.bmc_bios import update_dell_bios_settings from understack_workflows.bmc_chassis_info import ChassisInfo @@ -21,6 +22,10 @@ logger = logging.getLogger(__name__) +POST_POWER_ON_WAIT_SECONDS = 120 +POST_POWER_ON_RETRY_SECONDS = 30 +POST_POWER_ON_MAX_RETRIES = 8 + def main() -> None: """On-board new or Refresh existing baremetal node. @@ -187,8 +192,30 @@ def power_on_and_wait(bmc: Bmc, device_info: ChassisInfo) -> ChassisInfo: payload={"ResetType": "On"}, method="POST", ) - time.sleep(120) - return chassis_info(bmc) + time.sleep(POST_POWER_ON_WAIT_SECONDS) + + for attempt in range(POST_POWER_ON_MAX_RETRIES + 1): + try: + return chassis_info(bmc) + except RedfishRequestError as exc: + if not _is_temporary_redfish_unavailable(exc): + raise + if attempt == POST_POWER_ON_MAX_RETRIES: + raise + logger.warning( + "BMC Redfish data is temporarily unavailable after power on for %s. " + "Retrying in %s seconds.", + bmc.ip_address, + POST_POWER_ON_RETRY_SECONDS, + ) + time.sleep(POST_POWER_ON_RETRY_SECONDS) + + raise AssertionError("unreachable") + + +def _is_temporary_redfish_unavailable(exc: RedfishRequestError) -> bool: + message = str(exc) + return "HTTP 503" in message or "ServiceTemporarilyUnavailable" in message def device_name(device_info: ChassisInfo) -> str: From 0aaf48768ebe1d22f26afb55fbcb5b682ceb743d Mon Sep 17 00:00:00 2001 From: Steve Keay Date: Thu, 9 Apr 2026 12:56:52 +0100 Subject: [PATCH 12/15] Use consistent spelling of enroll --- .../tests/{test_enrol_server.py => enroll_server.py} | 0 workflows/argo-events/workflowtemplates/enroll-server.yaml | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename python/understack-workflows/tests/{test_enrol_server.py => enroll_server.py} (100%) diff --git a/python/understack-workflows/tests/test_enrol_server.py b/python/understack-workflows/tests/enroll_server.py similarity index 100% rename from python/understack-workflows/tests/test_enrol_server.py rename to python/understack-workflows/tests/enroll_server.py diff --git a/workflows/argo-events/workflowtemplates/enroll-server.yaml b/workflows/argo-events/workflowtemplates/enroll-server.yaml index 21bc8cddb..0001ee46a 100644 --- a/workflows/argo-events/workflowtemplates/enroll-server.yaml +++ b/workflows/argo-events/workflowtemplates/enroll-server.yaml @@ -40,7 +40,7 @@ spec: container: image: ghcr.io/rackerlabs/understack/ironic-nautobot-client:latest command: - - enrol-server + - enroll-server args: - --ip-address - "{{workflow.parameters.ip_address}}" From 9fa6122e9d470eb7e533b047587cb7e2ce318eb6 Mon Sep 17 00:00:00 2001 From: Steve Keay Date: Mon, 13 Apr 2026 16:39:37 +0100 Subject: [PATCH 13/15] Fix test file naming, was mistakenly created without "test" in filename --- .../tests/{enroll_server.py => test_enroll_server.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename python/understack-workflows/tests/{enroll_server.py => test_enroll_server.py} (100%) diff --git a/python/understack-workflows/tests/enroll_server.py b/python/understack-workflows/tests/test_enroll_server.py similarity index 100% rename from python/understack-workflows/tests/enroll_server.py rename to python/understack-workflows/tests/test_enroll_server.py From e0e7338e568ba64e1bd253247dbd50fde9e1ee82 Mon Sep 17 00:00:00 2001 From: Steve Keay Date: Mon, 13 Apr 2026 16:42:17 +0100 Subject: [PATCH 14/15] Fix off-by-one error with python array slice --- .../understack_workflows/pxe_port_heuristic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/understack-workflows/understack_workflows/pxe_port_heuristic.py b/python/understack-workflows/understack_workflows/pxe_port_heuristic.py index f9295e125..c1ab42081 100644 --- a/python/understack-workflows/understack_workflows/pxe_port_heuristic.py +++ b/python/understack-workflows/understack_workflows/pxe_port_heuristic.py @@ -26,7 +26,7 @@ def guess_pxe_interfaces( ) ] - return names[0:7] + return names[0:8] def disqualified(interface: InterfaceInfo) -> bool: From a988d7be2c8b7c04ba8480af99fc3138986483ce Mon Sep 17 00:00:00 2001 From: Steve Keay Date: Mon, 13 Apr 2026 16:47:06 +0100 Subject: [PATCH 15/15] When pxe_interfaces is set to the empty list, use fallback behaviour --- .../ironic_understack/port_bios_name_hook.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/python/ironic-understack/ironic_understack/port_bios_name_hook.py b/python/ironic-understack/ironic_understack/port_bios_name_hook.py index f9fdc73be..2e969b538 100644 --- a/python/ironic-understack/ironic_understack/port_bios_name_hook.py +++ b/python/ironic-understack/ironic_understack/port_bios_name_hook.py @@ -67,7 +67,14 @@ def __call__(self, task, inventory, plugin_data): def _enrolled_pxe_nics(task) -> list[str]: """Read enrolled PXE NIC names from node.extra, or use broad prefixes.""" enrolled_pxe_nics = task.node.extra.get("enrolled_pxe_ports") - if enrolled_pxe_nics is None: + if enrolled_pxe_nics: + LOG.info( + "Set node %s pxe flag on interfaces from extra.enrolled_pxe_ports %s", + task.node.uuid, + enrolled_pxe_nics, + ) + return enrolled_pxe_nics + else: LOG.warning( "Node %s extra.enrolled_pxe_ports is missing, " "setting pxe flag on all interfaces starting %s.", @@ -76,13 +83,6 @@ def _enrolled_pxe_nics(task) -> list[str]: ) return PXE_BIOS_NAME_PREFIXES - LOG.info( - "Set node %s pxe flag on interfaces from extra.enrolled_pxe_ports %s", - task.node.uuid, - enrolled_pxe_nics, - ) - return enrolled_pxe_nics - def _set_port_extra(baremetal_port, mac, required_bios_name): extra = baremetal_port.extra