← Back to team overview

sts-sponsors team mailing list archive

[Merge] ~adam-collard/maas-ci/+git/system-tests:retry-boot into ~maas-committers/maas-ci/+git/system-tests:master

 

Adam Collard has proposed merging ~adam-collard/maas-ci/+git/system-tests:retry-boot into ~maas-committers/maas-ci/+git/system-tests:master.

Commit message:
[lxd vm] re-attempt boot after 5m of not seeing machine, use ready_remote_maas


Requested reviews:
  MAAS Committers (maas-committers)

For more details, see:
https://code.launchpad.net/~adam-collard/maas-ci/+git/system-tests/+merge/436109
-- 
Your team MAAS Committers is requested to review the proposed merge of ~adam-collard/maas-ci/+git/system-tests:retry-boot into ~maas-committers/maas-ci/+git/system-tests:master.
diff --git a/systemtests/env_builder/test_basic.py b/systemtests/env_builder/test_basic.py
index 7ecfbd4..7405a05 100644
--- a/systemtests/env_builder/test_basic.py
+++ b/systemtests/env_builder/test_basic.py
@@ -9,6 +9,11 @@ from urllib.request import urlopen
 import pytest
 from retry import retry
 
+from systemtests.api import (
+    AuthenticatedAPIClient,
+    Machine,
+    UnauthenticatedMAASAPIClient,
+)
 from systemtests.lxd import Instance, get_lxd
 from systemtests.utils import (
     UnexpectedMachineStatus,
@@ -17,17 +22,59 @@ from systemtests.utils import (
     retries,
     wait_for_machine,
     wait_for_new_machine,
-    wait_for_ready_controllers,
 )
 
 if TYPE_CHECKING:
     from logging import Logger
 
-    from systemtests.api import AuthenticatedAPIClient, UnauthenticatedMAASAPIClient
     from systemtests.machine_config import MachineConfig
     from systemtests.region import MAASRegion
 
 
+@retry(tries=3)
+def _ensure_machine_enlisted(
+    maas_api_client: AuthenticatedAPIClient,
+    mac_address: str,
+    instance: Instance,
+) -> Machine:
+    instance_log = instance.logger.getChild(instance.name)
+    # Find the VM in MAAS by MAC
+    maybe_machine = maas_api_client.list_machines(mac_address=mac_address)
+    if maybe_machine:
+        # Yay, it exists
+        return maybe_machine[0]
+
+    # Machine not registered, let's boot it up
+    @retry(tries=5, delay=5, backoff=1.2, logger=instance_log)
+    def _boot_vm(vm: Instance) -> None:
+        status = instance.status()
+        if status == "RUNNING":
+            instance_log.debug("already running, restarting")
+            instance.restart()
+        elif status == "STOPPED":
+            instance_log.debug("is stopped, starting")
+            try:
+                instance.start()
+            except CalledProcessError:
+                debug_lxd_vm(instance.name, instance_log)
+                raise
+        else:
+            assert False, f"Don't know how to handle lxd_vm status: {status}"
+
+    _boot_vm(instance)
+    try:
+        vm_status = instance.status()
+    except ValueError:
+        vm_status = "not available"
+    instance_log.debug(f"is {vm_status}")
+
+    machine = wait_for_new_machine(
+        maas_api_client, mac_address, instance.name, timeout=(5 * 60, 20)
+    )
+    instance_log.debug(f"found machine {machine['hostname']}")
+    return machine
+
+
 class TestSetup:
     @pytest.mark.skip_if_installed_from_snap("Prometheus is installed in the snap")
     def test_setup_prometheus(
@@ -114,67 +161,40 @@ class TestSetup:
 
     def test_ensure_ready_vm_for_hardware_sync(
         self,
+        ready_remote_maas: None,
         instance_config: MachineConfig,
         maas_api_client: AuthenticatedAPIClient,
         testlog: Logger,
     ) -> None:
         """Ensure that we have a Ready VM at the end."""
-        lxd = get_lxd(logger=testlog)
         vm_name = instance_config.name
+        lxd = get_lxd(logger=testlog)
         instance = Instance(lxd, vm_name)
         if instance.exists():
             # Force delete the VM so we know we're starting clean
             instance.delete()
 
-        # Ensure that the Region Controller is ready
-        wait_for_ready_controllers(maas_api_client)
+        mac_address = instance_config.mac_address
         # Need to create a network device with a hwaddr
-        config: dict[str, str] = {"security.secureboot": "false"}
+        config: dict[str, str] = {
+            "security.secureboot": "false",
+            "volatile.eth0.hwaddr": mac_address,
+        }
         if instance_config.lxd_profile:
             config["profile"] = instance_config.lxd_profile
-        if instance_config.mac_address:
-            config["volatile.eth0.hwaddr"] = instance_config.mac_address
 
         instance = lxd.create_vm(vm_name, config)
 
-        mac_address = instance_config.mac_address
-
-        # Find the VM in MAAS by MAC
-        maybe_machine = maas_api_client.list_machines(mac_address=mac_address)
-        if maybe_machine:
-            # Yay, it exists
-            machine = maybe_machine[0]
-        else:
-            # Machine not registered, let's boot it up
-            @retry(tries=5, delay=5, backoff=1.2, logger=testlog)
-            def _boot_vm(vm: Instance) -> None:
-                status = instance.status()
-                if status == "RUNNING":
-                    testlog.debug(f"{instance.name} is already running, restarting")
-                    instance.restart()
-                elif status == "STOPPED":
-                    testlog.debug(f"{instance.name} is stopped, starting")
-                    try:
-                        instance.start()
-                    except CalledProcessError:
-                        debug_lxd_vm(vm_name, testlog)
-                        raise
-                else:
-                    assert False, f"Don't know how to handle lxd_vm status: {status}"
-
-            _boot_vm(instance)
-            try:
-                vm_status = instance.status()
-            except ValueError:
-                vm_status = "not available"
-            testlog.debug(f"{vm_name} is {vm_status}")
-
-            try:
-                machine = wait_for_new_machine(maas_api_client, mac_address, vm_name)
-            except UnexpectedMachineStatus as err:
-                # We know that this is a LXD VM - so debug it
-                err.debug_info.extend(debug_lxd_vm(vm_name, testlog))
-                raise
+        maas_api_client.logger = testlog.getChild(vm_name)
+        # Try 3 times to boot the LXD VM and get it enlisted
+        try:
+            machine = _ensure_machine_enlisted(maas_api_client, mac_address, instance)
+        except UnexpectedMachineStatus as err:
+            # We know that this is a LXD VM - so debug it
+            err.debug_info.extend(
+                debug_lxd_vm(instance.name, testlog.getChild(vm_name))
+            )
+            assert False, err
 
         # Make sure we have power parameters set
         if not machine["power_type"]:
diff --git a/systemtests/state.py b/systemtests/state.py
index 57f0f7c..38494d0 100644
--- a/systemtests/state.py
+++ b/systemtests/state.py
@@ -164,6 +164,7 @@ def configured_maas(
 def all_rack_controllers_commissioned(
     logger: Logger, admin: AuthenticatedAPIClient
 ) -> bool:
+    """Check if all rack controllers have passed commissioning."""
     for rack in get_rack_controllers(admin):
         status = rack["commissioning_status"]
         status_name = rack["commissioning_status_name"]
diff --git a/systemtests/utils.py b/systemtests/utils.py
index 4b0e63d..1edd11a 100644
--- a/systemtests/utils.py
+++ b/systemtests/utils.py
@@ -6,7 +6,6 @@ import random
 import re
 import string
 import time
-from collections import Counter
 from dataclasses import dataclass
 from logging import Logger
 from typing import Iterator, Optional, TypedDict, Union
@@ -153,28 +152,6 @@ def debug_last_events(
     return events
 
 
-def wait_for_ready_controllers(
-    api_client: api.AuthenticatedAPIClient, timeout: float = 10 * 60, delay: float = 30
-) -> None:
-    """Wait for all region controllers to have passed commissioning."""
-    quiet_client = api.QuietAuthenticatedAPIClient.from_api_client(api_client)
-    for retry_info in retries(timeout, delay):
-        region_controllers = quiet_client.list_region_controllers()
-        commissioning_statuses = Counter(
-            rc["commissioning_status_name"] for rc in region_controllers
-        )
-        passed_count = commissioning_statuses["Passed"]
-        total_count = sum(commissioning_statuses.values())
-        if passed_count == total_count:
-            api_client.logger.debug("All region controllers have passed commissioning!")
-            return
-        else:
-            api_client.logger.debug(
-                "Not all region controllers have passed commissioning ("
-                f"{passed_count}/{total_count}), sleeping for {delay} seconds"
-            )
-
-
 # XXX: Move to api.py
 def wait_for_machine(
     api_client: api.AuthenticatedAPIClient,
@@ -222,12 +199,15 @@ def debug_lxd_vm(machine_name: str, logger: Logger) -> list[str]:
 
 # XXX: Move to api.py
 def wait_for_new_machine(
-    api_client: api.AuthenticatedAPIClient, mac_address: str, machine_name: str
+    api_client: api.AuthenticatedAPIClient,
+    mac_address: str,
+    machine_name: str,
+    timeout: tuple[float, float] = (30 * 60, 30),
 ) -> api.Machine:
     """Blocks execution until a machine with the given mac_address appears as New."""
     __tracebackhide__ = True
     quiet_client = api.QuietAuthenticatedAPIClient.from_api_client(api_client)
-    for retry_info in retries(50 * 60, 30):
+    for retry_info in retries(*timeout):
         machines = quiet_client.list_machines(mac_address=mac_address, status="new")
         if machines:
             return machines[0]

Follow ups