sts-sponsors team mailing list archive
-
sts-sponsors team
-
Mailing list archive
-
Message #04701
[Merge] ~adam-collard/maas-ci/+git/system-tests:retry-boot into ~maas-committers/maas-ci/+git/system-tests:master
Adam Collard has proposed merging ~adam-collard/maas-ci/+git/system-tests:retry-boot into ~maas-committers/maas-ci/+git/system-tests:master.
Commit message:
[lxd vm] re-attempt boot after 5m of not seeing machine, use ready_remote_maas
Requested reviews:
MAAS Committers (maas-committers)
For more details, see:
https://code.launchpad.net/~adam-collard/maas-ci/+git/system-tests/+merge/436109
--
Your team MAAS Committers is requested to review the proposed merge of ~adam-collard/maas-ci/+git/system-tests:retry-boot into ~maas-committers/maas-ci/+git/system-tests:master.
diff --git a/systemtests/env_builder/test_basic.py b/systemtests/env_builder/test_basic.py
index 7ecfbd4..7405a05 100644
--- a/systemtests/env_builder/test_basic.py
+++ b/systemtests/env_builder/test_basic.py
@@ -9,6 +9,11 @@ from urllib.request import urlopen
import pytest
from retry import retry
+from systemtests.api import (
+ AuthenticatedAPIClient,
+ Machine,
+ UnauthenticatedMAASAPIClient,
+)
from systemtests.lxd import Instance, get_lxd
from systemtests.utils import (
UnexpectedMachineStatus,
@@ -17,17 +22,59 @@ from systemtests.utils import (
retries,
wait_for_machine,
wait_for_new_machine,
- wait_for_ready_controllers,
)
if TYPE_CHECKING:
from logging import Logger
- from systemtests.api import AuthenticatedAPIClient, UnauthenticatedMAASAPIClient
from systemtests.machine_config import MachineConfig
from systemtests.region import MAASRegion
+@retry(tries=3)
+def _ensure_machine_enlisted(
+ maas_api_client: AuthenticatedAPIClient,
+ mac_address: str,
+ instance: Instance,
+) -> Machine:
+ instance_log = instance.logger.getChild(instance.name)
+ # Find the VM in MAAS by MAC
+ maybe_machine = maas_api_client.list_machines(mac_address=mac_address)
+ if maybe_machine:
+ # Yay, it exists
+ return maybe_machine[0]
+
+ # Machine not registered, let's boot it up
+ @retry(tries=5, delay=5, backoff=1.2, logger=instance_log)
+ def _boot_vm(vm: Instance) -> None:
+ status = instance.status()
+ if status == "RUNNING":
+ instance_log.debug("already running, restarting")
+ instance.restart()
+ elif status == "STOPPED":
+ instance_log.debug("is stopped, starting")
+ try:
+ instance.start()
+ except CalledProcessError:
+ debug_lxd_vm(instance.name, instance_log)
+ raise
+ else:
+ assert False, f"Don't know how to handle lxd_vm status: {status}"
+
+ _boot_vm(instance)
+ try:
+ vm_status = instance.status()
+ except ValueError:
+ vm_status = "not available"
+ instance_log.debug(f"is {vm_status}")
+
+ machine = wait_for_new_machine(
+ maas_api_client, mac_address, instance.name, timeout=(5 * 60, 20)
+ )
+ instance_log.debug(f"found machine {machine['hostname']}")
+ return machine
+
+
class TestSetup:
@pytest.mark.skip_if_installed_from_snap("Prometheus is installed in the snap")
def test_setup_prometheus(
@@ -114,67 +161,40 @@ class TestSetup:
def test_ensure_ready_vm_for_hardware_sync(
self,
+ ready_remote_maas: None,
instance_config: MachineConfig,
maas_api_client: AuthenticatedAPIClient,
testlog: Logger,
) -> None:
"""Ensure that we have a Ready VM at the end."""
- lxd = get_lxd(logger=testlog)
vm_name = instance_config.name
+ lxd = get_lxd(logger=testlog)
instance = Instance(lxd, vm_name)
if instance.exists():
# Force delete the VM so we know we're starting clean
instance.delete()
- # Ensure that the Region Controller is ready
- wait_for_ready_controllers(maas_api_client)
+ mac_address = instance_config.mac_address
# Need to create a network device with a hwaddr
- config: dict[str, str] = {"security.secureboot": "false"}
+ config: dict[str, str] = {
+ "security.secureboot": "false",
+ "volatile.eth0.hwaddr": mac_address,
+ }
if instance_config.lxd_profile:
config["profile"] = instance_config.lxd_profile
- if instance_config.mac_address:
- config["volatile.eth0.hwaddr"] = instance_config.mac_address
instance = lxd.create_vm(vm_name, config)
- mac_address = instance_config.mac_address
-
- # Find the VM in MAAS by MAC
- maybe_machine = maas_api_client.list_machines(mac_address=mac_address)
- if maybe_machine:
- # Yay, it exists
- machine = maybe_machine[0]
- else:
- # Machine not registered, let's boot it up
- @retry(tries=5, delay=5, backoff=1.2, logger=testlog)
- def _boot_vm(vm: Instance) -> None:
- status = instance.status()
- if status == "RUNNING":
- testlog.debug(f"{instance.name} is already running, restarting")
- instance.restart()
- elif status == "STOPPED":
- testlog.debug(f"{instance.name} is stopped, starting")
- try:
- instance.start()
- except CalledProcessError:
- debug_lxd_vm(vm_name, testlog)
- raise
- else:
- assert False, f"Don't know how to handle lxd_vm status: {status}"
-
- _boot_vm(instance)
- try:
- vm_status = instance.status()
- except ValueError:
- vm_status = "not available"
- testlog.debug(f"{vm_name} is {vm_status}")
-
- try:
- machine = wait_for_new_machine(maas_api_client, mac_address, vm_name)
- except UnexpectedMachineStatus as err:
- # We know that this is a LXD VM - so debug it
- err.debug_info.extend(debug_lxd_vm(vm_name, testlog))
- raise
+ maas_api_client.logger = testlog.getChild(vm_name)
+ # Try 3 times to boot the LXD VM and get it enlisted
+ try:
+ machine = _ensure_machine_enlisted(maas_api_client, mac_address, instance)
+ except UnexpectedMachineStatus as err:
+ # We know that this is a LXD VM - so debug it
+ err.debug_info.extend(
+ debug_lxd_vm(instance.name, testlog.getChild(vm_name))
+ )
+ assert False, err
# Make sure we have power parameters set
if not machine["power_type"]:
diff --git a/systemtests/state.py b/systemtests/state.py
index 57f0f7c..38494d0 100644
--- a/systemtests/state.py
+++ b/systemtests/state.py
@@ -164,6 +164,7 @@ def configured_maas(
def all_rack_controllers_commissioned(
logger: Logger, admin: AuthenticatedAPIClient
) -> bool:
+ """Check if all rack controllers have passed commissioning."""
for rack in get_rack_controllers(admin):
status = rack["commissioning_status"]
status_name = rack["commissioning_status_name"]
diff --git a/systemtests/utils.py b/systemtests/utils.py
index 4b0e63d..1edd11a 100644
--- a/systemtests/utils.py
+++ b/systemtests/utils.py
@@ -6,7 +6,6 @@ import random
import re
import string
import time
-from collections import Counter
from dataclasses import dataclass
from logging import Logger
from typing import Iterator, Optional, TypedDict, Union
@@ -153,28 +152,6 @@ def debug_last_events(
return events
-def wait_for_ready_controllers(
- api_client: api.AuthenticatedAPIClient, timeout: float = 10 * 60, delay: float = 30
-) -> None:
- """Wait for all region controllers to have passed commissioning."""
- quiet_client = api.QuietAuthenticatedAPIClient.from_api_client(api_client)
- for retry_info in retries(timeout, delay):
- region_controllers = quiet_client.list_region_controllers()
- commissioning_statuses = Counter(
- rc["commissioning_status_name"] for rc in region_controllers
- )
- passed_count = commissioning_statuses["Passed"]
- total_count = sum(commissioning_statuses.values())
- if passed_count == total_count:
- api_client.logger.debug("All region controllers have passed commissioning!")
- return
- else:
- api_client.logger.debug(
- "Not all region controllers have passed commissioning ("
- f"{passed_count}/{total_count}), sleeping for {delay} seconds"
- )
-
-
# XXX: Move to api.py
def wait_for_machine(
api_client: api.AuthenticatedAPIClient,
@@ -222,12 +199,15 @@ def debug_lxd_vm(machine_name: str, logger: Logger) -> list[str]:
# XXX: Move to api.py
def wait_for_new_machine(
- api_client: api.AuthenticatedAPIClient, mac_address: str, machine_name: str
+ api_client: api.AuthenticatedAPIClient,
+ mac_address: str,
+ machine_name: str,
+ timeout: tuple[float, float] = (30 * 60, 30),
) -> api.Machine:
"""Blocks execution until a machine with the given mac_address appears as New."""
__tracebackhide__ = True
quiet_client = api.QuietAuthenticatedAPIClient.from_api_client(api_client)
- for retry_info in retries(50 * 60, 30):
+ for retry_info in retries(*timeout):
machines = quiet_client.list_machines(mac_address=mac_address, status="new")
if machines:
return machines[0]
Follow ups