← Back to team overview

sts-sponsors team mailing list archive

[Merge] ~igor-brovtsin/maas:dgx-kernel-commissioning into maas:master

 

Igor Brovtsin has proposed merging ~igor-brovtsin/maas:dgx-kernel-commissioning into maas:master.

Commit message:
Commissioning-time platform detection

Requested reviews:
  MAAS Maintainers (maas-maintainers)

For more details, see:
https://code.launchpad.net/~igor-brovtsin/maas/+git/maas/+merge/442664

This MP introduces a mechanism to set the platform for the commissioned machines, as well as a commissioning script that leverages it to detect NVIDIA DGX machines.

Note: it has gone through some serious changes over time, so please don't mind the commit list.
-- 
Your team MAAS Committers is subscribed to branch maas:master.
diff --git a/src/metadataserver/builtin_scripts/__init__.py b/src/metadataserver/builtin_scripts/__init__.py
index 57ab6d1..d7c19b1 100644
--- a/src/metadataserver/builtin_scripts/__init__.py
+++ b/src/metadataserver/builtin_scripts/__init__.py
@@ -25,6 +25,7 @@ from provisioningserver.refresh.node_info_scripts import (
     LLDP_INSTALL_OUTPUT_NAME,
     LLDP_OUTPUT_NAME,
     LSHW_OUTPUT_NAME,
+    MACHINE_CONFIG_HINTS_NAME,
     NODE_INFO_SCRIPTS,
     RUN_MACHINE_RESOURCES,
     SERIAL_PORTS_OUTPUT_NAME,
@@ -83,6 +84,9 @@ BUILTIN_SCRIPTS = [
     BuiltinScript(name=BMC_DETECTION, filename="bmc_config.py"),
     BuiltinScript(name=RUN_MACHINE_RESOURCES, filename=RUN_MACHINE_RESOURCES),
     BuiltinScript(
+        name=MACHINE_CONFIG_HINTS_NAME, filename=MACHINE_CONFIG_HINTS_NAME
+    ),
+    BuiltinScript(
         name=COMMISSIONING_OUTPUT_NAME, filename=COMMISSIONING_OUTPUT_NAME
     ),
     BuiltinScript(
diff --git a/src/metadataserver/builtin_scripts/commissioning_scripts/40-maas-01-machine-config-hints b/src/metadataserver/builtin_scripts/commissioning_scripts/40-maas-01-machine-config-hints
new file mode 100644
index 0000000..e68907e
--- /dev/null
+++ b/src/metadataserver/builtin_scripts/commissioning_scripts/40-maas-01-machine-config-hints
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+#
+# machine-config-hints - Generate hints for machine configuration.
+# Currently provides subarchitecture for DGX systems, but the idea is
+# to allow updating a subset of machine configuration from the
+# commissioning scripts without modifying the region source code.
+#
+# Copyright (C) 2023 Canonical
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# --- Start MAAS 1.0 script metadata ---
+# name: 40-maas-01-machine-config-hints
+# title: Generate machine configuration hints
+# description: Generate machine configuration hints
+# script_type: commissioning
+# timeout: 60
+# --- End MAAS 1.0 script metadata ---
+import json
+import os
+import sys
+
+
+def read_json_file(path):
+    try:
+        with open(path) as fd:
+            return json.load(fd)
+    except OSError as e:
+        sys.exit(f"Failed to read {path}: {e}")
+    except json.JSONDecodeError as e:
+        sys.exit(f"Failed to parse {path}: {e}")
+
+
+def detect_nvidia_dgx(mr):
+    """Returns whether machine-resources output suggests a DGX system"""
+    if "resources" not in mr:
+        return False
+    if "system" not in mr["resources"]:
+        return False
+    if "motherboard" not in mr["resources"]["system"]:
+        return False
+    motherboard = mr["resources"]["system"]["motherboard"]
+    vendor = motherboard.get("vendor", None)
+    product = motherboard.get("product", None)
+    return (
+        vendor.lower() == "nvidia"
+        and product.lower().removeprefix("nvidia ") == "dgx"
+    )
+
+
+PLATFORMS = {
+    "nvidia-dgx": detect_nvidia_dgx,
+}
+
+
+def detect_platform(machine_resources):
+    """Calls detection methods from PLATFORM and returns platform name"""
+    for platform_name, f in PLATFORMS.items():
+        if f(machine_resources):
+            return platform_name
+    return "generic"
+
+
+def provide_hints(resources):
+    """Returns a hints dictionary"""
+    return {"platform": detect_platform(resources), "tags": []}
+
+
+def main():
+    machine_resources = read_json_file(os.environ["MAAS_RESOURCES_FILE"])
+
+    hints_path = os.environ.get("MAAS_MACHINE_CONFIG_HINTS_FILE")
+    result = provide_hints(machine_resources)
+    serialized = json.dumps(result, indent=4, sort_keys=True)
+    print(serialized)
+    with open(hints_path, "w") as f:
+        f.write(serialized)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/metadataserver/builtin_scripts/hooks.py b/src/metadataserver/builtin_scripts/hooks.py
index 81aeb6b..b402735 100644
--- a/src/metadataserver/builtin_scripts/hooks.py
+++ b/src/metadataserver/builtin_scripts/hooks.py
@@ -658,6 +658,19 @@ def _process_lxd_resources(node, data):
     _link_dpu(node)
 
 
+def _process_machine_config_hints(node, hints):
+    if hints is None:
+        logger.warning(f"Machine configuration hints for `{node.system_id}` is None")
+        return
+
+    if 'platform' in hints:
+        node.architecture = f'{node.architecture.split("/", 2)[0]}/{hints["platform"]}'
+
+    node.save()
+
+    logger.error(f"PROCESS MACHINE CONFIG HINTS: {hints}")
+
+
 def _parse_memory(memory, numa_nodes):
     total_memory = memory.get("total", 0)
     # currently LXD only supports default size for hugepages
@@ -1119,6 +1132,9 @@ def process_lxd_results(node, output, exit_status):
     try:
         _process_lxd_environment(node, data["environment"])
         _process_lxd_resources(node, data)
+        _process_machine_config_hints(
+            node, data.get("machine-config-hints", None)
+        )
     except Exception as e:
         log_failure_event(str(e))
         raise
diff --git a/src/metadataserver/user_data/templates/snippets/maas_run_remote_scripts.py b/src/metadataserver/user_data/templates/snippets/maas_run_remote_scripts.py
index 92cce67..e4baa8f 100755
--- a/src/metadataserver/user_data/templates/snippets/maas_run_remote_scripts.py
+++ b/src/metadataserver/user_data/templates/snippets/maas_run_remote_scripts.py
@@ -1062,6 +1062,7 @@ def run_script(script, scripts_dir, send_result=True):
         {
             "MAAS_BASE_URL": get_base_url(script["config"].metadata_url),
             "MAAS_RESOURCES_FILE": script["resources_file"],
+            "MAAS_MACHINE_CONFIG_HINTS_FILE": "/machine-config-hints.json",
             "MAAS_STORAGE_CONFIG_FILE": "/storage-config.json",
             "OUTPUT_COMBINED_PATH": script["combined_path"],
             "OUTPUT_STDOUT_PATH": script["stdout_path"],
diff --git a/src/provisioningserver/refresh/50-maas-01-commissioning b/src/provisioningserver/refresh/50-maas-01-commissioning
index 45ff285..44b114c 100755
--- a/src/provisioningserver/refresh/50-maas-01-commissioning
+++ b/src/provisioningserver/refresh/50-maas-01-commissioning
@@ -48,4 +48,10 @@ if storage_path and os.path.exists(storage_path):
     storage_config = read_json_file(storage_path)
     data["storage-extra"] = storage_config
 
+# add machine config hints if present
+hints_path = os.environ.get("MAAS_MACHINE_CONFIG_HINTS_FILE")
+if hints_path and os.path.exists(hints_path):
+    hints = read_json_file(hints_path)
+    data["machine-config-hints"] = hints
+
 print(json.dumps(data, indent=2))
diff --git a/src/provisioningserver/refresh/node_info_scripts.py b/src/provisioningserver/refresh/node_info_scripts.py
index b92c10e..a1ef7eb 100644
--- a/src/provisioningserver/refresh/node_info_scripts.py
+++ b/src/provisioningserver/refresh/node_info_scripts.py
@@ -22,6 +22,8 @@ DHCP_EXPLORE_OUTPUT_NAME = "20-maas-02-dhcp-unconfigured-ifaces"
 RUN_MACHINE_RESOURCES = "20-maas-03-machine-resources"
 # Run BMC config early as it will enlist new machines.
 BMC_DETECTION = "30-maas-01-bmc-config"
+# Collect machine configuration hints before commissioning output
+MACHINE_CONFIG_HINTS_NAME = "40-maas-01-machine-config-hints"
 COMMISSIONING_OUTPUT_NAME = "50-maas-01-commissioning"
 # The remaining scripts can run in parallel
 SUPPORT_INFO_OUTPUT_NAME = "maas-support-info"
@@ -80,6 +82,10 @@ NODE_INFO_SCRIPTS = OrderedDict(
             {"hook": null_hook, "run_on_controller": True},
         ),
         (
+            MACHINE_CONFIG_HINTS_NAME,
+            {"hook": null_hook, "run_on_controller": True},
+        ),
+        (
             COMMISSIONING_OUTPUT_NAME,
             {"hook": null_hook, "run_on_controller": True},
         ),

Follow ups