sts-sponsors team mailing list archive
-
sts-sponsors team
-
Mailing list archive
-
Message #08318
[Merge] ~igor-brovtsin/maas:dgx-kernel-commissioning into maas:master
Igor Brovtsin has proposed merging ~igor-brovtsin/maas:dgx-kernel-commissioning into maas:master.
Commit message:
Commissioning-time platform detection
Requested reviews:
MAAS Maintainers (maas-maintainers)
For more details, see:
https://code.launchpad.net/~igor-brovtsin/maas/+git/maas/+merge/442664
This MP introduces a mechanism to set the platform for the commissioned machines, as well as a commissioning script that leverages it to detect NVIDIA DGX machines.
Note: it has gone through some serious changes over time, so please don't mind the commit list.
--
Your team MAAS Committers is subscribed to branch maas:master.
diff --git a/src/metadataserver/builtin_scripts/__init__.py b/src/metadataserver/builtin_scripts/__init__.py
index 57ab6d1..d7c19b1 100644
--- a/src/metadataserver/builtin_scripts/__init__.py
+++ b/src/metadataserver/builtin_scripts/__init__.py
@@ -25,6 +25,7 @@ from provisioningserver.refresh.node_info_scripts import (
LLDP_INSTALL_OUTPUT_NAME,
LLDP_OUTPUT_NAME,
LSHW_OUTPUT_NAME,
+ MACHINE_CONFIG_HINTS_NAME,
NODE_INFO_SCRIPTS,
RUN_MACHINE_RESOURCES,
SERIAL_PORTS_OUTPUT_NAME,
@@ -83,6 +84,9 @@ BUILTIN_SCRIPTS = [
BuiltinScript(name=BMC_DETECTION, filename="bmc_config.py"),
BuiltinScript(name=RUN_MACHINE_RESOURCES, filename=RUN_MACHINE_RESOURCES),
BuiltinScript(
+ name=MACHINE_CONFIG_HINTS_NAME, filename=MACHINE_CONFIG_HINTS_NAME
+ ),
+ BuiltinScript(
name=COMMISSIONING_OUTPUT_NAME, filename=COMMISSIONING_OUTPUT_NAME
),
BuiltinScript(
diff --git a/src/metadataserver/builtin_scripts/commissioning_scripts/40-maas-01-machine-config-hints b/src/metadataserver/builtin_scripts/commissioning_scripts/40-maas-01-machine-config-hints
new file mode 100644
index 0000000..e68907e
--- /dev/null
+++ b/src/metadataserver/builtin_scripts/commissioning_scripts/40-maas-01-machine-config-hints
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+#
+# machine-config-hints - Generate hints for machine configuration.
+# Currently provides subarchitecture for DGX systems, but the idea is
+# to allow updating a subset of machine configuration from the
+# commissioning scripts without modifying the region source code.
+#
+# Copyright (C) 2023 Canonical
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# --- Start MAAS 1.0 script metadata ---
+# name: 40-maas-01-machine-config-hints
+# title: Generate machine configuration hints
+# description: Generate machine configuration hints
+# script_type: commissioning
+# timeout: 60
+# --- End MAAS 1.0 script metadata ---
+import json
+import os
+import sys
+
+
+def read_json_file(path):
+ try:
+ with open(path) as fd:
+ return json.load(fd)
+ except OSError as e:
+ sys.exit(f"Failed to read {path}: {e}")
+ except json.JSONDecodeError as e:
+ sys.exit(f"Failed to parse {path}: {e}")
+
+
+def detect_nvidia_dgx(mr):
+ """Returns whether machine-resources output suggests a DGX system"""
+ if "resources" not in mr:
+ return False
+ if "system" not in mr["resources"]:
+ return False
+ if "motherboard" not in mr["resources"]["system"]:
+ return False
+ motherboard = mr["resources"]["system"]["motherboard"]
+ vendor = motherboard.get("vendor", None)
+ product = motherboard.get("product", None)
+ return (
+ vendor.lower() == "nvidia"
+ and product.lower().removeprefix("nvidia ") == "dgx"
+ )
+
+
+PLATFORMS = {
+ "nvidia-dgx": detect_nvidia_dgx,
+}
+
+
+def detect_platform(machine_resources):
+ """Calls detection methods from PLATFORM and returns platform name"""
+ for platform_name, f in PLATFORMS.items():
+ if f(machine_resources):
+ return platform_name
+ return "generic"
+
+
+def provide_hints(resources):
+ """Returns a hints dictionary"""
+ return {"platform": detect_platform(resources), "tags": []}
+
+
+def main():
+ machine_resources = read_json_file(os.environ["MAAS_RESOURCES_FILE"])
+
+ hints_path = os.environ.get("MAAS_MACHINE_CONFIG_HINTS_FILE")
+ result = provide_hints(machine_resources)
+ serialized = json.dumps(result, indent=4, sort_keys=True)
+ print(serialized)
+ with open(hints_path, "w") as f:
+ f.write(serialized)
+
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/src/metadataserver/builtin_scripts/hooks.py b/src/metadataserver/builtin_scripts/hooks.py
index 81aeb6b..b402735 100644
--- a/src/metadataserver/builtin_scripts/hooks.py
+++ b/src/metadataserver/builtin_scripts/hooks.py
@@ -658,6 +658,19 @@ def _process_lxd_resources(node, data):
_link_dpu(node)
+def _process_machine_config_hints(node, hints):
+ if hints is None:
+ logger.warning(f"Machine configuration hints for `{node.system_id}` is None")
+ return
+
+ if 'platform' in hints:
+ node.architecture = f'{node.architecture.split("/", 2)[0]}/{hints["platform"]}'
+
+ node.save()
+
+ logger.error(f"PROCESS MACHINE CONFIG HINTS: {hints}")
+
+
def _parse_memory(memory, numa_nodes):
total_memory = memory.get("total", 0)
# currently LXD only supports default size for hugepages
@@ -1119,6 +1132,9 @@ def process_lxd_results(node, output, exit_status):
try:
_process_lxd_environment(node, data["environment"])
_process_lxd_resources(node, data)
+ _process_machine_config_hints(
+ node, data.get("machine-config-hints", None)
+ )
except Exception as e:
log_failure_event(str(e))
raise
diff --git a/src/metadataserver/user_data/templates/snippets/maas_run_remote_scripts.py b/src/metadataserver/user_data/templates/snippets/maas_run_remote_scripts.py
index 92cce67..e4baa8f 100755
--- a/src/metadataserver/user_data/templates/snippets/maas_run_remote_scripts.py
+++ b/src/metadataserver/user_data/templates/snippets/maas_run_remote_scripts.py
@@ -1062,6 +1062,7 @@ def run_script(script, scripts_dir, send_result=True):
{
"MAAS_BASE_URL": get_base_url(script["config"].metadata_url),
"MAAS_RESOURCES_FILE": script["resources_file"],
+ "MAAS_MACHINE_CONFIG_HINTS_FILE": "/machine-config-hints.json",
"MAAS_STORAGE_CONFIG_FILE": "/storage-config.json",
"OUTPUT_COMBINED_PATH": script["combined_path"],
"OUTPUT_STDOUT_PATH": script["stdout_path"],
diff --git a/src/provisioningserver/refresh/50-maas-01-commissioning b/src/provisioningserver/refresh/50-maas-01-commissioning
index 45ff285..44b114c 100755
--- a/src/provisioningserver/refresh/50-maas-01-commissioning
+++ b/src/provisioningserver/refresh/50-maas-01-commissioning
@@ -48,4 +48,10 @@ if storage_path and os.path.exists(storage_path):
storage_config = read_json_file(storage_path)
data["storage-extra"] = storage_config
+# add machine config hints if present
+hints_path = os.environ.get("MAAS_MACHINE_CONFIG_HINTS_FILE")
+if hints_path and os.path.exists(hints_path):
+ hints = read_json_file(hints_path)
+ data["machine-config-hints"] = hints
+
print(json.dumps(data, indent=2))
diff --git a/src/provisioningserver/refresh/node_info_scripts.py b/src/provisioningserver/refresh/node_info_scripts.py
index b92c10e..a1ef7eb 100644
--- a/src/provisioningserver/refresh/node_info_scripts.py
+++ b/src/provisioningserver/refresh/node_info_scripts.py
@@ -22,6 +22,8 @@ DHCP_EXPLORE_OUTPUT_NAME = "20-maas-02-dhcp-unconfigured-ifaces"
RUN_MACHINE_RESOURCES = "20-maas-03-machine-resources"
# Run BMC config early as it will enlist new machines.
BMC_DETECTION = "30-maas-01-bmc-config"
+# Collect machine configuration hints before commissioning output
+MACHINE_CONFIG_HINTS_NAME = "40-maas-01-machine-config-hints"
COMMISSIONING_OUTPUT_NAME = "50-maas-01-commissioning"
# The remaining scripts can run in parallel
SUPPORT_INFO_OUTPUT_NAME = "maas-support-info"
@@ -80,6 +82,10 @@ NODE_INFO_SCRIPTS = OrderedDict(
{"hook": null_hook, "run_on_controller": True},
),
(
+ MACHINE_CONFIG_HINTS_NAME,
+ {"hook": null_hook, "run_on_controller": True},
+ ),
+ (
COMMISSIONING_OUTPUT_NAME,
{"hook": null_hook, "run_on_controller": True},
),
Follow ups