launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #29594
[Merge] ~cjwatson/launchpad-buildd:lxd-nvidia-nestable into launchpad-buildd:master
Colin Watson has proposed merging ~cjwatson/launchpad-buildd:lxd-nvidia-nestable into launchpad-buildd:master.
Commit message:
Fix use of nvidia* devices in nested containers
Requested reviews:
Launchpad code reviewers (launchpad-reviewers)
For more details, see:
https://code.launchpad.net/~cjwatson/launchpad-buildd/+git/launchpad-buildd/+merge/436282
Bind-mounting these devices into the container means that they aren't visible to the lxd snap inside the container. Create our own device nodes instead.
--
Your team Launchpad code reviewers is requested to review the proposed merge of ~cjwatson/launchpad-buildd:lxd-nvidia-nestable into launchpad-buildd:master.
diff --git a/debian/changelog b/debian/changelog
index 867c9f5..1e9ed07 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -2,6 +2,8 @@ launchpad-buildd (227) UNRELEASED; urgency=medium
* Tolerate receiving "builder_constraints": None.
* Check the appropriate server.key path for the LXD snap.
+ * Create nvidia* devices in such a way that they can be used by nested
+ containers.
-- Colin Watson <cjwatson@xxxxxxxxxx> Tue, 24 Jan 2023 13:13:27 +0000
diff --git a/lpbuildd/target/lxd.py b/lpbuildd/target/lxd.py
index 57f62a4..eaf4066 100644
--- a/lpbuildd/target/lxd.py
+++ b/lpbuildd/target/lxd.py
@@ -2,6 +2,7 @@
# GNU Affero General Public License version 3 (see the file LICENSE).
from contextlib import closing
+from functools import cached_property
import io
import json
import os
@@ -43,30 +44,6 @@ def get_device_mapper_major():
"Cannot determine major device number for device-mapper")
-def get_nvidia_container_paths():
- """Return the paths that need to be bind-mounted for NVIDIA CUDA support.
-
- LXD's security.privileged=true and nvidia.runtime=true options are
- unfortunately incompatible, but we can emulate the important bits of the
- latter with some tactical bind-mounts. There is no very good way to do
- this; this seems like the least unpleasant approach.
- """
- env = dict(os.environ)
- env["LD_LIBRARY_PATH"] = "/snap/lxd/current/lib"
- return subprocess.check_output(
- [
- "/snap/lxd/current/bin/nvidia-container-cli.real",
- "list",
- "--binaries",
- "--firmwares",
- "--ipcs",
- "--libraries",
- ],
- env=env,
- universal_newlines=True,
- ).splitlines()
-
-
fallback_hosts = dedent("""\
127.0.0.1\tlocalhost
::1\tlocalhost ip6-localhost ip6-loopback
@@ -312,6 +289,23 @@ class LXD(Backend):
os.unlink(self.dnsmasq_pid_file)
subprocess.call(["sudo", "ip", "link", "delete", self.bridge_name])
+ @cached_property
+ def _nvidia_container_paths(self):
+ """The paths that need to be bind-mounted for NVIDIA CUDA support.
+
+ LXD's security.privileged=true and nvidia.runtime=true options are
+ unfortunately incompatible, but we can emulate the important bits of
+ the latter with some tactical bind-mounts. There is no very good
+ way to do this; this seems like the least unpleasant approach.
+ """
+ env = dict(os.environ)
+ env["LD_LIBRARY_PATH"] = "/snap/lxd/current/lib"
+ return subprocess.check_output(
+ ["/snap/lxd/current/bin/nvidia-container-cli.real", "list"],
+ env=env,
+ universal_newlines=True,
+ ).splitlines()
+
def create_profile(self):
for addr in self.ipv4_network:
if addr not in (
@@ -381,13 +375,13 @@ class LXD(Backend):
"type": "disk",
}
if "gpu-nvidia" in self.constraints:
- devices["gpu"] = {"type": "gpu"}
- for i, path in enumerate(get_nvidia_container_paths()):
- devices[f"nvidia-{i}"] = {
- "path": path,
- "source": path,
- "type": "disk",
- }
+ for i, path in enumerate(self._nvidia_container_paths):
+ if not path.startswith("/dev/"):
+ devices[f"nvidia-{i}"] = {
+ "path": path,
+ "source": path,
+ "type": "disk",
+ }
self.client.profiles.create(self.profile_name, config, devices)
def start(self):
@@ -495,6 +489,20 @@ class LXD(Backend):
"b", str(major), str(minor)])
if "gpu-nvidia" in self.constraints:
+ # Create nvidia* devices. We have to do this here rather than
+ # bind-mounting them into the container, because bind-mounts
+ # aren't propagated into snaps (such as lxd) installed inside
+ # the container.
+ for path in self._nvidia_container_paths:
+ if path.startswith("/dev/"):
+ st = os.stat(path)
+ if stat.S_ISCHR(st.st_mode):
+ self.run(
+ ["mknod", "-m", "0%o" % stat.S_IMODE(st.st_mode),
+ path, "c",
+ str(os.major(st.st_rdev)),
+ str(os.minor(st.st_rdev))])
+
# We bind-mounted several libraries into the container, so run
# ldconfig to update the dynamic linker's cache.
self.run(["/sbin/ldconfig"])
diff --git a/lpbuildd/target/tests/test_lxd.py b/lpbuildd/target/tests/test_lxd.py
index dea7ad8..802d34c 100644
--- a/lpbuildd/target/tests/test_lxd.py
+++ b/lpbuildd/target/tests/test_lxd.py
@@ -100,14 +100,19 @@ class FakeFilesystem(_FakeFilesystem):
def _stat(self, real, path, *args, **kwargs):
r = super()._stat(real, path, *args, **kwargs)
if path in self._devices:
- r = os.stat_result(list(r), {"st_rdev": self._devices[path]})
+ flags, device = self._devices[path]
+ mode = stat.S_IMODE(r.st_mode) | flags
+ r = os.stat_result([mode] + list(r[1:]), {"st_rdev": device})
return r
def _mknod(self, real, path, mode=0o600, device=None):
- fd = os.open(path, os.O_CREAT | os.O_EXCL, mode & 0o777)
+ fd = os.open(path, os.O_CREAT | os.O_EXCL)
+ os.fchmod(fd, stat.S_IMODE(mode))
os.close(fd)
- if mode & (stat.S_IFBLK | stat.S_IFCHR):
- self._devices[path] = device
+ if stat.S_ISBLK(mode):
+ self._devices[path] = (stat.S_IFBLK, device)
+ elif stat.S_ISCHR(mode):
+ self._devices[path] = (stat.S_IFCHR, device)
class TestLXD(TestCase):
@@ -329,13 +334,13 @@ class TestLXD(TestCase):
"type": "disk",
}
if gpu_nvidia_paths:
- expected_devices["gpu"] = {"type": "gpu"}
for i, path in enumerate(gpu_nvidia_paths):
- expected_devices[f"nvidia-{i}"] = {
- "path": path,
- "source": path,
- "type": "disk",
- }
+ if not path.startswith("/dev/"):
+ expected_devices[f"nvidia-{i}"] = {
+ "path": path,
+ "source": path,
+ "type": "disk",
+ }
client.profiles.create.assert_called_once_with(
"lpbuildd", expected_config, expected_devices)
@@ -374,6 +379,7 @@ class TestLXD(TestCase):
client.profiles.get.side_effect = FakeLXDAPIException
client.host_info = {"environment": {"driver_version": "3.0"}}
gpu_nvidia_paths = [
+ "/dev/nvidiactl",
"/usr/bin/nvidia-smi",
"/usr/bin/nvidia-persistenced",
]
@@ -436,7 +442,13 @@ class TestLXD(TestCase):
processes_fixture.add(
FakeHostname("example", "example.buildd"), name="hostname")
if gpu_nvidia:
+ os.mknod(
+ "/dev/nvidia0", stat.S_IFCHR | 0o666, os.makedev(195, 0))
+ os.mknod(
+ "/dev/nvidiactl", stat.S_IFCHR | 0o666, os.makedev(195, 255))
gpu_nvidia_paths = [
+ "/dev/nvidia0",
+ "/dev/nvidiactl",
"/usr/bin/nvidia-smi",
"/usr/bin/nvidia-persistenced",
]
@@ -470,8 +482,7 @@ class TestLXD(TestCase):
expected_args.append(
Equals(
["/snap/lxd/current/bin/nvidia-container-cli.real",
- "list",
- "--binaries", "--firmwares", "--ipcs", "--libraries"]))
+ "list"]))
expected_args.extend([
Equals(ip + ["link", "add", "dev", "lpbuilddbr0",
"type", "bridge"]),
@@ -518,7 +529,17 @@ class TestLXD(TestCase):
["mknod", "-m", "0660", "/dev/dm-%d" % minor,
"b", str(DM_BLOCK_MAJOR), str(minor)]))
if gpu_nvidia:
- expected_args.append(Equals(lxc + ["/sbin/ldconfig"]))
+ expected_args.extend([
+ Equals(
+ lxc +
+ ["mknod", "-m", "0666", "/dev/nvidia0",
+ "c", "195", "0"]),
+ Equals(
+ lxc +
+ ["mknod", "-m", "0666", "/dev/nvidiactl",
+ "c", "195", "255"]),
+ Equals(lxc + ["/sbin/ldconfig"]),
+ ])
expected_args.extend([
Equals(
lxc + ["mkdir", "-p", "/etc/systemd/system/snapd.service.d"]),