← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] ~cjwatson/launchpad-buildd:lxd-gpu-nvidia into launchpad-buildd:master

 

Colin Watson has proposed merging ~cjwatson/launchpad-buildd:lxd-gpu-nvidia into launchpad-buildd:master.

Commit message:
Optionally pass through NVIDIA GPUs to LXD containers

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~cjwatson/launchpad-buildd/+git/launchpad-buildd/+merge/435898

The `nvidia-container-cli` invocation is nasty, but I don't see a better alternative in the absence of improved LXD support for privileged containers.  The rest is mostly just plumbing.
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of ~cjwatson/launchpad-buildd:lxd-gpu-nvidia into launchpad-buildd:master.
diff --git a/debian/changelog b/debian/changelog
index 6af237d..6c2c622 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -5,6 +5,8 @@ launchpad-buildd (226) UNRELEASED; urgency=medium
     in launchpad-buildd 127.
   * Remove the old "distroseries_name" argument from the sourcepackagerecipe
     manager, which duplicated the common "series" argument.
+  * If the "gpu-nvidia" constraint is specified, then pass through an NVIDIA
+    GPU to the LXD container.
 
  -- Colin Watson <cjwatson@xxxxxxxxxx>  Wed, 04 Jan 2023 12:58:36 +0000
 
diff --git a/lpbuildd/builder.py b/lpbuildd/builder.py
index 332d060..a94f53a 100644
--- a/lpbuildd/builder.py
+++ b/lpbuildd/builder.py
@@ -180,8 +180,10 @@ class BuildManager:
             "--backend=%s" % self.backend_name,
             "--series=%s" % self.series,
             "--arch=%s" % self.arch_tag,
-            self._buildid,
             ]
+        for constraint in self.constraints:
+            base_args.append("--constraint=%s" % constraint)
+        base_args.append(self._buildid)
         self.runSubProcess(
             self._intargetpath, base_args + list(args), **kwargs)
 
@@ -256,6 +258,7 @@ class BuildManager:
         self.series = extra_args['series']
         self.arch_tag = extra_args.get('arch_tag', self._builder.getArch())
         self.fast_cleanup = extra_args.get('fast_cleanup', False)
+        self.constraints = extra_args.get('builder_constraints', [])
 
         # Check whether this is a build in a private archive and
         # whether the URLs in the buildlog file should be sanitized
@@ -266,7 +269,8 @@ class BuildManager:
 
         self.backend = make_backend(
             self.backend_name, self._buildid,
-            series=self.series, arch=self.arch_tag)
+            series=self.series, arch=self.arch_tag,
+            constraints=self.constraints)
 
         self.runSubProcess(self._preppath, ["builder-prep"])
 
diff --git a/lpbuildd/target/backend.py b/lpbuildd/target/backend.py
index a920cf1..28e3ef6 100644
--- a/lpbuildd/target/backend.py
+++ b/lpbuildd/target/backend.py
@@ -31,10 +31,11 @@ class Backend:
 
     supports_snapd = False
 
-    def __init__(self, build_id, series=None, arch=None):
+    def __init__(self, build_id, series=None, arch=None, constraints=None):
         self.build_id = build_id
         self.series = series
         self.arch = arch
+        self.constraints = constraints or []
         self.build_path = os.path.join(os.environ["HOME"], "build-" + build_id)
 
     def create(self, image_path, image_type):
@@ -215,7 +216,7 @@ class Backend:
             rmtree(tmp_dir)
 
 
-def make_backend(name, build_id, series=None, arch=None):
+def make_backend(name, build_id, series=None, arch=None, constraints=None):
     if name == "chroot":
         from lpbuildd.target.chroot import Chroot
         backend_factory = Chroot
@@ -232,4 +233,6 @@ def make_backend(name, build_id, series=None, arch=None):
         backend_factory = UncontainedBackend
     else:
         raise KeyError("Unknown backend: %s" % name)
-    return backend_factory(build_id, series=series, arch=arch)
+    return backend_factory(
+        build_id, series=series, arch=arch, constraints=constraints
+    )
diff --git a/lpbuildd/target/chroot.py b/lpbuildd/target/chroot.py
index 80ad1b3..2dad812 100644
--- a/lpbuildd/target/chroot.py
+++ b/lpbuildd/target/chroot.py
@@ -20,8 +20,8 @@ from lpbuildd.util import (
 class Chroot(Backend):
     """Sets up a chroot."""
 
-    def __init__(self, build_id, series=None, arch=None):
-        super().__init__(build_id, series=series, arch=arch)
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
         self.chroot_path = os.path.join(self.build_path, "chroot-autobuild")
 
     def create(self, image_path, image_type):
diff --git a/lpbuildd/target/lxd.py b/lpbuildd/target/lxd.py
index cd79030..d585911 100644
--- a/lpbuildd/target/lxd.py
+++ b/lpbuildd/target/lxd.py
@@ -43,6 +43,30 @@ def get_device_mapper_major():
                 "Cannot determine major device number for device-mapper")
 
 
+def get_nvidia_container_paths():
+    """Return the paths that need to be bind-mounted for NVIDIA CUDA support.
+
+    LXD's security.privileged=true and nvidia.runtime=true options are
+    unfortunately incompatible, but we can emulate the important bits of the
+    latter with some tactical bind-mounts.  There is no very good way to do
+    this; this seems like the least unpleasant approach.
+    """
+    env = dict(os.environ)
+    env["LD_LIBRARY_PATH"] = "/snap/lxd/current/lib"
+    return subprocess.check_output(
+        [
+            "/snap/lxd/current/bin/nvidia-container-cli.real",
+            "list",
+            "--binaries",
+            "--firmwares",
+            "--ipcs",
+            "--libraries",
+        ],
+        env=env,
+        universal_newlines=True,
+    ).splitlines()
+
+
 fallback_hosts = dedent("""\
     127.0.0.1\tlocalhost
     ::1\tlocalhost ip6-localhost ip6-loopback
@@ -353,6 +377,14 @@ class LXD(Backend):
                 "pool": "default",
                 "type": "disk",
                 }
+        if "gpu-nvidia" in self.constraints:
+            devices["gpu"] = {"type": "gpu"}
+            for i, path in enumerate(get_nvidia_container_paths()):
+                devices[f"nvidia-{i}"] = {
+                    "path": path,
+                    "source": path,
+                    "type": "disk",
+                    }
         self.client.profiles.create(self.profile_name, config, devices)
 
     def start(self):
@@ -459,6 +491,11 @@ class LXD(Backend):
                 ["mknod", "-m", "0660", "/dev/dm-%d" % minor,
                  "b", str(major), str(minor)])
 
+        if "gpu-nvidia" in self.constraints:
+            # We bind-mounted several libraries into the container, so run
+            # ldconfig to update the dynamic linker's cache.
+            self.run(["/sbin/ldconfig"])
+
         # XXX cjwatson 2017-09-07: With LXD < 2.2 we can't create the
         # directory until the container has started.  We can get away with
         # this for the time being because snapd isn't in the buildd chroots.
diff --git a/lpbuildd/target/operation.py b/lpbuildd/target/operation.py
index 63a3c64..724f8c0 100644
--- a/lpbuildd/target/operation.py
+++ b/lpbuildd/target/operation.py
@@ -22,13 +22,18 @@ class Operation:
         parser.add_argument(
             "--arch", metavar="ARCH", help="operate on architecture ARCH")
         parser.add_argument(
+            "--constraint", metavar="CONSTRAINT", action="append",
+            dest="constraints",
+            help="add builder resource tag for this build")
+        parser.add_argument(
             "build_id", metavar="ID", help="operate on build ID")
 
     def __init__(self, args, parser):
         self.args = args
         self.backend = make_backend(
             self.args.backend, self.args.build_id,
-            series=self.args.series, arch=self.args.arch)
+            series=self.args.series, arch=self.args.arch,
+            constraints=self.args.constraints)
 
     def run_build_command(self, args, env=None, **kwargs):
         """Run a build command in the target.
diff --git a/lpbuildd/target/tests/test_lxd.py b/lpbuildd/target/tests/test_lxd.py
index 39859c0..edb687a 100644
--- a/lpbuildd/target/tests/test_lxd.py
+++ b/lpbuildd/target/tests/test_lxd.py
@@ -274,7 +274,7 @@ class TestLXD(TestCase):
             "lp-xenial-amd64", "lp-xenial-amd64")
 
     def assert_correct_profile(self, extra_raw_lxc_config=None,
-                               driver_version="2.0"):
+                               driver_version="2.0", gpu_nvidia_paths=False):
         if extra_raw_lxc_config is None:
             extra_raw_lxc_config = []
 
@@ -328,6 +328,14 @@ class TestLXD(TestCase):
                 "pool": "default",
                 "type": "disk",
                 }
+        if gpu_nvidia_paths:
+            expected_devices["gpu"] = {"type": "gpu"}
+            for i, path in enumerate(gpu_nvidia_paths):
+                expected_devices[f"nvidia-{i}"] = {
+                    "path": path,
+                    "source": path,
+                    "type": "disk",
+                    }
         client.profiles.create.assert_called_once_with(
             "lpbuildd", expected_config, expected_devices)
 
@@ -359,6 +367,28 @@ class TestLXD(TestCase):
                         driver_version=driver_version or "3.0"
                         )
 
+    def test_create_profile_gpu_nvidia(self):
+        with MockPatch("pylxd.Client"):
+            client = pylxd.Client()
+            client.reset_mock()
+            client.profiles.get.side_effect = FakeLXDAPIException
+            client.host_info = {"environment": {"driver_version": "3.0"}}
+            gpu_nvidia_paths = [
+                "/usr/bin/nvidia-smi",
+                "/usr/bin/nvidia-persistenced",
+                ]
+            processes_fixture = self.useFixture(FakeProcesses())
+            processes_fixture.add(
+                lambda _: {
+                    "stdout": io.StringIO(
+                        "".join(f"{path}\n" for path in gpu_nvidia_paths)),
+                    },
+                name="/snap/lxd/current/bin/nvidia-container-cli.real")
+            backend = LXD("1", "xenial", "amd64", constraints=["gpu-nvidia"])
+            backend.create_profile()
+            self.assert_correct_profile(
+                driver_version="3.0", gpu_nvidia_paths=gpu_nvidia_paths)
+
     def fakeFS(self):
         fs_fixture = self.useFixture(FakeFilesystem())
         fs_fixture.add("/proc")
@@ -379,7 +409,8 @@ class TestLXD(TestCase):
 
     # XXX cjwatson 2022-08-25: Refactor this to use some more sensible kind
     # of test parameterization.
-    def test_start(self, arch="amd64", unmounts_cpuinfo=False):
+    def test_start(self, arch="amd64", unmounts_cpuinfo=False,
+                   gpu_nvidia=False):
         self.fakeFS()
         DM_BLOCK_MAJOR = random.randrange(128, 255)
         with open("/proc/devices", "w") as f:
@@ -404,15 +435,29 @@ class TestLXD(TestCase):
         processes_fixture.add(lambda _: {}, name="lxc")
         processes_fixture.add(
             FakeHostname("example", "example.buildd"), name="hostname")
+        if gpu_nvidia:
+            gpu_nvidia_paths = [
+                "/usr/bin/nvidia-smi",
+                "/usr/bin/nvidia-persistenced",
+                ]
+            processes_fixture.add(
+                lambda _: {
+                    "stdout": io.StringIO(
+                        "".join(f"{path}\n" for path in gpu_nvidia_paths)),
+                    },
+                name="/snap/lxd/current/bin/nvidia-container-cli.real")
+        else:
+            gpu_nvidia_paths = None
 
         with mock.patch.object(
             LXD,
             "path_exists",
             side_effect=lambda path: path in existing_files
         ):
-            LXD("1", "xenial", arch).start()
+            constraints = ["gpu-nvidia"] if gpu_nvidia else []
+            LXD("1", "xenial", arch, constraints=constraints).start()
 
-        self.assert_correct_profile()
+        self.assert_correct_profile(gpu_nvidia_paths=gpu_nvidia_paths)
 
         ip = ["sudo", "ip"]
         iptables = ["sudo", "iptables", "-w"]
@@ -420,7 +465,14 @@ class TestLXD(TestCase):
             "-m", "comment", "--comment", "managed by launchpad-buildd"]
         setarch_cmd = "linux64" if get_arch_bits(arch) == 64 else "linux32"
         lxc = ["lxc", "exec", f"lp-xenial-{arch}", "--", setarch_cmd]
-        expected_args = [
+        expected_args = []
+        if gpu_nvidia:
+            expected_args.append(
+                Equals(
+                    ["/snap/lxd/current/bin/nvidia-container-cli.real",
+                     "list",
+                     "--binaries", "--firmwares", "--ipcs", "--libraries"]))
+        expected_args.extend([
             Equals(ip + ["link", "add", "dev", "lpbuilddbr0",
                          "type", "bridge"]),
             Equals(ip + ["addr", "add", "10.10.10.1/24",
@@ -452,7 +504,7 @@ class TestLXD(TestCase):
                 lxc +
                 ["mknod", "-m", "0660", "/dev/loop-control",
                  "c", "10", "237"]),
-            ]
+            ])
         for minor in range(256):
             expected_args.append(
                 Equals(
@@ -465,6 +517,8 @@ class TestLXD(TestCase):
                     lxc +
                     ["mknod", "-m", "0660", "/dev/dm-%d" % minor,
                      "b", str(DM_BLOCK_MAJOR), str(minor)]))
+        if gpu_nvidia:
+            expected_args.append(Equals(lxc + ["/sbin/ldconfig"]))
         expected_args.extend([
             Equals(
                 lxc + ["mkdir", "-p", "/etc/systemd/system/snapd.service.d"]),
@@ -594,6 +648,9 @@ class TestLXD(TestCase):
     def test_start_armhf_unmounts_cpuinfo(self):
         self.test_start(arch="armhf", unmounts_cpuinfo=True)
 
+    def test_start_gpu_nvidia(self):
+        self.test_start(gpu_nvidia=True)
+
     def test_run(self):
         processes_fixture = self.useFixture(FakeProcesses())
         processes_fixture.add(lambda _: {}, name="lxc")
diff --git a/lpbuildd/tests/test_ci.py b/lpbuildd/tests/test_ci.py
index d8a9ef9..1ab1408 100644
--- a/lpbuildd/tests/test_ci.py
+++ b/lpbuildd/tests/test_ci.py
@@ -62,7 +62,7 @@ class TestCIBuildManagerIteration(TestCase):
         return self.buildmanager._state
 
     @defer.inlineCallbacks
-    def startBuild(self, args=None, options=None):
+    def startBuild(self, args=None, options=None, constraints=None):
         # The build manager's iterate() kicks off the consecutive states
         # after INIT.
         extra_args = {
@@ -86,8 +86,11 @@ class TestCIBuildManagerIteration(TestCase):
         self.assertEqual(CIBuildState.PREPARE, self.getState())
         expected_command = [
             "sharepath/bin/in-target", "in-target", "run-ci-prepare",
-            "--backend=lxd", "--series=focal", "--arch=amd64", self.buildid,
+            "--backend=lxd", "--series=focal", "--arch=amd64",
             ]
+        for constraint in constraints or []:
+            expected_command.append("--constraint=%s" % constraint)
+        expected_command.append(self.buildid)
         if options is not None:
             expected_command.extend(options)
         self.assertEqual(expected_command, self.buildmanager.commands[-1])
@@ -399,3 +402,19 @@ class TestCIBuildManagerIteration(TestCase):
             "--clamav-database-url", "http://clamav.example/";,
             ]
         yield self.startBuild(args, expected_prepare_options)
+
+    @defer.inlineCallbacks
+    def test_constraints(self):
+        # The build manager passes constraints to subprocesses.
+        args = {
+            "builder_constraints": ["one", "two"],
+            "git_repository": "https://git.launchpad.test/~example/+git/ci";,
+            "git_path": "main",
+            "jobs": [[("build", "0")], [("test", "0")]],
+        }
+        expected_prepare_options = [
+            "--git-repository", "https://git.launchpad.test/~example/+git/ci";,
+            "--git-path", "main",
+        ]
+        yield self.startBuild(
+            args, expected_prepare_options, constraints=["one", "two"])
diff --git a/lpbuildd/tests/test_debian.py b/lpbuildd/tests/test_debian.py
index 0217c1a..993260e 100644
--- a/lpbuildd/tests/test_debian.py
+++ b/lpbuildd/tests/test_debian.py
@@ -78,6 +78,20 @@ class TestDebianBuildManagerIteration(TestCase):
         """Retrieve build manager's state."""
         return self.buildmanager._state
 
+    def test_no_constraints(self):
+        # If no `builder_constraints` argument is passed, the backend is set
+        # up with no constraints.
+        self.buildmanager.initiate({}, 'chroot.tar.gz', {'series': 'xenial'})
+        self.assertEqual([], self.buildmanager.backend.constraints)
+
+    def test_constraints(self):
+        # If a `builder_constraints` argument is passed, it is used to set
+        # up the backend's constraints.
+        self.buildmanager.initiate(
+            {}, 'chroot.tar.gz',
+            {'builder_constraints': ['gpu'], 'series': 'xenial'})
+        self.assertEqual(['gpu'], self.buildmanager.backend.constraints)
+
     def startBuild(self, extra_args):
         # The build manager's iterate() kicks off the consecutive states
         # after INIT.