launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #29555
[Merge] ~cjwatson/launchpad-buildd:lxd-gpu-nvidia into launchpad-buildd:master
Colin Watson has proposed merging ~cjwatson/launchpad-buildd:lxd-gpu-nvidia into launchpad-buildd:master.
Commit message:
Optionally pass through NVIDIA GPUs to LXD containers
Requested reviews:
Launchpad code reviewers (launchpad-reviewers)
For more details, see:
https://code.launchpad.net/~cjwatson/launchpad-buildd/+git/launchpad-buildd/+merge/435898
The `nvidia-container-cli` invocation is nasty, but I don't see a better alternative in the absence of improved LXD support for privileged containers. The rest is mostly just plumbing.
--
Your team Launchpad code reviewers is requested to review the proposed merge of ~cjwatson/launchpad-buildd:lxd-gpu-nvidia into launchpad-buildd:master.
diff --git a/debian/changelog b/debian/changelog
index 6af237d..6c2c622 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -5,6 +5,8 @@ launchpad-buildd (226) UNRELEASED; urgency=medium
in launchpad-buildd 127.
* Remove the old "distroseries_name" argument from the sourcepackagerecipe
manager, which duplicated the common "series" argument.
+ * If the "gpu-nvidia" constraint is specified, then pass through an NVIDIA
+ GPU to the LXD container.
-- Colin Watson <cjwatson@xxxxxxxxxx> Wed, 04 Jan 2023 12:58:36 +0000
diff --git a/lpbuildd/builder.py b/lpbuildd/builder.py
index 332d060..a94f53a 100644
--- a/lpbuildd/builder.py
+++ b/lpbuildd/builder.py
@@ -180,8 +180,10 @@ class BuildManager:
"--backend=%s" % self.backend_name,
"--series=%s" % self.series,
"--arch=%s" % self.arch_tag,
- self._buildid,
]
+ for constraint in self.constraints:
+ base_args.append("--constraint=%s" % constraint)
+ base_args.append(self._buildid)
self.runSubProcess(
self._intargetpath, base_args + list(args), **kwargs)
@@ -256,6 +258,7 @@ class BuildManager:
self.series = extra_args['series']
self.arch_tag = extra_args.get('arch_tag', self._builder.getArch())
self.fast_cleanup = extra_args.get('fast_cleanup', False)
+ self.constraints = extra_args.get('builder_constraints', [])
# Check whether this is a build in a private archive and
# whether the URLs in the buildlog file should be sanitized
@@ -266,7 +269,8 @@ class BuildManager:
self.backend = make_backend(
self.backend_name, self._buildid,
- series=self.series, arch=self.arch_tag)
+ series=self.series, arch=self.arch_tag,
+ constraints=self.constraints)
self.runSubProcess(self._preppath, ["builder-prep"])
diff --git a/lpbuildd/target/backend.py b/lpbuildd/target/backend.py
index a920cf1..28e3ef6 100644
--- a/lpbuildd/target/backend.py
+++ b/lpbuildd/target/backend.py
@@ -31,10 +31,11 @@ class Backend:
supports_snapd = False
- def __init__(self, build_id, series=None, arch=None):
+ def __init__(self, build_id, series=None, arch=None, constraints=None):
self.build_id = build_id
self.series = series
self.arch = arch
+ self.constraints = constraints or []
self.build_path = os.path.join(os.environ["HOME"], "build-" + build_id)
def create(self, image_path, image_type):
@@ -215,7 +216,7 @@ class Backend:
rmtree(tmp_dir)
-def make_backend(name, build_id, series=None, arch=None):
+def make_backend(name, build_id, series=None, arch=None, constraints=None):
if name == "chroot":
from lpbuildd.target.chroot import Chroot
backend_factory = Chroot
@@ -232,4 +233,6 @@ def make_backend(name, build_id, series=None, arch=None):
backend_factory = UncontainedBackend
else:
raise KeyError("Unknown backend: %s" % name)
- return backend_factory(build_id, series=series, arch=arch)
+ return backend_factory(
+ build_id, series=series, arch=arch, constraints=constraints
+ )
diff --git a/lpbuildd/target/chroot.py b/lpbuildd/target/chroot.py
index 80ad1b3..2dad812 100644
--- a/lpbuildd/target/chroot.py
+++ b/lpbuildd/target/chroot.py
@@ -20,8 +20,8 @@ from lpbuildd.util import (
class Chroot(Backend):
"""Sets up a chroot."""
- def __init__(self, build_id, series=None, arch=None):
- super().__init__(build_id, series=series, arch=arch)
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
self.chroot_path = os.path.join(self.build_path, "chroot-autobuild")
def create(self, image_path, image_type):
diff --git a/lpbuildd/target/lxd.py b/lpbuildd/target/lxd.py
index cd79030..d585911 100644
--- a/lpbuildd/target/lxd.py
+++ b/lpbuildd/target/lxd.py
@@ -43,6 +43,30 @@ def get_device_mapper_major():
"Cannot determine major device number for device-mapper")
+def get_nvidia_container_paths():
+ """Return the paths that need to be bind-mounted for NVIDIA CUDA support.
+
+ LXD's security.privileged=true and nvidia.runtime=true options are
+ unfortunately incompatible, but we can emulate the important bits of the
+ latter with some tactical bind-mounts. There is no very good way to do
+ this; this seems like the least unpleasant approach.
+ """
+ env = dict(os.environ)
+ env["LD_LIBRARY_PATH"] = "/snap/lxd/current/lib"
+ return subprocess.check_output(
+ [
+ "/snap/lxd/current/bin/nvidia-container-cli.real",
+ "list",
+ "--binaries",
+ "--firmwares",
+ "--ipcs",
+ "--libraries",
+ ],
+ env=env,
+ universal_newlines=True,
+ ).splitlines()
+
+
fallback_hosts = dedent("""\
127.0.0.1\tlocalhost
::1\tlocalhost ip6-localhost ip6-loopback
@@ -353,6 +377,14 @@ class LXD(Backend):
"pool": "default",
"type": "disk",
}
+ if "gpu-nvidia" in self.constraints:
+ devices["gpu"] = {"type": "gpu"}
+ for i, path in enumerate(get_nvidia_container_paths()):
+ devices[f"nvidia-{i}"] = {
+ "path": path,
+ "source": path,
+ "type": "disk",
+ }
self.client.profiles.create(self.profile_name, config, devices)
def start(self):
@@ -459,6 +491,11 @@ class LXD(Backend):
["mknod", "-m", "0660", "/dev/dm-%d" % minor,
"b", str(major), str(minor)])
+ if "gpu-nvidia" in self.constraints:
+ # We bind-mounted several libraries into the container, so run
+ # ldconfig to update the dynamic linker's cache.
+ self.run(["/sbin/ldconfig"])
+
# XXX cjwatson 2017-09-07: With LXD < 2.2 we can't create the
# directory until the container has started. We can get away with
# this for the time being because snapd isn't in the buildd chroots.
diff --git a/lpbuildd/target/operation.py b/lpbuildd/target/operation.py
index 63a3c64..724f8c0 100644
--- a/lpbuildd/target/operation.py
+++ b/lpbuildd/target/operation.py
@@ -22,13 +22,18 @@ class Operation:
parser.add_argument(
"--arch", metavar="ARCH", help="operate on architecture ARCH")
parser.add_argument(
+ "--constraint", metavar="CONSTRAINT", action="append",
+ dest="constraints",
+ help="add builder resource tag for this build")
+ parser.add_argument(
"build_id", metavar="ID", help="operate on build ID")
def __init__(self, args, parser):
self.args = args
self.backend = make_backend(
self.args.backend, self.args.build_id,
- series=self.args.series, arch=self.args.arch)
+ series=self.args.series, arch=self.args.arch,
+ constraints=self.args.constraints)
def run_build_command(self, args, env=None, **kwargs):
"""Run a build command in the target.
diff --git a/lpbuildd/target/tests/test_lxd.py b/lpbuildd/target/tests/test_lxd.py
index 39859c0..edb687a 100644
--- a/lpbuildd/target/tests/test_lxd.py
+++ b/lpbuildd/target/tests/test_lxd.py
@@ -274,7 +274,7 @@ class TestLXD(TestCase):
"lp-xenial-amd64", "lp-xenial-amd64")
def assert_correct_profile(self, extra_raw_lxc_config=None,
- driver_version="2.0"):
+ driver_version="2.0", gpu_nvidia_paths=False):
if extra_raw_lxc_config is None:
extra_raw_lxc_config = []
@@ -328,6 +328,14 @@ class TestLXD(TestCase):
"pool": "default",
"type": "disk",
}
+ if gpu_nvidia_paths:
+ expected_devices["gpu"] = {"type": "gpu"}
+ for i, path in enumerate(gpu_nvidia_paths):
+ expected_devices[f"nvidia-{i}"] = {
+ "path": path,
+ "source": path,
+ "type": "disk",
+ }
client.profiles.create.assert_called_once_with(
"lpbuildd", expected_config, expected_devices)
@@ -359,6 +367,28 @@ class TestLXD(TestCase):
driver_version=driver_version or "3.0"
)
+ def test_create_profile_gpu_nvidia(self):
+ with MockPatch("pylxd.Client"):
+ client = pylxd.Client()
+ client.reset_mock()
+ client.profiles.get.side_effect = FakeLXDAPIException
+ client.host_info = {"environment": {"driver_version": "3.0"}}
+ gpu_nvidia_paths = [
+ "/usr/bin/nvidia-smi",
+ "/usr/bin/nvidia-persistenced",
+ ]
+ processes_fixture = self.useFixture(FakeProcesses())
+ processes_fixture.add(
+ lambda _: {
+ "stdout": io.StringIO(
+ "".join(f"{path}\n" for path in gpu_nvidia_paths)),
+ },
+ name="/snap/lxd/current/bin/nvidia-container-cli.real")
+ backend = LXD("1", "xenial", "amd64", constraints=["gpu-nvidia"])
+ backend.create_profile()
+ self.assert_correct_profile(
+ driver_version="3.0", gpu_nvidia_paths=gpu_nvidia_paths)
+
def fakeFS(self):
fs_fixture = self.useFixture(FakeFilesystem())
fs_fixture.add("/proc")
@@ -379,7 +409,8 @@ class TestLXD(TestCase):
# XXX cjwatson 2022-08-25: Refactor this to use some more sensible kind
# of test parameterization.
- def test_start(self, arch="amd64", unmounts_cpuinfo=False):
+ def test_start(self, arch="amd64", unmounts_cpuinfo=False,
+ gpu_nvidia=False):
self.fakeFS()
DM_BLOCK_MAJOR = random.randrange(128, 255)
with open("/proc/devices", "w") as f:
@@ -404,15 +435,29 @@ class TestLXD(TestCase):
processes_fixture.add(lambda _: {}, name="lxc")
processes_fixture.add(
FakeHostname("example", "example.buildd"), name="hostname")
+ if gpu_nvidia:
+ gpu_nvidia_paths = [
+ "/usr/bin/nvidia-smi",
+ "/usr/bin/nvidia-persistenced",
+ ]
+ processes_fixture.add(
+ lambda _: {
+ "stdout": io.StringIO(
+ "".join(f"{path}\n" for path in gpu_nvidia_paths)),
+ },
+ name="/snap/lxd/current/bin/nvidia-container-cli.real")
+ else:
+ gpu_nvidia_paths = None
with mock.patch.object(
LXD,
"path_exists",
side_effect=lambda path: path in existing_files
):
- LXD("1", "xenial", arch).start()
+ constraints = ["gpu-nvidia"] if gpu_nvidia else []
+ LXD("1", "xenial", arch, constraints=constraints).start()
- self.assert_correct_profile()
+ self.assert_correct_profile(gpu_nvidia_paths=gpu_nvidia_paths)
ip = ["sudo", "ip"]
iptables = ["sudo", "iptables", "-w"]
@@ -420,7 +465,14 @@ class TestLXD(TestCase):
"-m", "comment", "--comment", "managed by launchpad-buildd"]
setarch_cmd = "linux64" if get_arch_bits(arch) == 64 else "linux32"
lxc = ["lxc", "exec", f"lp-xenial-{arch}", "--", setarch_cmd]
- expected_args = [
+ expected_args = []
+ if gpu_nvidia:
+ expected_args.append(
+ Equals(
+ ["/snap/lxd/current/bin/nvidia-container-cli.real",
+ "list",
+ "--binaries", "--firmwares", "--ipcs", "--libraries"]))
+ expected_args.extend([
Equals(ip + ["link", "add", "dev", "lpbuilddbr0",
"type", "bridge"]),
Equals(ip + ["addr", "add", "10.10.10.1/24",
@@ -452,7 +504,7 @@ class TestLXD(TestCase):
lxc +
["mknod", "-m", "0660", "/dev/loop-control",
"c", "10", "237"]),
- ]
+ ])
for minor in range(256):
expected_args.append(
Equals(
@@ -465,6 +517,8 @@ class TestLXD(TestCase):
lxc +
["mknod", "-m", "0660", "/dev/dm-%d" % minor,
"b", str(DM_BLOCK_MAJOR), str(minor)]))
+ if gpu_nvidia:
+ expected_args.append(Equals(lxc + ["/sbin/ldconfig"]))
expected_args.extend([
Equals(
lxc + ["mkdir", "-p", "/etc/systemd/system/snapd.service.d"]),
@@ -594,6 +648,9 @@ class TestLXD(TestCase):
def test_start_armhf_unmounts_cpuinfo(self):
self.test_start(arch="armhf", unmounts_cpuinfo=True)
+ def test_start_gpu_nvidia(self):
+ self.test_start(gpu_nvidia=True)
+
def test_run(self):
processes_fixture = self.useFixture(FakeProcesses())
processes_fixture.add(lambda _: {}, name="lxc")
diff --git a/lpbuildd/tests/test_ci.py b/lpbuildd/tests/test_ci.py
index d8a9ef9..1ab1408 100644
--- a/lpbuildd/tests/test_ci.py
+++ b/lpbuildd/tests/test_ci.py
@@ -62,7 +62,7 @@ class TestCIBuildManagerIteration(TestCase):
return self.buildmanager._state
@defer.inlineCallbacks
- def startBuild(self, args=None, options=None):
+ def startBuild(self, args=None, options=None, constraints=None):
# The build manager's iterate() kicks off the consecutive states
# after INIT.
extra_args = {
@@ -86,8 +86,11 @@ class TestCIBuildManagerIteration(TestCase):
self.assertEqual(CIBuildState.PREPARE, self.getState())
expected_command = [
"sharepath/bin/in-target", "in-target", "run-ci-prepare",
- "--backend=lxd", "--series=focal", "--arch=amd64", self.buildid,
+ "--backend=lxd", "--series=focal", "--arch=amd64",
]
+ for constraint in constraints or []:
+ expected_command.append("--constraint=%s" % constraint)
+ expected_command.append(self.buildid)
if options is not None:
expected_command.extend(options)
self.assertEqual(expected_command, self.buildmanager.commands[-1])
@@ -399,3 +402,19 @@ class TestCIBuildManagerIteration(TestCase):
"--clamav-database-url", "http://clamav.example/",
]
yield self.startBuild(args, expected_prepare_options)
+
+ @defer.inlineCallbacks
+ def test_constraints(self):
+ # The build manager passes constraints to subprocesses.
+ args = {
+ "builder_constraints": ["one", "two"],
+ "git_repository": "https://git.launchpad.test/~example/+git/ci",
+ "git_path": "main",
+ "jobs": [[("build", "0")], [("test", "0")]],
+ }
+ expected_prepare_options = [
+ "--git-repository", "https://git.launchpad.test/~example/+git/ci",
+ "--git-path", "main",
+ ]
+ yield self.startBuild(
+ args, expected_prepare_options, constraints=["one", "two"])
diff --git a/lpbuildd/tests/test_debian.py b/lpbuildd/tests/test_debian.py
index 0217c1a..993260e 100644
--- a/lpbuildd/tests/test_debian.py
+++ b/lpbuildd/tests/test_debian.py
@@ -78,6 +78,20 @@ class TestDebianBuildManagerIteration(TestCase):
"""Retrieve build manager's state."""
return self.buildmanager._state
+ def test_no_constraints(self):
+ # If no `builder_constraints` argument is passed, the backend is set
+ # up with no constraints.
+ self.buildmanager.initiate({}, 'chroot.tar.gz', {'series': 'xenial'})
+ self.assertEqual([], self.buildmanager.backend.constraints)
+
+ def test_constraints(self):
+ # If a `builder_constraints` argument is passed, it is used to set
+ # up the backend's constraints.
+ self.buildmanager.initiate(
+ {}, 'chroot.tar.gz',
+ {'builder_constraints': ['gpu'], 'series': 'xenial'})
+ self.assertEqual(['gpu'], self.buildmanager.backend.constraints)
+
def startBuild(self, extra_args):
# The build manager's iterate() kicks off the consecutive states
# after INIT.