launchpad-reviewers team mailing list archive

Thread
Date

[Merge] ~cjwatson/lpcraft:gpu-nvidia into lpcraft:main

To: mp+435909@xxxxxxxxxxxxxxxxxx
From: Colin Watson <mp+435909@xxxxxxxxxxxxxxxxxx>
Date: Tue, 17 Jan 2023 22:46:19 -0000
Reply-to: mp+435909@xxxxxxxxxxxxxxxxxx
Sender: noreply@xxxxxxxxxxxxx

Colin Watson has proposed merging ~cjwatson/lpcraft:gpu-nvidia into lpcraft:main.

Commit message:
Add --gpu-nvidia option

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~cjwatson/lpcraft/+git/lpcraft/+merge/435909

This allows us to pass through an NVIDIA GPU from the host system.  Jobs will need to install some kind of CUDA binaries in order for this to be useful.

`craft-providers` doesn't make it very straightforward to pass in additional container configuration, but I managed to make it work by adjusting the per-project profile before launching the container.

I've marked the option as experimental, because we need to get this in place soon but we might find we need to change it later; in that case coordinated changes to `lpcraft` and `launchpad-buildd` may be required.
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of ~cjwatson/lpcraft:gpu-nvidia into lpcraft:main.

diff --git a/NEWS.rst b/NEWS.rst
index 1bb0c17..4572f3a 100644
--- a/NEWS.rst
+++ b/NEWS.rst
@@ -2,6 +2,11 @@
 Version history
 ===============
 
+0.0.41 (unreleased)
+===================
+
+- Add experimental ``--gpu-nvidia`` option.
+
 0.0.40 (2023-01-13)
 ===================
 
diff --git a/docs/cli-interface.rst b/docs/cli-interface.rst
index afb6907..24a6bd6 100644
--- a/docs/cli-interface.rst
+++ b/docs/cli-interface.rst
@@ -49,6 +49,10 @@ lpcraft run optional arguments
 
   This option is repeatable.
 
+- ``--gpu-nvidia`` (experimental)
+
+  This option requires an NVIDIA GPU on the host system.
+
 lpcraft run-one
 ---------------
 
diff --git a/lpcraft/commands/run.py b/lpcraft/commands/run.py
index dcdb200..c0e00d9 100644
--- a/lpcraft/commands/run.py
+++ b/lpcraft/commands/run.py
@@ -469,6 +469,7 @@ def _run_job(
     env_from_cli: Optional[List[str]] = None,
     plugin_settings: Optional[List[str]] = None,
     secrets: Optional[Dict[str, str]] = None,
+    gpu_nvidia: bool = False,
 ) -> None:
     """Run a single job."""
     # XXX jugmac00 2022-04-27: we should create a configuration object to be
@@ -538,6 +539,7 @@ def _run_job(
         project_path=cwd,
         series=job.series,
         architecture=host_architecture,
+        gpu_nvidia=gpu_nvidia,
     ) as instance:
         snaps = list(itertools.chain(*pm.hook.lpcraft_install_snaps()))
         for snap in snaps:
@@ -681,6 +683,15 @@ class RunCommand(BaseCommand):
             dest="package_repositories",
             help="Provide an additional package repository.",
         )
+        parser.add_argument(
+            "--gpu-nvidia",
+            action="store_true",
+            default=False,
+            help=(
+                "Pass through an NVIDIA GPU from the host system.  "
+                "(Experimental option, subject to change.)"
+            ),
+        )
 
     def run(self, args: Namespace) -> int:
         """Run the command."""
@@ -730,6 +741,7 @@ class RunCommand(BaseCommand):
                             env_from_cli=args.set_env,
                             plugin_settings=args.plugin_setting,
                             secrets=secrets,
+                            gpu_nvidia=args.gpu_nvidia,
                         )
 
                 except CommandError as e:
@@ -836,6 +848,15 @@ class RunOneCommand(BaseCommand):
             dest="package_repositories",
             help="Provide an additional package repository.",
         )
+        parser.add_argument(
+            "--gpu-nvidia",
+            action="store_true",
+            default=False,
+            help=(
+                "Pass through an NVIDIA GPU from the host system.  "
+                "(Experimental option, subject to change.)"
+            ),
+        )
 
     def run(self, args: Namespace) -> int:
         """Run the command."""
@@ -885,6 +906,7 @@ class RunOneCommand(BaseCommand):
                 env_from_cli=args.set_env,
                 plugin_settings=args.plugin_setting,
                 secrets=secrets,
+                gpu_nvidia=args.gpu_nvidia,
             )
         finally:
             if args.clean:
diff --git a/lpcraft/commands/tests/test_run.py b/lpcraft/commands/tests/test_run.py
index aaa2c02..4466295 100644
--- a/lpcraft/commands/tests/test_run.py
+++ b/lpcraft/commands/tests/test_run.py
@@ -12,7 +12,7 @@ from typing import Any, AnyStr, Dict, List, Optional
 from unittest.mock import ANY, Mock, call, patch
 
 import responses
-from craft_providers.lxd import launch
+from craft_providers.lxd import LXC, launch
 from fixtures import TempDir
 from testtools.matchers import MatchesStructure
 
@@ -3075,6 +3075,88 @@ class TestRun(RunBaseTestCase):
             json.loads((job_output / "properties").read_text()),
         )
 
+    @patch("lpcraft.commands.run.get_provider")
+    @patch("lpcraft.commands.run.get_host_architecture", return_value="amd64")
+    def test_no_gpu_nvidia_option(
+        self, mock_get_host_architecture, mock_get_provider
+    ):
+        # Without --gpu-nvidia, containers are launched with a basic profile.
+        lxc = Mock(spec=LXC)
+        lxc.profile_show.return_value = {"config": {}, "devices": {}}
+        lxc.project_list.return_value = []
+        lxc.remote_list.return_value = {}
+        launcher = Mock(spec=launch)
+        provider = makeLXDProvider(lxc=lxc, lxd_launcher=launcher)
+        mock_get_provider.return_value = provider
+        execute_run = launcher.return_value.execute_run
+        execute_run.return_value = subprocess.CompletedProcess([], 0)
+        config = dedent(
+            """
+            pipeline:
+                - test
+
+            jobs:
+                test:
+                    series: focal
+                    architectures: amd64
+                    run: echo test
+            """
+        )
+        Path(".launchpad.yaml").write_text(config)
+
+        result = self.run_command("run")
+
+        self.assertEqual(0, result.exit_code)
+        lxc.profile_edit.assert_called_once_with(
+            profile="default",
+            config={"config": {}, "devices": {}},
+            project="test-project",
+            remote="test-remote",
+        )
+
+    @patch("lpcraft.commands.run.get_provider")
+    @patch("lpcraft.commands.run.get_host_architecture", return_value="amd64")
+    def test_gpu_nvidia_option(
+        self, mock_get_host_architecture, mock_get_provider
+    ):
+        # With --gpu-nvidia, containers are launched with a profile that
+        # enables GPU passthrough.
+        lxc = Mock(spec=LXC)
+        lxc.profile_show.return_value = {"config": {}, "devices": {}}
+        lxc.project_list.return_value = []
+        lxc.remote_list.return_value = {}
+        launcher = Mock(spec=launch)
+        provider = makeLXDProvider(lxc=lxc, lxd_launcher=launcher)
+        mock_get_provider.return_value = provider
+        execute_run = launcher.return_value.execute_run
+        execute_run.return_value = subprocess.CompletedProcess([], 0)
+        config = dedent(
+            """
+            pipeline:
+                - test
+
+            jobs:
+                test:
+                    series: focal
+                    architectures: amd64
+                    run: echo test
+            """
+        )
+        Path(".launchpad.yaml").write_text(config)
+
+        result = self.run_command("run", "--gpu-nvidia")
+
+        self.assertEqual(0, result.exit_code)
+        lxc.profile_edit.assert_called_once_with(
+            profile="default",
+            config={
+                "config": {"nvidia.runtime": "true"},
+                "devices": {"gpu": {"type": "gpu"}},
+            },
+            project="test-project",
+            remote="test-remote",
+        )
+
 
 class TestRunOne(RunBaseTestCase):
     def test_config_file_not_under_project_directory(self):
@@ -4214,3 +4296,85 @@ class TestRunOne(RunBaseTestCase):
             {"foo": "bar", "license": {"path": "LICENSE.txt", "spdx": None}},
             json.loads((job_output / "properties").read_text()),
         )
+
+    @patch("lpcraft.commands.run.get_provider")
+    @patch("lpcraft.commands.run.get_host_architecture", return_value="amd64")
+    def test_no_gpu_nvidia_option(
+        self, mock_get_host_architecture, mock_get_provider
+    ):
+        # Without --gpu-nvidia, containers are launched with a basic profile.
+        lxc = Mock(spec=LXC)
+        lxc.profile_show.return_value = {"config": {}, "devices": {}}
+        lxc.project_list.return_value = []
+        lxc.remote_list.return_value = {}
+        launcher = Mock(spec=launch)
+        provider = makeLXDProvider(lxc=lxc, lxd_launcher=launcher)
+        mock_get_provider.return_value = provider
+        execute_run = launcher.return_value.execute_run
+        execute_run.return_value = subprocess.CompletedProcess([], 0)
+        config = dedent(
+            """
+            pipeline:
+                - test
+
+            jobs:
+                test:
+                    series: focal
+                    architectures: amd64
+                    run: echo test
+            """
+        )
+        Path(".launchpad.yaml").write_text(config)
+
+        result = self.run_command("run-one", "test", "0")
+
+        self.assertEqual(0, result.exit_code)
+        lxc.profile_edit.assert_called_once_with(
+            profile="default",
+            config={"config": {}, "devices": {}},
+            project="test-project",
+            remote="test-remote",
+        )
+
+    @patch("lpcraft.commands.run.get_provider")
+    @patch("lpcraft.commands.run.get_host_architecture", return_value="amd64")
+    def test_gpu_nvidia_option(
+        self, mock_get_host_architecture, mock_get_provider
+    ):
+        # With --gpu-nvidia, containers are launched with a profile that
+        # enables GPU passthrough.
+        lxc = Mock(spec=LXC)
+        lxc.profile_show.return_value = {"config": {}, "devices": {}}
+        lxc.project_list.return_value = []
+        lxc.remote_list.return_value = {}
+        launcher = Mock(spec=launch)
+        provider = makeLXDProvider(lxc=lxc, lxd_launcher=launcher)
+        mock_get_provider.return_value = provider
+        execute_run = launcher.return_value.execute_run
+        execute_run.return_value = subprocess.CompletedProcess([], 0)
+        config = dedent(
+            """
+            pipeline:
+                - test
+
+            jobs:
+                test:
+                    series: focal
+                    architectures: amd64
+                    run: echo test
+            """
+        )
+        Path(".launchpad.yaml").write_text(config)
+
+        result = self.run_command("run-one", "--gpu-nvidia", "test", "0")
+
+        self.assertEqual(0, result.exit_code)
+        lxc.profile_edit.assert_called_once_with(
+            profile="default",
+            config={
+                "config": {"nvidia.runtime": "true"},
+                "devices": {"gpu": {"type": "gpu"}},
+            },
+            project="test-project",
+            remote="test-remote",
+        )
diff --git a/lpcraft/providers/_base.py b/lpcraft/providers/_base.py
index 6ca78e2..081837c 100644
--- a/lpcraft/providers/_base.py
+++ b/lpcraft/providers/_base.py
@@ -108,6 +108,7 @@ class Provider(ABC):
         project_path: Path,
         series: str,
         architecture: str,
+        gpu_nvidia: bool = False,
     ) -> Generator[lxd.LXDInstance, None, None]:
         """Launch environment for specified series and architecture.
 
@@ -115,4 +116,6 @@ class Provider(ABC):
         :param project_path: Path to project.
         :param series: Distribution series name.
         :param architecture: Targeted architecture name.
+        :param gpu_nvidia: If True, pass through an NVIDIA GPU from the host
+            to the environment.
         """
diff --git a/lpcraft/providers/_lxd.py b/lpcraft/providers/_lxd.py
index c69a72c..0d325a6 100644
--- a/lpcraft/providers/_lxd.py
+++ b/lpcraft/providers/_lxd.py
@@ -246,6 +246,7 @@ class LXDProvider(Provider):
         project_path: Path,
         series: str,
         architecture: str,
+        gpu_nvidia: bool = False,
     ) -> Generator[lxd.LXDInstance, None, None]:
         """Launch environment for specified series and architecture.
 
@@ -270,6 +271,31 @@ class LXDProvider(Provider):
             alias=alias, environment=environment, hostname=instance_name
         )
 
+        if self.lxd_project not in self.lxc.project_list(self.lxd_remote):
+            self.lxc.project_create(
+                project=self.lxd_project, remote=self.lxd_remote
+            )
+        # Copy the default profile from the default project and adjust it
+        # for our needs.  Unfortunately we have to edit the default profile
+        # in our project since there's no way to get craft-providers to use
+        # a different profile, but at least the profile is within the scope
+        # of the project so shouldn't affect other users of LXD.
+        profile = self.lxc.profile_show(
+            profile="default", project="default", remote=self.lxd_remote
+        )
+        if gpu_nvidia:
+            profile["config"]["nvidia.runtime"] = "true"
+            profile["devices"]["gpu"] = {"type": "gpu"}
+        else:
+            profile["config"].pop("nvidia.runtime", None)
+            profile["devices"].pop("gpu", None)
+        self.lxc.profile_edit(
+            profile="default",
+            config=profile,
+            project=self.lxd_project,
+            remote=self.lxd_remote,
+        )
+
         try:
             instance = self.lxd_launcher(
                 name=instance_name,
diff --git a/lpcraft/providers/tests/__init__.py b/lpcraft/providers/tests/__init__.py
index e3e9569..bb2254c 100644
--- a/lpcraft/providers/tests/__init__.py
+++ b/lpcraft/providers/tests/__init__.py
@@ -41,6 +41,8 @@ def makeLXDProvider(
     """Create a custom LXDProvider for tests."""
     if lxc is None:
         lxc = Mock(spec=LXC)
+        lxc.profile_show.return_value = {"config": {}, "devices": {}}
+        lxc.project_list.return_value = []
         lxc.remote_list.return_value = {}
     lxd_installer = FakeLXDInstaller(
         can_install=can_install,
diff --git a/lpcraft/providers/tests/test_lxd.py b/lpcraft/providers/tests/test_lxd.py
index f12767c..5df5fc5 100644
--- a/lpcraft/providers/tests/test_lxd.py
+++ b/lpcraft/providers/tests/test_lxd.py
@@ -414,6 +414,11 @@ class TestLXDProvider(TestCase):
     def test_launched_environment(self):
         expected_instance_name = "lpcraft-my-project-12345-focal-amd64"
         mock_lxc = Mock(spec=LXC)
+        mock_lxc.profile_show.return_value = {
+            "config": {"sentinel": "true"},
+            "devices": {"eth0": {}},
+        }
+        mock_lxc.project_list.return_value = []
         mock_lxc.remote_list.return_value = {}
         mock_launcher = Mock(spec=launch)
         provider = makeLXDProvider(lxc=mock_lxc, lxd_launcher=mock_launcher)
@@ -426,6 +431,22 @@ class TestLXDProvider(TestCase):
         ) as instance:
             self.assertIsNotNone(instance)
             mock_lxc.remote_add.assert_called_once()
+            mock_lxc.project_list.assert_called_once_with("test-remote")
+            mock_lxc.project_create.assert_called_once_with(
+                project="test-project", remote="test-remote"
+            )
+            mock_lxc.profile_show.assert_called_once_with(
+                profile="default", project="default", remote="test-remote"
+            )
+            mock_lxc.profile_edit.assert_called_once_with(
+                profile="default",
+                config={
+                    "config": {"sentinel": "true"},
+                    "devices": {"eth0": {}},
+                },
+                project="test-project",
+                remote="test-remote",
+            )
             self.assertEqual(
                 [
                     call(
@@ -694,3 +715,75 @@ class TestLXDProvider(TestCase):
                 pass  # pragma: no cover
 
         self.assertIs(error, raised.exception.__cause__)
+
+    def test_launched_environment_reuses_existing_profile(self):
+        mock_lxc = Mock(spec=LXC)
+        mock_lxc.profile_show.return_value = {"config": {}, "devices": {}}
+        mock_lxc.project_list.return_value = ["test-project"]
+        mock_lxc.remote_list.return_value = {"test-remote": {}}
+        mock_launcher = Mock(spec=launch)
+        provider = makeLXDProvider(lxc=mock_lxc, lxd_launcher=mock_launcher)
+
+        with provider.launched_environment(
+            project_name="my-project",
+            project_path=self.mock_path,
+            series="focal",
+            architecture="amd64",
+        ) as instance:
+            self.assertIsNotNone(instance)
+            mock_lxc.project_create.assert_not_called()
+
+    def test_launched_environment_removes_gpu_nvidia_configuration(self):
+        # With gpu_nvidia=False, launched_environment removes any existing
+        # NVIDIA GPU configuration from the default profile.
+        mock_lxc = Mock(spec=LXC)
+        mock_lxc.profile_show.return_value = {
+            "config": {"nvidia.runtime": "true"},
+            "devices": {"gpu": {"type": "gpu"}},
+        }
+        mock_lxc.project_list.return_value = []
+        mock_lxc.remote_list.return_value = {}
+        mock_launcher = Mock(spec=launch)
+        provider = makeLXDProvider(lxc=mock_lxc, lxd_launcher=mock_launcher)
+
+        with provider.launched_environment(
+            project_name="my-project",
+            project_path=self.mock_path,
+            series="focal",
+            architecture="amd64",
+        ) as instance:
+            self.assertIsNotNone(instance)
+            mock_lxc.profile_edit.assert_called_once_with(
+                profile="default",
+                config={"config": {}, "devices": {}},
+                project="test-project",
+                remote="test-remote",
+            )
+
+    def test_launched_environment_adds_gpu_nvidia_configuration(self):
+        # With gpu_nvidia=True, launched_environment adds NVIDIA GPU
+        # configuration to the default profile.
+        mock_lxc = Mock(spec=LXC)
+        mock_lxc.profile_show.return_value = {"config": {}, "devices": {}}
+        mock_lxc.project_list.return_value = []
+        mock_lxc.remote_list.return_value = {}
+        mock_launcher = Mock(spec=launch)
+        provider = makeLXDProvider(lxc=mock_lxc, lxd_launcher=mock_launcher)
+
+        with provider.launched_environment(
+            project_name="my-project",
+            project_path=self.mock_path,
+            series="focal",
+            architecture="amd64",
+            gpu_nvidia=True,
+        ) as instance:
+            self.assertIsNotNone(instance)
+            mock_lxc.profile_edit.assert_called_once_with(
+                profile="default",
+                config={
+                    "config": {"nvidia.runtime": "true"},
+                    "devices": {"gpu": {"type": "gpu"}},
+                },
+                project="test-project",
+                remote="test-remote",
+            )
diff --git a/setup.cfg b/setup.cfg
index a3d28ac..3fb0327 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = lpcraft
-version = 0.0.40
+version = 0.0.41.dev0
 description = Runner for Launchpad CI jobs
 long_description = file: README.rst
 long_description_content_type = text/x-rst