launchpad-reviewers team mailing list archive

Thread
Date
Re: [Merge] ~tushar5526/launchpad:add-support-for-parallel-publishing into launchpad:master

To: Tushar Gupta <tushar.gupta@xxxxxxxxxxxxx>
From: Guruprasad <mp+480435@xxxxxxxxxxxxxxxxxx>
Date: Mon, 03 Feb 2025 06:25:15 -0000
In-reply-to: <173855774112.1014596.12293980375064144073.launchpad@scripts-bzrsyncd.lp.internal>
Reply-to: mp+480435@xxxxxxxxxxxxxxxxxx
Sender: noreply@xxxxxxxxxxxxx

Diff comments:

> diff --git a/lib/lp/archivepublisher/scripts/base.py b/lib/lp/archivepublisher/scripts/base.py
> index f590791..1182032 100644
> --- a/lib/lp/archivepublisher/scripts/base.py
> +++ b/lib/lp/archivepublisher/scripts/base.py
> @@ -36,6 +37,34 @@ class PublisherScript(LaunchpadCronScript):
>              help="Publish all Ubuntu-derived distributions.",
>          )
>  
> +    def addParallelPublisherOptions(self):

Not all these options are exclusive to the parallel publisher setup. So is there a different, more appropriate name to use here?

> +        self.parser.add_option(
> +            "--archive",
> +            action="append",
> +            dest="archives",
> +            metavar="REFERENCE",
> +            default=[],
> +            help="Only run over the archives identified by this reference. "
> +            "You can specify multiple archives by repeating the option",
> +        )
> +
> +        self.parser.add_option(
> +            "--exclude",
> +            action="append",
> +            dest="excluded_archives",
> +            metavar="REFERENCE",
> +            default=[],
> +            help="Skip the archives identified by this reference in the "
> +            "publisher run. You can specify multiple archives by repeating "
> +            "the option",
> +        )
> +
> +        self.parser.add_option(
> +            "--lockfilename",
> +            dest="lockfilename",
> +            help="lockfilename to be used by the publisher run",
> +        )
> +
>      def findSelectedDistro(self):
>          """Find the `Distribution` named by the --distribution option.
>  
> @@ -62,3 +91,35 @@ class PublisherScript(LaunchpadCronScript):
>              return self.findDerivedDistros()
>          else:
>              return [self.findSelectedDistro()]
> +
> +    def findArchives(self, archive_references, distribution=None):
> +        """
> +        Retrieve a list of archives based on the provided references and
> +        optional distribution.
> +
> +        Args:
> +            archive_references (list): A list of archive references to
> +            retrieve.
> +            distribution (str, optional): The distribution to filter archives
> +            by. Defaults to None.
> +
> +        Returns:
> +            list: A list of archives that match the provided references and
> +            distribution.
> +        """
> +        if not archive_references:
> +            return []
> +
> +        archives = []
> +        for reference in archive_references:
> +            archive = getUtility(IArchiveSet).getByReference(reference)

Instead of doing 1 query per provided archive reference, can we do a bulk loading here using one query?

> +            if not archive:
> +                self.logger.warning(
> +                    "Cannot find the excluded archive with reference: '%s', "

I don't think this function is called and used only to determine the excluded archives. So this warning message needs to be reworded to accommodate that.

> +                    % reference
> +                )
> +                continue
> +            if distribution and archive.distribution != distribution:
> +                continue
> +            archives.append(archive)
> +        return archives
> diff --git a/lib/lp/archivepublisher/scripts/processaccepted.py b/lib/lp/archivepublisher/scripts/processaccepted.py
> index da27fb9..dc46718 100644
> --- a/lib/lp/archivepublisher/scripts/processaccepted.py
> +++ b/lib/lp/archivepublisher/scripts/processaccepted.py
> @@ -62,6 +63,18 @@ class ProcessAccepted(PublisherScript):
>              help="Run only over COPY archives.",
>          )
>  
> +    def countExclusiveOptions(self):
> +        """Return the number of exclusive "mode" options that were set.
> +
> +        In valid use, at most one of them should be set.

Is there a reason for using this instead of creating mutually exclusive `argparse` options (https://docs.python.org/3.8/library/argparse.html#argparse.ArgumentParser.add_mutually_exclusive_group)?

> +        """
> +        exclusive_options = [
> +            self.options.ppa,
> +            self.options.copy_archives,
> +            self.options.archives,
> +        ]
> +        return len(list(filter(None, exclusive_options)))
> +
>      def validateArguments(self):
>          """Validate command-line arguments."""
>          if self.options.ppa and self.options.copy_archives:
> diff --git a/lib/lp/archivepublisher/scripts/publishdistro.py b/lib/lp/archivepublisher/scripts/publishdistro.py
> index 38ee406..90c8116 100644
> --- a/lib/lp/archivepublisher/scripts/publishdistro.py
> +++ b/lib/lp/archivepublisher/scripts/publishdistro.py
> @@ -375,20 +371,32 @@ class PublishDistro(PublisherScript):
>  
>      def getTargetArchives(self, distribution):
>          """Find the archive(s) selected by the script's options."""
> -        if self.options.archive:
> -            archive = getUtility(IArchiveSet).getByReference(
> -                self.options.archive
> -            )
> -            if archive.distribution == distribution:
> -                return [archive]
> -            else:
> -                return []
> +        if self.options.archives:
> +            return self.findArchives(self.options.archives, distribution)
>          elif self.options.partner:
>              return [distribution.getArchiveByComponent("partner")]
>          elif self.options.ppa:
> -            return filter(is_ppa_public, self.getPPAs(distribution))
> +            return [
> +                archive
> +                for archive in filter(
> +                    is_ppa_public, self.getPPAs(distribution)
> +                )
> +                if archive
> +                not in self.findArchives(
> +                    self.options.excluded_archives, distribution
> +                )
> +            ]

Would this be simpler to write and/or easier to read and understand if we used set difference here?

>          elif self.options.private_ppa:
> -            return filter(is_ppa_private, self.getPPAs(distribution))
> +            return [
> +                archive
> +                for archive in filter(
> +                    is_ppa_private, self.getPPAs(distribution)
> +                )
> +                if archive
> +                not in self.findArchives(
> +                    self.options.excluded_archives, distribution
> +                )
> +            ]
>          elif self.options.copy_archive:
>              return self.getCopyArchives(distribution)
>          else:
> @@ -598,6 +606,10 @@ class PublishDistro(PublisherScript):
>                  Store.of(archive).reset()
>  
>      def rsyncOVALData(self):
> +        # rsync supports --exclude option to exclude specific folders to

`        # rsync supports --exclude option to specify folders to`

> +        # exclude. We can use that to exclude rsync for excluded PPAs and

`        # exclude. We can use that to skip rsync for excluded PPAs and` since there are too many excludes in this paragraph already? :)

> +        # then we have to add in additional functionality to run it
> +        # specifically for included archive.
>          # Ensure that the rsync paths have a trailing slash.
>          rsync_src = os.path.join(
>              config.archivepublisher.oval_data_rsync_endpoint, ""
> @@ -634,6 +646,9 @@ class PublishDistro(PublisherScript):
>          """Compare the published OVAL files with the incoming one."""
>          start_dir = Path(config.archivepublisher.oval_data_root)
>          archive_set = getUtility(IArchiveSet)
> +        excluded_archives = self.findArchives(
> +            self.options.excluded_archives, distribution
> +        )

We are making so many of these `findArchives()` calls to get the included and excluded archives. Is there a way to make them once and reuse the results everywhere?

>          for owner_path in start_dir.iterdir():
>              if not owner_path.name.startswith("~"):
>                  continue
> @@ -653,6 +673,15 @@ class PublishDistro(PublisherScript):
>                          archive_path.name,
>                      )
>                      continue
> +                if archive in excluded_archives:
> +                    self.logger.info(

I don't think it is necessary to log the skipping of OVAL data rsync for excluded archives. The log statement above serves to warn when an included PPA has no matching archive found but in this case, exclusion is an intentional action and hence, imho, this adds nothing useful.

> +                        "Skipping OVAL data for '~%s/%s/%s' "
> +                        "(archive excluded from the publisher run).",
> +                        owner_path.name[1:],
> +                        distribution.name,
> +                        archive_path.name,
> +                    )
> +                    continue
>                  for suite_path in archive_path.iterdir():
>                      try:
>                          series, pocket = distribution.getDistroSeriesAndPocket(
> diff --git a/lib/lp/archivepublisher/tests/test_processaccepted.py b/lib/lp/archivepublisher/tests/test_processaccepted.py
> index 34c459e..8c95f7a 100644
> --- a/lib/lp/archivepublisher/tests/test_processaccepted.py
> +++ b/lib/lp/archivepublisher/tests/test_processaccepted.py
> @@ -269,3 +270,104 @@ class TestProcessAccepted(TestCaseWithFactory):
>                  ]
>              ),
>          )
> +
> +    def test_countExclusiveOptions_is_zero_if_none_set(self):
> +        # If none of the exclusive options is set, countExclusiveOptions
> +        # counts zero.
> +        self.assertEqual(0, self.getScript().countExclusiveOptions())
> +
> +    def test_countExclusiveOptions_counts_ppa(self):
> +        # countExclusiveOptions includes the "ppa" option.
> +        self.assertEqual(
> +            1, self.getScript(test_args=["--ppa"]).countExclusiveOptions()
> +        )
> +
> +    def test_countExclusiveOptions_counts_copy_archives(self):
> +        # countExclusiveOptions includes the "copy-archive" option.
> +        self.assertEqual(
> +            1,
> +            self.getScript(
> +                test_args=["--copy-archives"]
> +            ).countExclusiveOptions(),
> +        )
> +
> +    def test_countExclusiveOptions_counts_archive(self):
> +        # countExclusiveOptions includes the "copy-archive" option.
> +        self.assertEqual(
> +            1, self.getScript(test_args=["--archive"]).countExclusiveOptions()
> +        )
> +
> +    def test_lockfilename_option_overrides_default_lock(self):
> +        script = self.getScript(test_args=["--lockfilename", "foo.lock"])
> +        assert script.lockfilepath == "/var/lock/foo.lock"

Is there a reason for switching from the unittest assert* methods to a plain `assert` here and in a few other places further down in this diff?

It is better not to hard-code `/var/lock/` since that is controlled by `LOCK_PATH` in `lib/lp/services/scripts/base.py`. Instead you can just the last segment of the path and assert on the file name or use `LOCK_PATH` to construct the non-hardcoded version of the full lock file path. The same feedback applies for all the subsequent instances of hardcoded `/var/lock`.

> +
> +    def test_default_lock(self):
> +        script = self.getScript()
> +        assert script.lockfilepath == "/var/lock/%s" % GLOBAL_PUBLISHER_LOCK

Can we use f-strings instead of the legacy `%`-style formatting?

> +
> +    def test_getTargetArchives_ignores_excluded_archives_for_ppa(self):
> +        # If the selected exclusive option is "ppa," getTargetArchives
> +        # leaves out excluded PPAs.
> +        distroseries = self.factory.makeDistroSeries(distribution=self.distro)
> +
> +        ppa = self.factory.makeArchive(self.distro, purpose=ArchivePurpose.PPA)
> +        excluded_ppa_1 = self.factory.makeArchive(
> +            self.distro, purpose=ArchivePurpose.PPA
> +        )
> +        excluded_ppa_2 = self.factory.makeArchive(
> +            self.distro, purpose=ArchivePurpose.PPA, private=True
> +        )
> +
> +        for archive in [ppa, excluded_ppa_1, excluded_ppa_2]:
> +            self.createWaitingAcceptancePackage(
> +                archive=archive, distroseries=distroseries
> +            )
> +        script = self.getScript(
> +            test_args=[
> +                "--ppa",
> +                "--exclude",
> +                excluded_ppa_1.reference,
> +                "--exclude",
> +                excluded_ppa_2.reference,
> +            ],
> +        )
> +        self.assertContentEqual([ppa], script.getTargetArchives(self.distro))
> +
> +    def test_getTargetArchives_gets_specific_archives(self):
> +        # If the selected exclusive option is "archive,"
> +        # getTargetArchives looks for the specified archives.
> +
> +        distroseries = self.factory.makeDistroSeries(distribution=self.distro)
> +
> +        ppa1 = self.factory.makeArchive(
> +            self.distro, purpose=ArchivePurpose.PPA
> +        )
> +        ppa2 = self.factory.makeArchive(
> +            self.distro, purpose=ArchivePurpose.PPA, private=True
> +        )
> +        ppa3 = self.factory.makeArchive(
> +            self.distro, purpose=ArchivePurpose.PPA
> +        )
> +        ppa4 = self.factory.makeArchive(
> +            self.distro, purpose=ArchivePurpose.PPA, private=True
> +        )
> +
> +        # create another random archive in the same distro
> +        self.factory.makeArchive(self.distro, purpose=ArchivePurpose.PPA)
> +
> +        for archive in [ppa1, ppa2, ppa3, ppa4]:
> +            self.createWaitingAcceptancePackage(
> +                archive=archive, distroseries=distroseries
> +            )
> +
> +        script = self.getScript(
> +            test_args=[
> +                "--archive",
> +                ppa1.reference,
> +                "--archive",
> +                ppa2.reference,
> +            ],
> +        )
> +        self.assertContentEqual(
> +            [ppa1, ppa2], script.getTargetArchives(self.distro)
> +        )
> diff --git a/lib/lp/archivepublisher/tests/test_publishdistro.py b/lib/lp/archivepublisher/tests/test_publishdistro.py
> index ad591fa..46e0d24 100644
> --- a/lib/lp/archivepublisher/tests/test_publishdistro.py
> +++ b/lib/lp/archivepublisher/tests/test_publishdistro.py
> @@ -1016,6 +1116,12 @@ class TestPublishDistroMethods(TestCaseWithFactory):
>              1, self.makeScript(args=["--copy-archive"]).countExclusiveOptions()
>          )
>  
> +    def test_countExclusiveOptions_counts_archive(self):

Can you rename the test to mention "copy archive"?

> +        # countExclusiveOptions includes the "copy-archive" option.
> +        self.assertEqual(
> +            1, self.makeScript(args=["--archive"]).countExclusiveOptions()
> +        )
> +
>      def test_countExclusiveOptions_detects_conflict(self):
>          # If more than one of the exclusive options has been set, that
>          # raises the result from countExclusiveOptions above 1.


-- 
https://code.launchpad.net/~tushar5526/launchpad/+git/launchpad/+merge/480435
Your team Launchpad code reviewers is requested to review the proposed merge of ~tushar5526/launchpad:add-support-for-parallel-publishing into launchpad:master.
References

[Merge] ~tushar5526/launchpad:add-support-for-parallel-publishing into launchpad:master
From: Tushar Gupta, 2025-02-03