← Back to team overview

launchpad-reviewers team mailing list archive

Re: [Merge] lp:~wgrant/launchpad/update-pkgcache-weight-loss into lp:launchpad

 

Review: Approve



Diff comments:

> 
> === modified file 'lib/lp/soyuz/model/distributionsourcepackagecache.py'
> --- lib/lp/soyuz/model/distributionsourcepackagecache.py	2016-05-17 13:35:03 +0000
> +++ lib/lp/soyuz/model/distributionsourcepackagecache.py	2016-05-18 08:41:52 +0000
> @@ -57,6 +60,18 @@
>              self.sourcepackagename)
>  
>      @classmethod
> +    def findCurrentSourcePackageNames(cls, archive):
> +        spn_ids = IStore(SourcePackagePublishingHistory).find(
> +            SourcePackagePublishingHistory.sourcepackagenameID,
> +            SourcePackagePublishingHistory.archive == archive,
> +            SourcePackagePublishingHistory.status.is_in((
> +                PackagePublishingStatus.PENDING,
> +                PackagePublishingStatus.PUBLISHED))).config(
> +                    distinct=True)
> +        return list(sorted(
> +            bulk.load(SourcePackageName, spn_ids), key=attrgetter('name')))

One of the callers puts the output straight into a set, so sorting is unnecessary work there.  Maybe pull the sorted() up to the caller that cares about it?

> +
> +    @classmethod
>      def _find(cls, distro, archive=None):
>          """The set of all source package info caches for this distribution.
>  
> @@ -115,78 +119,103 @@
>                  cache.destroySelf()
>  
>      @classmethod
> -    def _update(cls, distro, sourcepackagename, archive, log):
> -        """Update cached source package details.
> -
> -        Update cache details for a given ISourcePackageName, including
> -        generated binarypackage names, summary and description fti.
> +    def _update(cls, distro, sourcepackagenames, archive, log):
> +        """Update the package cache for a given set of `ISourcePackageName`s.
> +
> +        Cached details include generated binarypackage names, summary
> +        and description fti.
> +
>          'log' is required and only prints debug level information.
>          """
>  
>          # Get the set of published sourcepackage releases.
> -        sprs = list(SourcePackageRelease.select("""
> -            SourcePackageRelease.id =
> -                SourcePackagePublishingHistory.sourcepackagerelease AND
> -            SourcePackagePublishingHistory.sourcepackagename = %s AND
> -            SourcePackagePublishingHistory.distroseries =
> -                DistroSeries.id AND
> -            DistroSeries.distribution = %s AND
> -            SourcePackagePublishingHistory.archive = %s AND
> -            SourcePackagePublishingHistory.dateremoved is NULL
> -            """ % sqlvalues(sourcepackagename, distro, archive),
> -            orderBy='id',
> -            clauseTables=['SourcePackagePublishingHistory', 'DistroSeries'],
> -            distinct=True))
> -
> -        if len(sprs) == 0:
> +        all_sprs = list(IStore(SourcePackageRelease).find(
> +            (SourcePackageRelease.sourcepackagenameID,
> +             SourcePackageRelease.id, SourcePackageRelease.version),
> +            SourcePackageRelease.id ==
> +                SourcePackagePublishingHistory.sourcepackagereleaseID,
> +            SourcePackagePublishingHistory.sourcepackagenameID.is_in(
> +                [spn.id for spn in sourcepackagenames]),
> +            SourcePackagePublishingHistory.archive == archive,
> +            SourcePackagePublishingHistory.status.is_in((
> +                PackagePublishingStatus.PENDING,
> +                PackagePublishingStatus.PUBLISHED))
> +            ).config(distinct=True).order_by(SourcePackageRelease.id))

It took me a while to realise why it's OK to remove the distribution check here: namely, because it's implied by the archive.  Perhaps worth a comment?

> +        if len(all_sprs) == 0:
>              log.debug("No sources releases found.")
>              return
>  
> -        # Find or create the cache entry.
> -        cache = DistributionSourcePackageCache.selectOne("""
> -            distribution = %s AND
> -            archive = %s AND
> -            sourcepackagename = %s
> -            """ % sqlvalues(distro, archive, sourcepackagename))
> -        if cache is None:
> -            log.debug("Creating new source cache entry.")
> -            cache = DistributionSourcePackageCache(
> -                archive=archive,
> -                distribution=distro,
> -                sourcepackagename=sourcepackagename)
> -
> -        # Make sure the name is correct.
> -        cache.name = sourcepackagename.name
> -
> -        # Get the sets of binary package names, summaries, descriptions.
> -
> -        # XXX Julian 2007-04-03:
> -        # This bit of code needs fixing up, it is doing stuff that
> -        # really needs to be done in SQL, such as sorting and uniqueness.
> -        # This would also improve the performance.
> -        binpkgnames = set()
> -        binpkgsummaries = set()
> -        binpkgdescriptions = set()
> -        for spr in sprs:
> -            log.debug("Considering source version %s" % spr.version)
> -        binpkgs = IStore(BinaryPackageRelease).find(
> -            (BinaryPackageName.name, BinaryPackageRelease.summary,
> -             BinaryPackageRelease.description),
> -            BinaryPackageRelease.buildID == BinaryPackageBuild.id,
> +        spr_map = {}
> +        for spn_id, spr_id, spr_version in all_sprs:
> +            spn = IStore(SourcePackageName).get(SourcePackageName, spn_id)
> +            spr_map.setdefault(spn, [])

This feels like a good application for "spr_map = defaultdict(list)".

> +            spr_map[spn].append((spr_id, spr_version))
> +
> +        all_caches = IStore(cls).find(
> +            cls, cls.distribution == distro, cls.archive == archive,
> +            cls.sourcepackagenameID.is_in(
> +                [spn.id for spn in sourcepackagenames]))
> +        cache_map = {cache.sourcepackagename: cache for cache in all_caches}
> +
> +        for spn in set(sourcepackagenames) - set(cache_map.keys()):
> +            cache_map[spn] = cls(
> +                archive=archive, distribution=distro,
> +                sourcepackagename=spn)
> +
> +        # Query BinaryPackageBuilds and their BinaryPackageReleases
> +        # separately, since the big and inconsistent intermediates can
> +        # confuse postgres into a seq scan over BPR, which never ends
> +        # well for anybody.
> +        #
> +        # Beware: the sets expand much faster than you might expect for
> +        # the primary archive; COPY archive builds are caught too, of
> +        # which there are dozens for most SPRs, and there's no easy way
> +        # to exclude them!

Can't we join on BinaryPackageBuild.archive and check the archive purpose?  Even if we can't deal with the general case due to copies between the primary archive and PPAs, we could at least reasonably say that COPY archive builds are never relevant to the package caches.

> +        all_builds = list(IStore(BinaryPackageBuild).find(
> +            (BinaryPackageBuild.source_package_release_id,
> +             BinaryPackageBuild.id),
>              BinaryPackageBuild.source_package_release_id.is_in(
> -                [spr.id for spr in sprs]),
> -            BinaryPackageName.id == BinaryPackageRelease.binarypackagenameID)
> -        for name, summary, description in binpkgs:
> -            binpkgnames.add(name)
> -            binpkgsummaries.add(summary)
> -            binpkgdescriptions.add(description)
> -
> -        # Update the caches.
> -        cache.binpkgnames = ' '.join(sorted(binpkgnames))
> -        cache.binpkgsummaries = ' '.join(sorted(binpkgsummaries))
> -        cache.binpkgdescriptions = ' '.join(sorted(binpkgdescriptions))
> -        # Column due for deletion.
> -        cache.changelog = None
> +                [row[1] for row in all_sprs])))
> +        all_binaries = list(IStore(BinaryPackageRelease).find(
> +            (BinaryPackageRelease.buildID,
> +             BinaryPackageRelease.binarypackagenameID,
> +             BinaryPackageRelease.summary, BinaryPackageRelease.description),
> +            BinaryPackageRelease.buildID.is_in(
> +                [row[1] for row in all_builds])))
> +        sprs_by_build = {build_id: spr_id for spr_id, build_id in all_builds}
> +
> +        bulk.load(BinaryPackageName, [row[1] for row in all_binaries])
> +        binaries_by_spr = {}
> +        for bpb_id, bpn_id, summary, description in all_binaries:
> +            spr_id = sprs_by_build[bpb_id]
> +            binaries_by_spr.setdefault(spr_id, [])
> +            binaries_by_spr[spr_id].append((
> +                IStore(BinaryPackageName).get(BinaryPackageName, bpn_id),
> +                summary, description))
> +
> +        for spn in sourcepackagenames:
> +            cache = cache_map[spn]
> +            cache.name = spn.name
> +
> +            sprs = spr_map.get(spn, [])
> +
> +            binpkgnames = set()
> +            binpkgsummaries = set()
> +            binpkgdescriptions = set()
> +            for spr_id, spr_version in sprs:
> +                log.debug("Considering source %s %s", spn.name, spr_version)
> +                binpkgs = binaries_by_spr.get(spr_id, [])
> +                for bpn, summary, description in binpkgs:
> +                    binpkgnames.add(bpn.name)
> +                    binpkgsummaries.add(summary)
> +                    binpkgdescriptions.add(description)
> +
> +            # Update the caches.
> +            cache.binpkgnames = ' '.join(sorted(binpkgnames))
> +            cache.binpkgsummaries = ' '.join(sorted(binpkgsummaries))
> +            cache.binpkgdescriptions = ' '.join(sorted(binpkgdescriptions))
> +            # Column due for deletion.
> +            cache.changelog = None
>  
>      @classmethod
>      def updateAll(cls, distro, archive, log, ztm, commit_chunk=500):
> 
> === modified file 'lib/lp/soyuz/model/distroseriespackagecache.py'
> --- lib/lp/soyuz/model/distroseriespackagecache.py	2015-07-08 16:05:11 +0000
> +++ lib/lp/soyuz/model/distroseriespackagecache.py	2016-05-18 08:41:52 +0000
> @@ -50,6 +52,22 @@
>      descriptions = StringCol(notNull=False, default=None)
>  
>      @classmethod
> +    def findCurrentBinaryPackageNames(cls, archive, distroseries):
> +        bpn_ids = IStore(BinaryPackagePublishingHistory).find(
> +            BinaryPackagePublishingHistory.binarypackagenameID,
> +            BinaryPackagePublishingHistory.distroarchseriesID.is_in(
> +                Select(
> +                    DistroArchSeries.id, tables=[DistroArchSeries],
> +                    where=DistroArchSeries.distroseries == distroseries)),
> +            BinaryPackagePublishingHistory.archive == archive,
> +            BinaryPackagePublishingHistory.status.is_in((
> +                PackagePublishingStatus.PENDING,
> +                PackagePublishingStatus.PUBLISHED))).config(
> +                    distinct=True)
> +        return list(sorted(
> +            bulk.load(BinaryPackageName, bpn_ids), key=attrgetter('name')))

Same comment on sorting.

> +
> +    @classmethod
>      def _find(cls, distroseries, archive=None):
>          """All of the cached binary package records for this distroseries.
>  
> @@ -117,58 +121,70 @@
>          (in full batches of 100 elements)
>          """
>          # get the set of published binarypackagereleases
> -        details = list(IStore(BinaryPackageRelease).find(
> -            (BinaryPackageRelease.summary, BinaryPackageRelease.description,
> +        all_details = list(IStore(BinaryPackageRelease).find(
> +            (BinaryPackageRelease.binarypackagenameID,
> +             BinaryPackageRelease.summary, BinaryPackageRelease.description,
>               Max(BinaryPackageRelease.datecreated)),
>              BinaryPackageRelease.id ==
>                  BinaryPackagePublishingHistory.binarypackagereleaseID,
> -            BinaryPackagePublishingHistory.binarypackagename ==
> -                binarypackagename,
> -            BinaryPackagePublishingHistory.distroarchseriesID ==
> -                DistroArchSeries.id,
> -            DistroArchSeries.distroseries == distroseries,
> +            BinaryPackagePublishingHistory.binarypackagenameID.is_in(
> +                [bpn.id for bpn in binarypackagenames]),
> +            BinaryPackagePublishingHistory.distroarchseriesID.is_in(
> +                Select(
> +                    DistroArchSeries.id, tables=[DistroArchSeries],
> +                    where=DistroArchSeries.distroseries == distroseries)),
>              BinaryPackagePublishingHistory.archive == archive,
> -            BinaryPackagePublishingHistory.dateremoved == None
> +            BinaryPackagePublishingHistory.status.is_in((
> +                PackagePublishingStatus.PENDING,
> +                PackagePublishingStatus.PUBLISHED))
>              ).group_by(
> +                BinaryPackageRelease.binarypackagenameID,
>                  BinaryPackageRelease.summary,
>                  BinaryPackageRelease.description
>              ).order_by(
> +                BinaryPackageRelease.binarypackagenameID,
>                  Desc(Max(BinaryPackageRelease.datecreated))))
> -
> -        if not details:
> +        if not all_details:
>              log.debug("No binary releases found.")
>              return
>  
> -        # find or create the cache entry
> -        cache = cls.selectOne("""
> -            distroseries = %s AND
> -            archive = %s AND
> -            binarypackagename = %s
> -            """ % sqlvalues(distroseries, archive, binarypackagename))
> -        if cache is None:
> -            log.debug("Creating new binary cache entry.")
> -            cache = cls(
> -                archive=archive,
> -                distroseries=distroseries,
> -                binarypackagename=binarypackagename)
> -
> -        # make sure the cached name, summary and description are correct
> -        cache.name = binarypackagename.name
> -        cache.summary = details[0][0]
> -        cache.description = details[0][1]
> -
> -        # get the sets of binary package summaries, descriptions. there is
> -        # likely only one, but just in case...
> -
> -        summaries = set()
> -        descriptions = set()
> -        for summary, description, datecreated in details:
> -            summaries.add(summary)
> -            descriptions.add(description)
> -
> -        # and update the caches
> -        cache.summaries = ' '.join(sorted(summaries))
> -        cache.descriptions = ' '.join(sorted(descriptions))
> +        details_map = {}
> +        for (bpn_id, summary, description, datecreated) in all_details:
> +            bpn = IStore(BinaryPackageName).get(BinaryPackageName, bpn_id)
> +            details_map.setdefault(bpn, [])

Same suggestion about defaultdict.

> +            details_map[bpn].append((summary, description))
> +
> +        all_caches = IStore(cls).find(
> +            cls, cls.distroseries == distroseries, cls.archive == archive,
> +            cls.binarypackagenameID.is_in(
> +                [bpn.id for bpn in binarypackagenames]))
> +        cache_map = {cache.binarypackagename: cache for cache in all_caches}
> +
> +        for bpn in set(binarypackagenames) - set(cache_map.keys()):

".keys()" is unnecessary.

> +            cache_map[bpn] = cls(
> +                archive=archive, distroseries=distroseries,
> +                binarypackagename=bpn)
> +
> +        for bpn in binarypackagenames:
> +            cache = cache_map[bpn]
> +            details = details_map[bpn]
> +            # make sure the cached name, summary and description are correct
> +            cache.name = bpn.name
> +            cache.summary = details[0][0]
> +            cache.description = details[0][1]
> +
> +            # get the sets of binary package summaries, descriptions. there is
> +            # likely only one, but just in case...
> +
> +            summaries = set()
> +            descriptions = set()
> +            for summary, description in details:
> +                summaries.add(summary)
> +                descriptions.add(description)
> +
> +            # and update the caches
> +            cache.summaries = ' '.join(sorted(summaries))
> +            cache.descriptions = ' '.join(sorted(descriptions))
>  
>      @classmethod
>      def updateAll(cls, distroseries, archive, log, ztm, commit_chunk=500):


-- 
https://code.launchpad.net/~wgrant/launchpad/update-pkgcache-weight-loss/+merge/295019
Your team Launchpad code reviewers is subscribed to branch lp:launchpad.


References