← Back to team overview

launchpad-reviewers team mailing list archive

Re: [Merge] lp:~wgrant/launchpad/update-pkgcache-weight-loss into lp:launchpad

 


Diff comments:

> 
> === modified file 'lib/lp/soyuz/model/distributionsourcepackagecache.py'
> --- lib/lp/soyuz/model/distributionsourcepackagecache.py	2016-05-17 13:35:03 +0000
> +++ lib/lp/soyuz/model/distributionsourcepackagecache.py	2016-05-18 08:41:52 +0000
> @@ -57,6 +60,18 @@
>              self.sourcepackagename)
>  
>      @classmethod
> +    def findCurrentSourcePackageNames(cls, archive):
> +        spn_ids = IStore(SourcePackagePublishingHistory).find(
> +            SourcePackagePublishingHistory.sourcepackagenameID,
> +            SourcePackagePublishingHistory.archive == archive,
> +            SourcePackagePublishingHistory.status.is_in((
> +                PackagePublishingStatus.PENDING,
> +                PackagePublishingStatus.PUBLISHED))).config(
> +                    distinct=True)
> +        return list(sorted(
> +            bulk.load(SourcePackageName, spn_ids), key=attrgetter('name')))

Sorting a few tens of thousands of items isn't terrible, but fair point.

> +
> +    @classmethod
>      def _find(cls, distro, archive=None):
>          """The set of all source package info caches for this distribution.
>  
> @@ -115,78 +119,103 @@
>                  cache.destroySelf()
>  
>      @classmethod
> -    def _update(cls, distro, sourcepackagename, archive, log):
> -        """Update cached source package details.
> -
> -        Update cache details for a given ISourcePackageName, including
> -        generated binarypackage names, summary and description fti.
> +    def _update(cls, distro, sourcepackagenames, archive, log):
> +        """Update the package cache for a given set of `ISourcePackageName`s.
> +
> +        Cached details include generated binarypackage names, summary
> +        and description fti.
> +
>          'log' is required and only prints debug level information.
>          """
>  
>          # Get the set of published sourcepackage releases.
> -        sprs = list(SourcePackageRelease.select("""
> -            SourcePackageRelease.id =
> -                SourcePackagePublishingHistory.sourcepackagerelease AND
> -            SourcePackagePublishingHistory.sourcepackagename = %s AND
> -            SourcePackagePublishingHistory.distroseries =
> -                DistroSeries.id AND
> -            DistroSeries.distribution = %s AND
> -            SourcePackagePublishingHistory.archive = %s AND
> -            SourcePackagePublishingHistory.dateremoved is NULL
> -            """ % sqlvalues(sourcepackagename, distro, archive),
> -            orderBy='id',
> -            clauseTables=['SourcePackagePublishingHistory', 'DistroSeries'],
> -            distinct=True))
> -
> -        if len(sprs) == 0:
> +        all_sprs = list(IStore(SourcePackageRelease).find(
> +            (SourcePackageRelease.sourcepackagenameID,
> +             SourcePackageRelease.id, SourcePackageRelease.version),
> +            SourcePackageRelease.id ==
> +                SourcePackagePublishingHistory.sourcepackagereleaseID,
> +            SourcePackagePublishingHistory.sourcepackagenameID.is_in(
> +                [spn.id for spn in sourcepackagenames]),
> +            SourcePackagePublishingHistory.archive == archive,
> +            SourcePackagePublishingHistory.status.is_in((
> +                PackagePublishingStatus.PENDING,
> +                PackagePublishingStatus.PUBLISHED))
> +            ).config(distinct=True).order_by(SourcePackageRelease.id))
> +        if len(all_sprs) == 0:
>              log.debug("No sources releases found.")
>              return
>  
> -        # Find or create the cache entry.
> -        cache = DistributionSourcePackageCache.selectOne("""
> -            distribution = %s AND
> -            archive = %s AND
> -            sourcepackagename = %s
> -            """ % sqlvalues(distro, archive, sourcepackagename))
> -        if cache is None:
> -            log.debug("Creating new source cache entry.")
> -            cache = DistributionSourcePackageCache(
> -                archive=archive,
> -                distribution=distro,
> -                sourcepackagename=sourcepackagename)
> -
> -        # Make sure the name is correct.
> -        cache.name = sourcepackagename.name
> -
> -        # Get the sets of binary package names, summaries, descriptions.
> -
> -        # XXX Julian 2007-04-03:
> -        # This bit of code needs fixing up, it is doing stuff that
> -        # really needs to be done in SQL, such as sorting and uniqueness.
> -        # This would also improve the performance.
> -        binpkgnames = set()
> -        binpkgsummaries = set()
> -        binpkgdescriptions = set()
> -        for spr in sprs:
> -            log.debug("Considering source version %s" % spr.version)
> -        binpkgs = IStore(BinaryPackageRelease).find(
> -            (BinaryPackageName.name, BinaryPackageRelease.summary,
> -             BinaryPackageRelease.description),
> -            BinaryPackageRelease.buildID == BinaryPackageBuild.id,
> +        spr_map = {}
> +        for spn_id, spr_id, spr_version in all_sprs:
> +            spn = IStore(SourcePackageName).get(SourcePackageName, spn_id)
> +            spr_map.setdefault(spn, [])
> +            spr_map[spn].append((spr_id, spr_version))
> +
> +        all_caches = IStore(cls).find(
> +            cls, cls.distribution == distro, cls.archive == archive,
> +            cls.sourcepackagenameID.is_in(
> +                [spn.id for spn in sourcepackagenames]))
> +        cache_map = {cache.sourcepackagename: cache for cache in all_caches}
> +
> +        for spn in set(sourcepackagenames) - set(cache_map.keys()):
> +            cache_map[spn] = cls(
> +                archive=archive, distribution=distro,
> +                sourcepackagename=spn)
> +
> +        # Query BinaryPackageBuilds and their BinaryPackageReleases
> +        # separately, since the big and inconsistent intermediates can
> +        # confuse postgres into a seq scan over BPR, which never ends
> +        # well for anybody.
> +        #
> +        # Beware: the sets expand much faster than you might expect for
> +        # the primary archive; COPY archive builds are caught too, of
> +        # which there are dozens for most SPRs, and there's no easy way
> +        # to exclude them!

I considered a purpose filter, but it seems weird and I wanted to get this landed without working out those implications -- I mean, it's technically possible for a binary that's in the primary archive to have originated in a copy archive, though hopefully nobody has tried that.

It's not effective to constrain Archive.purpose directly, as that just tries to replace a big join with one that's almost as big. But we could identify the list of Archive IDs that are copy archives and explicitly exclude those BPBs.

> +        all_builds = list(IStore(BinaryPackageBuild).find(
> +            (BinaryPackageBuild.source_package_release_id,
> +             BinaryPackageBuild.id),
>              BinaryPackageBuild.source_package_release_id.is_in(
> -                [spr.id for spr in sprs]),
> -            BinaryPackageName.id == BinaryPackageRelease.binarypackagenameID)
> -        for name, summary, description in binpkgs:
> -            binpkgnames.add(name)
> -            binpkgsummaries.add(summary)
> -            binpkgdescriptions.add(description)
> -
> -        # Update the caches.
> -        cache.binpkgnames = ' '.join(sorted(binpkgnames))
> -        cache.binpkgsummaries = ' '.join(sorted(binpkgsummaries))
> -        cache.binpkgdescriptions = ' '.join(sorted(binpkgdescriptions))
> -        # Column due for deletion.
> -        cache.changelog = None
> +                [row[1] for row in all_sprs])))
> +        all_binaries = list(IStore(BinaryPackageRelease).find(
> +            (BinaryPackageRelease.buildID,
> +             BinaryPackageRelease.binarypackagenameID,
> +             BinaryPackageRelease.summary, BinaryPackageRelease.description),
> +            BinaryPackageRelease.buildID.is_in(
> +                [row[1] for row in all_builds])))
> +        sprs_by_build = {build_id: spr_id for spr_id, build_id in all_builds}
> +
> +        bulk.load(BinaryPackageName, [row[1] for row in all_binaries])
> +        binaries_by_spr = {}
> +        for bpb_id, bpn_id, summary, description in all_binaries:
> +            spr_id = sprs_by_build[bpb_id]
> +            binaries_by_spr.setdefault(spr_id, [])
> +            binaries_by_spr[spr_id].append((
> +                IStore(BinaryPackageName).get(BinaryPackageName, bpn_id),
> +                summary, description))
> +
> +        for spn in sourcepackagenames:
> +            cache = cache_map[spn]
> +            cache.name = spn.name
> +
> +            sprs = spr_map.get(spn, [])
> +
> +            binpkgnames = set()
> +            binpkgsummaries = set()
> +            binpkgdescriptions = set()
> +            for spr_id, spr_version in sprs:
> +                log.debug("Considering source %s %s", spn.name, spr_version)
> +                binpkgs = binaries_by_spr.get(spr_id, [])
> +                for bpn, summary, description in binpkgs:
> +                    binpkgnames.add(bpn.name)
> +                    binpkgsummaries.add(summary)
> +                    binpkgdescriptions.add(description)
> +
> +            # Update the caches.
> +            cache.binpkgnames = ' '.join(sorted(binpkgnames))
> +            cache.binpkgsummaries = ' '.join(sorted(binpkgsummaries))
> +            cache.binpkgdescriptions = ' '.join(sorted(binpkgdescriptions))
> +            # Column due for deletion.
> +            cache.changelog = None
>  
>      @classmethod
>      def updateAll(cls, distro, archive, log, ztm, commit_chunk=500):
> 
> === modified file 'lib/lp/soyuz/model/distroseriespackagecache.py'
> --- lib/lp/soyuz/model/distroseriespackagecache.py	2015-07-08 16:05:11 +0000
> +++ lib/lp/soyuz/model/distroseriespackagecache.py	2016-05-18 08:41:52 +0000
> @@ -117,58 +121,70 @@
>          (in full batches of 100 elements)
>          """
>          # get the set of published binarypackagereleases
> -        details = list(IStore(BinaryPackageRelease).find(
> -            (BinaryPackageRelease.summary, BinaryPackageRelease.description,
> +        all_details = list(IStore(BinaryPackageRelease).find(
> +            (BinaryPackageRelease.binarypackagenameID,
> +             BinaryPackageRelease.summary, BinaryPackageRelease.description,
>               Max(BinaryPackageRelease.datecreated)),
>              BinaryPackageRelease.id ==
>                  BinaryPackagePublishingHistory.binarypackagereleaseID,
> -            BinaryPackagePublishingHistory.binarypackagename ==
> -                binarypackagename,
> -            BinaryPackagePublishingHistory.distroarchseriesID ==
> -                DistroArchSeries.id,
> -            DistroArchSeries.distroseries == distroseries,
> +            BinaryPackagePublishingHistory.binarypackagenameID.is_in(
> +                [bpn.id for bpn in binarypackagenames]),
> +            BinaryPackagePublishingHistory.distroarchseriesID.is_in(
> +                Select(
> +                    DistroArchSeries.id, tables=[DistroArchSeries],
> +                    where=DistroArchSeries.distroseries == distroseries)),
>              BinaryPackagePublishingHistory.archive == archive,
> -            BinaryPackagePublishingHistory.dateremoved == None
> +            BinaryPackagePublishingHistory.status.is_in((
> +                PackagePublishingStatus.PENDING,
> +                PackagePublishingStatus.PUBLISHED))
>              ).group_by(
> +                BinaryPackageRelease.binarypackagenameID,
>                  BinaryPackageRelease.summary,
>                  BinaryPackageRelease.description
>              ).order_by(
> +                BinaryPackageRelease.binarypackagenameID,
>                  Desc(Max(BinaryPackageRelease.datecreated))))
> -
> -        if not details:
> +        if not all_details:
>              log.debug("No binary releases found.")
>              return
>  
> -        # find or create the cache entry
> -        cache = cls.selectOne("""
> -            distroseries = %s AND
> -            archive = %s AND
> -            binarypackagename = %s
> -            """ % sqlvalues(distroseries, archive, binarypackagename))
> -        if cache is None:
> -            log.debug("Creating new binary cache entry.")
> -            cache = cls(
> -                archive=archive,
> -                distroseries=distroseries,
> -                binarypackagename=binarypackagename)
> -
> -        # make sure the cached name, summary and description are correct
> -        cache.name = binarypackagename.name
> -        cache.summary = details[0][0]
> -        cache.description = details[0][1]
> -
> -        # get the sets of binary package summaries, descriptions. there is
> -        # likely only one, but just in case...
> -
> -        summaries = set()
> -        descriptions = set()
> -        for summary, description, datecreated in details:
> -            summaries.add(summary)
> -            descriptions.add(description)
> -
> -        # and update the caches
> -        cache.summaries = ' '.join(sorted(summaries))
> -        cache.descriptions = ' '.join(sorted(descriptions))
> +        details_map = {}
> +        for (bpn_id, summary, description, datecreated) in all_details:
> +            bpn = IStore(BinaryPackageName).get(BinaryPackageName, bpn_id)
> +            details_map.setdefault(bpn, [])
> +            details_map[bpn].append((summary, description))
> +
> +        all_caches = IStore(cls).find(
> +            cls, cls.distroseries == distroseries, cls.archive == archive,
> +            cls.binarypackagenameID.is_in(
> +                [bpn.id for bpn in binarypackagenames]))
> +        cache_map = {cache.binarypackagename: cache for cache in all_caches}
> +
> +        for bpn in set(binarypackagenames) - set(cache_map.keys()):

Derp de derp.

> +            cache_map[bpn] = cls(
> +                archive=archive, distroseries=distroseries,
> +                binarypackagename=bpn)
> +
> +        for bpn in binarypackagenames:
> +            cache = cache_map[bpn]
> +            details = details_map[bpn]
> +            # make sure the cached name, summary and description are correct
> +            cache.name = bpn.name
> +            cache.summary = details[0][0]
> +            cache.description = details[0][1]
> +
> +            # get the sets of binary package summaries, descriptions. there is
> +            # likely only one, but just in case...
> +
> +            summaries = set()
> +            descriptions = set()
> +            for summary, description in details:
> +                summaries.add(summary)
> +                descriptions.add(description)
> +
> +            # and update the caches
> +            cache.summaries = ' '.join(sorted(summaries))
> +            cache.descriptions = ' '.join(sorted(descriptions))
>  
>      @classmethod
>      def updateAll(cls, distroseries, archive, log, ztm, commit_chunk=500):


-- 
https://code.launchpad.net/~wgrant/launchpad/update-pkgcache-weight-loss/+merge/295019
Your team Launchpad code reviewers is subscribed to branch lp:launchpad.


References