launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #20411
Re: [Merge] lp:~wgrant/launchpad/update-pkgcache-weight-loss into lp:launchpad
Diff comments:
>
> === modified file 'lib/lp/soyuz/model/distributionsourcepackagecache.py'
> --- lib/lp/soyuz/model/distributionsourcepackagecache.py 2016-05-17 13:35:03 +0000
> +++ lib/lp/soyuz/model/distributionsourcepackagecache.py 2016-05-18 08:41:52 +0000
> @@ -57,6 +60,18 @@
> self.sourcepackagename)
>
> @classmethod
> + def findCurrentSourcePackageNames(cls, archive):
> + spn_ids = IStore(SourcePackagePublishingHistory).find(
> + SourcePackagePublishingHistory.sourcepackagenameID,
> + SourcePackagePublishingHistory.archive == archive,
> + SourcePackagePublishingHistory.status.is_in((
> + PackagePublishingStatus.PENDING,
> + PackagePublishingStatus.PUBLISHED))).config(
> + distinct=True)
> + return list(sorted(
> + bulk.load(SourcePackageName, spn_ids), key=attrgetter('name')))
Sorting a few tens of thousands of items isn't terrible, but fair point.
> +
> + @classmethod
> def _find(cls, distro, archive=None):
> """The set of all source package info caches for this distribution.
>
> @@ -115,78 +119,103 @@
> cache.destroySelf()
>
> @classmethod
> - def _update(cls, distro, sourcepackagename, archive, log):
> - """Update cached source package details.
> -
> - Update cache details for a given ISourcePackageName, including
> - generated binarypackage names, summary and description fti.
> + def _update(cls, distro, sourcepackagenames, archive, log):
> + """Update the package cache for a given set of `ISourcePackageName`s.
> +
> + Cached details include generated binarypackage names, summary
> + and description fti.
> +
> 'log' is required and only prints debug level information.
> """
>
> # Get the set of published sourcepackage releases.
> - sprs = list(SourcePackageRelease.select("""
> - SourcePackageRelease.id =
> - SourcePackagePublishingHistory.sourcepackagerelease AND
> - SourcePackagePublishingHistory.sourcepackagename = %s AND
> - SourcePackagePublishingHistory.distroseries =
> - DistroSeries.id AND
> - DistroSeries.distribution = %s AND
> - SourcePackagePublishingHistory.archive = %s AND
> - SourcePackagePublishingHistory.dateremoved is NULL
> - """ % sqlvalues(sourcepackagename, distro, archive),
> - orderBy='id',
> - clauseTables=['SourcePackagePublishingHistory', 'DistroSeries'],
> - distinct=True))
> -
> - if len(sprs) == 0:
> + all_sprs = list(IStore(SourcePackageRelease).find(
> + (SourcePackageRelease.sourcepackagenameID,
> + SourcePackageRelease.id, SourcePackageRelease.version),
> + SourcePackageRelease.id ==
> + SourcePackagePublishingHistory.sourcepackagereleaseID,
> + SourcePackagePublishingHistory.sourcepackagenameID.is_in(
> + [spn.id for spn in sourcepackagenames]),
> + SourcePackagePublishingHistory.archive == archive,
> + SourcePackagePublishingHistory.status.is_in((
> + PackagePublishingStatus.PENDING,
> + PackagePublishingStatus.PUBLISHED))
> + ).config(distinct=True).order_by(SourcePackageRelease.id))
> + if len(all_sprs) == 0:
> log.debug("No sources releases found.")
> return
>
> - # Find or create the cache entry.
> - cache = DistributionSourcePackageCache.selectOne("""
> - distribution = %s AND
> - archive = %s AND
> - sourcepackagename = %s
> - """ % sqlvalues(distro, archive, sourcepackagename))
> - if cache is None:
> - log.debug("Creating new source cache entry.")
> - cache = DistributionSourcePackageCache(
> - archive=archive,
> - distribution=distro,
> - sourcepackagename=sourcepackagename)
> -
> - # Make sure the name is correct.
> - cache.name = sourcepackagename.name
> -
> - # Get the sets of binary package names, summaries, descriptions.
> -
> - # XXX Julian 2007-04-03:
> - # This bit of code needs fixing up, it is doing stuff that
> - # really needs to be done in SQL, such as sorting and uniqueness.
> - # This would also improve the performance.
> - binpkgnames = set()
> - binpkgsummaries = set()
> - binpkgdescriptions = set()
> - for spr in sprs:
> - log.debug("Considering source version %s" % spr.version)
> - binpkgs = IStore(BinaryPackageRelease).find(
> - (BinaryPackageName.name, BinaryPackageRelease.summary,
> - BinaryPackageRelease.description),
> - BinaryPackageRelease.buildID == BinaryPackageBuild.id,
> + spr_map = {}
> + for spn_id, spr_id, spr_version in all_sprs:
> + spn = IStore(SourcePackageName).get(SourcePackageName, spn_id)
> + spr_map.setdefault(spn, [])
> + spr_map[spn].append((spr_id, spr_version))
> +
> + all_caches = IStore(cls).find(
> + cls, cls.distribution == distro, cls.archive == archive,
> + cls.sourcepackagenameID.is_in(
> + [spn.id for spn in sourcepackagenames]))
> + cache_map = {cache.sourcepackagename: cache for cache in all_caches}
> +
> + for spn in set(sourcepackagenames) - set(cache_map.keys()):
> + cache_map[spn] = cls(
> + archive=archive, distribution=distro,
> + sourcepackagename=spn)
> +
> + # Query BinaryPackageBuilds and their BinaryPackageReleases
> + # separately, since the big and inconsistent intermediates can
> + # confuse postgres into a seq scan over BPR, which never ends
> + # well for anybody.
> + #
> + # Beware: the sets expand much faster than you might expect for
> + # the primary archive; COPY archive builds are caught too, of
> + # which there are dozens for most SPRs, and there's no easy way
> + # to exclude them!
I considered a purpose filter, but it seems weird and I wanted to get this landed without working out those implications -- I mean, it's technically possible for a binary that's in the primary archive to have originated in a copy archive, though hopefully nobody has tried that.
It's not effective to constrain Archive.purpose directly, as that just tries to replace a big join with one that's almost as big. But we could identify the list of Archive IDs that are copy archives and explicitly exclude those BPBs.
> + all_builds = list(IStore(BinaryPackageBuild).find(
> + (BinaryPackageBuild.source_package_release_id,
> + BinaryPackageBuild.id),
> BinaryPackageBuild.source_package_release_id.is_in(
> - [spr.id for spr in sprs]),
> - BinaryPackageName.id == BinaryPackageRelease.binarypackagenameID)
> - for name, summary, description in binpkgs:
> - binpkgnames.add(name)
> - binpkgsummaries.add(summary)
> - binpkgdescriptions.add(description)
> -
> - # Update the caches.
> - cache.binpkgnames = ' '.join(sorted(binpkgnames))
> - cache.binpkgsummaries = ' '.join(sorted(binpkgsummaries))
> - cache.binpkgdescriptions = ' '.join(sorted(binpkgdescriptions))
> - # Column due for deletion.
> - cache.changelog = None
> + [row[1] for row in all_sprs])))
> + all_binaries = list(IStore(BinaryPackageRelease).find(
> + (BinaryPackageRelease.buildID,
> + BinaryPackageRelease.binarypackagenameID,
> + BinaryPackageRelease.summary, BinaryPackageRelease.description),
> + BinaryPackageRelease.buildID.is_in(
> + [row[1] for row in all_builds])))
> + sprs_by_build = {build_id: spr_id for spr_id, build_id in all_builds}
> +
> + bulk.load(BinaryPackageName, [row[1] for row in all_binaries])
> + binaries_by_spr = {}
> + for bpb_id, bpn_id, summary, description in all_binaries:
> + spr_id = sprs_by_build[bpb_id]
> + binaries_by_spr.setdefault(spr_id, [])
> + binaries_by_spr[spr_id].append((
> + IStore(BinaryPackageName).get(BinaryPackageName, bpn_id),
> + summary, description))
> +
> + for spn in sourcepackagenames:
> + cache = cache_map[spn]
> + cache.name = spn.name
> +
> + sprs = spr_map.get(spn, [])
> +
> + binpkgnames = set()
> + binpkgsummaries = set()
> + binpkgdescriptions = set()
> + for spr_id, spr_version in sprs:
> + log.debug("Considering source %s %s", spn.name, spr_version)
> + binpkgs = binaries_by_spr.get(spr_id, [])
> + for bpn, summary, description in binpkgs:
> + binpkgnames.add(bpn.name)
> + binpkgsummaries.add(summary)
> + binpkgdescriptions.add(description)
> +
> + # Update the caches.
> + cache.binpkgnames = ' '.join(sorted(binpkgnames))
> + cache.binpkgsummaries = ' '.join(sorted(binpkgsummaries))
> + cache.binpkgdescriptions = ' '.join(sorted(binpkgdescriptions))
> + # Column due for deletion.
> + cache.changelog = None
>
> @classmethod
> def updateAll(cls, distro, archive, log, ztm, commit_chunk=500):
>
> === modified file 'lib/lp/soyuz/model/distroseriespackagecache.py'
> --- lib/lp/soyuz/model/distroseriespackagecache.py 2015-07-08 16:05:11 +0000
> +++ lib/lp/soyuz/model/distroseriespackagecache.py 2016-05-18 08:41:52 +0000
> @@ -117,58 +121,70 @@
> (in full batches of 100 elements)
> """
> # get the set of published binarypackagereleases
> - details = list(IStore(BinaryPackageRelease).find(
> - (BinaryPackageRelease.summary, BinaryPackageRelease.description,
> + all_details = list(IStore(BinaryPackageRelease).find(
> + (BinaryPackageRelease.binarypackagenameID,
> + BinaryPackageRelease.summary, BinaryPackageRelease.description,
> Max(BinaryPackageRelease.datecreated)),
> BinaryPackageRelease.id ==
> BinaryPackagePublishingHistory.binarypackagereleaseID,
> - BinaryPackagePublishingHistory.binarypackagename ==
> - binarypackagename,
> - BinaryPackagePublishingHistory.distroarchseriesID ==
> - DistroArchSeries.id,
> - DistroArchSeries.distroseries == distroseries,
> + BinaryPackagePublishingHistory.binarypackagenameID.is_in(
> + [bpn.id for bpn in binarypackagenames]),
> + BinaryPackagePublishingHistory.distroarchseriesID.is_in(
> + Select(
> + DistroArchSeries.id, tables=[DistroArchSeries],
> + where=DistroArchSeries.distroseries == distroseries)),
> BinaryPackagePublishingHistory.archive == archive,
> - BinaryPackagePublishingHistory.dateremoved == None
> + BinaryPackagePublishingHistory.status.is_in((
> + PackagePublishingStatus.PENDING,
> + PackagePublishingStatus.PUBLISHED))
> ).group_by(
> + BinaryPackageRelease.binarypackagenameID,
> BinaryPackageRelease.summary,
> BinaryPackageRelease.description
> ).order_by(
> + BinaryPackageRelease.binarypackagenameID,
> Desc(Max(BinaryPackageRelease.datecreated))))
> -
> - if not details:
> + if not all_details:
> log.debug("No binary releases found.")
> return
>
> - # find or create the cache entry
> - cache = cls.selectOne("""
> - distroseries = %s AND
> - archive = %s AND
> - binarypackagename = %s
> - """ % sqlvalues(distroseries, archive, binarypackagename))
> - if cache is None:
> - log.debug("Creating new binary cache entry.")
> - cache = cls(
> - archive=archive,
> - distroseries=distroseries,
> - binarypackagename=binarypackagename)
> -
> - # make sure the cached name, summary and description are correct
> - cache.name = binarypackagename.name
> - cache.summary = details[0][0]
> - cache.description = details[0][1]
> -
> - # get the sets of binary package summaries, descriptions. there is
> - # likely only one, but just in case...
> -
> - summaries = set()
> - descriptions = set()
> - for summary, description, datecreated in details:
> - summaries.add(summary)
> - descriptions.add(description)
> -
> - # and update the caches
> - cache.summaries = ' '.join(sorted(summaries))
> - cache.descriptions = ' '.join(sorted(descriptions))
> + details_map = {}
> + for (bpn_id, summary, description, datecreated) in all_details:
> + bpn = IStore(BinaryPackageName).get(BinaryPackageName, bpn_id)
> + details_map.setdefault(bpn, [])
> + details_map[bpn].append((summary, description))
> +
> + all_caches = IStore(cls).find(
> + cls, cls.distroseries == distroseries, cls.archive == archive,
> + cls.binarypackagenameID.is_in(
> + [bpn.id for bpn in binarypackagenames]))
> + cache_map = {cache.binarypackagename: cache for cache in all_caches}
> +
> + for bpn in set(binarypackagenames) - set(cache_map.keys()):
Derp de derp.
> + cache_map[bpn] = cls(
> + archive=archive, distroseries=distroseries,
> + binarypackagename=bpn)
> +
> + for bpn in binarypackagenames:
> + cache = cache_map[bpn]
> + details = details_map[bpn]
> + # make sure the cached name, summary and description are correct
> + cache.name = bpn.name
> + cache.summary = details[0][0]
> + cache.description = details[0][1]
> +
> + # get the sets of binary package summaries, descriptions. there is
> + # likely only one, but just in case...
> +
> + summaries = set()
> + descriptions = set()
> + for summary, description in details:
> + summaries.add(summary)
> + descriptions.add(description)
> +
> + # and update the caches
> + cache.summaries = ' '.join(sorted(summaries))
> + cache.descriptions = ' '.join(sorted(descriptions))
>
> @classmethod
> def updateAll(cls, distroseries, archive, log, ztm, commit_chunk=500):
--
https://code.launchpad.net/~wgrant/launchpad/update-pkgcache-weight-loss/+merge/295019
Your team Launchpad code reviewers is subscribed to branch lp:launchpad.
References