← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] lp:~wgrant/launchpad/update-pkgcache-weight-loss into lp:launchpad

 

William Grant has proposed merging lp:~wgrant/launchpad/update-pkgcache-weight-loss into lp:launchpad.

Commit message:
Rewrite and optimise the guts of update-pkgcache's DSPC and DSPC updaters.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~wgrant/launchpad/update-pkgcache-weight-loss/+merge/295019

Rewrite and optimise the guts of the DSPC and DSPC updaters.

Name selection is replaced with a quick index scan, and detail collection is bulked up bro. These rows should never be updated by anything other than update-pkgcache, so I haven't thrown in a DBLoopTuner just yet, but a later adaptation would be simple.

The behaviour is subtly changed, but it's probably more correct now: packages are now only included in the caches if their status is Pending or Published, rather than whenever their dateremoved is unset. The old dateremoved filter was confusing, slow, and only made sense before ArchiveRemovalRedesign.

The no-op case is now a good bit faster, slowed mostly by the huge number of test rebuilds clogging up the SPR -> BPB -> BPR queries. Each cache row modification still requires a separate query, but the vast majority of rows remain untouched in the vast majority of runs, so bulkifying changes wasn't a priority.
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~wgrant/launchpad/update-pkgcache-weight-loss into lp:launchpad.
=== modified file 'lib/lp/soyuz/doc/package-cache.txt'
--- lib/lp/soyuz/doc/package-cache.txt	2014-07-09 15:11:03 +0000
+++ lib/lp/soyuz/doc/package-cache.txt	2016-05-18 08:41:52 +0000
@@ -162,12 +162,10 @@
 
     >>> updates = DistributionSourcePackageCache.updateAll(
     ...     ubuntu, archive=ubuntu.main_archive, ztm=transaction,
-    ...     log=FakeLogger())
-    DEBUG ...
-    DEBUG Considering source 'cdrkit'
-    DEBUG Creating new source cache entry.
+    ...     log=FakeLogger(), commit_chunk=3)
+    DEBUG Considering sources alsa-utils, cdrkit, cnews
     ...
-    DEBUG Considering source 'mozilla-firefox'
+    DEBUG Considering sources linux-source-2.6.15, mozilla-firefox, netapplet
     ...
 
     >>> print updates
@@ -246,16 +244,11 @@
 
     >>> updates = DistroSeriesPackageCache.updateAll(
     ...     warty, archive=ubuntu.main_archive, ztm=transaction,
-    ...     log=FakeLogger())
-    DEBUG Considering binary 'at'
-    DEBUG Considering binary 'cdrkit'
-    DEBUG Creating new binary cache entry.
-    DEBUG Considering binary 'linux-2.6.12'
-    DEBUG Considering binary 'mozilla-firefox'
-    DEBUG Considering binary 'mozilla-firefox-data'
-    DEBUG Creating new binary cache entry.
-    DEBUG Considering binary 'pmount'
-
+    ...     log=FakeLogger(), commit_chunk=3)
+    DEBUG Considering binaries at, cdrkit, linux-2.6.12
+    DEBUG Committing
+    DEBUG Considering binaries mozilla-firefox, mozilla-firefox-data, pmount
+    DEBUG Committing
 
     >>> print updates
     6
@@ -341,14 +334,13 @@
 
     >>> source_updates = DistributionSourcePackageCache.updateAll(
     ...     ubuntu, archive=cprov.archive, ztm=transaction, log=FakeLogger())
-    DEBUG ...
-    DEBUG Considering source 'pmount'
+    DEBUG Considering sources cdrkit, iceweasel, pmount
     ...
 
     >>> binary_updates = DistroSeriesPackageCache.updateAll(
     ...     warty, archive=cprov.archive, ztm=transaction,
     ...     log=FakeLogger())
-    DEBUG Considering binary 'mozilla-firefox'
+    DEBUG Considering binaries mozilla-firefox, pmount
     ...
 
     >>> cprov.archive.updateArchiveCache()

=== modified file 'lib/lp/soyuz/model/distributionsourcepackagecache.py'
--- lib/lp/soyuz/model/distributionsourcepackagecache.py	2016-05-17 13:35:03 +0000
+++ lib/lp/soyuz/model/distributionsourcepackagecache.py	2016-05-18 08:41:52 +0000
@@ -4,7 +4,10 @@
 __metaclass__ = type
 __all__ = ['DistributionSourcePackageCache', ]
 
-from operator import itemgetter
+from operator import (
+    attrgetter,
+    itemgetter,
+    )
 
 from sqlobject import (
     ForeignKey,
@@ -13,18 +16,18 @@
 from zope.interface import implementer
 
 from lp.registry.model.sourcepackagename import SourcePackageName
+from lp.services.database import bulk
 from lp.services.database.decoratedresultset import DecoratedResultSet
 from lp.services.database.interfaces import IStore
-from lp.services.database.sqlbase import (
-    SQLBase,
-    sqlvalues,
-    )
+from lp.services.database.sqlbase import SQLBase
+from lp.soyuz.enums import PackagePublishingStatus
 from lp.soyuz.interfaces.distributionsourcepackagecache import (
     IDistributionSourcePackageCache,
     )
 from lp.soyuz.model.binarypackagebuild import BinaryPackageBuild
 from lp.soyuz.model.binarypackagename import BinaryPackageName
 from lp.soyuz.model.binarypackagerelease import BinaryPackageRelease
+from lp.soyuz.model.publishing import SourcePackagePublishingHistory
 from lp.soyuz.model.sourcepackagerelease import SourcePackageRelease
 
 
@@ -57,6 +60,18 @@
             self.sourcepackagename)
 
     @classmethod
+    def findCurrentSourcePackageNames(cls, archive):
+        spn_ids = IStore(SourcePackagePublishingHistory).find(
+            SourcePackagePublishingHistory.sourcepackagenameID,
+            SourcePackagePublishingHistory.archive == archive,
+            SourcePackagePublishingHistory.status.is_in((
+                PackagePublishingStatus.PENDING,
+                PackagePublishingStatus.PUBLISHED))).config(
+                    distinct=True)
+        return list(sorted(
+            bulk.load(SourcePackageName, spn_ids), key=attrgetter('name')))
+
+    @classmethod
     def _find(cls, distro, archive=None):
         """The set of all source package info caches for this distribution.
 
@@ -89,22 +104,11 @@
         """
 
         # Get the set of source package names to deal with.
-        spns = set(SourcePackageName.select("""
-            SourcePackagePublishingHistory.distroseries =
-                DistroSeries.id AND
-            DistroSeries.distribution = %s AND
-            Archive.id = %s AND
-            SourcePackagePublishingHistory.archive = Archive.id AND
-            SourcePackagePublishingHistory.sourcepackagename =
-                SourcePackageName.id AND
-            SourcePackagePublishingHistory.dateremoved is NULL AND
-            Archive.enabled = TRUE
-            """ % sqlvalues(distro, archive),
-            distinct=True,
-            clauseTables=[
-                'Archive',
-                'DistroSeries',
-                'SourcePackagePublishingHistory']))
+        if not archive.enabled:
+            spns = set()
+        else:
+            spns = set(
+                cls.findCurrentSourcePackageNames(archive))
 
         # Remove the cache entries for packages we no longer publish.
         for cache in cls._find(distro, archive):
@@ -115,78 +119,103 @@
                 cache.destroySelf()
 
     @classmethod
-    def _update(cls, distro, sourcepackagename, archive, log):
-        """Update cached source package details.
-
-        Update cache details for a given ISourcePackageName, including
-        generated binarypackage names, summary and description fti.
+    def _update(cls, distro, sourcepackagenames, archive, log):
+        """Update the package cache for a given set of `ISourcePackageName`s.
+
+        Cached details include generated binarypackage names, summary
+        and description fti.
+
         'log' is required and only prints debug level information.
         """
 
         # Get the set of published sourcepackage releases.
-        sprs = list(SourcePackageRelease.select("""
-            SourcePackageRelease.id =
-                SourcePackagePublishingHistory.sourcepackagerelease AND
-            SourcePackagePublishingHistory.sourcepackagename = %s AND
-            SourcePackagePublishingHistory.distroseries =
-                DistroSeries.id AND
-            DistroSeries.distribution = %s AND
-            SourcePackagePublishingHistory.archive = %s AND
-            SourcePackagePublishingHistory.dateremoved is NULL
-            """ % sqlvalues(sourcepackagename, distro, archive),
-            orderBy='id',
-            clauseTables=['SourcePackagePublishingHistory', 'DistroSeries'],
-            distinct=True))
-
-        if len(sprs) == 0:
+        all_sprs = list(IStore(SourcePackageRelease).find(
+            (SourcePackageRelease.sourcepackagenameID,
+             SourcePackageRelease.id, SourcePackageRelease.version),
+            SourcePackageRelease.id ==
+                SourcePackagePublishingHistory.sourcepackagereleaseID,
+            SourcePackagePublishingHistory.sourcepackagenameID.is_in(
+                [spn.id for spn in sourcepackagenames]),
+            SourcePackagePublishingHistory.archive == archive,
+            SourcePackagePublishingHistory.status.is_in((
+                PackagePublishingStatus.PENDING,
+                PackagePublishingStatus.PUBLISHED))
+            ).config(distinct=True).order_by(SourcePackageRelease.id))
+        if len(all_sprs) == 0:
             log.debug("No sources releases found.")
             return
 
-        # Find or create the cache entry.
-        cache = DistributionSourcePackageCache.selectOne("""
-            distribution = %s AND
-            archive = %s AND
-            sourcepackagename = %s
-            """ % sqlvalues(distro, archive, sourcepackagename))
-        if cache is None:
-            log.debug("Creating new source cache entry.")
-            cache = DistributionSourcePackageCache(
-                archive=archive,
-                distribution=distro,
-                sourcepackagename=sourcepackagename)
-
-        # Make sure the name is correct.
-        cache.name = sourcepackagename.name
-
-        # Get the sets of binary package names, summaries, descriptions.
-
-        # XXX Julian 2007-04-03:
-        # This bit of code needs fixing up, it is doing stuff that
-        # really needs to be done in SQL, such as sorting and uniqueness.
-        # This would also improve the performance.
-        binpkgnames = set()
-        binpkgsummaries = set()
-        binpkgdescriptions = set()
-        for spr in sprs:
-            log.debug("Considering source version %s" % spr.version)
-        binpkgs = IStore(BinaryPackageRelease).find(
-            (BinaryPackageName.name, BinaryPackageRelease.summary,
-             BinaryPackageRelease.description),
-            BinaryPackageRelease.buildID == BinaryPackageBuild.id,
+        spr_map = {}
+        for spn_id, spr_id, spr_version in all_sprs:
+            spn = IStore(SourcePackageName).get(SourcePackageName, spn_id)
+            spr_map.setdefault(spn, [])
+            spr_map[spn].append((spr_id, spr_version))
+
+        all_caches = IStore(cls).find(
+            cls, cls.distribution == distro, cls.archive == archive,
+            cls.sourcepackagenameID.is_in(
+                [spn.id for spn in sourcepackagenames]))
+        cache_map = {cache.sourcepackagename: cache for cache in all_caches}
+
+        for spn in set(sourcepackagenames) - set(cache_map.keys()):
+            cache_map[spn] = cls(
+                archive=archive, distribution=distro,
+                sourcepackagename=spn)
+
+        # Query BinaryPackageBuilds and their BinaryPackageReleases
+        # separately, since the big and inconsistent intermediates can
+        # confuse postgres into a seq scan over BPR, which never ends
+        # well for anybody.
+        #
+        # Beware: the sets expand much faster than you might expect for
+        # the primary archive; COPY archive builds are caught too, of
+        # which there are dozens for most SPRs, and there's no easy way
+        # to exclude them!
+        all_builds = list(IStore(BinaryPackageBuild).find(
+            (BinaryPackageBuild.source_package_release_id,
+             BinaryPackageBuild.id),
             BinaryPackageBuild.source_package_release_id.is_in(
-                [spr.id for spr in sprs]),
-            BinaryPackageName.id == BinaryPackageRelease.binarypackagenameID)
-        for name, summary, description in binpkgs:
-            binpkgnames.add(name)
-            binpkgsummaries.add(summary)
-            binpkgdescriptions.add(description)
-
-        # Update the caches.
-        cache.binpkgnames = ' '.join(sorted(binpkgnames))
-        cache.binpkgsummaries = ' '.join(sorted(binpkgsummaries))
-        cache.binpkgdescriptions = ' '.join(sorted(binpkgdescriptions))
-        # Column due for deletion.
-        cache.changelog = None
+                [row[1] for row in all_sprs])))
+        all_binaries = list(IStore(BinaryPackageRelease).find(
+            (BinaryPackageRelease.buildID,
+             BinaryPackageRelease.binarypackagenameID,
+             BinaryPackageRelease.summary, BinaryPackageRelease.description),
+            BinaryPackageRelease.buildID.is_in(
+                [row[1] for row in all_builds])))
+        sprs_by_build = {build_id: spr_id for spr_id, build_id in all_builds}
+
+        bulk.load(BinaryPackageName, [row[1] for row in all_binaries])
+        binaries_by_spr = {}
+        for bpb_id, bpn_id, summary, description in all_binaries:
+            spr_id = sprs_by_build[bpb_id]
+            binaries_by_spr.setdefault(spr_id, [])
+            binaries_by_spr[spr_id].append((
+                IStore(BinaryPackageName).get(BinaryPackageName, bpn_id),
+                summary, description))
+
+        for spn in sourcepackagenames:
+            cache = cache_map[spn]
+            cache.name = spn.name
+
+            sprs = spr_map.get(spn, [])
+
+            binpkgnames = set()
+            binpkgsummaries = set()
+            binpkgdescriptions = set()
+            for spr_id, spr_version in sprs:
+                log.debug("Considering source %s %s", spn.name, spr_version)
+                binpkgs = binaries_by_spr.get(spr_id, [])
+                for bpn, summary, description in binpkgs:
+                    binpkgnames.add(bpn.name)
+                    binpkgsummaries.add(summary)
+                    binpkgdescriptions.add(description)
+
+            # Update the caches.
+            cache.binpkgnames = ' '.join(sorted(binpkgnames))
+            cache.binpkgsummaries = ' '.join(sorted(binpkgsummaries))
+            cache.binpkgdescriptions = ' '.join(sorted(binpkgdescriptions))
+            # Column due for deletion.
+            cache.changelog = None
 
     @classmethod
     def updateAll(cls, distro, archive, log, ztm, commit_chunk=500):
@@ -208,29 +237,26 @@
             return
 
         # Get the set of source package names to deal with.
-        spns = list(SourcePackageName.select("""
-            SourcePackagePublishingHistory.distroseries =
-                DistroSeries.id AND
-            DistroSeries.distribution = %s AND
-            SourcePackagePublishingHistory.archive = %s AND
-            SourcePackagePublishingHistory.sourcepackagename =
-                SourcePackageName.id AND
-            SourcePackagePublishingHistory.dateremoved is NULL
-            """ % sqlvalues(distro, archive),
-            distinct=True,
-            orderBy="name",
-            clauseTables=['SourcePackagePublishingHistory', 'DistroSeries']))
+        spns = cls.findCurrentSourcePackageNames(archive)
 
         number_of_updates = 0
-        chunk_size = 0
+        chunks = []
+        chunk = []
         for spn in spns:
-            log.debug("Considering source '%s'" % spn.name)
-            cls._update(distro, spn, archive, log)
-            chunk_size += 1
-            number_of_updates += 1
-            if chunk_size == commit_chunk:
-                chunk_size = 0
-                log.debug("Committing")
-                ztm.commit()
+            chunk.append(spn)
+            if len(chunk) == commit_chunk:
+                chunks.append(chunk)
+                chunk = []
+        if chunk:
+            chunks.append(chunk)
+        for chunk in chunks:
+            bulk.load(SourcePackageName, [spn.id for spn in chunk])
+            log.debug(
+                "Considering sources %s",
+                ', '.join([spn.name for spn in chunk]))
+            cls._update(distro, chunk, archive, log)
+            number_of_updates += len(chunk)
+            log.debug("Committing")
+            ztm.commit()
 
         return number_of_updates

=== modified file 'lib/lp/soyuz/model/distroseriespackagecache.py'
--- lib/lp/soyuz/model/distroseriespackagecache.py	2015-07-08 16:05:11 +0000
+++ lib/lp/soyuz/model/distroseriespackagecache.py	2016-05-18 08:41:52 +0000
@@ -1,4 +1,4 @@
-# Copyright 2009 Canonical Ltd.  This software is licensed under the
+# Copyright 2009-2016 Canonical Ltd.  This software is licensed under the
 # GNU Affero General Public License version 3 (see the file LICENSE).
 
 __metaclass__ = type
@@ -6,22 +6,24 @@
     'DistroSeriesPackageCache',
     ]
 
+from operator import attrgetter
+
 from sqlobject import (
     ForeignKey,
     StringCol,
     )
-from storm.locals import (
+from storm.expr import (
     Desc,
     Max,
-    RawStr,
+    Select,
     )
+from storm.locals import RawStr
 from zope.interface import implementer
 
+from lp.services.database import bulk
 from lp.services.database.interfaces import IStore
-from lp.services.database.sqlbase import (
-    SQLBase,
-    sqlvalues,
-    )
+from lp.services.database.sqlbase import SQLBase
+from lp.soyuz.enums import PackagePublishingStatus
 from lp.soyuz.interfaces.distroseriespackagecache import (
     IDistroSeriesPackageCache,
     )
@@ -50,6 +52,22 @@
     descriptions = StringCol(notNull=False, default=None)
 
     @classmethod
+    def findCurrentBinaryPackageNames(cls, archive, distroseries):
+        bpn_ids = IStore(BinaryPackagePublishingHistory).find(
+            BinaryPackagePublishingHistory.binarypackagenameID,
+            BinaryPackagePublishingHistory.distroarchseriesID.is_in(
+                Select(
+                    DistroArchSeries.id, tables=[DistroArchSeries],
+                    where=DistroArchSeries.distroseries == distroseries)),
+            BinaryPackagePublishingHistory.archive == archive,
+            BinaryPackagePublishingHistory.status.is_in((
+                PackagePublishingStatus.PENDING,
+                PackagePublishingStatus.PUBLISHED))).config(
+                    distinct=True)
+        return list(sorted(
+            bulk.load(BinaryPackageName, bpn_ids), key=attrgetter('name')))
+
+    @classmethod
     def _find(cls, distroseries, archive=None):
         """All of the cached binary package records for this distroseries.
 
@@ -79,25 +97,11 @@
             messages.
         """
         # get the set of package names that should be there
-        bpns = set(BinaryPackageName.select("""
-            BinaryPackagePublishingHistory.distroarchseries =
-                DistroArchSeries.id AND
-            DistroArchSeries.distroseries = %s AND
-            Archive.id = %s AND
-            BinaryPackagePublishingHistory.archive = Archive.id AND
-            BinaryPackagePublishingHistory.binarypackagerelease =
-                BinaryPackageRelease.id AND
-            BinaryPackagePublishingHistory.binarypackagename =
-                BinaryPackageName.id AND
-            BinaryPackagePublishingHistory.dateremoved is NULL AND
-            Archive.enabled = TRUE
-            """ % sqlvalues(distroseries.id, archive.id),
-            distinct=True,
-            clauseTables=[
-                'Archive',
-                'DistroArchSeries',
-                'BinaryPackagePublishingHistory',
-                'BinaryPackageRelease']))
+        if not archive.enabled:
+            bpns = set()
+        else:
+            bpns = set(
+                cls.findCurrentBinaryPackageNames(archive, distroseries))
 
         # remove the cache entries for binary packages we no longer want
         for cache in cls._find(distroseries, archive):
@@ -108,8 +112,8 @@
                 cache.destroySelf()
 
     @classmethod
-    def _update(cls, distroseries, binarypackagename, archive, log):
-        """Update the package cache for a given IBinaryPackageName
+    def _update(cls, distroseries, binarypackagenames, archive, log):
+        """Update the package cache for a given set of `IBinaryPackageName`s.
 
         'log' is required, it should be a logger object able to print
         DEBUG level messages.
@@ -117,58 +121,70 @@
         (in full batches of 100 elements)
         """
         # get the set of published binarypackagereleases
-        details = list(IStore(BinaryPackageRelease).find(
-            (BinaryPackageRelease.summary, BinaryPackageRelease.description,
+        all_details = list(IStore(BinaryPackageRelease).find(
+            (BinaryPackageRelease.binarypackagenameID,
+             BinaryPackageRelease.summary, BinaryPackageRelease.description,
              Max(BinaryPackageRelease.datecreated)),
             BinaryPackageRelease.id ==
                 BinaryPackagePublishingHistory.binarypackagereleaseID,
-            BinaryPackagePublishingHistory.binarypackagename ==
-                binarypackagename,
-            BinaryPackagePublishingHistory.distroarchseriesID ==
-                DistroArchSeries.id,
-            DistroArchSeries.distroseries == distroseries,
+            BinaryPackagePublishingHistory.binarypackagenameID.is_in(
+                [bpn.id for bpn in binarypackagenames]),
+            BinaryPackagePublishingHistory.distroarchseriesID.is_in(
+                Select(
+                    DistroArchSeries.id, tables=[DistroArchSeries],
+                    where=DistroArchSeries.distroseries == distroseries)),
             BinaryPackagePublishingHistory.archive == archive,
-            BinaryPackagePublishingHistory.dateremoved == None
+            BinaryPackagePublishingHistory.status.is_in((
+                PackagePublishingStatus.PENDING,
+                PackagePublishingStatus.PUBLISHED))
             ).group_by(
+                BinaryPackageRelease.binarypackagenameID,
                 BinaryPackageRelease.summary,
                 BinaryPackageRelease.description
             ).order_by(
+                BinaryPackageRelease.binarypackagenameID,
                 Desc(Max(BinaryPackageRelease.datecreated))))
-
-        if not details:
+        if not all_details:
             log.debug("No binary releases found.")
             return
 
-        # find or create the cache entry
-        cache = cls.selectOne("""
-            distroseries = %s AND
-            archive = %s AND
-            binarypackagename = %s
-            """ % sqlvalues(distroseries, archive, binarypackagename))
-        if cache is None:
-            log.debug("Creating new binary cache entry.")
-            cache = cls(
-                archive=archive,
-                distroseries=distroseries,
-                binarypackagename=binarypackagename)
-
-        # make sure the cached name, summary and description are correct
-        cache.name = binarypackagename.name
-        cache.summary = details[0][0]
-        cache.description = details[0][1]
-
-        # get the sets of binary package summaries, descriptions. there is
-        # likely only one, but just in case...
-
-        summaries = set()
-        descriptions = set()
-        for summary, description, datecreated in details:
-            summaries.add(summary)
-            descriptions.add(description)
-
-        # and update the caches
-        cache.summaries = ' '.join(sorted(summaries))
-        cache.descriptions = ' '.join(sorted(descriptions))
+        details_map = {}
+        for (bpn_id, summary, description, datecreated) in all_details:
+            bpn = IStore(BinaryPackageName).get(BinaryPackageName, bpn_id)
+            details_map.setdefault(bpn, [])
+            details_map[bpn].append((summary, description))
+
+        all_caches = IStore(cls).find(
+            cls, cls.distroseries == distroseries, cls.archive == archive,
+            cls.binarypackagenameID.is_in(
+                [bpn.id for bpn in binarypackagenames]))
+        cache_map = {cache.binarypackagename: cache for cache in all_caches}
+
+        for bpn in set(binarypackagenames) - set(cache_map.keys()):
+            cache_map[bpn] = cls(
+                archive=archive, distroseries=distroseries,
+                binarypackagename=bpn)
+
+        for bpn in binarypackagenames:
+            cache = cache_map[bpn]
+            details = details_map[bpn]
+            # make sure the cached name, summary and description are correct
+            cache.name = bpn.name
+            cache.summary = details[0][0]
+            cache.description = details[0][1]
+
+            # get the sets of binary package summaries, descriptions. there is
+            # likely only one, but just in case...
+
+            summaries = set()
+            descriptions = set()
+            for summary, description in details:
+                summaries.add(summary)
+                descriptions.add(description)
+
+            # and update the caches
+            cache.summaries = ' '.join(sorted(summaries))
+            cache.descriptions = ' '.join(sorted(descriptions))
 
     @classmethod
     def updateAll(cls, distroseries, archive, log, ztm, commit_chunk=500):
@@ -190,27 +206,26 @@
             return
 
         # Get the set of package names to deal with.
-        bpns = IStore(BinaryPackageName).find(
-            BinaryPackageName,
-            DistroArchSeries.distroseries == distroseries,
-            BinaryPackagePublishingHistory.distroarchseriesID ==
-                DistroArchSeries.id,
-            BinaryPackagePublishingHistory.archive == archive,
-            BinaryPackagePublishingHistory.binarypackagename ==
-                BinaryPackageName.id,
-            BinaryPackagePublishingHistory.dateremoved == None).config(
-                distinct=True).order_by(BinaryPackageName.name)
+        bpns = cls.findCurrentBinaryPackageNames(archive, distroseries)
 
         number_of_updates = 0
-        chunk_size = 0
+        chunks = []
+        chunk = []
         for bpn in bpns:
-            log.debug("Considering binary '%s'" % bpn.name)
-            cls._update(distroseries, bpn, archive, log)
-            number_of_updates += 1
-            chunk_size += 1
-            if chunk_size == commit_chunk:
-                chunk_size = 0
-                log.debug("Committing")
-                ztm.commit()
+            chunk.append(bpn)
+            if len(chunk) == commit_chunk:
+                chunks.append(chunk)
+                chunk = []
+        if chunk:
+            chunks.append(chunk)
+        for chunk in chunks:
+            bulk.load(BinaryPackageName, [bpn.id for bpn in chunk])
+            log.debug(
+                "Considering binaries %s",
+                ', '.join([bpn.name for bpn in chunk]))
+            cls._update(distroseries, chunk, archive, log)
+            number_of_updates += len(chunk)
+            log.debug("Committing")
+            ztm.commit()
 
         return number_of_updates

=== modified file 'lib/lp/soyuz/tests/test_distroseriesbinarypackage.py'
--- lib/lp/soyuz/tests/test_distroseriesbinarypackage.py	2012-01-20 15:42:44 +0000
+++ lib/lp/soyuz/tests/test_distroseriesbinarypackage.py	2016-05-18 08:41:52 +0000
@@ -1,4 +1,4 @@
-# Copyright 2010 Canonical Ltd.  This software is licensed under the
+# Copyright 2010-2016 Canonical Ltd.  This software is licensed under the
 # GNU Affero General Public License version 3 (see the file LICENSE).
 
 """Tests for `DistroSeriesBinaryPackage`."""
@@ -61,12 +61,12 @@
         logger = BufferLogger()
         with dbuser(config.statistician.dbuser):
             DistroSeriesPackageCache._update(
-                self.distroseries, self.binary_package_name, distro_archive_1,
-                logger)
+                self.distroseries, [self.binary_package_name],
+                distro_archive_1, logger)
 
             DistroSeriesPackageCache._update(
-                self.distroseries, self.binary_package_name, distro_archive_2,
-                logger)
+                self.distroseries, [self.binary_package_name],
+                distro_archive_2, logger)
 
         self.failUnlessEqual(
             'Foo is the best', self.distroseries_binary_package.summary)


Follow ups