← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] lp:~wgrant/launchpad/bug-718004-rewrite-targetnamecache into lp:launchpad

 

William Grant has proposed merging lp:~wgrant/launchpad/bug-718004-rewrite-targetnamecache into lp:launchpad.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)
Related bugs:
  #198778 IBugTaskSet.dangerousGetAllTasks() shouldn't exist
  https://bugs.launchpad.net/bugs/198778
  #718004 update-bugtask-targetnamecaches is horrifyingly inefficient
  https://bugs.launchpad.net/bugs/718004

For more details, see:
https://code.launchpad.net/~wgrant/launchpad/bug-718004-rewrite-targetnamecache/+merge/49612

This branch rewrites update-bugtask-targetnamecaches to be around 1000 times faster on the production dataset.

Previously it would retrieve every bugtask ever and call a method on each. In a tunableloop. Without precaching. It took around 18 hours to run on production.

This new version instead selects all distinct targets and their cached names, calculates each target's proper name, and does a mass-update on any outdated cached names. That alone takes it down to 8 minutes on mawson. Additionally precaching all target objects takes it down to 1 minute, a 100000% improvement over production.

Tests have been changed to cope with the new script's debug output, and I've dropped BugTaskSet.dangerousGetAllTasks which is now unused.
-- 
https://code.launchpad.net/~wgrant/launchpad/bug-718004-rewrite-targetnamecache/+merge/49612
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~wgrant/launchpad/bug-718004-rewrite-targetnamecache into lp:launchpad.
=== modified file 'lib/lp/bugs/doc/bugtask.txt'
--- lib/lp/bugs/doc/bugtask.txt	2011-02-10 01:18:39 +0000
+++ lib/lp/bugs/doc/bugtask.txt	2011-02-14 11:59:57 +0000
@@ -1113,8 +1113,12 @@
     >>> print err
     INFO Creating lockfile: /var/lock/launchpad-launchpad-targetnamecacheupdater.lock
     INFO Updating targetname cache of bugtasks.
-    INFO Updating 1 BugTasks (starting id: 2).
-    INFO Updating ...BugTasks...
+    INFO Calculating targets.
+    INFO Will check ... targets.
+    ...
+    INFO Updating (u'Mozilla Thunderbird',) to 'Mozilla Thunderbird NG'.
+    ...
+    INFO Updated 1 target names.
     INFO Finished updating targetname cache of bugtasks.
 
     >>> process.returncode
@@ -1192,8 +1196,11 @@
     >>> updater = BugTaskTargetNameCacheUpdater(transaction, logger)
     >>> updater.run()
     INFO Updating targetname cache of bugtasks.
-    INFO Updating 1 BugTasks (starting id: 2).
-    INFO Updating ...BugTasks...
+    INFO Calculating targets.
+    ...
+    INFO Updating (u'Mozilla Thunderbird NG',) to 'Mozilla Thunderbird'.
+    ...
+    INFO Updated 1 target names.
     INFO Finished updating targetname cache of bugtasks.
 
     >>> flush_database_caches()

=== modified file 'lib/lp/bugs/interfaces/bugtask.py'
--- lib/lp/bugs/interfaces/bugtask.py	2011-02-08 05:54:20 +0000
+++ lib/lp/bugs/interfaces/bugtask.py	2011-02-14 11:59:57 +0000
@@ -1512,19 +1512,6 @@
         If the col_name is unrecognized, a KeyError is raised.
         """
 
-    # XXX kiko 2006-03-23:
-    # get rid of this kludge when we have proper security for scripts.
-    def dangerousGetAllTasks():
-        """DO NOT USE THIS METHOD UNLESS YOU KNOW WHAT YOU ARE DOING
-
-        Returns ALL BugTasks. YES, THAT INCLUDES PRIVATE ONES. Do not
-        use this method. DO NOT USE IT. I REPEAT: DO NOT USE IT.
-
-        This method exists solely for the purpose of scripts that need
-        to do gardening over all bug tasks; the current example is
-        update-bugtask-targetnamecaches.
-        """
-
     def getBugCountsForPackages(user, packages):
         """Return open bug counts for the list of packages.
 

=== modified file 'lib/lp/bugs/model/bugtask.py'
--- lib/lp/bugs/model/bugtask.py	2011-02-14 10:24:56 +0000
+++ lib/lp/bugs/model/bugtask.py	2011-02-14 11:59:57 +0000
@@ -15,6 +15,7 @@
     'BugTaskSet',
     'NullBugTask',
     'bugtask_sort_key',
+    'determine_target',
     'get_bug_privacy_filter',
     'get_related_bugtasks_search_params',
     'search_value_to_where_condition',
@@ -146,7 +147,6 @@
 from lp.registry.interfaces.milestone import IProjectGroupMilestone
 from lp.registry.interfaces.person import (
     IPerson,
-    IPersonSet,
     validate_person,
     validate_public_person,
     )
@@ -268,6 +268,29 @@
     return search_params
 
 
+def determine_target(product, productseries, distribution, distroseries,
+                     sourcepackagename):
+    """Returns the IBugTarget defined by the given arguments."""
+    if product:
+        return product
+    elif productseries:
+        return productseries
+    elif distribution:
+        if sourcepackagename:
+            return distribution.getSourcePackage(
+                sourcepackagename)
+        else:
+            return distribution
+    elif distroseries:
+        if sourcepackagename:
+            return distroseries.getSourcePackage(
+                sourcepackagename)
+        else:
+            return distroseries
+    else:
+        raise AssertionError("Unable to determine bugtask target.")
+
+
 class BugTaskDelta:
     """See `IBugTaskDelta`."""
 
@@ -313,24 +336,9 @@
         # We explicitly reference attributes here (rather than, say,
         # IDistroBugTask.providedBy(self)), because we can't assume this
         # task has yet been marked with the correct interface.
-        if self.product:
-            return self.product
-        elif self.productseries:
-            return self.productseries
-        elif self.distribution:
-            if self.sourcepackagename:
-                return self.distribution.getSourcePackage(
-                    self.sourcepackagename)
-            else:
-                return self.distribution
-        elif self.distroseries:
-            if self.sourcepackagename:
-                return self.distroseries.getSourcePackage(
-                    self.sourcepackagename)
-            else:
-                return self.distroseries
-        else:
-            raise AssertionError("Unable to determine bugtask target.")
+        return determine_target(
+            self.product, self.productseries, self.distribution,
+            self.distroseries, self.sourcepackagename)
 
     @property
     def related_tasks(self):
@@ -2924,10 +2932,6 @@
 
         return tuple(orderby_arg)
 
-    def dangerousGetAllTasks(self):
-        """DO NOT USE THIS METHOD. For details, see `IBugTaskSet`"""
-        return BugTask.select(orderBy='id')
-
     def getBugCountsForPackages(self, user, packages):
         """See `IBugTaskSet`."""
         distributions = sorted(

=== modified file 'lib/lp/bugs/scripts/bugtasktargetnamecaches.py'
--- lib/lp/bugs/scripts/bugtasktargetnamecaches.py	2010-08-20 20:31:18 +0000
+++ lib/lp/bugs/scripts/bugtasktargetnamecaches.py	2011-02-14 11:59:57 +0000
@@ -6,12 +6,33 @@
 __metaclass__ = type
 __all__ = ['BugTaskTargetNameCacheUpdater']
 
-from zope.component import getUtility
+from collections import defaultdict
+
 from zope.interface import implements
 
+from canonical.launchpad.interfaces.lpstorm import (
+    IMasterStore,
+    ISlaveStore,
+    )
 from canonical.launchpad.interfaces.looptuner import ITunableLoop
 from canonical.launchpad.utilities.looptuner import LoopTuner
-from lp.bugs.interfaces.bugtask import IBugTaskSet
+from lp.bugs.model.bugtask import (
+    BugTask,
+    determine_target,
+    )
+from lp.registry.model.distribution import Distribution
+from lp.registry.model.distroseries import DistroSeries
+from lp.registry.model.product import Product
+from lp.registry.model.productseries import ProductSeries
+from lp.registry.model.sourcepackagename import SourcePackageName
+
+
+target_classes = (
+    Product, ProductSeries, Distribution, DistroSeries, SourcePackageName)
+target_columns = (
+    BugTask.productID, BugTask.productseriesID, BugTask.distributionID,
+    BugTask.distroseriesID, BugTask.sourcepackagenameID,
+    BugTask.targetnamecache)
 
 
 class BugTaskTargetNameCachesTunableLoop(object):
@@ -19,23 +40,38 @@
 
     implements(ITunableLoop)
 
-    total_updated = 0
-
     def __init__(self, transaction, logger, offset=0):
         self.transaction = transaction
         self.logger = logger
         self.offset = offset
         self.total_updated = 0
 
+        self.logger.info("Calculating targets.")
+        self.transaction.begin()
+        self.candidates = self.determineCandidates()
+        self.transaction.abort()
+        self.logger.info("Will check %i targets." % len(self.candidates))
+
+    def determineCandidates(self):
+        """Find all distinct BugTask targets with their cached names.
+
+        Returns a list of (target, set_of_cached_names) pairs, where target is
+        (Product ID, ProductSeries ID, Distribution ID, DistroSeries ID,
+        SourcePackageName ID).
+        """
+        store = ISlaveStore(BugTask)
+        candidate_set = store.find(target_columns).config(distinct=True)
+        candidates = defaultdict(set)
+        for p, ps, d, ds, spn, cache in candidate_set:
+            candidates[(p, ps, d, ds, spn)].add(cache)
+        return list(candidates.iteritems())
+
     def isDone(self):
         """See `ITunableLoop`."""
-        # When the main loop has no more BugTasks to process it sets
-        # offset to None. Until then, it always has a numerical
-        # value.
-        return self.offset is None
+        return self.offset >= len(self.candidates)
 
     def __call__(self, chunk_size):
-        """Retrieve a batch of BugTasks and update their targetname caches.
+        """Take a batch of targets and update their BugTasks' name caches.
 
         See `ITunableLoop`.
         """
@@ -49,30 +85,43 @@
         start = self.offset
         end = self.offset + chunk_size
 
+        chunk = self.candidates[start:end]
+
         self.transaction.begin()
-        # XXX: kiko 2006-03-23:
-        # We use a special API here, which is kinda klunky, but which
-        # allows us to return all bug tasks (even private ones); this should
-        # eventually be changed to a more elaborate permissions scheme,
-        # pending the infrastructure to do so. See bug #198778.
-        bugtasks = list(
-            getUtility(IBugTaskSet).dangerousGetAllTasks()[start:end])
-
-        self.offset = None
-        if bugtasks:
-            starting_id = bugtasks[0].id
-            self.logger.info("Updating %i BugTasks (starting id: %i)." %
-                (len(bugtasks), starting_id))
-
-        for bugtask in bugtasks:
-            # We set the starting point of the next batch to the BugTask
-            # id after the one we're looking at now. If there aren't any
-            # bugtasks this loop will run for 0 iterations and start_id
-            # will remain set to None.
-            start += 1
-            self.offset = start
-            bugtask.updateTargetNameCache()
-            self.total_updated += 1
+        store = IMasterStore(BugTask)
+
+        # Rotate the target rows into lists of object IDs to retrieve.
+        # This way we can issue just one query of each type for each batch.
+        ids_to_cache = zip(*(target for (target, names) in chunk))
+        for index, cls in enumerate(target_classes):
+            ids = (id for id in ids_to_cache[index] if id is not None)
+            list(store.find(cls, cls.id.is_in(ids)))
+
+        for target_bits, cached_names in chunk:
+            self.offset += 1
+            # Resolve the IDs to objects, and get the actual IBugTarget.
+            # If the ID is None, don't even try to get an object.
+            target_objects = (
+                (store.get(cls, id) if id is not None else None)
+                for cls, id in zip(target_classes, target_bits))
+            target = determine_target(*target_objects)
+            new_name = target.bugtargetdisplayname
+            cached_names.discard(new_name)
+            # If there are any outdated names cached, update them all in
+            # a single query.
+            if len(cached_names) > 0:
+                self.logger.info(
+                    "Updating %r to '%s'." % (tuple(cached_names), new_name))
+                self.total_updated += len(cached_names)
+                conditions = (
+                    col == id for col, id in zip(target_columns, target_bits))
+                to_update = store.find(
+                    BugTask,
+                    BugTask.targetnamecache.is_in(cached_names),
+                    *conditions)
+                to_update.set(targetnamecache=new_name)
+
+        self.logger.info("Checked %i targets." % len(chunk))
 
         self.transaction.commit()
 
@@ -96,7 +145,5 @@
         loop_tuner = LoopTuner(loop, 2)
         loop_tuner.run()
 
-        self.logger.info("Updated %i bugtask targetname caches." %
-            loop.total_updated)
+        self.logger.info("Updated %i target names." % loop.total_updated)
         self.logger.info("Finished updating targetname cache of bugtasks.")
-