← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] ~cjwatson/launchpad:optimize-scrub-pofiletranslator into launchpad:master

 

Colin Watson has proposed merging ~cjwatson/launchpad:optimize-scrub-pofiletranslator into launchpad:master.

Commit message:
Optimize ScrubPOFileTranslator

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~cjwatson/launchpad/+git/launchpad/+merge/419220

This garbo job was taking over 16 hours on production, typically spending around eight of those hours sleeping while the nightly backup ran, and then timing out some time later.  A few simple database query optimizations (mostly bulk-loading data) reduce the runtime from around eight hours to around 16 minutes on dogfood, and should have a similar effect on production.
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of ~cjwatson/launchpad:optimize-scrub-pofiletranslator into launchpad:master.
diff --git a/lib/lp/translations/scripts/scrub_pofiletranslator.py b/lib/lp/translations/scripts/scrub_pofiletranslator.py
index 605b822..0cb095f 100644
--- a/lib/lp/translations/scripts/scrub_pofiletranslator.py
+++ b/lib/lp/translations/scripts/scrub_pofiletranslator.py
@@ -70,28 +70,29 @@ def summarize_pofiles(pofile_ids):
 def get_potmsgset_ids(potemplate_id):
     """Get the ids for each current `POTMsgSet` in a `POTemplate`."""
     store = IStore(POTemplate)
-    return store.find(
+    return set(store.find(
         TranslationTemplateItem.potmsgsetID,
         TranslationTemplateItem.potemplateID == potemplate_id,
-        TranslationTemplateItem.sequence > 0)
+        TranslationTemplateItem.sequence > 0))
 
 
-def summarize_contributors(potemplate_id, language_id, potmsgset_ids):
-    """Return the set of ids of persons who contributed to a `POFile`.
+def summarize_contributors(potemplate_id, language_ids, potmsgset_ids):
+    """Return per-language sets of person ids who contributed to a `POFile`.
 
     This is a limited version of `get_contributions` that is easier to
     compute.
     """
     store = IStore(POFile)
-    contribs = store.find(
-        TranslationMessage.submitterID,
-        TranslationMessage.potmsgsetID.is_in(potmsgset_ids),
-        TranslationMessage.languageID == language_id,
-        TranslationMessage.msgstr0 != None,
-        Coalesce(TranslationMessage.potemplateID, potemplate_id) ==
-            potemplate_id)
-    contribs.config(distinct=True)
-    return set(contribs)
+    contribs = {language_id: set() for language_id in language_ids}
+    for language_id, submitter_id in store.find(
+            (TranslationMessage.languageID, TranslationMessage.submitterID),
+            TranslationMessage.potmsgsetID.is_in(potmsgset_ids),
+            TranslationMessage.languageID.is_in(language_ids),
+            TranslationMessage.msgstr0 != None,
+            Coalesce(TranslationMessage.potemplateID, potemplate_id) ==
+                potemplate_id).config(distinct=True):
+        contribs[language_id].add(submitter_id)
+    return contribs
 
 
 def get_contributions(pofile, potmsgset_ids):
@@ -125,15 +126,18 @@ def get_contributions(pofile, potmsgset_ids):
     return dict(contribs)
 
 
-def get_pofiletranslators(pofile_id):
-    """Get `Person` ids from `POFileTranslator` entries for a `POFile`.
+def get_pofiletranslators(pofile_ids):
+    """Get `Person` ids from `POFileTranslator` entries for a set of `POFile`s.
 
-    Returns a `set` of `Person` ids.
+    Returns a mapping of `POFile` IDs to `set`s of `Person` ids.
     """
     store = IStore(POFileTranslator)
-    return set(store.find(
-        POFileTranslator.personID,
-        POFileTranslator.pofileID == pofile_id))
+    pofts = {pofile_id: set() for pofile_id in pofile_ids}
+    for pofile_id, person_id in store.find(
+            (POFileTranslator.pofileID, POFileTranslator.personID),
+            POFileTranslator.pofileID.is_in(pofile_ids)):
+        pofts[pofile_id].add(person_id)
+    return pofts
 
 
 def remove_pofiletranslators(logger, pofile, person_ids):
@@ -180,21 +184,6 @@ def fix_pofile(logger, pofile, potmsgset_ids, pofiletranslators):
         logger, pofile, pofiletranslators, contribs)
 
 
-def needs_fixing(template_id, language_id, potmsgset_ids, pofiletranslators):
-    """Does the `POFile` with given details need `POFileTranslator` changes?
-
-    :param template_id: id of the `POTemplate` for the `POFile`.
-    :param language_id: id of the `Language` the `POFile` translates to.
-    :param potmsgset_ids: ids of the `POTMsgSet` items participating in the
-        template.
-    :param pofiletranslators: `POFileTranslator` objects for the `POFile`.
-    :return: Bool: does the existing set of `POFileTranslator` need fixing?
-    """
-    contributors = summarize_contributors(
-        template_id, language_id, potmsgset_ids)
-    return pofiletranslators != set(contributors)
-
-
 # A tuple describing a POFile that needs its POFileTranslators fixed.
 WorkItem = namedtuple("WorkItem", [
     'pofile_id',
@@ -212,14 +201,24 @@ def gather_work_items(pofile_ids):
     """
     pofile_summaries = summarize_pofiles(pofile_ids)
     cached_potmsgsets = {}
+    cached_contributors = {}
+    cached_pofts = get_pofiletranslators(pofile_ids)
     work_items = []
     for pofile_id in pofile_ids:
         template_id, language_id = pofile_summaries[pofile_id]
         if template_id not in cached_potmsgsets:
             cached_potmsgsets[template_id] = get_potmsgset_ids(template_id)
         potmsgset_ids = cached_potmsgsets[template_id]
-        pofts = get_pofiletranslators(pofile_id)
-        if needs_fixing(template_id, language_id, potmsgset_ids, pofts):
+        if template_id not in cached_contributors:
+            all_language_ids = [
+                lang_id for temp_id, lang_id in pofile_summaries.values()
+                if temp_id == template_id]
+            cached_contributors[template_id] = summarize_contributors(
+                template_id, all_language_ids, potmsgset_ids)
+        contributor_ids = cached_contributors[template_id][language_id]
+        pofts = cached_pofts[pofile_id]
+        # Does this `POFile` need `POFileTranslator` changes?
+        if pofts != contributor_ids:
             work_items.append(WorkItem(pofile_id, potmsgset_ids, pofts))
 
     return work_items
@@ -255,7 +254,7 @@ def process_work_items(logger, work_items, pofiles):
 class ScrubPOFileTranslator(TunableLoop):
     """Tunable loop, meant for running from inside Garbo."""
 
-    maximum_chunk_size = 500
+    maximum_chunk_size = 5000
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
diff --git a/lib/lp/translations/scripts/tests/test_scrub_pofiletranslator.py b/lib/lp/translations/scripts/tests/test_scrub_pofiletranslator.py
index b36bac7..c42c33f 100644
--- a/lib/lp/translations/scripts/tests/test_scrub_pofiletranslator.py
+++ b/lib/lp/translations/scripts/tests/test_scrub_pofiletranslator.py
@@ -138,54 +138,72 @@ class TestScrubPOFileTranslator(TestCaseWithFactory):
         pofile = self.factory.makePOFile()
         potmsgset = self.factory.makePOTMsgSet(
             potemplate=pofile.potemplate, sequence=1)
-        self.assertContentEqual(
-            [potmsgset.id], get_potmsgset_ids(pofile.potemplate.id))
+        self.assertEqual(
+            {potmsgset.id}, get_potmsgset_ids(pofile.potemplate.id))
 
     def test_get_potmsgset_ids_ignores_inactive_messages(self):
         pofile = self.factory.makePOFile()
         self.factory.makePOTMsgSet(
             potemplate=pofile.potemplate, sequence=0)
-        self.assertContentEqual([], get_potmsgset_ids(pofile.potemplate.id))
+        self.assertEqual(set(), get_potmsgset_ids(pofile.potemplate.id))
 
     def test_summarize_contributors_gets_contributors(self):
-        pofile = self.factory.makePOFile()
-        tm = self.factory.makeSuggestion(pofile=pofile)
-        potmsgset_ids = get_potmsgset_ids(pofile.potemplate.id)
-        self.assertContentEqual(
-            [tm.submitter.id],
+        template = self.factory.makePOTemplate()
+        pofiles = [
+            self.factory.makePOFile(potemplate=template) for _ in range(2)]
+        tms = [
+            self.factory.makeSuggestion(pofile=pofile) for pofile in pofiles]
+        potmsgset_ids = get_potmsgset_ids(template.id)
+        self.assertEqual(
+            {pofile.language.id: {tm.submitter.id}
+             for pofile, tm in zip(pofiles, tms)},
             summarize_contributors(
-                pofile.potemplate.id, pofile.language.id, potmsgset_ids))
+                template.id, [pofile.language.id for pofile in pofiles],
+                potmsgset_ids))
 
     def test_summarize_contributors_ignores_inactive_potmsgsets(self):
-        pofile = self.factory.makePOFile()
-        potmsgset = self.factory.makePOTMsgSet(
-            potemplate=pofile.potemplate, sequence=0)
-        self.factory.makeSuggestion(pofile=pofile, potmsgset=potmsgset)
-        potmsgset_ids = get_potmsgset_ids(pofile.potemplate.id)
-        self.assertContentEqual(
-            [],
+        template = self.factory.makePOTemplate()
+        pofiles = [
+            self.factory.makePOFile(potemplate=template) for _ in range(2)]
+        potmsgset = self.factory.makePOTMsgSet(potemplate=template, sequence=0)
+        self.factory.makeSuggestion(pofile=pofiles[0], potmsgset=potmsgset)
+        potmsgset_ids = get_potmsgset_ids(template.id)
+        self.assertEqual(
+            {pofile.language.id: set() for pofile in pofiles},
             summarize_contributors(
-                pofile.potemplate.id, pofile.language.id, potmsgset_ids))
+                template.id, [pofile.language.id for pofile in pofiles],
+                potmsgset_ids))
 
     def test_summarize_contributors_includes_diverged_msgs_for_template(self):
-        pofile = self.factory.makePOFile()
-        tm = self.factory.makeSuggestion(pofile=pofile)
-        tm.potemplate = pofile.potemplate
-        potmsgset_ids = get_potmsgset_ids(pofile.potemplate.id)
-        self.assertContentEqual(
-            [tm.submitter.id],
+        template = self.factory.makePOTemplate()
+        pofiles = [
+            self.factory.makePOFile(potemplate=template) for _ in range(2)]
+        tms = [
+            self.factory.makeSuggestion(pofile=pofile) for pofile in pofiles]
+        for tm in tms:
+            tm.potemplate = template
+        potmsgset_ids = get_potmsgset_ids(template.id)
+        self.assertEqual(
+            {pofile.language.id: {tm.submitter.id}
+             for pofile, tm in zip(pofiles, tms)},
             summarize_contributors(
-                pofile.potemplate.id, pofile.language.id, potmsgset_ids))
+                template.id, [pofile.language.id for pofile in pofiles],
+                potmsgset_ids))
 
     def test_summarize_contributors_excludes_other_diverged_messages(self):
-        pofile = self.factory.makePOFile()
-        tm = self.factory.makeSuggestion(pofile=pofile)
-        tm.potemplate = self.factory.makePOTemplate()
-        potmsgset_ids = get_potmsgset_ids(pofile.potemplate.id)
-        self.assertContentEqual(
-            [],
+        template = self.factory.makePOTemplate()
+        pofiles = [
+            self.factory.makePOFile(potemplate=template) for _ in range(2)]
+        tms = [
+            self.factory.makeSuggestion(pofile=pofile) for pofile in pofiles]
+        for tm in tms:
+            tm.potemplate = self.factory.makePOTemplate()
+        potmsgset_ids = get_potmsgset_ids(template.id)
+        self.assertEqual(
+            {pofile.language.id: set() for pofile in pofiles},
             summarize_contributors(
-                pofile.potemplate.id, pofile.language.id, potmsgset_ids))
+                template.id, [pofile.language.id for pofile in pofiles],
+                potmsgset_ids))
 
     def test_get_contributions_gets_contributions(self):
         pofile = self.factory.makePOFile()
@@ -233,10 +251,15 @@ class TestScrubPOFileTranslator(TestCaseWithFactory):
         self.assertEqual({}, get_contributions(pofile, potmsgset_ids))
 
     def test_get_pofiletranslators_gets_translators_for_pofile(self):
-        pofile = self.factory.makePOFile()
-        tm = self.make_message_with_pofiletranslator(pofile)
-        self.assertContentEqual(
-            [tm.submitter.id], get_pofiletranslators(pofile.id))
+        template = self.factory.makePOTemplate()
+        pofiles = [
+            self.factory.makePOFile(potemplate=template) for _ in range(2)]
+        tms = [
+            self.make_message_with_pofiletranslator(pofile)
+            for pofile in pofiles]
+        self.assertEqual(
+            {pofile.id: {tm.submitter.id} for pofile, tm in zip(pofiles, tms)},
+            get_pofiletranslators([pofile.id for pofile in pofiles]))
 
     def test_fix_pofile_leaves_good_pofiletranslator_in_place(self):
         pofile = self.factory.makePOFile()