← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] lp:~wgrant/launchpad/bug-736005-trivialise into lp:launchpad

 

William Grant has proposed merging lp:~wgrant/launchpad/bug-736005-trivialise into lp:launchpad.

Commit message:
Rewrite the POFile:+translate text search query to perform adequately.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)
Related bugs:
  Bug #736005 in Launchpad itself: "POFile:+translate timeouts"
  https://bugs.launchpad.net/launchpad/+bug/736005

For more details, see:
https://code.launchpad.net/~wgrant/launchpad/bug-736005-trivialise/+merge/250447

Rewrite translations text search to use the simplest query that can work, which happens to optimise best nowadays. The old code is littered with comments describing how simpler queries degrade to seqscans of POMsgID and POTranslation, but that's not the case in 9.3 with a sensible simple query.

https://pastebin.canonical.com/126018/ compares the new and old queries. Further optimisation will come soon once POTMsgSet.msgid_(singular|plural) are denormalised to TranslationTemplateItem, and the COUNT(*) queries are eliminated from large template views.
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~wgrant/launchpad/bug-736005-trivialise into lp:launchpad.
=== modified file 'lib/lp/translations/model/pofile.py'
--- lib/lp/translations/model/pofile.py	2013-06-20 05:50:00 +0000
+++ lib/lp/translations/model/pofile.py	2015-02-20 14:04:23 +0000
@@ -56,7 +56,6 @@
 from lp.services.database.sqlbase import (
     flush_database_updates,
     quote,
-    quote_like,
     SQLBase,
     sqlvalues,
     )
@@ -96,6 +95,7 @@
     credits_message_str,
     POTMsgSet,
     )
+from lp.translations.model.potranslation import POTranslation
 from lp.translations.model.translationimportqueue import collect_import_info
 from lp.translations.model.translationmessage import (
     make_plurals_sql_fragment,
@@ -165,102 +165,6 @@
         header.has_plural_forms = self.potemplate.hasPluralMessage()
         return header
 
-    def _getTranslationSearchQuery(self, pofile, plural_form, text):
-        """Query to find `text` in `plural_form` translations of a `pofile`.
-
-        This produces a list of clauses that can be used to search for
-        TranslationMessages containing `text` in their msgstr[`plural_form`].
-        Returned values are POTMsgSet ids containing them, expected to be
-        used in a UNION across all plural forms.
-        """
-        translation_match = """
-        -- Find translations containing `text`.
-        -- Like in findPOTMsgSetsContaining(), to avoid seqscans on
-        -- POTranslation table, we do ILIKE comparison on them in
-        -- a subselect which is first filtered by the POFile.
-          SELECT TranslationMessage.potmsgset
-            FROM TranslationMessage
-            JOIN TranslationTemplateItem
-              ON TranslationMessage.potmsgset
-                   = TranslationTemplateItem.potmsgset
-            WHERE
-              TranslationTemplateItem.potemplate = %(potemplate)s AND
-              TranslationMessage.language = %(language)s AND
-              TranslationMessage.msgstr%(plural_form)d IN (
-                SELECT POTranslation.id FROM POTranslation WHERE
-                  POTranslation.id IN (
-                    SELECT DISTINCT(msgstr%(plural_form)d)
-                      FROM TranslationMessage AS tm_ids
-                      JOIN TranslationTemplateItem
-                        ON tm_ids.potmsgset=TranslationTemplateItem.potmsgset
-                      WHERE
-                        TranslationTemplateItem.potemplate
-                          = %(potemplate)s AND
-                        TranslationTemplateItem.sequence > 0 AND
-                        tm_ids.language=%(language)s
-                  ) AND
-                  POTranslation.translation
-                    ILIKE '%%' || %(text)s || '%%')
-                    """ % dict(potemplate=quote(pofile.potemplate),
-                               language=quote(pofile.language),
-                               plural_form=plural_form,
-                               text=quote_like(text))
-        return translation_match
-
-    def _getTemplateSearchQuery(self, text):
-        """Query for finding `text` in msgids of this POFile.
-        """
-        english_match = """
-        -- Step 1a: get POTMsgSets where msgid_singular contains `text`
-        -- To avoid seqscans on POMsgID table (what LIKE usually
-        -- does), we do ILIKE comparison on them in a subselect first
-        -- filtered by this POTemplate.
-          SELECT POTMsgSet.id
-            FROM POTMsgSet
-            JOIN TranslationTemplateItem
-              ON TranslationTemplateItem.potmsgset=POTMsgSet.id AND
-                 TranslationTemplateItem.potemplate=%s
-            WHERE
-              (POTMsgSet.msgid_singular IS NOT NULL AND
-               POTMsgSet.msgid_singular IN (
-                 SELECT POMsgID.id FROM POMsgID
-                   WHERE id IN (
-                     SELECT DISTINCT(msgid_singular)
-                       FROM POTMsgSet
-                       JOIN TranslationTemplateItem
-                         ON TranslationTemplateItem.potmsgset = POTMsgSet.id
-                       WHERE
-                         TranslationTemplateItem.potemplate=%s AND
-                         TranslationTemplateItem.sequence > 0
-                   ) AND
-                   msgid ILIKE '%%' || %s || '%%'))
-          UNION
-        -- Step 1b: like above, just on msgid_plural.
-          SELECT POTMsgSet.id
-            FROM POTMsgSet
-            JOIN TranslationTemplateItem
-              ON TranslationTemplateItem.potmsgset=POTMsgSet.id AND
-                 TranslationTemplateItem.potemplate=%s
-            WHERE
-              (POTMsgSet.msgid_plural IS NOT NULL AND
-               POTMsgSet.msgid_plural IN (
-                 SELECT POMsgID.id FROM POMsgID
-                   WHERE id IN (
-                     SELECT DISTINCT(msgid_plural)
-                       FROM POTMsgSet
-                       JOIN TranslationTemplateItem
-                         ON TranslationTemplateItem.potmsgset = POTMsgSet.id
-                       WHERE
-                         TranslationTemplateItem.potemplate=%s AND
-                         TranslationTemplateItem.sequence > 0
-                   ) AND
-                   msgid ILIKE '%%' || %s || '%%'))
-            """ % (quote(self.potemplate), quote(self.potemplate),
-                   quote_like(text),
-                   quote(self.potemplate), quote(self.potemplate),
-                   quote_like(text))
-        return english_match
-
     def _getOrderedPOTMsgSets(self, origin_tables, query):
         """Find all POTMsgSets matching `query` from `origin_tables`.
 
@@ -271,41 +175,83 @@
             POTMsgSet, SQL(query))
         return results.order_by(TranslationTemplateItem.sequence)
 
+    def _getTranslationSearchBits(self, pofile):
+        ThisTM = ClassAlias(TranslationMessage)
+        origin = [
+            LeftJoin(
+                ThisTM,
+                And(
+                    ThisTM.potmsgsetID ==
+                        TranslationTemplateItem.potmsgsetID,
+                    ThisTM.languageID == pofile.languageID))
+            ]
+        search_options = []
+        for plural_form in range(pofile.plural_forms):
+            tm_col = getattr(
+                ThisTM, 'msgstr%dID' % plural_form)
+            FormStr = ClassAlias(POTranslation)
+            origin.append(LeftJoin(
+                FormStr, FormStr.id == tm_col))
+            search_options.append(FormStr.translation)
+        return origin, search_options
+
     def findPOTMsgSetsContaining(self, text):
         """See `IPOFile`."""
+        origin = [
+            POTMsgSet,
+            Join(
+                TranslationTemplateItem,
+                TranslationTemplateItem.potmsgsetID == POTMsgSet.id),
+            ]
         clauses = [
-            'TranslationTemplateItem.potemplate = %s' % sqlvalues(
-                self.potemplate),
-            'TranslationTemplateItem.potmsgset = POTMsgSet.id',
-            'TranslationTemplateItem.sequence > 0',
+            TranslationTemplateItem.potemplateID == self.potemplate.id,
+            TranslationTemplateItem.potmsgsetID == POTMsgSet.id,
+            TranslationTemplateItem.sequence > 0,
             ]
 
         if text is not None:
             assert len(text) > 1, (
                 "You can not search for strings shorter than 2 characters.")
 
+            # A list of string columns, of which at least one must match
+            # the search term.
+            search_options = []
+
+            # Search the English strings.
             if self.potemplate.uses_english_msgids:
-                english_match = self._getTemplateSearchQuery(text)
+                SingularID = ClassAlias(POMsgID)
+                PluralID = ClassAlias(POMsgID)
+                origin.extend([
+                    Join(
+                        SingularID,
+                        SingularID.id == POTMsgSet.msgid_singularID),
+                    LeftJoin(
+                        PluralID,
+                        PluralID.id == POTMsgSet.msgid_pluralID)
+                    ])
+                search_options.extend([SingularID.msgid, PluralID.msgid])
             else:
-                # If msgids are not in English, use English PO file
-                # to fetch original strings instead.
-                en_pofile = self.potemplate.getPOFileByLang('en')
-                english_match = self._getTranslationSearchQuery(
-                    en_pofile, 0, text)
+                en_po = self.potemplate.getPOFileByLang('en')
+                en_origin, en_options = self._getTranslationSearchBits(en_po)
+                origin.extend(en_origin)
+                search_options.extend(en_options)
 
-            # Do not look for translations in a DummyPOFile.
-            search_clauses = [english_match]
+            # Search the translations themselves, unless this is a
+            # DummyPOFile.
             if self.id is not None:
-                for plural_form in range(self.plural_forms):
-                    translation_match = self._getTranslationSearchQuery(
-                        self, plural_form, text)
-                    search_clauses.append(translation_match)
-
-            clauses.append(
-                "POTMsgSet.id IN (" + " UNION ".join(search_clauses) + ")")
-
-        return self._getOrderedPOTMsgSets(
-            [POTMsgSet, TranslationTemplateItem], ' AND '.join(clauses))
+                po_origin, po_options = self._getTranslationSearchBits(self)
+                origin.extend(po_origin)
+                search_options.extend(po_options)
+
+            # Case-insensitively substring-match the search term against
+            # each searchable column.
+            clauses.append(Or(*(
+                str.like('%%%s%%' % text, case_sensitive=False)
+                for str in search_options)))
+
+        return IMasterStore(POTMsgSet).using(*origin).find(
+            POTMsgSet, *clauses).config(distinct=True).order_by(
+            TranslationTemplateItem.sequence)
 
     def getFullLanguageCode(self):
         """See `IPOFile`."""


Follow ups