← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] ~lgp171188/launchpad:fix-textsearch-failures-postgres-14 into launchpad:master

 

Guruprasad has proposed merging ~lgp171188/launchpad:fix-textsearch-failures-postgres-14 into launchpad:master.

Commit message:
Fix full text search test failures with Postgres 14

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~lgp171188/launchpad/+git/launchpad/+merge/480018
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of ~lgp171188/launchpad:fix-textsearch-failures-postgres-14 into launchpad:master.
diff --git a/lib/lp/services/database/doc/textsearching.rst b/lib/lp/services/database/doc/textsearching.rst
index c374934..56ccbf1 100644
--- a/lib/lp/services/database/doc/textsearching.rst
+++ b/lib/lp/services/database/doc/textsearching.rst
@@ -249,19 +249,26 @@ Repeated '-' are simply ignored by to_tsquery().
     >>> ftq("---foo--- ---bar---")
     ---foo---&---bar--- <=> 'foo' & 'bar'
 
+
+XXX 2025-01-23 lgp171188: The following doctests have a lot of placeholders
+ignoring key values like '&', '<->', and '<2>' since it is not straightforward
+to test different values in a doctest based on different PostgreSQL versions.
+So these ignored values have been checked in the unit tests in the
+lp.services.database.tests.test_text_searching module.
+
 Hyphens surrounded by two words are retained. This reflects the way
 how to_tsquery() and to_tsvector() handle such strings.
 
     >>> print(search_same("foo-bar"))
     FTI data: 'bar':3 'foo':2 'foo-bar':1
-    query: 'foo-bar' & 'foo' & 'bar'
+    query: 'foo-bar' ... 'foo' ... 'bar'
     match: True
 
 A '-' surrounded by numbers is treated as the sign of the right-hand number.
 
     >>> print(search_same("123-456"))
     FTI data: '-456':2 '123':1
-    query: '123' & '-456'
+    query: '123' ... '-456'
     match: True
 
 Punctuation is handled consistently. If a string containing punctuation
@@ -272,31 +279,31 @@ string finds the indexed text.
     >>> for symbol in punctuation:
     ...     print(repr(symbol), search_same("foo%sbar" % symbol))
     ...
-    "'" FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
-    '"' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
-    '#' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
-    '$' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
-    '%' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
-    '*' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
-    '+' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
-    ',' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
+    "'" FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+    '"' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+    '#' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+    '$' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+    '%' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+    '*' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+    '+' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+    ',' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
     '.' FTI data: 'foo.bar':1 query: 'foo.bar' match: True
     '/' FTI data: 'foo/bar':1 query: 'foo/bar' match: True
-    ':' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
-    ';' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
-    '<' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
-    '=' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
-    '>' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
-    '?' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
-    '@' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
-    '[' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
-    '\\' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
-    ']' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
-    '^' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
-    '`' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
-    '{' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
-    '}' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
-    '~' FTI data: 'foo':1 '~bar':2 query: 'foo' & '~bar' match: True
+    ':' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+    ';' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+    '<' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+    '=' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+    '>' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+    '?' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+    '@' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+    '[' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+    '\\' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+    ']' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+    '^' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+    '`' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+    '{' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+    '}' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+    '~' FTI data: 'foo':1 '~bar':2 query: 'foo' ... '~bar' match: True
 
     >>> for symbol in punctuation:
     ...     print(
@@ -399,14 +406,14 @@ Bug #44913 - Unicode characters in the wrong place.
 
     >>> print(search_same("abc-a\N{LATIN SMALL LETTER C WITH CEDILLA}"))
     FTI data: 'abc':2 'abc-aç':1 'aç':3
-    query: 'abc-aç' & 'abc' & 'aç'
+    query: 'abc-aç' ... 'abc' ... 'aç'
     match: True
 
 Cut & Paste of 'Smart' quotes. Note that the quotation mark is retained
 in the FTI.
 
     >>> print(search_same("a-a\N{RIGHT DOUBLE QUOTATION MARK}"))
-    FTI data: 'a-a”':1 'a”':3 query: 'a-a”' & 'a”' match: True
+    FTI data: 'a-a”':1 'a”':3 query: 'a-a”' ... 'a”' match: True
 
     >>> print(
     ...     search_same(
@@ -414,7 +421,7 @@ in the FTI.
     ...         "\N{RIGHT SINGLE QUOTATION MARK}"
     ...     )
     ... )
-    FTI data: 'a’':2 '‘a':1 query: '‘a' & 'a’' match: True
+    FTI data: 'a’':2 '‘a':1 query: '‘a' ... 'a’' match: True
 
 
 Bug #44913 - Nothing but stopwords in a query needing repair
@@ -543,7 +550,7 @@ or invalid leading operators
     Bug #160236
 
     >>> ftq("foo AND AND bar-baz")
-    foo&bar-baz <=> 'foo' & 'bar-baz' & 'bar' & 'baz'
+    foo&bar-baz <=> 'foo' ... 'bar-baz' ... 'bar' ... 'baz'
 
     >>> ftq("foo OR OR bar.baz")
     foo|bar.baz <=> 'foo' | 'bar.baz'
diff --git a/lib/lp/services/database/tests/test_text_searching.py b/lib/lp/services/database/tests/test_text_searching.py
new file mode 100644
index 0000000..b375deb
--- /dev/null
+++ b/lib/lp/services/database/tests/test_text_searching.py
@@ -0,0 +1,159 @@
+# Copyright 2025 Canonical Ltd.  This software is licensed under the
+# GNU Affero General Public License version 3 (see the file LICENSE).
+
+"""Test text searching functionality."""
+
+from testtools.matchers import Equals, MatchesAny
+from zope.component import getUtility
+
+from lp.services.database.interfaces import (
+    DEFAULT_FLAVOR,
+    MAIN_STORE,
+    IStoreSelector,
+)
+from lp.services.helpers import backslashreplace
+from lp.testing import TestCaseWithFactory
+from lp.testing.layers import DatabaseFunctionalLayer
+
+
+def get_store():
+    return getUtility(IStoreSelector).get(MAIN_STORE, DEFAULT_FLAVOR)
+
+
+def ftq(query):
+    store = get_store()
+    try:
+        result = store.execute("SELECT _ftq(%s), ftq(%s)", (query, query))
+        uncompiled, compiled = result.get_one()
+    except Exception:
+        store.rollback()
+        raise
+    if uncompiled is not None:
+        uncompiled = backslashreplace(uncompiled)
+        uncompiled = uncompiled.replace(" ", "")
+    if compiled is not None:
+        compiled = backslashreplace(compiled)
+    result = "%s <=> %s" % (uncompiled, compiled)
+    return result
+
+
+def search(text_to_search, search_phrase):
+    store = get_store()
+    result = store.execute("SELECT to_tsvector(%s)", (text_to_search,))
+    ts_vector = result.get_all()[0][0]
+    result = store.execute("SELECT ftq(%s)", (search_phrase,))
+    ts_query = result.get_all()[0][0]
+    result = store.execute(
+        "SELECT to_tsvector(%s) @@ ftq(%s)",
+        (text_to_search, search_phrase),
+    )
+    match = result.get_all()[0][0]
+    return "FTI data: %s query: %s match: %s" % (
+        ts_vector,
+        ts_query,
+        str(match),
+    )
+
+
+def search_same(text):
+    return search(text, text)
+
+
+class TestTextSearchingFTI(TestCaseWithFactory):
+    layer = DatabaseFunctionalLayer
+
+    def assert_result_matches(self, result, expected, placeholders_list):
+        matchers = [
+            Equals(expected.format(*placeholders))
+            for placeholders in placeholders_list
+        ]
+        self.assertThat(
+            result,
+            MatchesAny(
+                *matchers,
+            ),
+        )
+
+    def test_hyphens_surrounded_by_two_words_retained(self):
+        # Hyphens surrounded by two words are retained. This reflects the way
+        # how to_tsquery() and to_tsvector() handle such strings.
+        result = search_same("foo-bar")
+        expected = (
+            "FTI data: 'bar':3 'foo':2 'foo-bar':1 query: "
+            "'foo-bar' {} 'foo' {} 'bar' match: True"
+        )
+        self.assert_result_matches(result, expected, (["&"] * 3, ["<->"] * 3))
+
+    def test_hyphen_surrounded_by_numbers_sign_of_right_number(self):
+        # A '-' surrounded by numbers is treated as the sign of the
+        # right-hand number.
+        result = search_same("123-456")
+        expected = (
+            "FTI data: '-456':2 '123':1 query: '123' {} '-456' match: True"
+        )
+        self.assert_result_matches(result, expected, (["&"], ["<->"]))
+
+    def test_consistent_handling_of_punctuation(self):
+        # Punctuation is handled consistently. If a string containing
+        # punctuation appears in an FTI, it can also be passed to ftq(),
+        # and a search for this string finds the indexed text.
+        result = search_same("foo'bar")
+        expected = (
+            "FTI data: 'bar':2 'foo':1 query: 'foo' {} 'bar' match: True"
+        )
+        placeholders = (["&"], ["<->"])
+        punctuations = "'\"#$%*+,:;<=>?@[\\]^`{}`"
+        for symbol in punctuations:
+            result = search_same(f"foo{symbol}bar")
+            self.assert_result_matches(
+                result,
+                expected,
+                placeholders,
+            )
+        result = search_same("foo.bar")
+        expected = "FTI data: 'foo.bar':1 query: 'foo.bar' match: True"
+        self.assert_result_matches(
+            result,
+            expected,
+            ([], []),
+        )
+
+    def test_unicode_characters_in_the_wrong_place(self):
+        # Bug #44913 - Unicode characters in the wrong place.
+        result = search_same("abc-a\N{LATIN SMALL LETTER C WITH CEDILLA}")
+        expected = (
+            "FTI data: 'abc':2 'abc-aç':1 'aç':3 query: 'abc-aç' {} 'abc' "
+            "{} 'aç' match: True"
+        )
+        self.assert_result_matches(
+            result,
+            expected,
+            (["&"] * 2, ["<->"] * 2),
+        )
+
+    def test_cut_and_past_of_smart_quotes(self):
+        # Cut & Paste of 'Smart' quotes. Note that the quotation mark is
+        # retained in the FTI.
+        result = search_same("a-a\N{RIGHT DOUBLE QUOTATION MARK}")
+        expected = (
+            "FTI data: 'a-a”':1 'a”':3 query: 'a-a”' {} 'a”' match: True"
+        )
+        self.assert_result_matches(
+            result,
+            expected,
+            (["&"], ["<2>"]),
+        )
+        result = search_same(
+            "\N{LEFT SINGLE QUOTATION MARK}a.a"
+            "\N{RIGHT SINGLE QUOTATION MARK}"
+        )
+        expected = "FTI data: 'a’':2 '‘a':1 query: '‘a' {} 'a’' match: True"
+        self.assert_result_matches(result, expected, (["&"], ["<->"]))
+
+    def test_bug_160236_ftq(self):
+        # filing a bug with summary "a&& a-a" oopses with sql syntax error
+        result = ftq("foo AND AND bar-baz")
+        expected = "foo&bar-baz <=> 'foo' {} 'bar-baz' {} 'bar' {} 'baz'"
+        self.assert_result_matches(
+            result, expected, (["&"] * 3, ["&", "<->", "<->"])
+        )

Follow ups