launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #32136
[Merge] ~lgp171188/launchpad:fix-textsearch-failures-postgres-14 into launchpad:master
Guruprasad has proposed merging ~lgp171188/launchpad:fix-textsearch-failures-postgres-14 into launchpad:master.
Commit message:
Fix full text search test failures with Postgres 14
Requested reviews:
Launchpad code reviewers (launchpad-reviewers)
For more details, see:
https://code.launchpad.net/~lgp171188/launchpad/+git/launchpad/+merge/480018
--
Your team Launchpad code reviewers is requested to review the proposed merge of ~lgp171188/launchpad:fix-textsearch-failures-postgres-14 into launchpad:master.
diff --git a/lib/lp/services/database/doc/textsearching.rst b/lib/lp/services/database/doc/textsearching.rst
index c374934..56ccbf1 100644
--- a/lib/lp/services/database/doc/textsearching.rst
+++ b/lib/lp/services/database/doc/textsearching.rst
@@ -249,19 +249,26 @@ Repeated '-' are simply ignored by to_tsquery().
>>> ftq("---foo--- ---bar---")
---foo---&---bar--- <=> 'foo' & 'bar'
+
+XXX 2025-01-23 lgp171188: The following doctests have a lot of placeholders
+ignoring key values like '&', '<->', and '<2>' since it is not straightforward
+to test different values in a doctest based on different PostgreSQL versions.
+So these ignored values have been checked in the unit tests in the
+lp.services.database.tests.test_text_searching module.
+
Hyphens surrounded by two words are retained. This reflects the way
how to_tsquery() and to_tsvector() handle such strings.
>>> print(search_same("foo-bar"))
FTI data: 'bar':3 'foo':2 'foo-bar':1
- query: 'foo-bar' & 'foo' & 'bar'
+ query: 'foo-bar' ... 'foo' ... 'bar'
match: True
A '-' surrounded by numbers is treated as the sign of the right-hand number.
>>> print(search_same("123-456"))
FTI data: '-456':2 '123':1
- query: '123' & '-456'
+ query: '123' ... '-456'
match: True
Punctuation is handled consistently. If a string containing punctuation
@@ -272,31 +279,31 @@ string finds the indexed text.
>>> for symbol in punctuation:
... print(repr(symbol), search_same("foo%sbar" % symbol))
...
- "'" FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
- '"' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
- '#' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
- '$' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
- '%' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
- '*' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
- '+' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
- ',' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
+ "'" FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+ '"' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+ '#' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+ '$' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+ '%' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+ '*' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+ '+' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+ ',' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
'.' FTI data: 'foo.bar':1 query: 'foo.bar' match: True
'/' FTI data: 'foo/bar':1 query: 'foo/bar' match: True
- ':' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
- ';' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
- '<' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
- '=' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
- '>' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
- '?' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
- '@' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
- '[' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
- '\\' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
- ']' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
- '^' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
- '`' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
- '{' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
- '}' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
- '~' FTI data: 'foo':1 '~bar':2 query: 'foo' & '~bar' match: True
+ ':' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+ ';' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+ '<' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+ '=' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+ '>' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+ '?' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+ '@' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+ '[' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+ '\\' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+ ']' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+ '^' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+ '`' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+ '{' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+ '}' FTI data: 'bar':2 'foo':1 query: 'foo' ... 'bar' match: True
+ '~' FTI data: 'foo':1 '~bar':2 query: 'foo' ... '~bar' match: True
>>> for symbol in punctuation:
... print(
@@ -399,14 +406,14 @@ Bug #44913 - Unicode characters in the wrong place.
>>> print(search_same("abc-a\N{LATIN SMALL LETTER C WITH CEDILLA}"))
FTI data: 'abc':2 'abc-aç':1 'aç':3
- query: 'abc-aç' & 'abc' & 'aç'
+ query: 'abc-aç' ... 'abc' ... 'aç'
match: True
Cut & Paste of 'Smart' quotes. Note that the quotation mark is retained
in the FTI.
>>> print(search_same("a-a\N{RIGHT DOUBLE QUOTATION MARK}"))
- FTI data: 'a-a”':1 'a”':3 query: 'a-a”' & 'a”' match: True
+ FTI data: 'a-a”':1 'a”':3 query: 'a-a”' ... 'a”' match: True
>>> print(
... search_same(
@@ -414,7 +421,7 @@ in the FTI.
... "\N{RIGHT SINGLE QUOTATION MARK}"
... )
... )
- FTI data: 'a’':2 '‘a':1 query: '‘a' & 'a’' match: True
+ FTI data: 'a’':2 '‘a':1 query: '‘a' ... 'a’' match: True
Bug #44913 - Nothing but stopwords in a query needing repair
@@ -543,7 +550,7 @@ or invalid leading operators
Bug #160236
>>> ftq("foo AND AND bar-baz")
- foo&bar-baz <=> 'foo' & 'bar-baz' & 'bar' & 'baz'
+ foo&bar-baz <=> 'foo' ... 'bar-baz' ... 'bar' ... 'baz'
>>> ftq("foo OR OR bar.baz")
foo|bar.baz <=> 'foo' | 'bar.baz'
diff --git a/lib/lp/services/database/tests/test_text_searching.py b/lib/lp/services/database/tests/test_text_searching.py
new file mode 100644
index 0000000..b375deb
--- /dev/null
+++ b/lib/lp/services/database/tests/test_text_searching.py
@@ -0,0 +1,159 @@
+# Copyright 2025 Canonical Ltd. This software is licensed under the
+# GNU Affero General Public License version 3 (see the file LICENSE).
+
+"""Test text searching functionality."""
+
+from testtools.matchers import Equals, MatchesAny
+from zope.component import getUtility
+
+from lp.services.database.interfaces import (
+ DEFAULT_FLAVOR,
+ MAIN_STORE,
+ IStoreSelector,
+)
+from lp.services.helpers import backslashreplace
+from lp.testing import TestCaseWithFactory
+from lp.testing.layers import DatabaseFunctionalLayer
+
+
+def get_store():
+ return getUtility(IStoreSelector).get(MAIN_STORE, DEFAULT_FLAVOR)
+
+
+def ftq(query):
+ store = get_store()
+ try:
+ result = store.execute("SELECT _ftq(%s), ftq(%s)", (query, query))
+ uncompiled, compiled = result.get_one()
+ except Exception:
+ store.rollback()
+ raise
+ if uncompiled is not None:
+ uncompiled = backslashreplace(uncompiled)
+ uncompiled = uncompiled.replace(" ", "")
+ if compiled is not None:
+ compiled = backslashreplace(compiled)
+ result = "%s <=> %s" % (uncompiled, compiled)
+ return result
+
+
+def search(text_to_search, search_phrase):
+ store = get_store()
+ result = store.execute("SELECT to_tsvector(%s)", (text_to_search,))
+ ts_vector = result.get_all()[0][0]
+ result = store.execute("SELECT ftq(%s)", (search_phrase,))
+ ts_query = result.get_all()[0][0]
+ result = store.execute(
+ "SELECT to_tsvector(%s) @@ ftq(%s)",
+ (text_to_search, search_phrase),
+ )
+ match = result.get_all()[0][0]
+ return "FTI data: %s query: %s match: %s" % (
+ ts_vector,
+ ts_query,
+ str(match),
+ )
+
+
+def search_same(text):
+ return search(text, text)
+
+
+class TestTextSearchingFTI(TestCaseWithFactory):
+ layer = DatabaseFunctionalLayer
+
+ def assert_result_matches(self, result, expected, placeholders_list):
+ matchers = [
+ Equals(expected.format(*placeholders))
+ for placeholders in placeholders_list
+ ]
+ self.assertThat(
+ result,
+ MatchesAny(
+ *matchers,
+ ),
+ )
+
+ def test_hyphens_surrounded_by_two_words_retained(self):
+ # Hyphens surrounded by two words are retained. This reflects the way
+ # how to_tsquery() and to_tsvector() handle such strings.
+ result = search_same("foo-bar")
+ expected = (
+ "FTI data: 'bar':3 'foo':2 'foo-bar':1 query: "
+ "'foo-bar' {} 'foo' {} 'bar' match: True"
+ )
+ self.assert_result_matches(result, expected, (["&"] * 3, ["<->"] * 3))
+
+ def test_hyphen_surrounded_by_numbers_sign_of_right_number(self):
+ # A '-' surrounded by numbers is treated as the sign of the
+ # right-hand number.
+ result = search_same("123-456")
+ expected = (
+ "FTI data: '-456':2 '123':1 query: '123' {} '-456' match: True"
+ )
+ self.assert_result_matches(result, expected, (["&"], ["<->"]))
+
+ def test_consistent_handling_of_punctuation(self):
+ # Punctuation is handled consistently. If a string containing
+ # punctuation appears in an FTI, it can also be passed to ftq(),
+ # and a search for this string finds the indexed text.
+ result = search_same("foo'bar")
+ expected = (
+ "FTI data: 'bar':2 'foo':1 query: 'foo' {} 'bar' match: True"
+ )
+ placeholders = (["&"], ["<->"])
+ punctuations = "'\"#$%*+,:;<=>?@[\\]^`{}`"
+ for symbol in punctuations:
+ result = search_same(f"foo{symbol}bar")
+ self.assert_result_matches(
+ result,
+ expected,
+ placeholders,
+ )
+ result = search_same("foo.bar")
+ expected = "FTI data: 'foo.bar':1 query: 'foo.bar' match: True"
+ self.assert_result_matches(
+ result,
+ expected,
+ ([], []),
+ )
+
+ def test_unicode_characters_in_the_wrong_place(self):
+ # Bug #44913 - Unicode characters in the wrong place.
+ result = search_same("abc-a\N{LATIN SMALL LETTER C WITH CEDILLA}")
+ expected = (
+ "FTI data: 'abc':2 'abc-aç':1 'aç':3 query: 'abc-aç' {} 'abc' "
+ "{} 'aç' match: True"
+ )
+ self.assert_result_matches(
+ result,
+ expected,
+ (["&"] * 2, ["<->"] * 2),
+ )
+
+ def test_cut_and_past_of_smart_quotes(self):
+ # Cut & Paste of 'Smart' quotes. Note that the quotation mark is
+ # retained in the FTI.
+ result = search_same("a-a\N{RIGHT DOUBLE QUOTATION MARK}")
+ expected = (
+ "FTI data: 'a-a”':1 'a”':3 query: 'a-a”' {} 'a”' match: True"
+ )
+ self.assert_result_matches(
+ result,
+ expected,
+ (["&"], ["<2>"]),
+ )
+ result = search_same(
+ "\N{LEFT SINGLE QUOTATION MARK}a.a"
+ "\N{RIGHT SINGLE QUOTATION MARK}"
+ )
+ expected = "FTI data: 'a’':2 '‘a':1 query: '‘a' {} 'a’' match: True"
+ self.assert_result_matches(result, expected, (["&"], ["<->"]))
+
+ def test_bug_160236_ftq(self):
+ # filing a bug with summary "a&& a-a" oopses with sql syntax error
+ result = ftq("foo AND AND bar-baz")
+ expected = "foo&bar-baz <=> 'foo' {} 'bar-baz' {} 'bar' {} 'baz'"
+ self.assert_result_matches(
+ result, expected, (["&"] * 3, ["&", "<->", "<->"])
+ )
Follow ups