zeitgeist team mailing list archive
-
zeitgeist team
-
Mailing list archive
-
Message #01893
[Merge] lp:~thekorn/zeitgeist/fix-641198-glob-use-index into lp:zeitgeist
Markus Korn has proposed merging lp:~thekorn/zeitgeist/fix-641198-glob-use-index into lp:zeitgeist.
Requested reviews:
Zeitgeist Framework Team (zeitgeist)
Related bugs:
#641198 Prefix search is not using an index
https://bugs.launchpad.net/bugs/641198
This branch is speed optimizing the GLOB query we are using in the prefix search as described in
http://www.sqlite.org/optoverview.html `4.0 The LIKE optimization`.
The most tricky part of this implementation is the unicode support, and I've added some tests for corner cases to
test/engine-test.py ZeitgeistEngineTest.testWildcardOptimization
I think the way it's tested is ok, unless I'm missing sth., If so please point me in the right direction.
--
https://code.launchpad.net/~thekorn/zeitgeist/fix-641198-glob-use-index/+merge/36105
Your team Zeitgeist Framework Team is requested to review the proposed merge of lp:~thekorn/zeitgeist/fix-641198-glob-use-index into lp:zeitgeist.
=== modified file '_zeitgeist/engine/sql.py'
--- _zeitgeist/engine/sql.py 2010-09-20 07:29:32 +0000
+++ _zeitgeist/engine/sql.py 2010-09-21 07:21:03 +0000
@@ -451,6 +451,22 @@
# Use this when fetching values which are supposed to be in the
# database already. Eg., in find_eventids.
return super(TableLookup, self).__getitem__(name)
+
+def get_right_boundary(text):
+ """ returns the smallest string which is greater than `text` """
+ if not text:
+ # if the search prefix is empty we query for the whole range
+ # of 'utf-8 'unicode chars
+ return unichr(0x10ffff)
+ if isinstance(text, str):
+ # we need to make sure the text is decoded as 'utf-8' unicode
+ text = unicode(text, "UTF-8")
+ charpoint = ord(text[-1])
+ if charpoint == 0x10ffff:
+ # if the last character is the biggest possible char we need to
+ # look at the second last
+ return get_right_boundary(text[:-1])
+ return text[:-1] + unichr(charpoint+1)
class WhereClause:
"""
@@ -471,6 +487,25 @@
OR = " OR "
NOT = "NOT "
+ @staticmethod
+ def optimize_glob(column, table, prefix):
+ """returns an optimized version of the GLOB statement as described
+ in http://www.sqlite.org/optoverview.html `4.0 The LIKE optimization`
+ """
+ if isinstance(prefix, str):
+ # we need to make sure the text is decoded as 'utf-8' unicode
+ prefix = unicode(prefix, "UTF-8")
+ if not prefix:
+ # empty prefix means 'select all', no way to optimize this
+ sql = "SELECT %s FROM %s" %(column, table)
+ return sql, ()
+ elif all([i == unichr(0x10ffff) for i in prefix]):
+ sql = "SELECT %s FROM %s WHERE value >= ?" %(column, table)
+ return sql, (prefix,)
+ else:
+ sql = "SELECT %s FROM %s WHERE (value >= ? AND value < ?)" %(column, table)
+ return sql, (prefix, get_right_boundary(prefix))
+
def __init__(self, relation, negation=False):
self._conditions = []
self.arguments = []
@@ -498,9 +533,8 @@
view_column = "%s_id" %column
else:
view_column = column
- sql = "%s %sIN (SELECT id FROM %s WHERE value GLOB ?)" \
- %(view_column, self.NOT if negation else "", TABLE_MAP.get(column, column))
- value += "*"
+ optimized_glob, value = self.optimize_glob("id", TABLE_MAP.get(column, column), value)
+ sql = "%s %sIN (%s)" %(view_column, self.NOT if negation else "", optimized_glob)
else:
sql = "%s %s= ?" %(column, "!" if negation else "")
if cache is not None:
=== modified file 'test/engine-test.py'
--- test/engine-test.py 2010-08-31 20:48:33 +0000
+++ test/engine-test.py 2010-09-21 07:21:03 +0000
@@ -9,6 +9,7 @@
import _zeitgeist.engine
from _zeitgeist.engine import constants
from _zeitgeist.engine import get_engine
+from _zeitgeist.engine.sql import WhereClause
from zeitgeist.datamodel import *
from testutils import import_events
@@ -932,6 +933,72 @@
[template,], StorageState.Any, 10, ResultType.MostRecentEvents
)
self.assertEquals(5, len(ids))
+
+ def testWildcardOptimization(self):
+ cursor = self.engine._cursor
+ strings = [
+ (u"hällö, I'm gürmen - Ã¥ge drikker øl - â bug",),
+ (u"ä â åø",),
+ (u"h" + unichr(0x10ffff),),
+ (unichr(0x10ffff),),
+ ("",),
+ (unichr(0x10ffff) + unichr(0x10ffff) + "aa",),
+ ]
+
+ # does it work for ascii chars?
+ cursor.executemany("INSERT INTO uri(value) VALUES(?)", strings)
+ stm = WhereClause.optimize_glob("value", "uri", u"h")
+ self.assertEquals(
+ cursor.execute(*stm).fetchall(),
+ cursor.execute("SELECT value FROM uri WHERE value GLOB ?", ("h*",)).fetchall()
+ )
+ self.assertEquals(len(cursor.execute(*stm).fetchall()), 2)
+
+ # bunch of unicode in the prefix
+ stm = WhereClause.optimize_glob("value", "uri", u"ä â Ã¥")
+ self.assertEquals(
+ cursor.execute(*stm).fetchall(),
+ cursor.execute("SELECT value FROM uri WHERE value GLOB ?", (u"ä â Ã¥*",)).fetchall()
+ )
+ self.assertEquals(len(cursor.execute(*stm).fetchall()), 1)
+
+ # bunch of unicode in the prefix, prefix is not 'utf-8' decoded
+ stm = WhereClause.optimize_glob("value", "uri", "ä â Ã¥")
+ self.assertEquals(
+ cursor.execute(*stm).fetchall(),
+ cursor.execute("SELECT value FROM uri WHERE value GLOB ?", ("ä â Ã¥*",)).fetchall()
+ )
+ self.assertEquals(len(cursor.execute(*stm).fetchall()), 1)
+
+ # select all
+ stm = WhereClause.optimize_glob("value", "uri", "")
+ self.assertEquals(
+ cursor.execute(*stm).fetchall(),
+ cursor.execute("SELECT value FROM uri WHERE value GLOB ?", ("*",)).fetchall()
+ )
+ self.assertEquals(len(cursor.execute(*stm).fetchall()), len(strings))
+
+ # what if the biggest char is the last character of the search prefix?
+ prefix = u"h" + unichr(0x10ffff)
+ stm = WhereClause.optimize_glob("value", "uri", prefix)
+ self.assertEquals(
+ cursor.execute(*stm).fetchall(),
+ cursor.execute(
+ "SELECT value FROM uri WHERE value GLOB ?", (u"%s*" %prefix,)
+ ).fetchall()
+ )
+ self.assertEquals(len(cursor.execute(*stm).fetchall()), 1)
+
+ # what if the search prefix only contains the biggest char
+ prefix = unichr(0x10ffff) + unichr(0x10ffff)
+ stm = WhereClause.optimize_glob("value", "uri", prefix)
+ self.assertEquals(
+ cursor.execute(*stm).fetchall(),
+ cursor.execute(
+ "SELECT value FROM uri WHERE value GLOB ?", (u"%s*" %prefix,)
+ ).fetchall()
+ )
+ self.assertEquals(len(cursor.execute(*stm).fetchall()), 1)
if __name__ == "__main__":
unittest.main()
=== modified file 'test/sql-test.py'
--- test/sql-test.py 2010-09-19 10:42:30 +0000
+++ test/sql-test.py 2010-09-21 07:21:03 +0000
@@ -81,27 +81,27 @@
where = WhereClause(WhereClause.AND)
where.add_text_condition("actor", "bar", like=True)
self.assertEquals(where.sql.replace("?", "%s") % tuple(where.arguments),
- "(actor IN (SELECT id FROM actor WHERE value GLOB bar*))")
+ "(actor IN (SELECT id FROM actor WHERE (value >= bar AND value < bas)))")
where = WhereClause(WhereClause.AND)
where.add_text_condition("subj_mimetype", "bar", like=True)
self.assertEquals(where.sql.replace("?", "%s") % tuple(where.arguments),
- "(subj_mimetype IN (SELECT id FROM mimetype WHERE value GLOB bar*))")
+ "(subj_mimetype IN (SELECT id FROM mimetype WHERE (value >= bar AND value < bas)))")
where = WhereClause(WhereClause.AND)
where.add_text_condition("subj_uri", "bar", like=True)
self.assertEquals(where.sql.replace("?", "%s") % tuple(where.arguments),
- "(subj_uri_id IN (SELECT id FROM uri WHERE value GLOB bar*))")
+ "(subj_uri_id IN (SELECT id FROM uri WHERE (value >= bar AND value < bas)))")
where = WhereClause(WhereClause.AND)
where.add_text_condition("subj_origin", "bar", like=True)
self.assertEquals(where.sql.replace("?", "%s") % tuple(where.arguments),
- "(subj_origin_id IN (SELECT id FROM uri WHERE value GLOB bar*))")
+ "(subj_origin_id IN (SELECT id FROM uri WHERE (value >= bar AND value < bas)))")
where = WhereClause(WhereClause.AND)
where.add_text_condition("actor", "bar", like=True, negation=True)
self.assertEquals(where.sql.replace("?", "%s") % tuple(where.arguments),
- "(actor NOT IN (SELECT id FROM actor WHERE value GLOB bar*))")
+ "(actor NOT IN (SELECT id FROM actor WHERE (value >= bar AND value < bas)))")
if __name__ == "__main__":