zeitgeist team mailing list archive

Thread
Date

[Merge] lp:~thekorn/zeitgeist/fix-641198-glob-use-index into lp:zeitgeist

To: mp+36105@xxxxxxxxxxxxxxxxxx
From: Markus Korn <thekorn@xxxxxx>
Date: Tue, 21 Sep 2010 07:21:06 -0000
Reply-to: mp+36105@xxxxxxxxxxxxxxxxxx
Sender: bounces@xxxxxxxxxxxxx

Markus Korn has proposed merging lp:~thekorn/zeitgeist/fix-641198-glob-use-index into lp:zeitgeist.

Requested reviews:
  Zeitgeist Framework Team (zeitgeist)
Related bugs:
  #641198 Prefix search is not using an index
  https://bugs.launchpad.net/bugs/641198


This branch is speed optimizing the GLOB query we are using in the prefix search as described in
  http://www.sqlite.org/optoverview.html `4.0 The LIKE optimization`.

The most tricky part of this implementation is the unicode support, and I've added some tests for corner cases to 
  test/engine-test.py ZeitgeistEngineTest.testWildcardOptimization

I think the way it's tested is ok, unless I'm missing sth., If so please point me in the right direction.
-- 
https://code.launchpad.net/~thekorn/zeitgeist/fix-641198-glob-use-index/+merge/36105
Your team Zeitgeist Framework Team is requested to review the proposed merge of lp:~thekorn/zeitgeist/fix-641198-glob-use-index into lp:zeitgeist.

=== modified file '_zeitgeist/engine/sql.py'
--- _zeitgeist/engine/sql.py	2010-09-20 07:29:32 +0000
+++ _zeitgeist/engine/sql.py	2010-09-21 07:21:03 +0000
@@ -451,6 +451,22 @@
 		# Use this when fetching values which are supposed to be in the
 		# database already. Eg., in find_eventids.
 		return super(TableLookup, self).__getitem__(name)
+		
+def get_right_boundary(text):
+	""" returns the smallest string which is greater than `text` """
+	if not text:
+		# if the search prefix is empty we query for the whole range
+		# of 'utf-8 'unicode chars
+		return unichr(0x10ffff)
+	if isinstance(text, str):
+		# we need to make sure the text is decoded as 'utf-8' unicode
+		text = unicode(text, "UTF-8")
+	charpoint = ord(text[-1])
+	if charpoint == 0x10ffff:
+		# if the last character is the biggest possible char we need to
+		# look at the second last
+		return get_right_boundary(text[:-1])
+	return text[:-1] + unichr(charpoint+1)
 
 class WhereClause:
 	"""
@@ -471,6 +487,25 @@
 	OR = " OR "
 	NOT = "NOT "
 	
+	@staticmethod
+	def optimize_glob(column, table, prefix):
+		"""returns an optimized version of the GLOB statement as described
+		in http://www.sqlite.org/optoverview.html `4.0 The LIKE optimization`
+		"""
+		if isinstance(prefix, str):
+			# we need to make sure the text is decoded as 'utf-8' unicode
+			prefix = unicode(prefix, "UTF-8")
+		if not prefix:
+			# empty prefix means 'select all', no way to optimize this
+			sql = "SELECT %s FROM %s" %(column, table)
+			return sql, ()
+		elif all([i == unichr(0x10ffff) for i in prefix]):
+			sql = "SELECT %s FROM %s WHERE value >= ?" %(column, table)
+			return sql, (prefix,)
+		else:
+			sql = "SELECT %s FROM %s WHERE (value >= ? AND value < ?)" %(column, table)
+			return sql, (prefix, get_right_boundary(prefix))
+	
 	def __init__(self, relation, negation=False):
 		self._conditions = []
 		self.arguments = []
@@ -498,9 +533,8 @@
 				view_column = "%s_id" %column
 			else:
 				view_column = column
-			sql = "%s %sIN (SELECT id FROM %s WHERE value GLOB ?)" \
-					%(view_column, self.NOT if negation else "", TABLE_MAP.get(column, column))
-			value += "*"
+			optimized_glob, value = self.optimize_glob("id", TABLE_MAP.get(column, column), value)
+			sql = "%s %sIN (%s)" %(view_column, self.NOT if negation else "", optimized_glob)
 		else:
 			sql = "%s %s= ?" %(column, "!" if negation else "")
 			if cache is not None:

=== modified file 'test/engine-test.py'
--- test/engine-test.py	2010-08-31 20:48:33 +0000
+++ test/engine-test.py	2010-09-21 07:21:03 +0000
@@ -9,6 +9,7 @@
 import _zeitgeist.engine
 from _zeitgeist.engine import constants
 from _zeitgeist.engine import get_engine
+from _zeitgeist.engine.sql import WhereClause
 from zeitgeist.datamodel import *
 from testutils import import_events
 
@@ -932,6 +933,72 @@
 			[template,], StorageState.Any, 10, ResultType.MostRecentEvents
 		)
 		self.assertEquals(5, len(ids))
+		
+	def testWildcardOptimization(self):
+		cursor = self.engine._cursor
+		strings = [
+			(u"hÃ¤llÃ¶, I'm gÃ¼rmen - Ã¥ge drikker Ã¸l - â˜  bug",),
+			(u"Ã¤ â˜  Ã¥Ã¸",),
+			(u"h" + unichr(0x10ffff),),
+			(unichr(0x10ffff),),
+			("",),
+			(unichr(0x10ffff) + unichr(0x10ffff) + "aa",),
+		]
+		
+		# does it work for ascii chars?
+		cursor.executemany("INSERT INTO uri(value) VALUES(?)", strings)
+		stm = WhereClause.optimize_glob("value", "uri", u"h")
+		self.assertEquals(
+			cursor.execute(*stm).fetchall(),
+			cursor.execute("SELECT value FROM uri WHERE value GLOB ?", ("h*",)).fetchall()
+		)
+		self.assertEquals(len(cursor.execute(*stm).fetchall()), 2)
+		
+		# bunch of unicode in the prefix
+		stm = WhereClause.optimize_glob("value", "uri", u"Ã¤ â˜  Ã¥")
+		self.assertEquals(
+			cursor.execute(*stm).fetchall(),
+			cursor.execute("SELECT value FROM uri WHERE value GLOB ?", (u"Ã¤ â˜  Ã¥*",)).fetchall()
+		)
+		self.assertEquals(len(cursor.execute(*stm).fetchall()), 1)
+		
+		# bunch of unicode in the prefix, prefix is not 'utf-8' decoded
+		stm = WhereClause.optimize_glob("value", "uri", "Ã¤ â˜  Ã¥")
+		self.assertEquals(
+			cursor.execute(*stm).fetchall(),
+			cursor.execute("SELECT value FROM uri WHERE value GLOB ?", ("Ã¤ â˜  Ã¥*",)).fetchall()
+		)
+		self.assertEquals(len(cursor.execute(*stm).fetchall()), 1)
+		
+		# select all
+		stm = WhereClause.optimize_glob("value", "uri", "")
+		self.assertEquals(
+			cursor.execute(*stm).fetchall(),
+			cursor.execute("SELECT value FROM uri WHERE value GLOB ?", ("*",)).fetchall()
+		)
+		self.assertEquals(len(cursor.execute(*stm).fetchall()), len(strings))
+		
+		# what if the biggest char is the last character of the search prefix?
+		prefix = u"h" + unichr(0x10ffff)
+		stm = WhereClause.optimize_glob("value", "uri", prefix)
+		self.assertEquals(
+			cursor.execute(*stm).fetchall(),
+			cursor.execute(
+				"SELECT value FROM uri WHERE value GLOB ?", (u"%s*" %prefix,)
+			).fetchall()
+		)
+		self.assertEquals(len(cursor.execute(*stm).fetchall()), 1)
+		
+		# what if the search prefix only contains the biggest char
+		prefix = unichr(0x10ffff) + unichr(0x10ffff)
+		stm = WhereClause.optimize_glob("value", "uri", prefix)
+		self.assertEquals(
+			cursor.execute(*stm).fetchall(),
+			cursor.execute(
+				"SELECT value FROM uri WHERE value GLOB ?", (u"%s*" %prefix,)
+			).fetchall()
+		)
+		self.assertEquals(len(cursor.execute(*stm).fetchall()), 1)
 
 if __name__ == "__main__":
 	unittest.main()

=== modified file 'test/sql-test.py'
--- test/sql-test.py	2010-09-19 10:42:30 +0000
+++ test/sql-test.py	2010-09-21 07:21:03 +0000
@@ -81,27 +81,27 @@
 		where = WhereClause(WhereClause.AND)
 		where.add_text_condition("actor", "bar", like=True)
 		self.assertEquals(where.sql.replace("?", "%s") % tuple(where.arguments),
-			"(actor IN (SELECT id FROM actor WHERE value GLOB bar*))")
+			"(actor IN (SELECT id FROM actor WHERE (value >= bar AND value < bas)))")
 			
 		where = WhereClause(WhereClause.AND)
 		where.add_text_condition("subj_mimetype", "bar", like=True)
 		self.assertEquals(where.sql.replace("?", "%s") % tuple(where.arguments),
-			"(subj_mimetype IN (SELECT id FROM mimetype WHERE value GLOB bar*))")
+			"(subj_mimetype IN (SELECT id FROM mimetype WHERE (value >= bar AND value < bas)))")
 			
 		where = WhereClause(WhereClause.AND)
 		where.add_text_condition("subj_uri", "bar", like=True)
 		self.assertEquals(where.sql.replace("?", "%s") % tuple(where.arguments),
-			"(subj_uri_id IN (SELECT id FROM uri WHERE value GLOB bar*))")
+			"(subj_uri_id IN (SELECT id FROM uri WHERE (value >= bar AND value < bas)))")
 			
 		where = WhereClause(WhereClause.AND)
 		where.add_text_condition("subj_origin", "bar", like=True)
 		self.assertEquals(where.sql.replace("?", "%s") % tuple(where.arguments),
-			"(subj_origin_id IN (SELECT id FROM uri WHERE value GLOB bar*))")
+			"(subj_origin_id IN (SELECT id FROM uri WHERE (value >= bar AND value < bas)))")
 			
 		where = WhereClause(WhereClause.AND)
 		where.add_text_condition("actor", "bar", like=True, negation=True)
 		self.assertEquals(where.sql.replace("?", "%s") % tuple(where.arguments),
-			"(actor NOT IN (SELECT id FROM actor WHERE value GLOB bar*))")
+			"(actor NOT IN (SELECT id FROM actor WHERE (value >= bar AND value < bas)))")
 		
 
 if __name__ == "__main__":