← Back to team overview

zeitgeist team mailing list archive

[Merge] lp:~thekorn/zeitgeist/wildcard_support into lp:zeitgeist

 

Markus Korn has proposed merging lp:~thekorn/zeitgeist/wildcard_support into lp:zeitgeist.

Requested reviews:
  Mikkel Kamstrup Erlandsen (kamstrup)
  Zeitgeist Framework Team (zeitgeist)


This branch adds wildcard-support to some template-fields, and completes the fix of bug 485966
-- 
https://code.launchpad.net/~thekorn/zeitgeist/wildcard_support/+merge/25345
Your team Zeitgeist Framework Team is requested to review the proposed merge of lp:~thekorn/zeitgeist/wildcard_support into lp:zeitgeist.
=== modified file '_zeitgeist/engine/main.py'
--- _zeitgeist/engine/main.py	2010-05-14 11:54:52 +0000
+++ _zeitgeist/engine/main.py	2010-05-14 17:20:42 +0000
@@ -32,7 +32,7 @@
 from collections import defaultdict
 
 from zeitgeist.datamodel import Event as OrigEvent, StorageState, TimeRange, \
-	ResultType, get_timestamp_for_now, Interpretation, Symbol, NEGATION_OPERATOR
+	ResultType, get_timestamp_for_now, Interpretation, Symbol, NEGATION_OPERATOR, WILDCARD
 from _zeitgeist.engine.datamodel import Event, Subject
 from _zeitgeist.engine.extension import ExtensionsCollection, load_class
 from _zeitgeist.engine import constants
@@ -44,6 +44,12 @@
 logging.basicConfig(level=logging.DEBUG)
 log = logging.getLogger("zeitgeist.engine")
 
+class NegationNotSupported(ValueError):
+	pass
+
+class WildcardNotSupported(ValueError):
+	pass
+
 def parse_negation(kind, field, value, parse_negation=True):
 	"""checks if value starts with the negation operator,
 	if value starts with the negation operator but the field does
@@ -55,8 +61,42 @@
 		negation = True
 		value = value[len(NEGATION_OPERATOR):]
 	if negation and field not in kind.SUPPORTS_NEGATION:
-		raise ValueError("This field does not support negation")
+		raise NegationNotSupported("This field does not support negation")
 	return value, negation
+	
+def parse_wildcard(kind, field, value):
+	"""checks if value ends with the a wildcard,
+	if value ends with a wildcard but the field does not support wildcards
+	a ValueError is raised.
+	This function returns a (value_without_wildcard, wildcard)-tuple
+	"""
+	wildcard = False
+	if value.endswith(WILDCARD):
+		wildcard = True
+		value = value[:-len(WILDCARD)]
+	if wildcard and field not in kind.SUPPORTS_WILDCARDS:
+		raise WildcardNotSupported("This field does not support wildcards")
+	return value, wildcard
+	
+def parse_operators(kind, field, value):
+	"""runs both (parse_negation and parse_wildcard) parser functions
+	on query values, and handles the special case of Subject.Text correctly.
+	returns a (value_without_negation_and_wildcard, negation, wildcard)-tuple
+	"""
+	try:
+		value, negation = parse_negation(kind, field, value)
+	except ValueError:
+		if kind is Subject and field == Subject.Text:
+			# we do not support negation of the text field,
+			# the text field starts with the NEGATION_OPERATOR
+			# so we handle this string as the content instead
+			# of an operator
+			negation = False
+		else:
+			raise
+	value, wildcard = parse_wildcard(kind, field, value)
+	return value, negation, wildcard
+
 
 class ZeitgeistEngine:
 	
@@ -186,58 +226,57 @@
 				subwhere.add("id = ?", event_template.id)
 			
 			try:
-				value, negation = parse_negation(Event, Event.Interpretation, event_template.interpretation)
+				value, negation, wildcard = parse_operators(Event, Event.Interpretation, event_template.interpretation)
 				# Expand event interpretation children
 				event_interp_where = WhereClause(WhereClause.OR, negation)
 				for child_interp in (Symbol.find_child_uris_extended(value)):
 					if child_interp:
-						event_interp_where.add("interpretation = ?",
-						                       self._interpretation[child_interp])
+						event_interp_where.add_text_condition("interpretation",
+						                       child_interp, like=wildcard, cache=self._interpretation)
 				if event_interp_where:
 					subwhere.extend(event_interp_where)
 				
-				value, negation = parse_negation(Event, Event.Manifestation, event_template.manifestation)
+				value, negation, wildcard = parse_operators(Event, Event.Manifestation, event_template.manifestation)
 				# Expand event manifestation children
 				event_manif_where = WhereClause(WhereClause.OR, negation)
 				for child_manif in (Symbol.find_child_uris_extended(value)):
 					if child_manif:
-						event_manif_where.add("manifestation = ?",
-						                      self._manifestation[child_manif])
+						event_manif_where.add_text_condition("manifestation",
+						                      child_manif, like=wildcard, cache=self._manifestation)
 				if event_manif_where:
 					subwhere.extend(event_manif_where)
 				
-				value, negation = parse_negation(Subject, Subject.Interpretation, subject_template.interpretation)
+				value, negation, wildcard = parse_operators(Subject, Subject.Interpretation, subject_template.interpretation)
 				# Expand subject interpretation children
 				su_interp_where = WhereClause(WhereClause.OR, negation)
 				for child_interp in (Symbol.find_child_uris_extended(value)):
 					if child_interp:
-						su_interp_where.add("subj_interpretation = ?",
-						                    self._interpretation[child_interp])
+						su_interp_where.add_text_condition("subj_interpretation",
+						                    child_interp, like=wildcard, cache=self._interpretation)
 				if su_interp_where:
 					subwhere.extend(su_interp_where)
 				
-				value, negation = parse_negation(Subject, Subject.Manifestation, subject_template.manifestation)
+				value, negation, wildcard = parse_operators(Subject, Subject.Manifestation, subject_template.manifestation)
 				# Expand subject manifestation children
 				su_manif_where = WhereClause(WhereClause.OR, negation)
 				for child_manif in (Symbol.find_child_uris_extended(value)):
 					if child_manif:
-						su_manif_where.add("subj_manifestation = ?",
-						                   self._manifestation[child_manif])
+						su_manif_where.add_text_condition("subj_manifestation",
+						                   child_manif, like=wildcard, cache=self._manifestation)
 				if su_manif_where:
 					subwhere.extend(su_manif_where)
 				
 				# FIXME: Expand mime children as well.
 				# Right now we only do exact matching for mimetypes
 				# thekorn: this will be fixed when wildcards are supported
-				value, negation = parse_negation(Subject, Subject.Mimetype, subject_template.mimetype)
+				value, negation, wildcard = parse_operators(Subject, Subject.Mimetype, subject_template.mimetype)
 				if value:
-					subwhere.add("subj_mimetype %s= ?" %(NEGATION_OPERATOR if negation else ""),
-					             self._mimetype[value])
+					subwhere.add_text_condition("subj_mimetype",
+					             value, wildcard, negation, cache=self._mimetype)
 				
-				value, negation = parse_negation(Event, Event.Actor, event_template.actor)
+				value, negation, wildcard = parse_operators(Event, Event.Actor, event_template.actor)
 				if value:
-					subwhere.add("actor %s= ?" %(NEGATION_OPERATOR if negation else ""),
-					             self._actor[value])
+					subwhere.add_text_condition("actor", value, wildcard, negation, cache=self._actor)
 			except KeyError, e:
 				# Value not in DB
 				log.debug("Unknown entity in query: %s" % e)
@@ -247,18 +286,8 @@
 			for key in ("uri", "origin", "text"):
 				value = getattr(subject_template, key)
 				if value:
-					try:
-						value, negation = parse_negation(Subject, getattr(Subject, key.title()), value)
-					except ValueError:
-						if key == "text":
-							# we do not support negation of the text field,
-							# the text field starts with the NEGATION_OPERATOR
-							# so we handle this string as the content instead
-							# of an operator
-							negation = False
-						else:
-							raise
-					subwhere.add("subj_%s %s= ?" %(key, NEGATION_OPERATOR if negation else ""), value)
+					value, negation, wildcard = parse_operators(Subject, getattr(Subject, key.title()), value)
+					subwhere.add_text_condition("subj_%s" %key, value, wildcard, negation)
 			where_or.extend(subwhere)
 		
 		return where_or

=== modified file '_zeitgeist/engine/sql.py'
--- _zeitgeist/engine/sql.py	2010-05-13 11:46:31 +0000
+++ _zeitgeist/engine/sql.py	2010-05-14 17:20:42 +0000
@@ -28,6 +28,12 @@
 logging.basicConfig(level=logging.DEBUG)
 log = logging.getLogger("zeitgeist.sql")
 
+TABLE_MAP = {
+	"subj_mimetype": "mimetype",
+	"subj_origin": "uri",
+	"subj_uri": "uri",
+}
+
 class UnicodeCursor(sqlite3.Cursor):
 	
 	@staticmethod
@@ -366,6 +372,31 @@
 			self.arguments.append(arguments)
 		else:
 			self.arguments.extend(arguments)
+			
+	def add_text_condition(self, column, value, like=False, negation=False, cache=None):
+		if like:
+			# thekorn: unfortunatly the data in event_view is a bit inconsistent
+			# e.g.:
+			# subj_uri and subj_origin are presented as string-values
+			# actor and subj_mimetype are ids
+			# (LP: #580601)
+			if column in ("subj_uri", "subj_origin"):
+				value_type = "value"
+			elif column in ("actor", "subj_mimetype"):
+				value_type = "id"
+			else:
+				raise AssertionError("We don't know how to handle this type of data")
+			# thekorn: this is a first (unoptimized version)
+			# see http://www.sqlite.org/optoverview.html '4.0 The LIKE optimization'
+			# for how this will look in the future
+			sql = "%s %sIN (SELECT %s FROM %s WHERE value GLOB ?)" \
+					%(column, self.NOT if negation else "", value_type, TABLE_MAP.get(column, column))
+			value += "*"
+		else:
+			sql = "%s %s= ?" %(column, "!" if negation else "")
+			if cache is not None:
+				value = cache[value]
+		self.add(sql, value)
 	
 	def extend(self, where):
 		self.add(where.sql, where.arguments)

=== modified file 'test/datamodel-test.py'
--- test/datamodel-test.py	2010-05-14 11:54:52 +0000
+++ test/datamodel-test.py	2010-05-14 17:20:42 +0000
@@ -304,6 +304,42 @@
 		event = Event.new_for_values(timestamp=1000, subject_storage="sometext")
 		template = Event.new_for_values(subject_storage="xxxx")
 		self.assertRaises(ValueError, template.matches_event, event)
+		
+	def testWildcardTemplateMatching(self):
+		event = Event.new_for_values(actor="boo bar")
+		
+		template = Event.new_for_values(actor="boo*")
+		self.assertTrue(event.matches_template(template))
+		
+		# wildcards are not supported in interpretation,
+		# so they are handled as content
+		event = Event.new_for_values(interpretation="boo bar")
+		
+		template = Event.new_for_values(interpretation="boo*")
+		self.assertFalse(event.matches_template(template))
+		
+		event = Event.new_for_values(subject_uri="boo bar")
+		
+		template = Event.new_for_values(subject_uri="boo*")
+		self.assertTrue(event.matches_template(template))
+		
+		event = Event.new_for_values(subject_origin="boo bar")
+		
+		template = Event.new_for_values(subject_origin="boo*")
+		self.assertTrue(event.matches_template(template))
+		
+		event = Event.new_for_values(subject_mimetype="boo bar")
+		
+		template = Event.new_for_values(subject_mimetype="boo*")
+		self.assertTrue(event.matches_template(template))
+		
+	def testNegationWildcardTemplateMatching(self):
+		event = Event.new_for_values(actor="boo bar")
+		
+		template = Event.new_for_values(actor="!boo*")
+		self.assertFalse(event.matches_template(template))
+		template = Event.new_for_values(actor="!test*")
+		self.assertTrue(event.matches_template(template))
 
 
 class TimeRangeTest (unittest.TestCase):

=== modified file 'test/engine-test.py'
--- test/engine-test.py	2010-05-14 11:54:52 +0000
+++ test/engine-test.py	2010-05-14 17:20:42 +0000
@@ -763,6 +763,51 @@
 			TimeRange.always(), [template], StorageState.Any, 10,
 			ResultType.MostRecentEvents
 		)
+		
+	def testWildcard(self):
+		import_events("test/data/five_events.js", self.engine)
+
+		template = Event.new_for_values(
+			actor = "ge*"
+		)
+		ids = self.engine.find_eventids(TimeRange.always(),
+			[template,], StorageState.Any, 10, ResultType.MostRecentEvents
+		)
+		self.assertEquals(2, len(ids))
+		
+		template = Event.new_for_values(
+			actor = "!ge*"
+		)
+		ids = self.engine.find_eventids(TimeRange.always(),
+			[template,], StorageState.Any, 10, ResultType.MostRecentEvents
+		)
+		self.assertEquals(3, len(ids))
+		
+		template = Event.new_for_values(
+			subject_mimetype = "text/*"
+		)
+		ids = self.engine.find_eventids(TimeRange.always(),
+			[template,], StorageState.Any, 10, ResultType.MostRecentEvents
+		)
+		self.assertEquals(5, len(ids))
+		
+		template = Event.new_for_values(
+			subject_uri = "http://*";
+		)
+		
+		ids = self.engine.find_eventids(TimeRange.always(),
+			[template,], StorageState.Any, 10, ResultType.MostRecentEvents
+		)
+		self.assertEquals(1, len(ids))
+		
+		template = Event.new_for_values(
+			subject_origin = "file://*"
+		)
+		
+		ids = self.engine.find_eventids(TimeRange.always(),
+			[template,], StorageState.Any, 10, ResultType.MostRecentEvents
+		)
+		self.assertEquals(5, len(ids))
 
 if __name__ == "__main__":
 	unittest.main()

=== modified file 'test/test-sql.py'
--- test/test-sql.py	2010-05-13 13:12:12 +0000
+++ test/test-sql.py	2010-05-14 17:20:42 +0000
@@ -66,6 +66,27 @@
 		
 		self.assertEquals(where.sql % tuple(where.arguments),
 		                  "(foo = 10 AND NOT (subfoo = 68 OR subbar = 69) AND bar = 11)")
+		                  
+	def testAddTextCondition(self):
+		where = WhereClause(WhereClause.AND)
+		where.add_text_condition("boo", "bar")
+		self.assertEquals(where.sql.replace("?", "%s") % tuple(where.arguments),
+			"(boo = bar)")
+			
+		where = WhereClause(WhereClause.AND)
+		where.add_text_condition("boo", "bar", negation=True)
+		self.assertEquals(where.sql.replace("?", "%s") % tuple(where.arguments),
+			"(boo != bar)")
+			
+		where = WhereClause(WhereClause.AND)
+		where.add_text_condition("boo", "bar", like=True)
+		self.assertEquals(where.sql.replace("?", "%s") % tuple(where.arguments),
+			"(boo IN (SELECT id FROM boo WHERE value GLOB bar*))")
+			
+		where = WhereClause(WhereClause.AND)
+		where.add_text_condition("boo", "bar", like=True, negation=True)
+		self.assertEquals(where.sql.replace("?", "%s") % tuple(where.arguments),
+			"(boo NOT IN (SELECT id FROM boo WHERE value GLOB bar*))")
 		
 
 if __name__ == "__main__":

=== modified file 'zeitgeist/datamodel.py'
--- zeitgeist/datamodel.py	2010-05-14 11:54:52 +0000
+++ zeitgeist/datamodel.py	2010-05-14 17:20:42 +0000
@@ -40,10 +40,15 @@
 ]
 
 NEGATION_OPERATOR = "!"
+WILDCARD = "*"
 
 def EQUAL(x, y):
 	"""checks if both given arguments are equal"""
 	return x == y
+	
+def STARTSWITH(x, y):
+	"""checks if 'x' startswith 'y'"""
+	return x.startswith(y)
 
 # next() function is python >= 2.6
 try:
@@ -436,6 +441,7 @@
 		Storage) = range(7)
 		
 	SUPPORTS_NEGATION = (Uri, Interpretation, Manifestation, Origin, Mimetype)
+	SUPPORTS_WILDCARDS = (Uri, Origin, Mimetype)
 	
 	def __init__(self, data=None):
 		super(Subject, self).__init__([""]*len(Subject.Fields))
@@ -560,6 +566,10 @@
 		if field_id in self.SUPPORTS_NEGATION \
 				and expression.startswith(NEGATION_OPERATOR):
 			return not self._check_field_match(field_id, expression[len(NEGATION_OPERATOR):], comp)
+		elif field_id in self.SUPPORTS_WILDCARDS \
+				and expression.endswith(WILDCARD):
+			assert comp == EQUAL, "wildcards only work for pure text fields"
+			return self._check_field_match(field_id, expression[:-len(WILDCARD)], STARTSWITH)
 		else:
 			return comp(self[field_id], expression)
 
@@ -585,6 +595,7 @@
 		Actor) = range(5)
 		
 	SUPPORTS_NEGATION = (Interpretation, Manifestation, Actor)
+	SUPPORTS_WILDCARDS = (Actor,)
 	
 	def __init__(self, struct = None):
 		"""
@@ -833,6 +844,10 @@
 		if field_id in self.SUPPORTS_NEGATION \
 				and expression.startswith(NEGATION_OPERATOR):
 			return not self._check_field_match(field_id, expression[len(NEGATION_OPERATOR):], comp)
+		elif field_id in self.SUPPORTS_WILDCARDS \
+				and expression.endswith(WILDCARD):
+			assert comp == EQUAL, "wildcards only work for pure text fields"
+			return self._check_field_match(field_id, expression[:-len(WILDCARD)], STARTSWITH)
 		else:
 			return comp(self[0][field_id], expression)
 	


Follow ups