← Back to team overview

zeitgeist team mailing list archive

[Merge] lp:~kamstrup/zeitgeist/query-expansion into lp:zeitgeist

 

Mikkel Kamstrup Erlandsen has proposed merging lp:~kamstrup/zeitgeist/query-expansion into lp:zeitgeist.

Requested reviews:
  Zeitgeist Framework Team (zeitgeist)


Huzzah! Smackeroo! I have query expansion fully working now all unit tests passing. Both on the SQL level and on our template matching level.

So what does "query expansion" mean. Consider a query for subject with interp. nfo:Media. Right that would only match stuff that has been explicitly identified as nfo:Media (which is not much since we usually can identify whther stuff is Audio, Image, or Video data).

With query expansion we'll also match any children of nfo:Media. Ie also nfo:Image, nfo:Audio, and nfo:Video. Also recursively matching children of these like nfo:RasterImage and nfo:Vector image.

The way it's implemented is really simple. We simply expand the tree of children and compile a big OR query with everything.
-- 
https://code.launchpad.net/~kamstrup/zeitgeist/query-expansion/+merge/25000
Your team Zeitgeist Framework Team is requested to review the proposed merge of lp:~kamstrup/zeitgeist/query-expansion into lp:zeitgeist.
=== modified file '_zeitgeist/engine/main.py'
--- _zeitgeist/engine/main.py	2010-05-01 22:18:55 +0000
+++ _zeitgeist/engine/main.py	2010-05-10 14:47:20 +0000
@@ -32,7 +32,7 @@
 from collections import defaultdict
 
 from zeitgeist.datamodel import Event as OrigEvent, StorageState, TimeRange, \
-	ResultType, get_timestamp_for_now, Interpretation
+	ResultType, get_timestamp_for_now, Interpretation, Symbol
 from _zeitgeist.engine.datamodel import Event, Subject	
 from _zeitgeist.engine.extension import ExtensionsCollection, load_class
 from _zeitgeist.engine import constants
@@ -163,16 +163,51 @@
 		for (event_template, subject_template) in self._build_templates(templates):
 			subwhere = WhereClause(WhereClause.AND)
 			try:
-				for key in ("interpretation", "manifestation", "actor"):
-					value = getattr(event_template, key)
-					if value:
-						subwhere.add("%s = ?" % key,
-							getattr(self, "_" + key).id(value))
-				for key in ("interpretation", "manifestation", "mimetype"):
-					value = getattr(subject_template, key)
-					if value:
-						subwhere.add("subj_%s = ?" % key,
-							getattr(self, "_" + key).id(value))
+				# Expand event interpretation children
+				event_interp_where = WhereClause(WhereClause.OR)
+				for child_interp in (Symbol.find_child_uris_extended(event_template.interpretation)):
+					if child_interp:
+						event_interp_where.add("interpretation = ?",
+						                       self._interpretation.id(child_interp))
+				if event_interp_where:
+					subwhere.extend(event_interp_where)
+				
+				# Expand event manifestation children
+				event_manif_where = WhereClause(WhereClause.OR)
+				for child_manif in (Symbol.find_child_uris_extended(event_template.manifestation)):
+					if child_manif:
+						event_manif_where.add("manifestation = ?",
+						                      self._manifestation.id(child_manif))
+				if event_manif_where:
+					subwhere.extend(event_manif_where)
+				
+				# Expand subject interpretation children
+				su_interp_where = WhereClause(WhereClause.OR)
+				for child_interp in (Symbol.find_child_uris_extended(subject_template.interpretation)):
+					if child_interp:
+						su_interp_where.add("subj_interpretation = ?",
+						                    self._interpretation.id(child_interp))
+				if su_interp_where:
+					subwhere.extend(su_interp_where)
+				
+				# Expand subject manifestation children
+				su_manif_where = WhereClause(WhereClause.OR)
+				for child_manif in (Symbol.find_child_uris_extended(subject_template.manifestation)):
+					if child_manif:
+						su_manif_where.add("subj_manifestation = ?",
+						                   self._manifestation.id(child_manif))
+				if su_manif_where:
+					subwhere.extend(su_manif_where)
+				
+				# FIXME: Expand mime children as well.
+				# Right now we only do exact matching for mimetypes
+				if subject_template.mimetype:
+					subwhere.add("subj_mimetype = ?",
+					             self._mimetype.id(subject_tempalte.mimetype))
+				
+				if event_template.actor:
+					subwhere.add("actor = ?",
+					             self._actor.id(event_template.actor))
 			except KeyError:
 				# Value not in DB
 				where_or.register_no_result()
@@ -183,6 +218,7 @@
 					subwhere.add("subj_%s = ?" % key, value)
 			where_or.extend(subwhere)
 		
+		print "SQL:  ", where_or.sql, where_or.arguments
 		return where_or
 	
 	def _build_sql_event_filter(self, time_range, templates, storage_state):

=== modified file 'test/datamodel-test.py'
--- test/datamodel-test.py	2010-04-26 19:42:07 +0000
+++ test/datamodel-test.py	2010-05-10 14:47:20 +0000
@@ -51,6 +51,47 @@
 		self.assertTrue(f.display_name != None)
 		self.assertTrue(f.doc != None)
 
+class RelationshipTest (unittest.TestCase):
+	"""
+	Tests for parent/child relationships in the loaded ontologies
+	"""
+	
+	def testDirectParents (self):
+		"""
+		Tests relationship tracking for immediate parents
+		"""
+		self.assertTrue(Interpretation.AUDIO.is_a(Interpretation.MEDIA))
+	
+	def testSecondLevelParents (self):
+		"""
+		Tests relationship tracking for second level parents
+		"""
+		self.assertTrue(Interpretation.VECTOR_IMAGE.is_a(Interpretation.MEDIA))
+		self.assertTrue(Interpretation.VECTOR_IMAGE.is_a(Interpretation.IMAGE))
+	
+	def testRootParents (self):
+		"""
+		Tests relationship tracking for root nodes, ie Interpretation
+		and Manifestation
+		"""
+		self.assertTrue(Interpretation.VECTOR_IMAGE.is_a(Interpretation))
+		self.assertTrue(Manifestation.FILE_DATA_OBJECT.is_a(Manifestation))
+		self.assertTrue(Manifestation.USER_ACTIVITY.is_a(Manifestation))
+	
+	def testReflecsive (self):
+		"""
+		Assert that a symbol is a child of itself
+		"""
+		self.assertTrue(Manifestation.USER_ACTIVITY.is_a(Manifestation.USER_ACTIVITY))
+	
+	def testFindExtendedChildren (self):
+		self.assertEquals(["foo://bar"], Symbol.find_child_uris_extended("foo://bar"))
+		self.assertEquals(["http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#Icon";,
+		                   "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#VectorImage";,
+		                   "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#Cursor";,
+		                   "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#RasterImage";,
+		                   "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#Image";],
+		                  Symbol.find_child_uris_extended("http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#Image";))
 
 class EventTest (unittest.TestCase):
 	def setUp(self):
@@ -116,6 +157,17 @@
 		e.manifestation="ILLEGAL SNAFU"
 		self.assertFalse(e.matches_template(template))
 	
+	def testTemplateParentMatching(self):
+		template = Event.new_for_values(
+					manifestation=Manifestation.EVENT_MANIFESTATION,
+					subject_interpretation=Interpretation)
+
+		e = Event.new_for_values(
+					manifestation=Manifestation.USER_ACTIVITY,
+					subject_interpretation=Interpretation.TEXT_DOCUMENT,
+					subject_text="Foo")
+		self.assertTrue(e.matches_template(template))		
+	
 	def testTemplateFiltering(self):
 		template = Event.new_for_values(interpretation="stfu:OpenEvent")
 		events = parse_events("test/data/five_events.js")

=== added file 'test/test-sql.py'
--- test/test-sql.py	1970-01-01 00:00:00 +0000
+++ test/test-sql.py	2010-05-10 14:47:20 +0000
@@ -0,0 +1,51 @@
+#! /usr/bin/python
+# -.- coding: utf-8 -.-
+
+# Zeitgeist
+#
+# Copyright © 2010 Mikkel Kamstrup Erlandsen <mikkel.kamstrup@xxxxxxxxx>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import sys, os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import unittest
+from _zeitgeist.engine.sql import *
+
+class SQLTest (unittest.TestCase):
+	
+	def testFlat (self):
+		where = WhereClause(WhereClause.AND)
+		where.add ("foo = %s", 10)
+		where.add ("bar = %s", 27)
+		self.assertEquals(where.sql % tuple(where.arguments),
+		                  "(foo = 10 AND bar = 27)")
+	
+	def testNested (self):
+		where = WhereClause(WhereClause.AND)
+		where.add ("foo = %s", 10)
+		
+		subwhere = WhereClause(WhereClause.OR)
+		subwhere.add ("subfoo = %s", 68)
+		subwhere.add ("subbar = %s", 69)
+		where.extend(subwhere)
+		where.add ("bar = %s", 11)
+		
+		self.assertEquals(where.sql % tuple(where.arguments),
+		                  "(foo = 10 AND (subfoo = 68 OR subbar = 69) AND bar = 11)")
+
+if __name__ == "__main__":
+	unittest.main()

=== modified file 'zeitgeist/datamodel.py'
--- zeitgeist/datamodel.py	2010-04-29 08:28:44 +0000
+++ zeitgeist/datamodel.py	2010-05-10 14:47:20 +0000
@@ -185,6 +185,22 @@
 		dikt[self.name] = self
 		for child in self._children.itervalues():
 			child._visit(dikt) 
+	
+	@staticmethod
+	def find_child_uris_extended (uri):
+		"""
+		Creates a list of all known child URIs of `uri`, including
+		`uri` itself in the list. Hence the "extended". If `uri`
+		is unknown a list containing only `uri` is returned.
+		"""
+		try:
+			symbol = _SYMBOLS_BY_URI[uri]
+			children = [child.uri for child in symbol.get_all_children()]
+			children.append(uri)
+			return children
+		except KeyError, e:
+			return [uri]
+		
 
 	@property
 	def uri(self):
@@ -236,7 +252,51 @@
 		Returns a list of immediate parent symbols
 		"""
 		return frozenset(self._parents.itervalues())
-		
+	
+	def is_a (self, parent):
+		"""
+		Returns True if this symbol is a child of `parent`.
+		"""
+		if not isinstance (parent, Symbol):
+			try:
+				parent = _SYMBOLS_BY_URI[parent]
+			except KeyError, e:
+				# Parent is not a known URI
+				print 11111111111, self.uri, parent
+				return self.uri == parent
+		
+		# Invariant: parent is a Symbol
+		if self.uri == parent.uri : return True
+		
+		parent._ensure_all_children()
+		
+		# FIXME: We should really check that child.uri is in there,
+		#        but that is not fast with the current code layout
+		return self.name in parent._all_children
+	
+	@staticmethod
+	def uri_is_a (child, parent):
+		"""
+		Returns True if `child` is a child of `parent`. Both `child`
+		and `parent` arguments must be any combination of
+		:class:`Symbol` and/or string.
+		"""
+		if isinstance (child, basestring):
+			try:
+				child = _SYMBOLS_BY_URI[child]
+			except KeyError, e:
+				# Child is not a know URI
+				if isinstance (parent, basestring):
+					return child == parent
+				elif isinstance (parent, Symbol):
+					return child == parent.uri
+				else:
+					return False
+		
+		if not isinstance (child, Symbol):
+			raise ValueError("Child argument must be a Symbol or string. Got %s" % type(child))
+		
+		return child.is_a(parent)
 		
 class TimeRange(list):
 	"""
@@ -463,11 +523,13 @@
 		"""
 		Return True if this Subject matches *subject_template*. Empty
 		fields in the template are treated as wildcards.
+		Interpretations and manifestations are also matched if they are
+		children of the types specified in `subject_template`. 
 		
 		See also :meth:`Event.matches_template`
 		"""
 		for m in Subject.Fields:
-			if subject_template[m] and subject_template[m] != self[m] :
+			if subject_template[m] and not Symbol.uri_is_a (self[m], subject_template[m]):
 				return False
 		return True
 
@@ -693,7 +755,9 @@
 		"""
 		Return True if this event matches *event_template*. The
 		matching is done where unset fields in the template is
-		interpreted as wild cards. If the template has more than one
+		interpreted as wild cards. Interpretations and manifestations
+		are also matched if they are children of the types specified
+		in `event_template`. If the template has more than one
 		subject, this event matches if at least one of the subjects
 		on this event matches any single one of the subjects on the
 		template.
@@ -707,7 +771,7 @@
 		tdata = event_template[0]
 		for m in Event.Fields:
 			if m == Event.Timestamp : continue
-			if tdata[m] and tdata[m] != data[m] : return False
+			if tdata[m] and not Symbol.uri_is_a (data[m], tdata[m]) : return False
 		
 		# If template has no subjects we have a match
 		if len(event_template[1]) == 0 : return True


Follow ups