← Back to team overview

zeitgeist team mailing list archive

[Merge] lp:~seif/zeitgeist/rewrite-find-related-uris into lp:zeitgeist

 

Seif Lotfy has proposed merging lp:~seif/zeitgeist/rewrite-find-related-uris into lp:zeitgeist.

Requested reviews:
  Zeitgeist Framework Team (zeitgeist)


I took the time to rewrite the find_related_uris to use a more or less graph structuring before pushing everything into one pot and going through a 1-step aproiri.
The results are the same except for one test case where [i1, i3, i5] is expected but [i3, i1, i5] is sent. However this results is also right since i3 and i2 both have the same count = 2 but were sorted differently whereas i5 count = 1.

The new code is documented and is much shorter allowing me to get rid of many custom helpers that were written for find_related_uris.

If this branch is ok with you i will change the last test case and merge it.
Cheers
Seif
-- 
https://code.launchpad.net/~seif/zeitgeist/rewrite-find-related-uris/+merge/38746
Your team Zeitgeist Framework Team is requested to review the proposed merge of lp:~seif/zeitgeist/rewrite-find-related-uris into lp:zeitgeist.
=== modified file '_zeitgeist/engine/main.py'
--- _zeitgeist/engine/main.py	2010-10-12 18:37:10 +0000
+++ _zeitgeist/engine/main.py	2010-10-18 18:09:42 +0000
@@ -36,8 +36,6 @@
 from _zeitgeist.engine import constants
 from _zeitgeist.engine.sql import get_default_cursor, unset_cursor, \
 	TableLookup, WhereClause
-	
-WINDOW_SIZE = 7
 
 logging.basicConfig(level=logging.DEBUG)
 log = logging.getLogger("zeitgeist.engine")
@@ -325,8 +323,8 @@
 		Return modes:
 		 - 0: IDs.
 		 - 1: Events.
-		 - 2: (Timestamp, SubjectUri)
 		"""
+		print "*********", time_range
 		t = time.time()
 		
 		where = self._build_sql_event_filter(time_range, event_templates,
@@ -339,8 +337,6 @@
 			sql = "SELECT DISTINCT id FROM event_view"
 		elif return_mode == 1:
 			sql = "SELECT id FROM event_view"
-		elif return_mode == 2:
-			sql = "SELECT subj_uri, timestamp FROM event_view"
 		else:
 			raise NotImplementedError, "Unsupported return_mode."
 		
@@ -388,10 +384,7 @@
 			result = [row[0] for row in result]
 		elif return_mode == 1:
 			log.debug("Found %d events IDs in %fs" % (len(result), time.time()- t))
-			result = self.get_events(ids=[row[0] for row in result], sender=sender)			
-		elif return_mode == 2:
-			log.debug("Found %d (uri,timestamp) tuples in %fs" % (len(result), time.time()- t))
-			result = map(lambda row: (row[0], row[1]), result)			
+			result = self.get_events(ids=[row[0] for row in result], sender=sender)	
 		else:
 			raise Exception("%d" % return_mode)
 		
@@ -403,12 +396,6 @@
 	def find_events(self, *args):
 		return self._find_events(1, *args)
 	
-	def __add_window(self, _set, assoc, landmarks, windows):
-		if _set & landmarks: # intersection != 0
-			windows.append(_set)
-			for i in _set.difference(landmarks):
-				assoc[i] += 1
-	
 	def find_related_uris(self, timerange, event_templates, result_event_templates,
 		result_storage_state, num_results, result_type):
 		"""
@@ -421,79 +408,58 @@
 		"""
 		
 		if result_type == 0 or result_type == 1:
-			
+			# Instead of using sliding windows we will be using a graph representation
+
 			t1 = time.time()
 			
-			if len(result_event_templates) == 0:
-				uris = self._find_events(2, timerange, result_event_templates,
-					result_storage_state, 0, ResultType.LeastRecentEvents)
-			else:
-				uris = self._find_events(2, timerange, result_event_templates + event_templates,
-					result_storage_state, 0, ResultType.LeastRecentEvents)
-			
-			assoc = defaultdict(int)
-			
-			landmarks = self._find_events(2, timerange, event_templates,
-					result_storage_state, 0, 4)
-			landmarks = set([unicode(event[0]) for event in landmarks])
-			
-			latest_uris = dict(uris)
-			events = [unicode(u[0]) for u in uris]
-
-			furis = filter(lambda x: x[0] in landmarks, uris)
-			if len(furis) == 0:
-				return []
-			
-			_min = min(furis, key=operator.itemgetter(1))
-			_max = max(furis, key=operator.itemgetter(1))
-			min_index = uris.index(_min) - WINDOW_SIZE
-			max_index = uris.index(_max) + WINDOW_SIZE
-			_min = _min[1]
-			_max = _max[1]
-			
-			if min_index < 0:
-				min_index = 0
-			if max_index > len(events):
-				max_index = -1
-				
-			func = self.__add_window
-			
-			if len(events) == 0 or len(landmarks) == 0:
-				return []
-			
-			windows = []
-	
-			if len(events) <= WINDOW_SIZE:
-				#TODO bug! windows is not defined, seems the algorithm never touches these loop
-				func(events, assoc, landmarks, windows)
-			else:
-				events = events[min_index:max_index]
-				offset = WINDOW_SIZE/2
-				
-				for i in xrange(offset):
-					func(set(events[0: offset - i]), assoc, landmarks, 
-						windows)
-					func(set(events[len(events) - offset + i: len(events)]),
-						assoc, landmarks, windows)
-					
-				it = iter(events)
-				result = tuple(islice(it, WINDOW_SIZE))
-				for elem in it:
-					result = result[1:] + (elem,)
-					func(set(result), assoc, landmarks, windows)
-					
-				
+			# We pick out the ids for relational event so we can set them as roots
+			# the ids are taken from the events that match the events_templates
+			ids = self._find_events(0, timerange, event_templates,
+					result_storage_state, 0, ResultType.LeastRecentEvents)
+			
+			# Pick out the result_ids for the filtered results we would like to take into account
+			# the ids are taken from the events that match the result_event_templates
+			# if no result_event_templates are set we consider all results as allowed
+			result_ids = []
+			if len(result_event_templates) > 0:
+				result_ids = self._find_events(0, timerange, result_event_templates,
+						result_storage_state, 0, ResultType.LeastRecentEvents)
+			
+			# From here we create several graphs with the maximum height depth of 2
+			# and push all the nodes and vertices (events) in one pot together
+			# FIXME: the depth should be adaptable 
+			pot = []
+			for id in ids:
+				for x in xrange(id-2,id+3):
+					if len(result_ids) == 0 or x in result_ids:
+						pot.append(x)
+			
+			# Out of the pot we get all respected events and count which uris occur most
+			events = self.get_events(pot)
+			subject_uri_counter = {}
+			latest_uris = {}
+			for event in events:
+				if event and not event.id in ids:
+					subj = event.subjects[0]
+					if not subject_uri_counter.has_key(subj.uri):
+						subject_uri_counter[subj.uri] = 0
+					subject_uri_counter[subj.uri] += 1
+					latest_uris[subj.uri] = event
+							
 			log.debug("FindRelatedUris: Finished sliding windows in %fs." % \
 				(time.time()-t1))
 			
 			if result_type == 0:
-				sets = [[v, k] for k, v in assoc.iteritems()]
+				sets = [[v, k] for k, v in subject_uri_counter.iteritems()]
 			elif result_type == 1:
-				sets = [[latest_uris[k], k] for k in assoc]
+				sets = [[latest_uris[k], k] for k in subject_uri_counter]
 				
 			sets.sort(reverse = True)
+			print sets
 			sets = map(lambda result: result[1], sets[:num_results])
 			
+			print "####################"
+			print subject_uri_counter
 			return sets
 		else:
 			raise NotImplementedError, "Unsupported ResultType."