zeitgeist team mailing list archive
-
zeitgeist team
-
Mailing list archive
-
Message #02122
[Merge] lp:~seif/zeitgeist/rewrite-find-related-uris into lp:zeitgeist
Seif Lotfy has proposed merging lp:~seif/zeitgeist/rewrite-find-related-uris into lp:zeitgeist.
Requested reviews:
Zeitgeist Framework Team (zeitgeist)
I took the time to rewrite the find_related_uris to use a more or less graph structuring before pushing everything into one pot and going through a 1-step aproiri.
The results are the same except for one test case where [i1, i3, i5] is expected but [i3, i1, i5] is sent. However this results is also right since i3 and i2 both have the same count = 2 but were sorted differently whereas i5 count = 1.
The new code is documented and is much shorter allowing me to get rid of many custom helpers that were written for find_related_uris.
If this branch is ok with you i will change the last test case and merge it.
Cheers
Seif
--
https://code.launchpad.net/~seif/zeitgeist/rewrite-find-related-uris/+merge/38746
Your team Zeitgeist Framework Team is requested to review the proposed merge of lp:~seif/zeitgeist/rewrite-find-related-uris into lp:zeitgeist.
=== modified file '_zeitgeist/engine/main.py'
--- _zeitgeist/engine/main.py 2010-10-12 18:37:10 +0000
+++ _zeitgeist/engine/main.py 2010-10-18 18:09:42 +0000
@@ -36,8 +36,6 @@
from _zeitgeist.engine import constants
from _zeitgeist.engine.sql import get_default_cursor, unset_cursor, \
TableLookup, WhereClause
-
-WINDOW_SIZE = 7
logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger("zeitgeist.engine")
@@ -325,8 +323,8 @@
Return modes:
- 0: IDs.
- 1: Events.
- - 2: (Timestamp, SubjectUri)
"""
+ print "*********", time_range
t = time.time()
where = self._build_sql_event_filter(time_range, event_templates,
@@ -339,8 +337,6 @@
sql = "SELECT DISTINCT id FROM event_view"
elif return_mode == 1:
sql = "SELECT id FROM event_view"
- elif return_mode == 2:
- sql = "SELECT subj_uri, timestamp FROM event_view"
else:
raise NotImplementedError, "Unsupported return_mode."
@@ -388,10 +384,7 @@
result = [row[0] for row in result]
elif return_mode == 1:
log.debug("Found %d events IDs in %fs" % (len(result), time.time()- t))
- result = self.get_events(ids=[row[0] for row in result], sender=sender)
- elif return_mode == 2:
- log.debug("Found %d (uri,timestamp) tuples in %fs" % (len(result), time.time()- t))
- result = map(lambda row: (row[0], row[1]), result)
+ result = self.get_events(ids=[row[0] for row in result], sender=sender)
else:
raise Exception("%d" % return_mode)
@@ -403,12 +396,6 @@
def find_events(self, *args):
return self._find_events(1, *args)
- def __add_window(self, _set, assoc, landmarks, windows):
- if _set & landmarks: # intersection != 0
- windows.append(_set)
- for i in _set.difference(landmarks):
- assoc[i] += 1
-
def find_related_uris(self, timerange, event_templates, result_event_templates,
result_storage_state, num_results, result_type):
"""
@@ -421,79 +408,58 @@
"""
if result_type == 0 or result_type == 1:
-
+ # Instead of using sliding windows we will be using a graph representation
+
t1 = time.time()
- if len(result_event_templates) == 0:
- uris = self._find_events(2, timerange, result_event_templates,
- result_storage_state, 0, ResultType.LeastRecentEvents)
- else:
- uris = self._find_events(2, timerange, result_event_templates + event_templates,
- result_storage_state, 0, ResultType.LeastRecentEvents)
-
- assoc = defaultdict(int)
-
- landmarks = self._find_events(2, timerange, event_templates,
- result_storage_state, 0, 4)
- landmarks = set([unicode(event[0]) for event in landmarks])
-
- latest_uris = dict(uris)
- events = [unicode(u[0]) for u in uris]
-
- furis = filter(lambda x: x[0] in landmarks, uris)
- if len(furis) == 0:
- return []
-
- _min = min(furis, key=operator.itemgetter(1))
- _max = max(furis, key=operator.itemgetter(1))
- min_index = uris.index(_min) - WINDOW_SIZE
- max_index = uris.index(_max) + WINDOW_SIZE
- _min = _min[1]
- _max = _max[1]
-
- if min_index < 0:
- min_index = 0
- if max_index > len(events):
- max_index = -1
-
- func = self.__add_window
-
- if len(events) == 0 or len(landmarks) == 0:
- return []
-
- windows = []
-
- if len(events) <= WINDOW_SIZE:
- #TODO bug! windows is not defined, seems the algorithm never touches these loop
- func(events, assoc, landmarks, windows)
- else:
- events = events[min_index:max_index]
- offset = WINDOW_SIZE/2
-
- for i in xrange(offset):
- func(set(events[0: offset - i]), assoc, landmarks,
- windows)
- func(set(events[len(events) - offset + i: len(events)]),
- assoc, landmarks, windows)
-
- it = iter(events)
- result = tuple(islice(it, WINDOW_SIZE))
- for elem in it:
- result = result[1:] + (elem,)
- func(set(result), assoc, landmarks, windows)
-
-
+ # We pick out the ids for relational event so we can set them as roots
+ # the ids are taken from the events that match the events_templates
+ ids = self._find_events(0, timerange, event_templates,
+ result_storage_state, 0, ResultType.LeastRecentEvents)
+
+ # Pick out the result_ids for the filtered results we would like to take into account
+ # the ids are taken from the events that match the result_event_templates
+ # if no result_event_templates are set we consider all results as allowed
+ result_ids = []
+ if len(result_event_templates) > 0:
+ result_ids = self._find_events(0, timerange, result_event_templates,
+ result_storage_state, 0, ResultType.LeastRecentEvents)
+
+ # From here we create several graphs with the maximum height depth of 2
+ # and push all the nodes and vertices (events) in one pot together
+ # FIXME: the depth should be adaptable
+ pot = []
+ for id in ids:
+ for x in xrange(id-2,id+3):
+ if len(result_ids) == 0 or x in result_ids:
+ pot.append(x)
+
+ # Out of the pot we get all respected events and count which uris occur most
+ events = self.get_events(pot)
+ subject_uri_counter = {}
+ latest_uris = {}
+ for event in events:
+ if event and not event.id in ids:
+ subj = event.subjects[0]
+ if not subject_uri_counter.has_key(subj.uri):
+ subject_uri_counter[subj.uri] = 0
+ subject_uri_counter[subj.uri] += 1
+ latest_uris[subj.uri] = event
+
log.debug("FindRelatedUris: Finished sliding windows in %fs." % \
(time.time()-t1))
if result_type == 0:
- sets = [[v, k] for k, v in assoc.iteritems()]
+ sets = [[v, k] for k, v in subject_uri_counter.iteritems()]
elif result_type == 1:
- sets = [[latest_uris[k], k] for k in assoc]
+ sets = [[latest_uris[k], k] for k in subject_uri_counter]
sets.sort(reverse = True)
+ print sets
sets = map(lambda result: result[1], sets[:num_results])
+ print "####################"
+ print subject_uri_counter
return sets
else:
raise NotImplementedError, "Unsupported ResultType."