zeitgeist team mailing list archive
-
zeitgeist team
-
Mailing list archive
-
Message #03984
[Merge] lp:~kamstrup/zeitgeist-extensions/fts-cap-term-length into lp:zeitgeist-extensions
Mikkel Kamstrup Erlandsen has proposed merging lp:~kamstrup/zeitgeist-extensions/fts-cap-term-length into lp:zeitgeist-extensions.
Requested reviews:
Zeitgeist Extensions (zeitgeist-extensions)
Related bugs:
Bug #843668 in Zeitgeist Extensions: "Blowing Xapian max term length corrupts index"
https://bugs.launchpad.net/zeitgeist-extensions/+bug/843668
For more details, see:
https://code.launchpad.net/~kamstrup/zeitgeist-extensions/fts-cap-term-length/+merge/74362
See attached bug
--
https://code.launchpad.net/~kamstrup/zeitgeist-extensions/fts-cap-term-length/+merge/74362
Your team Zeitgeist Extensions is requested to review the proposed merge of lp:~kamstrup/zeitgeist-extensions/fts-cap-term-length into lp:zeitgeist-extensions.
=== modified file 'fts/_tests.py'
--- fts/_tests.py 2011-09-01 13:46:30 +0000
+++ fts/_tests.py 2011-09-07 08:45:23 +0000
@@ -104,3 +104,4 @@
assert u"æ¼¢å" in results[0].subjects[0].text, results[0].subjects[0].uri
+
=== modified file 'fts/fts.py'
--- fts/fts.py 2011-09-06 10:03:23 +0000
+++ fts/fts.py 2011-09-07 08:45:23 +0000
@@ -50,6 +50,7 @@
import threading
from urllib import quote as url_escape, unquote as url_unescape
import gobject, gio
+from cStringIO import StringIO
from zeitgeist.datamodel import Symbol, StorageState, ResultType, TimeRange, NULL_EVENT, NEGATION_OPERATOR
from _zeitgeist.engine.datamodel import Event, Subject
@@ -93,6 +94,10 @@
ResultType.LeastPopularActor,
]
+# Xapian has a maximum term length of 245 bytes and Bad Things(TM) happen
+# if you bust that. We use the cap_string() function to control this.
+MAX_TERM_LENGTH = 245
+
def synchronized(lock):
""" Synchronization decorator. """
@@ -197,6 +202,31 @@
result += c
return result
+def cap_string (s, nbytes=MAX_TERM_LENGTH):
+ """
+ If s has more than nbytes bytes (not characters) then cap it off
+ after nbytes bytes in a way still producing a valid utf-8 string.
+
+ Assumes that s is a utf-8 string.
+
+ This function useful for working with Xapian terms because Xapian has
+ a max term length of 245 (which is not very well documented, but see
+ http://xapian.org/docs/omega/termprefixes.html).
+ """
+ # Check if we can fast-path this string
+ if (len(s.encode("utf-8")) <= nbytes):
+ return s
+
+ # We use a StringIO here to avoid mem thrashing via naiive
+ # string concatenation. See fx. http://www.skymind.com/~ocrow/python_string/
+ buf = StringIO()
+ for char in s :
+ if buf.tell() >= nbytes - 1 :
+ return buf.getvalue()
+ buf.write(char.encode("utf-8"))
+
+ return unicode(buf.getvalue().decode("utf-8"))
+
def expand_type (type_prefix, uri):
"""
Return a string with a Xapian query matching all child types of 'uri'
@@ -564,7 +594,7 @@
doc = self._tokenizer.get_document()
for cat in desktop.getCategories():
- doc.add_boolean_term(FILTER_PREFIX_XDG_CATEGORY+cat.lower())
+ doc.add_boolean_term(cap_string(FILTER_PREFIX_XDG_CATEGORY+cat.lower()))
else:
log.debug("Unable to look up app info for %s" % actor)
@@ -649,25 +679,25 @@
"""Adds the filtering rules to the doc. Filtering rules will
not affect the relevancy ranking of the event/doc"""
if event.interpretation:
- doc.add_boolean_term (FILTER_PREFIX_EVENT_INTERPRETATION+event.interpretation)
+ doc.add_boolean_term (cap_string(FILTER_PREFIX_EVENT_INTERPRETATION+event.interpretation))
if event.manifestation:
- doc.add_boolean_term (FILTER_PREFIX_EVENT_MANIFESTATION+event.manifestation)
+ doc.add_boolean_term (cap_string(FILTER_PREFIX_EVENT_MANIFESTATION+event.manifestation))
if event.actor:
- doc.add_boolean_term (FILTER_PREFIX_ACTOR+mangle_uri(event.actor))
+ doc.add_boolean_term (cap_string(FILTER_PREFIX_ACTOR+mangle_uri(event.actor)))
for su in event.subjects:
if su.uri:
- doc.add_boolean_term (FILTER_PREFIX_SUBJECT_URI+mangle_uri(su.uri))
+ doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_URI+mangle_uri(su.uri)))
if su.interpretation:
- doc.add_boolean_term (FILTER_PREFIX_SUBJECT_INTERPRETATION+su.interpretation)
+ doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_INTERPRETATION+su.interpretation))
if su.manifestation:
- doc.add_boolean_term (FILTER_PREFIX_SUBJECT_MANIFESTATION+su.manifestation)
+ doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_MANIFESTATION+su.manifestation))
if su.origin:
- doc.add_boolean_term (FILTER_PREFIX_SUBJECT_ORIGIN+mangle_uri(su.origin))
+ doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_ORIGIN+mangle_uri(su.origin)))
if su.mimetype:
- doc.add_boolean_term (FILTER_PREFIX_SUBJECT_MIMETYPE+su.mimetype)
+ doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_MIMETYPE+su.mimetype))
if su.storage:
- doc.add_boolean_term (FILTER_PREFIX_SUBJECT_STORAGE+su.storage)
+ doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_STORAGE+su.storage))
@synchronized (INDEX_LOCK)
def _index_event_real (self, event):
@@ -766,6 +796,3 @@
return "%s..%sms" % (time_range.begin, time_range.end)
-if __name__ == "__main__":
- indexer = Indexer(None)
- print indexer._compile_filter_query([Event.new_for_values(subject_interpretation="http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#Document")])
Follow ups