← Back to team overview

zeitgeist team mailing list archive

[Merge] lp:~kamstrup/zeitgeist-extensions/fts-cap-term-length into lp:zeitgeist-extensions

 

Mikkel Kamstrup Erlandsen has proposed merging lp:~kamstrup/zeitgeist-extensions/fts-cap-term-length into lp:zeitgeist-extensions.

Requested reviews:
  Zeitgeist Extensions (zeitgeist-extensions)
Related bugs:
  Bug #843668 in Zeitgeist Extensions: "Blowing Xapian max term length corrupts index"
  https://bugs.launchpad.net/zeitgeist-extensions/+bug/843668

For more details, see:
https://code.launchpad.net/~kamstrup/zeitgeist-extensions/fts-cap-term-length/+merge/74362

See attached bug
-- 
https://code.launchpad.net/~kamstrup/zeitgeist-extensions/fts-cap-term-length/+merge/74362
Your team Zeitgeist Extensions is requested to review the proposed merge of lp:~kamstrup/zeitgeist-extensions/fts-cap-term-length into lp:zeitgeist-extensions.
=== modified file 'fts/_tests.py'
--- fts/_tests.py	2011-09-01 13:46:30 +0000
+++ fts/_tests.py	2011-09-07 08:45:23 +0000
@@ -104,3 +104,4 @@
 assert u"漢字" in results[0].subjects[0].text, results[0].subjects[0].uri
 
 
+

=== modified file 'fts/fts.py'
--- fts/fts.py	2011-09-06 10:03:23 +0000
+++ fts/fts.py	2011-09-07 08:45:23 +0000
@@ -50,6 +50,7 @@
 import threading
 from urllib import quote as url_escape, unquote as url_unescape
 import gobject, gio
+from cStringIO import StringIO
 
 from zeitgeist.datamodel import Symbol, StorageState, ResultType, TimeRange, NULL_EVENT, NEGATION_OPERATOR
 from _zeitgeist.engine.datamodel import Event, Subject
@@ -93,6 +94,10 @@
 	ResultType.LeastPopularActor,
 ]
 
+# Xapian has a maximum term length of 245 bytes and Bad Things(TM) happen
+# if you bust that. We use the cap_string() function to control this.
+MAX_TERM_LENGTH = 245
+
 def synchronized(lock):
     """ Synchronization decorator. """
 
@@ -197,6 +202,31 @@
 			result += c
 	return result
 
+def cap_string (s, nbytes=MAX_TERM_LENGTH):
+	"""
+	If s has more than nbytes bytes (not characters) then cap it off
+	after nbytes bytes in a way still producing a valid utf-8 string.
+	
+	Assumes that s is a utf-8 string.
+	
+	This function useful for working with Xapian terms because Xapian has
+	a max term length of 245 (which is not very well documented, but see
+	http://xapian.org/docs/omega/termprefixes.html).
+	"""
+	# Check if we can fast-path this string
+	if (len(s.encode("utf-8")) <= nbytes):
+		return s
+	
+	# We use a StringIO here to avoid mem thrashing via naiive
+	# string concatenation. See fx. http://www.skymind.com/~ocrow/python_string/
+	buf = StringIO()
+	for char in s :
+		if buf.tell() >= nbytes - 1 :
+			return buf.getvalue()
+		buf.write(char.encode("utf-8"))
+	
+	return unicode(buf.getvalue().decode("utf-8"))
+
 def expand_type (type_prefix, uri):
 	"""
 	Return a string with a Xapian query matching all child types of 'uri'
@@ -564,7 +594,7 @@
 			
 				doc = self._tokenizer.get_document()
 				for cat in desktop.getCategories():
-					doc.add_boolean_term(FILTER_PREFIX_XDG_CATEGORY+cat.lower())
+					doc.add_boolean_term(cap_string(FILTER_PREFIX_XDG_CATEGORY+cat.lower()))
 		else:
 			log.debug("Unable to look up app info for %s" % actor)
 		
@@ -649,25 +679,25 @@
 		"""Adds the filtering rules to the doc. Filtering rules will
 		   not affect the relevancy ranking of the event/doc"""
 		if event.interpretation:
-			doc.add_boolean_term (FILTER_PREFIX_EVENT_INTERPRETATION+event.interpretation)
+			doc.add_boolean_term (cap_string(FILTER_PREFIX_EVENT_INTERPRETATION+event.interpretation))
 		if event.manifestation:
-			doc.add_boolean_term (FILTER_PREFIX_EVENT_MANIFESTATION+event.manifestation)
+			doc.add_boolean_term (cap_string(FILTER_PREFIX_EVENT_MANIFESTATION+event.manifestation))
 		if event.actor:
-			doc.add_boolean_term (FILTER_PREFIX_ACTOR+mangle_uri(event.actor))
+			doc.add_boolean_term (cap_string(FILTER_PREFIX_ACTOR+mangle_uri(event.actor)))
 		
 		for su in event.subjects:
 			if su.uri:
-				doc.add_boolean_term (FILTER_PREFIX_SUBJECT_URI+mangle_uri(su.uri))
+				doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_URI+mangle_uri(su.uri)))
 			if su.interpretation:
-				doc.add_boolean_term (FILTER_PREFIX_SUBJECT_INTERPRETATION+su.interpretation)
+				doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_INTERPRETATION+su.interpretation))
 			if su.manifestation:
-				doc.add_boolean_term (FILTER_PREFIX_SUBJECT_MANIFESTATION+su.manifestation)
+				doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_MANIFESTATION+su.manifestation))
 			if su.origin:
-				doc.add_boolean_term (FILTER_PREFIX_SUBJECT_ORIGIN+mangle_uri(su.origin))
+				doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_ORIGIN+mangle_uri(su.origin)))
 			if su.mimetype:
-				doc.add_boolean_term (FILTER_PREFIX_SUBJECT_MIMETYPE+su.mimetype)
+				doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_MIMETYPE+su.mimetype))
 			if su.storage:
-				doc.add_boolean_term (FILTER_PREFIX_SUBJECT_STORAGE+su.storage)
+				doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_STORAGE+su.storage))
 	
 	@synchronized (INDEX_LOCK)
 	def _index_event_real (self, event):
@@ -766,6 +796,3 @@
 		
 		return "%s..%sms" % (time_range.begin, time_range.end)
 
-if __name__ == "__main__":
-	indexer = Indexer(None)
-	print indexer._compile_filter_query([Event.new_for_values(subject_interpretation="http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#Document";)])


Follow ups