← Back to team overview

zeitgeist team mailing list archive

[Branch ~zeitgeist/zeitgeist/bluebird] Rev 391: Merged lp:~mhr3/zeitgeist/fts-extras

 

Merge authors:
  Michal Hruby (mhr3)
Related merge proposals:
  https://code.launchpad.net/~mhr3/zeitgeist/fts-extras/+merge/92430
  proposed by: Michal Hruby (mhr3)
  review: Approve - Siegfried Gevatter (rainct)
------------------------------------------------------------
revno: 391 [merge]
committer: Michal Hruby <michal.mhr@xxxxxxxxx>
branch nick: zeitgeist
timestamp: Fri 2012-02-10 13:30:21 +0100
message:
  Merged lp:~mhr3/zeitgeist/fts-extras
modified:
  configure.ac
  extensions/fts++/Makefile.am
  extensions/fts++/fts.cpp
  extensions/fts++/fts.h
  extensions/fts++/fts.vapi
  extensions/fts++/indexer.cpp
  extensions/fts++/indexer.h
  extensions/fts++/stringutils.cpp
  extensions/fts++/stringutils.h
  extensions/fts++/test/Makefile.am
  extensions/fts++/test/test-indexer.cpp
  extensions/fts++/test/test-stringutils.cpp
  extensions/fts++/zeitgeist-fts.vala
  extensions/fts.vala
  src/remote.vala


--
lp:zeitgeist
https://code.launchpad.net/~zeitgeist/zeitgeist/bluebird

Your team Zeitgeist Framework Team is subscribed to branch lp:zeitgeist.
To unsubscribe from this branch go to https://code.launchpad.net/~zeitgeist/zeitgeist/bluebird/+edit-subscription
=== modified file 'configure.ac'
--- configure.ac	2012-02-08 18:54:58 +0000
+++ configure.ac	2012-02-09 15:32:36 +0000
@@ -40,6 +40,30 @@
 AC_SUBST(ZEITGEIST_LIBS)
 
 #################################################
+# Dee-ICU check
+#################################################
+DEE_ICU_REQUIRED=1.0.2
+
+AC_ARG_WITH([dee-icu],
+	AS_HELP_STRING([--with-dee-icu[=@<:@no/auto/yes@:>@]],
+		[Build the FTS extension with dee-icu]),
+	[with_dee_icu=$withval],
+	[with_dee_icu="auto"])
+
+if test "x$with_dee_icu" = "xauto" ; then
+	PKG_CHECK_EXISTS([dee-icu-1.0 >= $DEE_ICU_REQUIRED],
+		with_dee_icu="yes",
+		with_dee_icu="no")
+fi
+
+if test "x$with_dee_icu" = "xyes" ; then
+	PKG_CHECK_MODULES(DEE_ICU, dee-icu-1.0 >= $DEE_ICU_REQUIRED)
+	AC_DEFINE(HAVE_DEE_ICU, 1, [Have dee-icu])
+fi
+
+AM_CONDITIONAL(HAVE_DEE_ICU, test "x$with_dee_icu" = "xyes")
+
+#################################################
 # DBus service
 #################################################
 
@@ -88,3 +112,16 @@
 fi
 
 AC_OUTPUT
+
+cat <<EOF
+
+${PACKAGE}-${VERSION}
+
+  Build Environment
+    Install Prefix:     ${prefix}
+
+  Optional dependencies
+    dee-icu:            ${with_dee_icu}
+
+EOF
+

=== modified file 'extensions/fts++/Makefile.am'
--- extensions/fts++/Makefile.am	2012-02-08 18:54:58 +0000
+++ extensions/fts++/Makefile.am	2012-02-09 15:32:36 +0000
@@ -76,6 +76,11 @@
 	-lxapian \
 	$(NULL)
 
+if HAVE_DEE_ICU
+AM_CPPFLAGS += $(DEE_ICU_CFLAGS)
+zeitgeist_fts_LDADD += $(DEE_ICU_LIBS)
+endif
+
 BUILT_SOURCES = \
 	zeitgeist-internal.stamp \
 	zeitgeist-fts_vala.stamp \

=== modified file 'extensions/fts++/fts.cpp'
--- extensions/fts++/fts.cpp	2012-02-09 09:32:33 +0000
+++ extensions/fts++/fts.cpp	2012-02-09 18:34:36 +0000
@@ -84,6 +84,36 @@
   return results;
 }
 
+GPtrArray*
+zeitgeist_indexer_search_with_relevancies (ZeitgeistIndexer *indexer,
+                                           const gchar *search_string,
+                                           ZeitgeistTimeRange *time_range,
+                                           GPtrArray *templates,
+                                           guint offset,
+                                           guint count,
+                                           ZeitgeistResultType result_type,
+                                           gdouble **relevancies,
+                                           gint *relevancies_size,
+                                           guint *matches,
+                                           GError **error)
+{
+  GPtrArray *results;
+  ZeitgeistFTS::Controller *_indexer;
+
+  g_return_val_if_fail (indexer != NULL, NULL);
+  g_return_val_if_fail (search_string != NULL, NULL);
+  g_return_val_if_fail (ZEITGEIST_IS_TIME_RANGE (time_range), NULL);
+  g_return_val_if_fail (error == NULL || *error == NULL, NULL);
+
+  _indexer = (ZeitgeistFTS::Controller*) indexer;
+
+  results = _indexer->indexer->SearchWithRelevancies (
+      search_string, time_range, templates, offset, count, result_type,
+      relevancies, relevancies_size, matches, error);
+
+  return results;
+}
+
 void zeitgeist_indexer_index_events (ZeitgeistIndexer *indexer,
                                      GPtrArray *events)
 {

=== modified file 'extensions/fts++/fts.h'
--- extensions/fts++/fts.h	2012-02-09 09:32:33 +0000
+++ extensions/fts++/fts.h	2012-02-09 18:34:36 +0000
@@ -43,6 +43,19 @@
                                                     guint *matches,
                                                     GError **error);
 
+GPtrArray*         zeitgeist_indexer_search_with_relevancies
+                                                   (ZeitgeistIndexer *indexer,
+                                                    const gchar *search_string,
+                                                    ZeitgeistTimeRange *time_range,
+                                                    GPtrArray *templates,
+                                                    guint offset,
+                                                    guint count,
+                                                    ZeitgeistResultType result_type,
+                                                    gdouble **relevancies,
+                                                    gint *relevancies_size,
+                                                    guint *matches,
+                                                    GError **error);
+
 void               zeitgeist_indexer_index_events  (ZeitgeistIndexer *indexer,
                                                     GPtrArray *events);
 

=== modified file 'extensions/fts++/fts.vapi'
--- extensions/fts++/fts.vapi	2012-02-07 17:02:30 +0000
+++ extensions/fts++/fts.vapi	2012-02-09 18:34:36 +0000
@@ -14,6 +14,16 @@
                                             ResultType result_type,
                                             out uint matches) throws GLib.Error;
 
+    public GLib.GenericArray<Event> search_with_relevancies (
+                                            string search_string,
+                                            TimeRange time_range,
+                                            GLib.GenericArray<Event> templates,
+                                            uint offset,
+                                            uint count,
+                                            ResultType result_type,
+                                            out double[] relevancies,
+                                            out uint matches) throws GLib.Error;
+
     public void index_events (GLib.GenericArray<Event> events);
 
     public void delete_events (uint[] event_ids);

=== modified file 'extensions/fts++/indexer.cpp'
--- extensions/fts++/indexer.cpp	2012-02-09 09:37:48 +0000
+++ extensions/fts++/indexer.cpp	2012-02-10 11:54:32 +0000
@@ -356,10 +356,40 @@
   }
 }
 
+std::string Indexer::PreprocessString (std::string const& input)
+{
+  if (input.empty ()) return input;
+
+  std::string result (StringUtils::RemoveUnderscores (input));
+  // a simple heuristic for the uncamelcaser
+  size_t num_digits = StringUtils::CountDigits (result);
+  if (result.length () > 3 && num_digits < result.length () / 2)
+  {
+    // FIXME: process digits?, atm they stay attached to the text
+    result = StringUtils::UnCamelcase (result);
+  }
+
+  std::string folded (StringUtils::AsciiFold (result));
+  if (!folded.empty ())
+  {
+    result += ' ';
+    result += folded;
+  }
+
+#ifdef DEBUG_PREPROCESSING
+  if (input != result)
+    g_debug ("processed: %s\n-> %s", input.c_str (), result.c_str ());
+#endif
+
+  return result;
+}
+
 void Indexer::IndexText (std::string const& text)
 {
-  // FIXME: ascii folding!
   tokenizer->index_text (text, 5);
+  // this is by definition already a human readable display string,
+  // so it shouldn't need removal of underscores and uncamelcase
+  tokenizer->index_text (StringUtils::AsciiFold (text), 5);
 }
 
 void Indexer::IndexUri (std::string const& uri, std::string const& origin)
@@ -403,9 +433,10 @@
     gchar *pn = g_file_get_parse_name (f);
     gchar *basename = g_path_get_basename (pn);
 
-    // FIXME: remove unscores, CamelCase and process digits
-    tokenizer->index_text (basename, 5);
-    tokenizer->index_text (basename, 5, "N");
+    // remove unscores, CamelCase and process digits
+    std::string processed (PreprocessString (basename));
+    tokenizer->index_text (processed, 5);
+    tokenizer->index_text (processed, 5, "N");
 
     g_free (basename);
     // limit the directory indexing to just a few levels
@@ -420,17 +451,17 @@
     g_free (dir);
     g_free (pn);
 
-    while (path_component.length () > 2 && 
+    while (path_component.length () > 2 &&
         weight_index < G_N_ELEMENTS (path_weights))
     {
       // if this is already home directory we don't want it
-      if (path_component.length () == home_dir_path.length () &&
-          path_component == home_dir_path) return;
+      if (path_component == home_dir_path) return;
 
       gchar *name = g_path_get_basename (path_component.c_str ());
 
-      // FIXME: un-underscore, uncamelcase, ascii fold
-      tokenizer->index_text (name, path_weights[weight_index++]);
+      // un-underscore, uncamelcase, ascii fold
+      processed = PreprocessString (name);
+      tokenizer->index_text (processed, path_weights[weight_index++]);
 
       dir = g_path_get_dirname (path_component.c_str ());
       path_component = dir;
@@ -471,9 +502,10 @@
     
     if (g_utf8_validate (unescaped_basename, -1, NULL))
     {
-      // FIXME: remove unscores, CamelCase and process digits
-      tokenizer->index_text (unescaped_basename, 5);
-      tokenizer->index_text (unescaped_basename, 5, "N");
+      // remove unscores, CamelCase and process digits
+      std::string processed (PreprocessString (unescaped_basename));
+      tokenizer->index_text (processed, 5);
+      tokenizer->index_text (processed, 5, "N");
     }
 
     // and also index hostname (taken from origin field if possible)
@@ -505,6 +537,7 @@
   {
     // we *really* don't want to index anything with this scheme
   }
+  // how about special casing (s)ftp and ssh?
   else
   {
     std::string authority, path, query;
@@ -593,12 +626,11 @@
   unsigned name_weight = is_subject ? 5 : 2;
   unsigned comment_weight = 2;
 
-  // FIXME: ascii folding somewhere
-
   val = g_app_info_get_display_name (ai);
   if (val && val[0] != '\0')
   {
-    std::string display_name (val);
+    std::string display_name (PreprocessString (val));
+
     tokenizer->index_text (display_name, name_weight);
     tokenizer->index_text (display_name, name_weight, "A");
   }
@@ -606,9 +638,14 @@
   val = g_desktop_app_info_get_generic_name (dai);
   if (val && val[0] != '\0')
   {
+    // this shouldn't need uncamelcasing
     std::string generic_name (val);
+    std::string generic_name_folded (StringUtils::AsciiFold (generic_name));
+
     tokenizer->index_text (generic_name, name_weight);
     tokenizer->index_text (generic_name, name_weight, "A");
+    tokenizer->index_text (generic_name_folded, name_weight);
+    tokenizer->index_text (generic_name_folded, name_weight, "A");
   }
 
   if (!is_subject) return true;
@@ -642,7 +679,35 @@
   return true;
 }
 
-GPtrArray* Indexer::Search (const gchar *search_string,
+std::string Indexer::CompileQueryString (const gchar *search_string,
+                                         ZeitgeistTimeRange *time_range,
+                                         GPtrArray *templates)
+{
+  std::string query_string (search_string);
+
+  if (templates && templates->len > 0)
+  {
+    std::string filters (CompileEventFilterQuery (templates));
+    query_string = "(" + query_string + ") AND (" + filters + ")";
+  }
+
+  if (time_range)
+  {
+    gint64 start_time = zeitgeist_time_range_get_start (time_range);
+    gint64 end_time = zeitgeist_time_range_get_end (time_range);
+
+    if (start_time > 0 || end_time < G_MAXINT64)
+    {
+      std::string time_filter (CompileTimeRangeFilterQuery (start_time, end_time));
+      query_string = "(" + query_string + ") AND (" + time_filter + ")";
+    }
+  }
+
+  g_debug ("query: %s", query_string.c_str ());
+  return query_string;
+}
+
+GPtrArray* Indexer::Search (const gchar *search,
                             ZeitgeistTimeRange *time_range,
                             GPtrArray *templates,
                             guint offset,
@@ -654,28 +719,22 @@
   GPtrArray *results = NULL;
   try
   {
-    std::string query_string(search_string);
-
-    if (templates && templates->len > 0)
-    {
-      std::string filters (CompileEventFilterQuery (templates));
-      query_string = "(" + query_string + ") AND (" + filters + ")";
-    }
-
-    if (time_range)
-    {
-      gint64 start_time = zeitgeist_time_range_get_start (time_range);
-      gint64 end_time = zeitgeist_time_range_get_end (time_range);
-
-      if (start_time > 0 || end_time < G_MAXINT64)
-      {
-        std::string time_filter (CompileTimeRangeFilterQuery (start_time, end_time));
-        query_string = "(" + query_string + ") AND (" + time_filter + ")";
-      }
-    }
-
-    // FIXME: which result types coalesce?
-    guint maxhits = count * 3;
+    std::string query_string (CompileQueryString (search, time_range, templates));
+
+    // When sorting by some result types, we need to fetch some extra events
+    // from the Xapian index because the final result set will be coalesced
+    // on some property of the event
+    guint maxhits;
+    if (result_type == 100 ||
+        result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS ||
+        result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_EVENTS)
+    {
+      maxhits = count;
+    }
+    else
+    {
+      maxhits = count * 3;
+    }
 
     if (result_type == 100)
     {
@@ -686,7 +745,6 @@
       enquire->set_sort_by_value (VALUE_TIMESTAMP, true);
     }
 
-    g_debug ("query: %s", query_string.c_str ());
     Xapian::Query q(query_parser->parse_query (query_string, QUERY_PARSER_FLAGS));
     enquire->set_query (q);
     Xapian::MSet hits (enquire->get_mset (offset, maxhits));
@@ -753,7 +811,119 @@
   }
   catch (Xapian::Error const& e)
   {
-    g_warning ("Failed to index event: %s", e.get_msg ().c_str ());
+    g_warning ("Failed to search index: %s", e.get_msg ().c_str ());
+    g_set_error_literal (error,
+                         ZEITGEIST_ENGINE_ERROR,
+                         ZEITGEIST_ENGINE_ERROR_DATABASE_ERROR,
+                         e.get_msg ().c_str ());
+  }
+
+  return results;
+}
+
+GPtrArray* Indexer::SearchWithRelevancies (const gchar *search,
+                                           ZeitgeistTimeRange *time_range,
+                                           GPtrArray *templates,
+                                           guint offset,
+                                           guint count,
+                                           ZeitgeistResultType result_type,
+                                           gdouble **relevancies,
+                                           gint *relevancies_size,
+                                           guint *matches,
+                                           GError **error)
+{
+  GPtrArray *results = NULL;
+  try
+  {
+    std::string query_string (CompileQueryString (search, time_range, templates));
+
+    guint maxhits = count;
+
+    if (result_type == 100)
+    {
+      enquire->set_sort_by_relevance ();
+    }
+    else
+    {
+      enquire->set_sort_by_value (VALUE_TIMESTAMP, true);
+    }
+
+    Xapian::Query q(query_parser->parse_query (query_string, QUERY_PARSER_FLAGS));
+    enquire->set_query (q);
+    Xapian::MSet hits (enquire->get_mset (offset, maxhits));
+    Xapian::doccount hitcount = hits.get_matches_estimated ();
+
+    if (result_type == 100)
+    {
+      std::vector<unsigned> event_ids;
+      std::vector<gdouble> relevancy_arr;
+      Xapian::MSetIterator iter, end;
+      for (iter = hits.begin (), end = hits.end (); iter != end; ++iter)
+      {
+        Xapian::Document doc(iter.get_document ());
+        double unserialized =
+          Xapian::sortable_unserialise (doc.get_value (VALUE_EVENT_ID));
+        unsigned event_id = static_cast<unsigned>(unserialized);
+        event_ids.push_back (event_id);
+
+        double rank = iter.get_percent () / 100.;
+        relevancy_arr.push_back (rank);
+      }
+
+      results = zeitgeist_db_reader_get_events (zg_reader,
+                                                &event_ids[0],
+                                                event_ids.size (),
+                                                NULL,
+                                                error);
+
+      if (results->len != relevancy_arr.size ())
+      {
+        g_warning ("Results don't match relevancies!");
+        g_set_error_literal (error,
+                             ZEITGEIST_ENGINE_ERROR,
+                             ZEITGEIST_ENGINE_ERROR_DATABASE_ERROR,
+                             "Internal database error");
+        return NULL;
+      }
+
+      if (relevancies)
+      {
+        *relevancies = (gdouble*) g_memdup (&relevancy_arr[0],
+                                            sizeof (gdouble) * results->len);
+      }
+      if (relevancies_size)
+      {
+        *relevancies_size = relevancy_arr.size ();
+      }
+    }
+    else
+    {
+      g_set_error_literal (error,
+                           ZEITGEIST_ENGINE_ERROR,
+                           ZEITGEIST_ENGINE_ERROR_INVALID_ARGUMENT,
+                           "Only RELEVANCY result type is supported");
+      /*
+       * perhaps something like this could be used here?
+      std::map<unsigned, gdouble> relevancy_map;
+      foreach (...)
+      {
+        double rank = iter.get_percent () / 100.;
+        if (rank > relevancy_map[event_id])
+        {
+          relevancy_map[event_id] = rank;
+        }
+      }
+      */
+    }
+
+    if (matches)
+    {
+      *matches = hitcount;
+    }
+  }
+  catch (Xapian::Error const& e)
+  {
+    g_warning ("Failed to search index: %s", e.get_msg ().c_str ());
     g_set_error_literal (error,
                          ZEITGEIST_ENGINE_ERROR,
                          ZEITGEIST_ENGINE_ERROR_DATABASE_ERROR,

=== modified file 'extensions/fts++/indexer.h'
--- extensions/fts++/indexer.h	2012-02-09 09:37:48 +0000
+++ extensions/fts++/indexer.h	2012-02-10 11:30:52 +0000
@@ -77,7 +77,7 @@
   void DeleteEvent (guint32 event_id);
   void SetDbMetadata (std::string const& key, std::string const& value);
 
-  GPtrArray* Search (const gchar *search_string,
+  GPtrArray* Search (const gchar *search,
                      ZeitgeistTimeRange *time_range,
                      GPtrArray *templates,
                      guint offset,
@@ -85,11 +85,26 @@
                      ZeitgeistResultType result_type,
                      guint *matches,
                      GError **error);
+  GPtrArray* SearchWithRelevancies (const gchar *search,
+                                    ZeitgeistTimeRange *time_range,
+                                    GPtrArray *templates,
+                                    guint offset,
+                                    guint count,
+                                    ZeitgeistResultType result_type,
+                                    gdouble **relevancies,
+                                    gint *relevancies_size,
+                                    guint *matches,
+                                    GError **error);
 
 private:
   std::string ExpandType (std::string const& prefix, const gchar* unparsed_uri);
   std::string CompileEventFilterQuery (GPtrArray *templates);
   std::string CompileTimeRangeFilterQuery (gint64 start, gint64 end);
+  std::string CompileQueryString (const gchar *search,
+                                  ZeitgeistTimeRange *time_range,
+                                  GPtrArray *templates);
+
+  std::string PreprocessString (std::string const& input);
 
   void AddDocFilters (ZeitgeistEvent *event, Xapian::Document &doc);
   void IndexText (std::string const& text);

=== modified file 'extensions/fts++/stringutils.cpp'
--- extensions/fts++/stringutils.cpp	2012-02-09 09:32:33 +0000
+++ extensions/fts++/stringutils.cpp	2012-02-10 11:54:32 +0000
@@ -17,9 +17,14 @@
  * Authored by Mikkel Kamstrup Erlandsen <mikkel.kamstrup@xxxxxxxxx>
  *
  */
+
+#include "stringutils.h"
 #include <string>
+#include <algorithm>
 
-#include "stringutils.h"
+#ifdef HAVE_DEE_ICU
+#include <dee-icu.h>
+#endif
 
 using namespace std;
 
@@ -123,6 +128,87 @@
   }
 }
 
+string RemoveUnderscores (string const &input)
+{
+  string result (input);
+  std::replace (result.begin (), result.end (), '_', ' ');
+
+  return result;
+}
+
+static bool is_digit (char c) { return c >= '0' && c <= '9'; }
+
+size_t CountDigits (string const &input)
+{
+  return std::count_if (input.begin (), input.end (), is_digit);
+}
+
+static GRegex *camelcase_matcher = NULL;
+
+static gboolean
+matcher_cb (const GMatchInfo *match_info, GString *result, gpointer user_data)
+{
+  gint start_pos;
+  g_match_info_fetch_pos (match_info, 0, &start_pos, NULL);
+  if (start_pos != 0) g_string_append_c (result, ' ');
+  gchar *word = g_match_info_fetch (match_info, 0);
+  g_string_append (result, word);
+  g_free (word);
+
+  return FALSE;
+}
+
+string UnCamelcase (string const &input)
+{
+  if (camelcase_matcher == NULL)
+  {
+    camelcase_matcher = g_regex_new ("(?<=^|[[:lower:]])[[:upper:]]+[^[:upper:]]+", G_REGEX_OPTIMIZE, (GRegexMatchFlags) 0, NULL);
+    if (camelcase_matcher == NULL) g_critical ("Unable to create matcher!");
+  }
+
+  gchar *result = g_regex_replace_eval (camelcase_matcher, input.c_str (),
+                                        input.length (), 0,
+                                        (GRegexMatchFlags) 0,
+                                        matcher_cb, NULL, NULL);
+
+  string ret (result);
+  g_free (result);
+  return ret;
+}
+
+#ifdef HAVE_DEE_ICU
+static DeeICUTermFilter *icu_filter = NULL;
+
+/**
+ * Use ascii folding filter on the input text and return folded version
+ * of the original string.
+ *
+ * Note that if the folded version is exactly the same as the original
+ * empty string will be returned.
+ */
+string AsciiFold (string const& input)
+{
+  if (icu_filter == NULL)
+  {
+    icu_filter = dee_icu_term_filter_new_ascii_folder ();
+    if (icu_filter == NULL) return "";
+  }
+
+  // FIXME: check first if the input contains any non-ascii chars?
+
+  gchar *folded = dee_icu_term_filter_apply (icu_filter, input.c_str ());
+  string result (folded);
+  g_free (folded);
+
+  return result == input ? "" : result;
+}
+#else
+string AsciiFold (string const& input)
+{
+  return "";
+}
+#endif
+
 } /* namespace StringUtils */
 
 } /* namespace ZeitgeistFTS */

=== modified file 'extensions/fts++/stringutils.h'
--- extensions/fts++/stringutils.h	2012-02-09 09:32:33 +0000
+++ extensions/fts++/stringutils.h	2012-02-10 10:19:52 +0000
@@ -37,6 +37,14 @@
                std::string &path,
                std::string &basename);
 
+std::string RemoveUnderscores (std::string const &input);
+
+size_t CountDigits (std::string const &input);
+
+std::string UnCamelcase (std::string const &input);
+
+std::string AsciiFold (std::string const& input);
+
 } /* namespace StringUtils */
 
 } /* namespace ZeitgeistFTS */

=== modified file 'extensions/fts++/test/Makefile.am'
--- extensions/fts++/test/Makefile.am	2012-02-08 18:54:58 +0000
+++ extensions/fts++/test/Makefile.am	2012-02-09 15:32:36 +0000
@@ -25,3 +25,8 @@
   -lxapian \
   $(NULL)
 
+if HAVE_DEE_ICU
+AM_CPPFLAGS += $(DEE_ICU_CFLAGS)
+test_fts_LDADD += $(DEE_ICU_LIBS)
+endif
+

=== modified file 'extensions/fts++/test/test-indexer.cpp'
--- extensions/fts++/test/test-indexer.cpp	2012-02-09 09:32:33 +0000
+++ extensions/fts++/test/test-indexer.cpp	2012-02-10 12:07:27 +0000
@@ -145,6 +145,26 @@
   return event;
 }
 
+static ZeitgeistEvent* create_test_event5 (void)
+{
+  ZeitgeistEvent *event = zeitgeist_event_new ();
+  ZeitgeistSubject *subject = zeitgeist_subject_new ();
+  
+  zeitgeist_subject_set_interpretation (subject, ZEITGEIST_NFO_SOURCE_CODE);
+  zeitgeist_subject_set_manifestation (subject, ZEITGEIST_NFO_FILE_DATA_OBJECT);
+  zeitgeist_subject_set_uri (subject, "file:///home/username/projects/GLibSignalImplementation.cpp");
+  zeitgeist_subject_set_text (subject, "Because c++ is awesome");
+  zeitgeist_subject_set_mimetype (subject, "text/x-c++src");
+
+  zeitgeist_event_set_interpretation (event, ZEITGEIST_ZG_CREATE_EVENT);
+  zeitgeist_event_set_manifestation (event, ZEITGEIST_ZG_USER_ACTIVITY);
+  zeitgeist_event_set_actor (event, "application://gedit.desktop");
+  zeitgeist_event_add_subject (event, subject);
+
+  g_object_unref (subject);
+  return event;
+}
+
 // Steals the event, ref it if you want to keep it
 static guint
 index_event (Fixture *fix, ZeitgeistEvent *event)
@@ -426,6 +446,71 @@
 }
 
 static void
+test_simple_underscores (Fixture *fix, gconstpointer data)
+{
+  guint matches;
+  guint event_id;
+  ZeitgeistEvent* event;
+  ZeitgeistSubject *subject;
+
+  // add test events to DBs
+  index_event (fix, create_test_event1 ());
+  index_event (fix, create_test_event2 ());
+  index_event (fix, create_test_event3 ());
+  event_id = index_event (fix, create_test_event4 ());
+
+  GPtrArray *results =
+    zeitgeist_indexer_search (fix->indexer,
+                              "fabulo*",
+                              zeitgeist_time_range_new_anytime (),
+                              g_ptr_array_new (),
+                              0,
+                              10,
+                              ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS,
+                              &matches,
+                              NULL);
+
+  g_assert_cmpuint (matches, >, 0);
+  g_assert_cmpuint (results->len, ==, 1);
+
+  event = (ZeitgeistEvent*) results->pdata[0];
+  g_assert_cmpuint (zeitgeist_event_get_id (event), ==, event_id);
+}
+
+static void
+test_simple_camelcase (Fixture *fix, gconstpointer data)
+{
+  guint matches;
+  guint event_id;
+  ZeitgeistEvent* event;
+  ZeitgeistSubject *subject;
+
+  // add test events to DBs
+  index_event (fix, create_test_event1 ());
+  index_event (fix, create_test_event2 ());
+  index_event (fix, create_test_event3 ());
+  index_event (fix, create_test_event4 ());
+  event_id = index_event (fix, create_test_event5 ());
+
+  GPtrArray *results =
+    zeitgeist_indexer_search (fix->indexer,
+                              "signal",
+                              zeitgeist_time_range_new_anytime (),
+                              g_ptr_array_new (),
+                              0,
+                              10,
+                              ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS,
+                              &matches,
+                              NULL);
+
+  g_assert_cmpuint (matches, >, 0);
+  g_assert_cmpuint (results->len, ==, 1);
+
+  event = (ZeitgeistEvent*) results->pdata[0];
+  g_assert_cmpuint (zeitgeist_event_get_id (event), ==, event_id);
+}
+
+static void
 test_simple_cjk (Fixture *fix, gconstpointer data)
 {
   guint matches;
@@ -517,6 +602,10 @@
               setup, test_simple_noexpand, teardown);
   g_test_add ("/Zeitgeist/FTS/Indexer/SimpleNoexpandValid", Fixture, 0,
               setup, test_simple_noexpand_valid, teardown);
+  g_test_add ("/Zeitgeist/FTS/Indexer/SimpleUnderscores", Fixture, 0,
+              setup, test_simple_underscores, teardown);
+  g_test_add ("/Zeitgeist/FTS/Indexer/SimpleCamelcase", Fixture, 0,
+              setup, test_simple_camelcase, teardown);
   g_test_add ("/Zeitgeist/FTS/Indexer/URLUnescape", Fixture, 0,
               setup, test_simple_url_unescape, teardown);
   g_test_add ("/Zeitgeist/FTS/Indexer/IDNSupport", Fixture, 0,

=== modified file 'extensions/fts++/test/test-stringutils.cpp'
--- extensions/fts++/test/test-stringutils.cpp	2012-02-09 09:32:33 +0000
+++ extensions/fts++/test/test-stringutils.cpp	2012-02-10 11:54:32 +0000
@@ -163,6 +163,91 @@
   g_assert_cmpstr ("type=A", ==, query.c_str ());
 }
 
+static void
+test_ascii_fold (Fixture *fix, gconstpointer data)
+{
+  std::string folded;
+
+  folded = StringUtils::AsciiFold ("");
+  g_assert_cmpstr ("", ==, folded.c_str ());
+
+  // if the original matches the folded version, AsciiFold returns ""
+  folded = StringUtils::AsciiFold ("a");
+  g_assert_cmpstr ("", ==, folded.c_str ());
+
+  folded = StringUtils::AsciiFold ("abcdef");
+  g_assert_cmpstr ("", ==, folded.c_str ());
+
+  folded = StringUtils::AsciiFold ("å");
+  g_assert_cmpstr ("a", ==, folded.c_str ());
+
+  folded = StringUtils::AsciiFold ("åå");
+  g_assert_cmpstr ("aa", ==, folded.c_str ());
+
+  folded = StringUtils::AsciiFold ("aåaåa");
+  g_assert_cmpstr ("aaaaa", ==, folded.c_str ());
+}
+
+static void
+test_underscores (Fixture *fix, gconstpointer data)
+{
+  g_assert_cmpstr ("", ==, StringUtils::RemoveUnderscores ("").c_str ());
+
+  g_assert_cmpstr (" ", ==, StringUtils::RemoveUnderscores ("_").c_str ());
+
+  g_assert_cmpstr ("   ", ==, StringUtils::RemoveUnderscores ("___").c_str ());
+
+  g_assert_cmpstr ("abcd", ==, StringUtils::RemoveUnderscores ("abcd").c_str ());
+
+  g_assert_cmpstr (" abcd ", ==, StringUtils::RemoveUnderscores ("_abcd_").c_str ());
+
+  g_assert_cmpstr ("a b c d", ==, StringUtils::RemoveUnderscores ("a_b_c_d").c_str ());
+}
+
+static void
+test_uncamelcase (Fixture *fix, gconstpointer data)
+{
+  g_assert_cmpstr ("", ==, StringUtils::UnCamelcase ("").c_str ());
+
+  g_assert_cmpstr ("abcd", ==, StringUtils::UnCamelcase ("abcd").c_str ());
+
+  g_assert_cmpstr ("Abcd", ==, StringUtils::UnCamelcase ("Abcd").c_str ());
+
+  g_assert_cmpstr ("ABCD", ==, StringUtils::UnCamelcase ("ABCD").c_str ());
+
+  g_assert_cmpstr ("ABcd", ==, StringUtils::UnCamelcase ("ABcd").c_str ());
+
+  g_assert_cmpstr ("Abcd Ef", ==, StringUtils::UnCamelcase ("AbcdEf").c_str ());
+
+  g_assert_cmpstr ("Text Editor", ==, StringUtils::UnCamelcase ("Text Editor").c_str ());
+
+  g_assert_cmpstr ("py Karaoke", ==, StringUtils::UnCamelcase ("pyKaraoke").c_str ());
+
+  g_assert_cmpstr ("Zeitgeist Project", ==, StringUtils::UnCamelcase ("ZeitgeistProject").c_str ());
+
+  g_assert_cmpstr ("Very Nice Camel Case Text", ==, StringUtils::UnCamelcase ("VeryNiceCamelCaseText").c_str ());
+
+  g_assert_cmpstr ("Ňeedš Ťo Wórk Óń Útf Čhářacters As WelL", ==,
+      StringUtils::UnCamelcase ("ŇeedšŤoWórkÓńÚtfČhářactersAsWelL").c_str ());
+}
+
+static void
+test_count_digits (Fixture *fix, gconstpointer data)
+{
+  g_assert_cmpuint (0, ==, StringUtils::CountDigits (""));
+
+  g_assert_cmpuint (0, ==, StringUtils::CountDigits ("abcdefghijklmnopqrstuvwxyz"));
+
+  g_assert_cmpuint (10, ==, StringUtils::CountDigits ("0123456789"));
+
+  g_assert_cmpuint (1, ==, StringUtils::CountDigits ("abc3"));
+
+  g_assert_cmpuint (3, ==, StringUtils::CountDigits ("::123__poa//weee"));
+
+  g_assert_cmpuint (5, ==, StringUtils::CountDigits ("PCN30129.JPG"));
+
+}
+
 G_BEGIN_DECLS
 
 void test_stringutils_create_suite (void)
@@ -173,6 +258,16 @@
               setup, test_mangle, teardown);
   g_test_add ("/Zeitgeist/FTS/StringUtils/SplitUri", Fixture, 0,
               setup, test_split, teardown);
+  g_test_add ("/Zeitgeist/FTS/StringUtils/RemoveUnderscores", Fixture, 0,
+              setup, test_underscores, teardown);
+  g_test_add ("/Zeitgeist/FTS/StringUtils/UnCamelcase", Fixture, 0,
+              setup, test_uncamelcase, teardown);
+  g_test_add ("/Zeitgeist/FTS/StringUtils/CountDigits", Fixture, 0,
+              setup, test_count_digits, teardown);
+#ifdef HAVE_DEE_ICU
+  g_test_add ("/Zeitgeist/FTS/StringUtils/AsciiFold", Fixture, 0,
+              setup, test_ascii_fold, teardown);
+#endif
 }
 
 G_END_DECLS

=== modified file 'extensions/fts++/zeitgeist-fts.vala'
--- extensions/fts++/zeitgeist-fts.vala	2012-02-09 09:32:33 +0000
+++ extensions/fts++/zeitgeist-fts.vala	2012-02-09 18:34:36 +0000
@@ -132,6 +132,23 @@
             events = Events.to_variant (results);
         }
 
+        public async void search_with_relevancies (
+                                  string query_string, Variant time_range,
+                                  Variant filter_templates,
+                                  uint offset, uint count, uint result_type,
+                                  out Variant events, out double[] relevancies,
+                                  out uint matches)
+            throws Error
+        {
+            var tr = new TimeRange.from_variant (time_range);
+            var templates = Events.from_variant (filter_templates);
+            var results = instance.indexer.search_with_relevancies (
+                    query_string, tr, templates, offset, count,
+                    (ResultType) result_type, out relevancies, out matches);
+
+            events = Events.to_variant (results);
+        }
+
         private static void name_acquired_callback (DBusConnection conn)
         {
             name_acquired = true;

=== modified file 'extensions/fts.vala'
--- extensions/fts.vala	2012-02-07 12:47:44 +0000
+++ extensions/fts.vala	2012-02-10 09:35:31 +0000
@@ -31,6 +31,14 @@
             uint offset, uint count, uint result_type,
             [DBus (signature = "a(asaasay)")] out Variant events,
             out uint matches) throws Error;
+        public abstract async void search_with_relevancies (
+            string query_string,
+            [DBus (signature = "(xx)")] Variant time_range,
+            [DBus (signature = "a(asaasay)")] Variant filter_templates,
+            uint offset, uint count, uint result_type,
+            [DBus (signature = "a(asaasay)")] out Variant events,
+            out double[] relevancies,
+            out uint matches) throws Error;
     }
 
     /* Because of a Vala bug we have to define the proxy interface outside of
@@ -55,6 +63,7 @@
         private const string INDEXER_NAME = "org.gnome.zeitgeist.SimpleIndexer";
 
         private RemoteSimpleIndexer siin;
+        private bool siin_connection_failed = false;
         private uint registration_id;
         private MonitorManager? notifier;
 
@@ -67,6 +76,8 @@
         {
             if (Utils.using_in_memory_database ()) return;
 
+            // FIXME: check dbus and see if fts is installed?
+
             // installing a monitor from the daemon will ensure that we don't
             // miss any notifications that would be emitted in between
             // zeitgeist start and fts daemon start
@@ -109,23 +120,40 @@
             try
             {
                 siin = conn.get_proxy.end<RemoteSimpleIndexer> (res);
+                siin_connection_failed = false;
             }
             catch (IOError err)
             {
+                siin_connection_failed = true;
                 warning ("%s", err.message);
             }
         }
 
-        public async void search (string query_string, Variant time_range,
-            Variant filter_templates, uint offset, uint count, uint result_type,
-            out Variant events, out uint matches) throws Error
+        public async void wait_for_proxy () throws Error
         {
+            int i = 0;
+            while (this.siin == null && i < 6 && !siin_connection_failed)
+            {
+                Timeout.add_full (Priority.DEFAULT_IDLE, 250,
+                                  wait_for_proxy.callback);
+                i++;
+                yield;
+            }
+
             if (siin == null || !(siin is DBusProxy))
             {
                 // FIXME: queue until we have the proxy
                 throw new EngineError.DATABASE_ERROR (
                     "Not connected to SimpleIndexer");
             }
+        }
+
+        public async void search (string query_string, Variant time_range,
+            Variant filter_templates, uint offset, uint count, uint result_type,
+            out Variant events, out uint matches) throws Error
+        {
+            if (siin == null) yield wait_for_proxy ();
+
             var timer = new Timer ();
             yield siin.search (query_string, time_range, filter_templates,
                                offset, count, result_type,
@@ -134,6 +162,24 @@
                 (uint) events.n_children (), matches, timer.elapsed ());
         }
 
+        public async void search_with_relevancies (
+            string query_string, Variant time_range,
+            Variant filter_templates, uint offset, uint count, uint result_type,
+            out Variant events, out double[] relevancies, out uint matches)
+            throws Error
+        {
+            if (siin == null) yield wait_for_proxy ();
+
+            var timer = new Timer ();
+            yield siin.search_with_relevancies (
+                query_string, time_range, filter_templates,
+                offset, count, result_type,
+                out events, out relevancies, out matches);
+
+            debug ("Got %u[/%u] results from indexer (in %f seconds)",
+                (uint) events.n_children (), matches, timer.elapsed ());
+        }
+
     }
 
     [ModuleInit]

=== modified file 'src/remote.vala'
--- src/remote.vala	2012-02-05 14:52:13 +0000
+++ src/remote.vala	2012-02-09 18:34:36 +0000
@@ -121,6 +121,13 @@
             uint offset, uint count, uint result_type,
             [DBus (signature = "a(asaasay)")] out Variant events,
             out uint matches) throws Error;
+        public abstract async void search_with_relevancies (
+            string query_string,
+            [DBus (signature = "(xx)")] Variant time_range,
+            [DBus (signature = "a(asaasay)")] Variant filter_templates,
+            uint offset, uint count, uint result_type,
+            [DBus (signature = "a(asaasay)")] out Variant events,
+            out double[] relevancies, out uint matches) throws Error;
     }
     
     /* FIXME: Remove this! Only here because of a bug in Vala (see ext-fts) */