← Back to team overview

zeitgeist team mailing list archive

[Merge] lp:~mhr3/zeitgeist/fts-secondary-sorting into lp:zeitgeist

 

Michal Hruby has proposed merging lp:~mhr3/zeitgeist/fts-secondary-sorting into lp:zeitgeist.

Requested reviews:
  Zeitgeist Framework Team (zeitgeist)

For more details, see:
https://code.launchpad.net/~mhr3/zeitgeist/fts-secondary-sorting/+merge/96479

Implements secondary sorting based on ResultType to SearchWithRelevancies method.
-- 
https://code.launchpad.net/~mhr3/zeitgeist/fts-secondary-sorting/+merge/96479
Your team Zeitgeist Framework Team is requested to review the proposed merge of lp:~mhr3/zeitgeist/fts-secondary-sorting into lp:zeitgeist.
=== modified file 'extensions/fts++/indexer.cpp'
--- extensions/fts++/indexer.cpp	2012-03-07 16:08:26 +0000
+++ extensions/fts++/indexer.cpp	2012-03-07 22:37:19 +0000
@@ -23,6 +23,7 @@
 #include <xapian.h>
 #include <queue>
 #include <vector>
+#include <cmath>
 
 #include <gio/gio.h>
 #include <gio/gdesktopappinfo.h>
@@ -804,7 +805,6 @@
 
       if (event_templates->len > 0)
       {
-        ZeitgeistTimeRange *time_range = zeitgeist_time_range_new_anytime ();
         results = zeitgeist_db_reader_find_events (zg_reader,
                                                    time_range,
                                                    event_templates,
@@ -813,8 +813,6 @@
                                                    result_type,
                                                    NULL,
                                                    error);
-
-        g_object_unref (time_range);
       }
       else
       {
@@ -841,6 +839,34 @@
   return results;
 }
 
+static gint
+sort_events_by_relevance (gconstpointer a, gconstpointer b, gpointer user_data)
+{
+  gdouble rel1 = 0.0;
+  gdouble rel2 = 0.0;
+  std::map<unsigned, gdouble>::const_iterator it;
+  ZeitgeistEvent **e1 = (ZeitgeistEvent**) a;
+  ZeitgeistEvent **e2 = (ZeitgeistEvent**) b;
+  std::map<unsigned, gdouble> const& relevancy_map =
+    *(static_cast<std::map<unsigned, gdouble>*> (user_data));
+
+  it = relevancy_map.find (zeitgeist_event_get_id (*e1));
+  if (it != relevancy_map.end ()) rel1 = it->second;
+
+  it = relevancy_map.find (zeitgeist_event_get_id (*e2));
+  if (it != relevancy_map.end ()) rel2 = it->second;
+
+  gdouble delta = rel1 - rel2;
+  if (fabs (delta) < 0.00001)
+  {
+    // relevancy of both items is the same, let's make use of stable sort
+    return e1 > e2 ? 1 : -1;
+  }
+
+  // we want the higher ranked events first
+  return (delta < 0) ? 1 : -1;
+}
+
 GPtrArray* Indexer::SearchWithRelevancies (const gchar *search,
                                            ZeitgeistTimeRange *time_range,
                                            GPtrArray *templates,
@@ -860,24 +886,51 @@
 
     guint maxhits = count;
 
-    if (result_type == RELEVANCY_RESULT_TYPE)
-    {
-      enquire->set_sort_by_relevance ();
-    }
-    else
-    {
-      enquire->set_sort_by_value (VALUE_TIMESTAMP, true);
-    }
-
     if (storage_state != ZEITGEIST_STORAGE_STATE_ANY)
     {
       g_set_error_literal (error,
                            ZEITGEIST_ENGINE_ERROR,
                            ZEITGEIST_ENGINE_ERROR_INVALID_ARGUMENT,
-                           "Only ANY stogate state is supported");
+                           "Only ANY storage state is supported");
       return NULL;
     }
 
+    if (result_type == RELEVANCY_RESULT_TYPE)
+    {
+      enquire->set_sort_by_relevance ();
+    }
+    else if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS ||
+        result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_EVENTS)
+    {
+      enquire->set_sort_by_relevance_then_value (VALUE_TIMESTAMP, true);
+      enquire->set_collapse_key (VALUE_EVENT_ID);
+    }
+    else if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_SUBJECTS ||
+        result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_SUBJECTS ||
+        result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_SUBJECTS ||
+        result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_SUBJECTS)
+    {
+      enquire->set_sort_by_relevance_then_value (VALUE_TIMESTAMP, true);
+      enquire->set_collapse_key (VALUE_URI_HASH);
+    }
+    else if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_ORIGIN ||
+        result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_ORIGIN ||
+        result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN ||
+        result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_ORIGIN)
+    {
+      // FIXME: not really correct but close :)
+      enquire->set_sort_by_relevance_then_value (VALUE_TIMESTAMP, true);
+      enquire->set_collapse_key (VALUE_URI_HASH);
+      maxhits *= 3;
+    }
+    else
+    {
+      // throw an error for these?
+      enquire->set_sort_by_relevance_then_value (VALUE_TIMESTAMP, true);
+      enquire->set_collapse_key (VALUE_EVENT_ID);
+      maxhits *= 3;
+    }
+
     Xapian::Query q(query_parser->parse_query (query_string, QUERY_PARSER_FLAGS));
     enquire->set_query (q);
     Xapian::MSet hits (enquire->get_mset (offset, maxhits));
@@ -906,6 +959,8 @@
                                                 NULL,
                                                 error);
 
+      if (error && *error) return NULL;
+
       if (results->len != relevancy_arr.size ())
       {
         g_warning ("Results don't match relevancies!");
@@ -928,22 +983,70 @@
     }
     else
     {
-      g_set_error_literal (error,
-                           ZEITGEIST_ENGINE_ERROR,
-                           ZEITGEIST_ENGINE_ERROR_INVALID_ARGUMENT,
-                           "Only RELEVANCY result type is supported");
-      /*
-       * perhaps something like this could be used here?
+      // we'll use the result type only for secondary sorting, relevancy
+      // is still primary!
+      GPtrArray *event_templates;
+      event_templates = g_ptr_array_new_with_free_func (g_object_unref);
       std::map<unsigned, gdouble> relevancy_map;
-      foreach (...)
+      Xapian::MSetIterator iter, end;
+      for (iter = hits.begin (), end = hits.end (); iter != end; ++iter)
       {
+        Xapian::Document doc(iter.get_document ());
+        double unserialized =
+          Xapian::sortable_unserialise (doc.get_value (VALUE_EVENT_ID));
+        unsigned event_id = static_cast<unsigned>(unserialized);
+
+        ZeitgeistEvent *event = zeitgeist_event_new ();
+        zeitgeist_event_set_id (event, event_id);
+        g_ptr_array_add (event_templates, event);
+
         double rank = iter.get_percent () / 100.;
         if (rank > relevancy_map[event_id])
         {
           relevancy_map[event_id] = rank;
         }
       }
-      */
+
+      if (event_templates->len > 0)
+      {
+        // let's ask zeitgeist for sorting based on result type
+        results = zeitgeist_db_reader_find_events (zg_reader,
+                                                   time_range,
+                                                   event_templates,
+                                                   ZEITGEIST_STORAGE_STATE_ANY,
+                                                   0,
+                                                   result_type,
+                                                   NULL,
+                                                   error);
+
+        if (error && *error) return NULL;
+
+        g_ptr_array_sort_with_data (results, sort_events_by_relevance,
+                                    &relevancy_map);
+
+        if (relevancies)
+        {
+          *relevancies = g_new (gdouble, results->len);
+          for (unsigned i = 0; i < results->len; i++)
+          {
+            ZeitgeistEvent *event = (ZeitgeistEvent*) g_ptr_array_index (results, i);
+            (*relevancies)[i] = relevancy_map[zeitgeist_event_get_id (event)];
+          }
+        }
+
+        if (relevancies_size)
+        {
+          *relevancies_size = results->len;
+        }
+      }
+      else
+      {
+        results = g_ptr_array_new ();
+        if (relevancies) *relevancies = NULL;
+        if (relevancies_size) *relevancies_size = 0;
+      }
+
+      g_ptr_array_unref (event_templates);
     }
 
     if (matches)

=== modified file 'extensions/fts++/test/test-indexer.cpp'
--- extensions/fts++/test/test-indexer.cpp	2012-02-14 16:56:04 +0000
+++ extensions/fts++/test/test-indexer.cpp	2012-03-07 22:37:19 +0000
@@ -163,6 +163,26 @@
   return event;
 }
 
+static ZeitgeistEvent* create_test_event6 (void)
+{
+  ZeitgeistEvent *event = zeitgeist_event_new ();
+  ZeitgeistSubject *subject = zeitgeist_subject_new ();
+  
+  zeitgeist_subject_set_interpretation (subject, ZEITGEIST_NFO_PRESENTATION);
+  zeitgeist_subject_set_manifestation (subject, ZEITGEIST_NFO_FILE_DATA_OBJECT);
+  zeitgeist_subject_set_uri (subject, "file:///home/username/Documents/CamelCasePresentation.pdf");
+  zeitgeist_subject_set_text (subject, NULL);
+  zeitgeist_subject_set_mimetype (subject, "application/pdf");
+
+  zeitgeist_event_set_interpretation (event, ZEITGEIST_ZG_MODIFY_EVENT);
+  zeitgeist_event_set_manifestation (event, ZEITGEIST_ZG_USER_ACTIVITY);
+  zeitgeist_event_set_actor (event, "application://libreoffice-impress.desktop");
+  zeitgeist_event_add_subject (event, subject);
+
+  g_object_unref (subject);
+  return event;
+}
+
 // Steals the event, ref it if you want to keep it
 static guint
 index_event (Fixture *fix, ZeitgeistEvent *event)
@@ -172,6 +192,7 @@
   guint *event_ids;
   int num_events_inserted;
 
+  zeitgeist_event_set_timestamp (event, zeitgeist_timestamp_now ());
   // add event to DBs
   events = g_ptr_array_new ();
   g_ptr_array_add (events, event);
@@ -586,6 +607,86 @@
   g_assert_cmpstr (zeitgeist_subject_get_text (subject), ==, "IDNwiki");
 }
 
+static void
+test_simple_relevancies_query (Fixture *fix, gconstpointer data)
+{
+  guint matches;
+  guint event_id;
+  gdouble *relevancies;
+  gint relevancies_size;
+  ZeitgeistEvent* event;
+ 
+  // add test events to DBs
+  event_id = index_event (fix, create_test_event1 ());
+  index_event (fix, create_test_event2 ());
+  index_event (fix, create_test_event3 ());
+  index_event (fix, create_test_event4 ());
+
+  GPtrArray *results =
+    zeitgeist_indexer_search_with_relevancies (fix->indexer,
+                              "text",
+                              zeitgeist_time_range_new_anytime (),
+                              g_ptr_array_new (),
+                              ZEITGEIST_STORAGE_STATE_ANY,
+                              0,
+                              10,
+                              (ZeitgeistResultType) 100,
+                              &relevancies, &relevancies_size,
+                              &matches,
+                              NULL);
+
+  g_assert_cmpuint (matches, >, 0);
+  g_assert_cmpuint (results->len, ==, 1);
+  g_assert_cmpint (relevancies_size, ==, 1);
+  g_assert_cmpfloat (relevancies[0], >=, 1.0);
+
+  event = (ZeitgeistEvent*) results->pdata[0];
+  g_assert_cmpuint (zeitgeist_event_get_id (event), ==, event_id);
+
+  ZeitgeistSubject *subject = (ZeitgeistSubject*)
+    g_ptr_array_index (zeitgeist_event_get_subjects (event), 0);
+  g_assert_cmpstr (zeitgeist_subject_get_text (subject), ==, "text");
+}
+
+static void
+test_simple_relevancies_subject_query (Fixture *fix, gconstpointer data)
+{
+  guint matches;
+  gdouble *relevancies;
+  gint relevancies_size;
+  guint event_id4, event_id5, event_id6;
+ 
+  // add test events to DBs
+  index_event (fix, create_test_event1 ());
+  index_event (fix, create_test_event2 ());
+  index_event (fix, create_test_event3 ());
+  event_id4 = index_event (fix, create_test_event4 ());
+  event_id5 = index_event (fix, create_test_event5 ());
+  event_id6 = index_event (fix, create_test_event6 ());
+
+  GPtrArray *results =
+    zeitgeist_indexer_search_with_relevancies (fix->indexer,
+                              "user*",
+                              zeitgeist_time_range_new_anytime (),
+                              g_ptr_array_new (),
+                              ZEITGEIST_STORAGE_STATE_ANY,
+                              0,
+                              10,
+                              ZEITGEIST_RESULT_TYPE_MOST_RECENT_SUBJECTS,
+                              &relevancies, &relevancies_size,
+                              &matches,
+                              NULL);
+
+  g_assert_cmpuint (matches, >, 0);
+  g_assert_cmpuint (results->len, ==, 3);
+  g_assert_cmpint (relevancies_size, ==, 3);
+
+  // we're creating event 6 after 5 and 4, so it has to be more recent (but it seems
+  // that number of terms indexed matters as well, so careful with the relevancies)
+  g_assert_cmpuint (event_id6, ==,
+      zeitgeist_event_get_id ((ZeitgeistEvent*) results->pdata[0]));
+}
+
 G_BEGIN_DECLS
 
 static void discard_message (const gchar *domain,
@@ -619,6 +720,10 @@
               setup, test_simple_idn_support, teardown);
   g_test_add ("/Zeitgeist/FTS/Indexer/CJK", Fixture, 0,
               setup, test_simple_cjk, teardown);
+  g_test_add ("/Zeitgeist/FTS/Indexer/Relevancies", Fixture, 0,
+              setup, test_simple_relevancies_query, teardown);
+  g_test_add ("/Zeitgeist/FTS/Indexer/RelevanciesSubject", Fixture, 0,
+              setup, test_simple_relevancies_subject_query, teardown);
 
   // get rid of the "rebuilding index..." messages
   g_log_set_handler (NULL, G_LOG_LEVEL_MESSAGE, discard_message, NULL);


Follow ups