← Back to team overview

zeitgeist team mailing list archive

[Branch ~zeitgeist/zeitgeist/bluebird] Rev 423: Merge lp:~mhr3/zeitgeist/fts-secondary-sorting

 

Merge authors:
  Michal Hruby (mhr3)
Related merge proposals:
  https://code.launchpad.net/~mhr3/zeitgeist/fts-secondary-sorting/+merge/96479
  proposed by: Michal Hruby (mhr3)
  review: Approve - Siegfried Gevatter (rainct)
------------------------------------------------------------
revno: 423 [merge]
committer: Michal Hruby <michal.mhr@xxxxxxxxx>
branch nick: zeitgeist
timestamp: Wed 2012-03-14 13:34:10 +0100
message:
  Merge lp:~mhr3/zeitgeist/fts-secondary-sorting
modified:
  extensions/fts++/indexer.cpp
  extensions/fts++/test/test-indexer.cpp


--
lp:zeitgeist
https://code.launchpad.net/~zeitgeist/zeitgeist/bluebird

Your team Zeitgeist Framework Team is subscribed to branch lp:zeitgeist.
To unsubscribe from this branch go to https://code.launchpad.net/~zeitgeist/zeitgeist/bluebird/+edit-subscription
=== modified file 'extensions/fts++/indexer.cpp'
--- extensions/fts++/indexer.cpp	2012-03-12 14:22:16 +0000
+++ extensions/fts++/indexer.cpp	2012-03-14 12:31:51 +0000
@@ -824,7 +824,6 @@
 
       if (event_templates->len > 0)
       {
-        ZeitgeistTimeRange *time_range = zeitgeist_time_range_new_anytime ();
         results = zeitgeist_db_reader_find_events (zg_reader,
                                                    time_range,
                                                    event_templates,
@@ -833,8 +832,6 @@
                                                    result_type,
                                                    NULL,
                                                    error);
-
-        g_object_unref (time_range);
       }
       else
       {
@@ -861,6 +858,208 @@
   return results;
 }
 
+static guint32*
+find_event_ids_for_combined_template (ZeitgeistDbReader *zg_reader,
+                                      ZeitgeistWhereClause *query_clause, // steals
+                                      GPtrArray *event_templates, // steals
+                                      guint count,
+                                      ZeitgeistResultType result_type,
+                                      gint *event_ids_length,
+                                      GError **error)
+{
+  g_return_val_if_fail (error == NULL || (error && *error == NULL), NULL);
+
+  ZeitgeistWhereClause *uri_where;
+  uri_where = zeitgeist_db_reader_get_where_clause_from_event_templates (
+      zg_reader, event_templates, error);
+  g_ptr_array_unref (event_templates);
+
+  zeitgeist_where_clause_extend (query_clause, uri_where);
+  g_object_unref (G_OBJECT (uri_where));
+
+  guint32 *event_ids;
+  event_ids = zeitgeist_db_reader_find_event_ids_for_clause (zg_reader,
+      query_clause, count, result_type, event_ids_length, error);
+
+  g_object_unref (query_clause);
+
+  return event_ids;
+}
+
+static GPtrArray*
+find_events_for_result_type_and_ids (ZeitgeistDbReader *zg_reader,
+                                     ZeitgeistTimeRange *time_range,
+                                     GPtrArray *templates,
+                                     ZeitgeistStorageState storage_state,
+                                     unsigned count,
+                                     ZeitgeistResultType result_type,
+                                     std::vector<unsigned> const& event_ids,
+                                     std::map<unsigned, gdouble> &relevancy_map,
+                                     GError **error)
+{
+  GPtrArray *results = NULL;
+  results = zeitgeist_db_reader_get_events (zg_reader,
+                                            const_cast<unsigned*>(&event_ids[0]),
+                                            event_ids.size (),
+                                            NULL,
+                                            error);
+
+  if (error && *error) return NULL;
+
+  if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS ||
+      result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_EVENTS)
+    return results;
+
+  if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_SUBJECTS ||
+      result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_SUBJECTS ||
+      result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_SUBJECTS ||
+      result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_SUBJECTS)
+  {
+    // need to get the uris from the events and do another find_events call
+    GPtrArray *event_templates;
+    event_templates = g_ptr_array_new_with_free_func (g_object_unref);
+    std::map<std::string, unsigned> remapper;
+
+    for (unsigned i = 0; i < results->len; i++)
+    {
+      ZeitgeistEvent* original_event = (ZeitgeistEvent*) results->pdata[i];
+      unsigned event_id = zeitgeist_event_get_id (original_event);
+      GPtrArray *subjects = zeitgeist_event_get_subjects (original_event);
+      if (subjects == NULL) continue;
+      for (unsigned j = 0; j < subjects->len; j++)
+      {
+        const gchar *subj_uri = zeitgeist_subject_get_uri ((ZeitgeistSubject*) subjects->pdata[j]);
+        if (subj_uri == NULL) continue;
+        remapper[subj_uri] = event_id;
+        ZeitgeistEvent *event = zeitgeist_event_new ();
+        ZeitgeistSubject *subject = zeitgeist_subject_new ();
+        zeitgeist_subject_set_uri (subject, subj_uri);
+        zeitgeist_event_add_subject (event, subject); // FIXME: leaks?
+        g_ptr_array_add (event_templates, event);
+      }
+    }
+
+    g_ptr_array_unref (results);
+
+    // construct custom where clause which combines the original template
+    // with the uris we found
+    ZeitgeistWhereClause *where;
+    where = zeitgeist_db_reader_get_where_clause_for_query (zg_reader,
+        time_range, templates, storage_state, error);
+
+    guint32 *real_event_ids;
+    gint real_event_ids_length;
+
+    real_event_ids = find_event_ids_for_combined_template (zg_reader,
+        where, event_templates, count, result_type, &real_event_ids_length,
+        error);
+
+    if (error && *error) return NULL;
+
+    results = zeitgeist_db_reader_get_events (zg_reader,
+                                              real_event_ids,
+                                              real_event_ids_length,
+                                              NULL,
+                                              error);
+
+    g_free (real_event_ids);
+    real_event_ids = NULL;
+
+    if (error && *error) return NULL;
+
+    // the event ids might have changed, we need to update the relevancy_map
+    for (unsigned i = 0; i < results->len; i++)
+    {
+      ZeitgeistEvent* original_event = (ZeitgeistEvent*) results->pdata[i];
+      unsigned event_id = zeitgeist_event_get_id (original_event);
+      GPtrArray *subjects = zeitgeist_event_get_subjects (original_event);
+      if (subjects == NULL) continue;
+      for (unsigned j = 0; j < subjects->len; j++)
+      {
+        const gchar *subj_uri = zeitgeist_subject_get_uri ((ZeitgeistSubject*) subjects->pdata[j]);
+        if (subj_uri == NULL) continue;
+        relevancy_map[event_id] = relevancy_map[remapper[subj_uri]];
+      }
+    }
+
+  }
+  else if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_ORIGIN ||
+      result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_ORIGIN ||
+      result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN ||
+      result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_ORIGIN)
+  {
+    // need to get the origins from the events and do another find_events call
+    GPtrArray *event_templates;
+    event_templates = g_ptr_array_new_with_free_func (g_object_unref);
+    std::map<std::string, unsigned> remapper;
+
+    for (unsigned i = 0; i < results->len; i++)
+    {
+      ZeitgeistEvent* original_event = (ZeitgeistEvent*) results->pdata[i];
+      unsigned event_id = zeitgeist_event_get_id (original_event);
+      GPtrArray *subjects = zeitgeist_event_get_subjects (original_event);
+      if (subjects == NULL) continue;
+      for (unsigned j = 0; j < subjects->len; j++)
+      {
+        const gchar *subj_origin = zeitgeist_subject_get_origin ((ZeitgeistSubject*) subjects->pdata[j]);
+        if (subj_origin == NULL) continue;
+        remapper[subj_origin] = event_id;
+        ZeitgeistEvent *event = zeitgeist_event_new ();
+        ZeitgeistSubject *subject = zeitgeist_subject_new ();
+        zeitgeist_subject_set_origin (subject, subj_origin);
+        zeitgeist_event_add_subject (event, subject); // FIXME: leaks?
+        g_ptr_array_add (event_templates, event);
+      }
+    }
+
+    g_ptr_array_unref (results);
+
+    // construct custom where clause which combines the original template
+    // with the uris we found
+    ZeitgeistWhereClause *where;
+    where = zeitgeist_db_reader_get_where_clause_for_query (zg_reader,
+        time_range, templates, storage_state, error);
+
+    guint32 *real_event_ids;
+    gint real_event_ids_length;
+
+    real_event_ids = find_event_ids_for_combined_template (zg_reader,
+        where, event_templates, count, result_type, &real_event_ids_length,
+        error);
+
+    if (error && *error) return NULL;
+
+    results = zeitgeist_db_reader_get_events (zg_reader,
+                                              real_event_ids,
+                                              real_event_ids_length,
+                                              NULL,
+                                              error);
+
+    if (error && *error) return NULL;
+
+    g_free (real_event_ids);
+    real_event_ids = NULL;
+
+    // the event ids might have changed, we need to update the relevancy_map
+    for (unsigned i = 0; i < results->len; i++)
+    {
+      ZeitgeistEvent* original_event = (ZeitgeistEvent*) results->pdata[i];
+      unsigned event_id = zeitgeist_event_get_id (original_event);
+      GPtrArray *subjects = zeitgeist_event_get_subjects (original_event);
+      if (subjects == NULL) continue;
+      for (unsigned j = 0; j < subjects->len; j++)
+      {
+        const gchar *subj_origin = zeitgeist_subject_get_origin ((ZeitgeistSubject*) subjects->pdata[j]);
+        if (subj_origin == NULL) continue;
+        relevancy_map[event_id] = relevancy_map[remapper[subj_origin]];
+      }
+    }
+
+  }
+
+  return results;
+}
+
 GPtrArray* Indexer::SearchWithRelevancies (const gchar *search,
                                            ZeitgeistTimeRange *time_range,
                                            GPtrArray *templates,
@@ -880,21 +1079,58 @@
 
     guint maxhits = count;
 
-    if (result_type == RELEVANCY_RESULT_TYPE)
-    {
-      enquire->set_sort_by_relevance ();
-    }
-    else
-    {
-      enquire->set_sort_by_value (VALUE_TIMESTAMP, true);
-    }
-
     if (storage_state != ZEITGEIST_STORAGE_STATE_ANY)
     {
-      g_set_error_literal (error,
-                           ZEITGEIST_ENGINE_ERROR,
-                           ZEITGEIST_ENGINE_ERROR_INVALID_ARGUMENT,
-                           "Only ANY stogate state is supported");
+      // FIXME: add support for this by grabing (un)available storages
+      // from the storage table and appending them to the query
+      g_set_error_literal (error,
+                           ZEITGEIST_ENGINE_ERROR,
+                           ZEITGEIST_ENGINE_ERROR_INVALID_ARGUMENT,
+                           "Only ANY storage state is supported");
+      return NULL;
+    }
+
+    bool reversed_sort =
+      result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS ||
+      result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_SUBJECTS ||
+      result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_SUBJECTS ||
+      result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_ORIGIN ||
+      result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN;
+
+    if (result_type == RELEVANCY_RESULT_TYPE)
+    {
+      enquire->set_sort_by_relevance ();
+    }
+    else if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS ||
+        result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_EVENTS)
+    {
+      enquire->set_sort_by_relevance_then_value (VALUE_TIMESTAMP, reversed_sort);
+      enquire->set_collapse_key (VALUE_EVENT_ID);
+    }
+    else if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_SUBJECTS ||
+        result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_SUBJECTS ||
+        result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_SUBJECTS ||
+        result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_SUBJECTS)
+    {
+      enquire->set_sort_by_relevance_then_value (VALUE_TIMESTAMP, reversed_sort);
+      enquire->set_collapse_key (VALUE_URI_HASH);
+    }
+    else if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_ORIGIN ||
+        result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_ORIGIN ||
+        result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN ||
+        result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_ORIGIN)
+    {
+      // FIXME: not really correct but close :)
+      enquire->set_sort_by_relevance_then_value (VALUE_TIMESTAMP, reversed_sort);
+      enquire->set_collapse_key (VALUE_URI_HASH);
+      maxhits *= 3;
+    }
+    else
+    {
+      g_set_error_literal (error,
+                           ZEITGEIST_ENGINE_ERROR,
+                           ZEITGEIST_ENGINE_ERROR_INVALID_ARGUMENT,
+                           "Requested result type is not supported");
       return NULL;
     }
 
@@ -926,6 +1162,8 @@
                                                 NULL,
                                                 error);
 
+      if (error && *error) return NULL;
+
       if (results->len != relevancy_arr.size ())
       {
         g_warning ("Results don't match relevancies!");
@@ -948,22 +1186,56 @@
     }
     else
     {
-      g_set_error_literal (error,
-                           ZEITGEIST_ENGINE_ERROR,
-                           ZEITGEIST_ENGINE_ERROR_INVALID_ARGUMENT,
-                           "Only RELEVANCY result type is supported");
-      /*
-       * perhaps something like this could be used here?
+      std::vector<unsigned> event_ids;
       std::map<unsigned, gdouble> relevancy_map;
-      foreach (...)
+      Xapian::MSetIterator iter, end;
+      for (iter = hits.begin (), end = hits.end (); iter != end; ++iter)
       {
+        Xapian::Document doc(iter.get_document ());
+        double unserialized =
+          Xapian::sortable_unserialise (doc.get_value (VALUE_EVENT_ID));
+        unsigned event_id = static_cast<unsigned>(unserialized);
+
+        event_ids.push_back (event_id);
+
         double rank = iter.get_percent () / 100.;
         if (rank > relevancy_map[event_id])
         {
           relevancy_map[event_id] = rank;
         }
       }
-      */
+
+      results = find_events_for_result_type_and_ids (zg_reader, time_range,
+                                                     templates, storage_state,
+                                                     count, result_type,
+                                                     event_ids,
+                                                     relevancy_map, error);
+
+      if (error && *error) return NULL;
+
+      if (results == NULL)
+      {
+        results = g_ptr_array_new ();
+        if (relevancies) *relevancies = NULL;
+        if (relevancies_size) *relevancies_size = 0;
+      }
+      else
+      {
+        if (relevancies)
+        {
+          *relevancies = g_new (gdouble, results->len);
+          for (unsigned i = 0; i < results->len; i++)
+          {
+            ZeitgeistEvent *event = (ZeitgeistEvent*) g_ptr_array_index (results, i);
+            (*relevancies)[i] = relevancy_map[zeitgeist_event_get_id (event)];
+          }
+        }
+
+        if (relevancies_size)
+        {
+          *relevancies_size = results->len;
+        }
+      }
     }
 
     if (matches)

=== modified file 'extensions/fts++/test/test-indexer.cpp'
--- extensions/fts++/test/test-indexer.cpp	2012-02-14 16:56:04 +0000
+++ extensions/fts++/test/test-indexer.cpp	2012-03-11 18:58:01 +0000
@@ -163,6 +163,26 @@
   return event;
 }
 
+static ZeitgeistEvent* create_test_event6 (void)
+{
+  ZeitgeistEvent *event = zeitgeist_event_new ();
+  ZeitgeistSubject *subject = zeitgeist_subject_new ();
+  
+  zeitgeist_subject_set_interpretation (subject, ZEITGEIST_NFO_PRESENTATION);
+  zeitgeist_subject_set_manifestation (subject, ZEITGEIST_NFO_FILE_DATA_OBJECT);
+  zeitgeist_subject_set_uri (subject, "file:///home/username/Documents/CamelCasePresentation.pdf");
+  zeitgeist_subject_set_text (subject, NULL);
+  zeitgeist_subject_set_mimetype (subject, "application/pdf");
+
+  zeitgeist_event_set_interpretation (event, ZEITGEIST_ZG_MODIFY_EVENT);
+  zeitgeist_event_set_manifestation (event, ZEITGEIST_ZG_USER_ACTIVITY);
+  zeitgeist_event_set_actor (event, "application://libreoffice-impress.desktop");
+  zeitgeist_event_add_subject (event, subject);
+
+  g_object_unref (subject);
+  return event;
+}
+
 // Steals the event, ref it if you want to keep it
 static guint
 index_event (Fixture *fix, ZeitgeistEvent *event)
@@ -172,6 +192,7 @@
   guint *event_ids;
   int num_events_inserted;
 
+  zeitgeist_event_set_timestamp (event, zeitgeist_timestamp_now ());
   // add event to DBs
   events = g_ptr_array_new ();
   g_ptr_array_add (events, event);
@@ -586,6 +607,88 @@
   g_assert_cmpstr (zeitgeist_subject_get_text (subject), ==, "IDNwiki");
 }
 
+static void
+test_simple_relevancies_query (Fixture *fix, gconstpointer data)
+{
+  guint matches;
+  guint event_id;
+  gdouble *relevancies;
+  gint relevancies_size;
+  ZeitgeistEvent* event;
+ 
+  // add test events to DBs
+  event_id = index_event (fix, create_test_event1 ());
+  index_event (fix, create_test_event2 ());
+  index_event (fix, create_test_event3 ());
+  index_event (fix, create_test_event4 ());
+
+  GPtrArray *results =
+    zeitgeist_indexer_search_with_relevancies (fix->indexer,
+                              "text",
+                              zeitgeist_time_range_new_anytime (),
+                              g_ptr_array_new (),
+                              ZEITGEIST_STORAGE_STATE_ANY,
+                              0,
+                              10,
+                              (ZeitgeistResultType) 100,
+                              &relevancies, &relevancies_size,
+                              &matches,
+                              NULL);
+
+  g_assert_cmpuint (matches, >, 0);
+  g_assert_cmpuint (results->len, ==, 1);
+  g_assert_cmpint (relevancies_size, ==, 1);
+  g_assert_cmpfloat (relevancies[0], >=, 1.0);
+
+  event = (ZeitgeistEvent*) results->pdata[0];
+  g_assert_cmpuint (zeitgeist_event_get_id (event), ==, event_id);
+
+  ZeitgeistSubject *subject = (ZeitgeistSubject*)
+    g_ptr_array_index (zeitgeist_event_get_subjects (event), 0);
+  g_assert_cmpstr (zeitgeist_subject_get_text (subject), ==, "text");
+}
+
+static void
+test_simple_relevancies_subject_query (Fixture *fix, gconstpointer data)
+{
+  guint matches;
+  gdouble *relevancies;
+  gint relevancies_size;
+  guint event_id4, event_id5, event_id6;
+ 
+  // add test events to DBs
+  index_event (fix, create_test_event1 ());
+  index_event (fix, create_test_event2 ());
+  index_event (fix, create_test_event3 ());
+  event_id4 = index_event (fix, create_test_event4 ());
+  usleep (50000);
+  event_id5 = index_event (fix, create_test_event5 ());
+  usleep (50000);
+  event_id6 = index_event (fix, create_test_event6 ());
+
+  GPtrArray *results =
+    zeitgeist_indexer_search_with_relevancies (fix->indexer,
+                              "user*",
+                              zeitgeist_time_range_new_anytime (),
+                              g_ptr_array_new (),
+                              ZEITGEIST_STORAGE_STATE_ANY,
+                              0,
+                              10,
+                              ZEITGEIST_RESULT_TYPE_MOST_RECENT_SUBJECTS,
+                              &relevancies, &relevancies_size,
+                              &matches,
+                              NULL);
+
+  g_assert_cmpuint (matches, >, 0);
+  g_assert_cmpuint (results->len, ==, 3);
+  g_assert_cmpint (relevancies_size, ==, 3);
+
+  // we're creating event 6 after 5 and 4, so it has to be more recent (but it seems
+  // that number of terms indexed matters as well, so careful with the relevancies)
+  g_assert_cmpuint (event_id6, ==,
+      zeitgeist_event_get_id ((ZeitgeistEvent*) results->pdata[0]));
+}
+
 G_BEGIN_DECLS
 
 static void discard_message (const gchar *domain,
@@ -619,6 +722,10 @@
               setup, test_simple_idn_support, teardown);
   g_test_add ("/Zeitgeist/FTS/Indexer/CJK", Fixture, 0,
               setup, test_simple_cjk, teardown);
+  g_test_add ("/Zeitgeist/FTS/Indexer/Relevancies", Fixture, 0,
+              setup, test_simple_relevancies_query, teardown);
+  g_test_add ("/Zeitgeist/FTS/Indexer/RelevanciesSubject", Fixture, 0,
+              setup, test_simple_relevancies_subject_query, teardown);
 
   // get rid of the "rebuilding index..." messages
   g_log_set_handler (NULL, G_LOG_LEVEL_MESSAGE, discard_message, NULL);