zeitgeist team mailing list archive
-
zeitgeist team
-
Mailing list archive
-
Message #05049
[Merge] lp:~mhr3/zeitgeist/fts-secondary-sorting into lp:zeitgeist
Michal Hruby has proposed merging lp:~mhr3/zeitgeist/fts-secondary-sorting into lp:zeitgeist.
Requested reviews:
Zeitgeist Framework Team (zeitgeist)
For more details, see:
https://code.launchpad.net/~mhr3/zeitgeist/fts-secondary-sorting/+merge/96479
Implements secondary sorting based on ResultType to SearchWithRelevancies method.
--
https://code.launchpad.net/~mhr3/zeitgeist/fts-secondary-sorting/+merge/96479
Your team Zeitgeist Framework Team is requested to review the proposed merge of lp:~mhr3/zeitgeist/fts-secondary-sorting into lp:zeitgeist.
=== modified file 'extensions/fts++/indexer.cpp'
--- extensions/fts++/indexer.cpp 2012-03-07 16:08:26 +0000
+++ extensions/fts++/indexer.cpp 2012-03-07 22:37:19 +0000
@@ -23,6 +23,7 @@
#include <xapian.h>
#include <queue>
#include <vector>
+#include <cmath>
#include <gio/gio.h>
#include <gio/gdesktopappinfo.h>
@@ -804,7 +805,6 @@
if (event_templates->len > 0)
{
- ZeitgeistTimeRange *time_range = zeitgeist_time_range_new_anytime ();
results = zeitgeist_db_reader_find_events (zg_reader,
time_range,
event_templates,
@@ -813,8 +813,6 @@
result_type,
NULL,
error);
-
- g_object_unref (time_range);
}
else
{
@@ -841,6 +839,34 @@
return results;
}
+static gint
+sort_events_by_relevance (gconstpointer a, gconstpointer b, gpointer user_data)
+{
+ gdouble rel1 = 0.0;
+ gdouble rel2 = 0.0;
+ std::map<unsigned, gdouble>::const_iterator it;
+ ZeitgeistEvent **e1 = (ZeitgeistEvent**) a;
+ ZeitgeistEvent **e2 = (ZeitgeistEvent**) b;
+ std::map<unsigned, gdouble> const& relevancy_map =
+ *(static_cast<std::map<unsigned, gdouble>*> (user_data));
+
+ it = relevancy_map.find (zeitgeist_event_get_id (*e1));
+ if (it != relevancy_map.end ()) rel1 = it->second;
+
+ it = relevancy_map.find (zeitgeist_event_get_id (*e2));
+ if (it != relevancy_map.end ()) rel2 = it->second;
+
+ gdouble delta = rel1 - rel2;
+ if (fabs (delta) < 0.00001)
+ {
+ // relevancy of both items is the same, let's make use of stable sort
+ return e1 > e2 ? 1 : -1;
+ }
+
+ // we want the higher ranked events first
+ return (delta < 0) ? 1 : -1;
+}
+
GPtrArray* Indexer::SearchWithRelevancies (const gchar *search,
ZeitgeistTimeRange *time_range,
GPtrArray *templates,
@@ -860,24 +886,51 @@
guint maxhits = count;
- if (result_type == RELEVANCY_RESULT_TYPE)
- {
- enquire->set_sort_by_relevance ();
- }
- else
- {
- enquire->set_sort_by_value (VALUE_TIMESTAMP, true);
- }
-
if (storage_state != ZEITGEIST_STORAGE_STATE_ANY)
{
g_set_error_literal (error,
ZEITGEIST_ENGINE_ERROR,
ZEITGEIST_ENGINE_ERROR_INVALID_ARGUMENT,
- "Only ANY stogate state is supported");
+ "Only ANY storage state is supported");
return NULL;
}
+ if (result_type == RELEVANCY_RESULT_TYPE)
+ {
+ enquire->set_sort_by_relevance ();
+ }
+ else if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS ||
+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_EVENTS)
+ {
+ enquire->set_sort_by_relevance_then_value (VALUE_TIMESTAMP, true);
+ enquire->set_collapse_key (VALUE_EVENT_ID);
+ }
+ else if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_SUBJECTS ||
+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_SUBJECTS ||
+ result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_SUBJECTS ||
+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_SUBJECTS)
+ {
+ enquire->set_sort_by_relevance_then_value (VALUE_TIMESTAMP, true);
+ enquire->set_collapse_key (VALUE_URI_HASH);
+ }
+ else if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_ORIGIN ||
+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_ORIGIN ||
+ result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN ||
+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_ORIGIN)
+ {
+ // FIXME: not really correct but close :)
+ enquire->set_sort_by_relevance_then_value (VALUE_TIMESTAMP, true);
+ enquire->set_collapse_key (VALUE_URI_HASH);
+ maxhits *= 3;
+ }
+ else
+ {
+ // throw an error for these?
+ enquire->set_sort_by_relevance_then_value (VALUE_TIMESTAMP, true);
+ enquire->set_collapse_key (VALUE_EVENT_ID);
+ maxhits *= 3;
+ }
+
Xapian::Query q(query_parser->parse_query (query_string, QUERY_PARSER_FLAGS));
enquire->set_query (q);
Xapian::MSet hits (enquire->get_mset (offset, maxhits));
@@ -906,6 +959,8 @@
NULL,
error);
+ if (error && *error) return NULL;
+
if (results->len != relevancy_arr.size ())
{
g_warning ("Results don't match relevancies!");
@@ -928,22 +983,70 @@
}
else
{
- g_set_error_literal (error,
- ZEITGEIST_ENGINE_ERROR,
- ZEITGEIST_ENGINE_ERROR_INVALID_ARGUMENT,
- "Only RELEVANCY result type is supported");
- /*
- * perhaps something like this could be used here?
+ // we'll use the result type only for secondary sorting, relevancy
+ // is still primary!
+ GPtrArray *event_templates;
+ event_templates = g_ptr_array_new_with_free_func (g_object_unref);
std::map<unsigned, gdouble> relevancy_map;
- foreach (...)
+ Xapian::MSetIterator iter, end;
+ for (iter = hits.begin (), end = hits.end (); iter != end; ++iter)
{
+ Xapian::Document doc(iter.get_document ());
+ double unserialized =
+ Xapian::sortable_unserialise (doc.get_value (VALUE_EVENT_ID));
+ unsigned event_id = static_cast<unsigned>(unserialized);
+
+ ZeitgeistEvent *event = zeitgeist_event_new ();
+ zeitgeist_event_set_id (event, event_id);
+ g_ptr_array_add (event_templates, event);
+
double rank = iter.get_percent () / 100.;
if (rank > relevancy_map[event_id])
{
relevancy_map[event_id] = rank;
}
}
- */
+
+ if (event_templates->len > 0)
+ {
+ // let's ask zeitgeist for sorting based on result type
+ results = zeitgeist_db_reader_find_events (zg_reader,
+ time_range,
+ event_templates,
+ ZEITGEIST_STORAGE_STATE_ANY,
+ 0,
+ result_type,
+ NULL,
+ error);
+
+ if (error && *error) return NULL;
+
+ g_ptr_array_sort_with_data (results, sort_events_by_relevance,
+ &relevancy_map);
+
+ if (relevancies)
+ {
+ *relevancies = g_new (gdouble, results->len);
+ for (unsigned i = 0; i < results->len; i++)
+ {
+ ZeitgeistEvent *event = (ZeitgeistEvent*) g_ptr_array_index (results, i);
+ (*relevancies)[i] = relevancy_map[zeitgeist_event_get_id (event)];
+ }
+ }
+
+ if (relevancies_size)
+ {
+ *relevancies_size = results->len;
+ }
+ }
+ else
+ {
+ results = g_ptr_array_new ();
+ if (relevancies) *relevancies = NULL;
+ if (relevancies_size) *relevancies_size = 0;
+ }
+
+ g_ptr_array_unref (event_templates);
}
if (matches)
=== modified file 'extensions/fts++/test/test-indexer.cpp'
--- extensions/fts++/test/test-indexer.cpp 2012-02-14 16:56:04 +0000
+++ extensions/fts++/test/test-indexer.cpp 2012-03-07 22:37:19 +0000
@@ -163,6 +163,26 @@
return event;
}
+static ZeitgeistEvent* create_test_event6 (void)
+{
+ ZeitgeistEvent *event = zeitgeist_event_new ();
+ ZeitgeistSubject *subject = zeitgeist_subject_new ();
+
+ zeitgeist_subject_set_interpretation (subject, ZEITGEIST_NFO_PRESENTATION);
+ zeitgeist_subject_set_manifestation (subject, ZEITGEIST_NFO_FILE_DATA_OBJECT);
+ zeitgeist_subject_set_uri (subject, "file:///home/username/Documents/CamelCasePresentation.pdf");
+ zeitgeist_subject_set_text (subject, NULL);
+ zeitgeist_subject_set_mimetype (subject, "application/pdf");
+
+ zeitgeist_event_set_interpretation (event, ZEITGEIST_ZG_MODIFY_EVENT);
+ zeitgeist_event_set_manifestation (event, ZEITGEIST_ZG_USER_ACTIVITY);
+ zeitgeist_event_set_actor (event, "application://libreoffice-impress.desktop");
+ zeitgeist_event_add_subject (event, subject);
+
+ g_object_unref (subject);
+ return event;
+}
+
// Steals the event, ref it if you want to keep it
static guint
index_event (Fixture *fix, ZeitgeistEvent *event)
@@ -172,6 +192,7 @@
guint *event_ids;
int num_events_inserted;
+ zeitgeist_event_set_timestamp (event, zeitgeist_timestamp_now ());
// add event to DBs
events = g_ptr_array_new ();
g_ptr_array_add (events, event);
@@ -586,6 +607,86 @@
g_assert_cmpstr (zeitgeist_subject_get_text (subject), ==, "IDNwiki");
}
+static void
+test_simple_relevancies_query (Fixture *fix, gconstpointer data)
+{
+ guint matches;
+ guint event_id;
+ gdouble *relevancies;
+ gint relevancies_size;
+ ZeitgeistEvent* event;
+
+ // add test events to DBs
+ event_id = index_event (fix, create_test_event1 ());
+ index_event (fix, create_test_event2 ());
+ index_event (fix, create_test_event3 ());
+ index_event (fix, create_test_event4 ());
+
+ GPtrArray *results =
+ zeitgeist_indexer_search_with_relevancies (fix->indexer,
+ "text",
+ zeitgeist_time_range_new_anytime (),
+ g_ptr_array_new (),
+ ZEITGEIST_STORAGE_STATE_ANY,
+ 0,
+ 10,
+ (ZeitgeistResultType) 100,
+ &relevancies, &relevancies_size,
+ &matches,
+ NULL);
+
+ g_assert_cmpuint (matches, >, 0);
+ g_assert_cmpuint (results->len, ==, 1);
+ g_assert_cmpint (relevancies_size, ==, 1);
+ g_assert_cmpfloat (relevancies[0], >=, 1.0);
+
+ event = (ZeitgeistEvent*) results->pdata[0];
+ g_assert_cmpuint (zeitgeist_event_get_id (event), ==, event_id);
+
+ ZeitgeistSubject *subject = (ZeitgeistSubject*)
+ g_ptr_array_index (zeitgeist_event_get_subjects (event), 0);
+ g_assert_cmpstr (zeitgeist_subject_get_text (subject), ==, "text");
+}
+
+static void
+test_simple_relevancies_subject_query (Fixture *fix, gconstpointer data)
+{
+ guint matches;
+ gdouble *relevancies;
+ gint relevancies_size;
+ guint event_id4, event_id5, event_id6;
+
+ // add test events to DBs
+ index_event (fix, create_test_event1 ());
+ index_event (fix, create_test_event2 ());
+ index_event (fix, create_test_event3 ());
+ event_id4 = index_event (fix, create_test_event4 ());
+ event_id5 = index_event (fix, create_test_event5 ());
+ event_id6 = index_event (fix, create_test_event6 ());
+
+ GPtrArray *results =
+ zeitgeist_indexer_search_with_relevancies (fix->indexer,
+ "user*",
+ zeitgeist_time_range_new_anytime (),
+ g_ptr_array_new (),
+ ZEITGEIST_STORAGE_STATE_ANY,
+ 0,
+ 10,
+ ZEITGEIST_RESULT_TYPE_MOST_RECENT_SUBJECTS,
+ &relevancies, &relevancies_size,
+ &matches,
+ NULL);
+
+ g_assert_cmpuint (matches, >, 0);
+ g_assert_cmpuint (results->len, ==, 3);
+ g_assert_cmpint (relevancies_size, ==, 3);
+
+ // we're creating event 6 after 5 and 4, so it has to be more recent (but it seems
+ // that number of terms indexed matters as well, so careful with the relevancies)
+ g_assert_cmpuint (event_id6, ==,
+ zeitgeist_event_get_id ((ZeitgeistEvent*) results->pdata[0]));
+}
+
G_BEGIN_DECLS
static void discard_message (const gchar *domain,
@@ -619,6 +720,10 @@
setup, test_simple_idn_support, teardown);
g_test_add ("/Zeitgeist/FTS/Indexer/CJK", Fixture, 0,
setup, test_simple_cjk, teardown);
+ g_test_add ("/Zeitgeist/FTS/Indexer/Relevancies", Fixture, 0,
+ setup, test_simple_relevancies_query, teardown);
+ g_test_add ("/Zeitgeist/FTS/Indexer/RelevanciesSubject", Fixture, 0,
+ setup, test_simple_relevancies_subject_query, teardown);
// get rid of the "rebuilding index..." messages
g_log_set_handler (NULL, G_LOG_LEVEL_MESSAGE, discard_message, NULL);
Follow ups