zeitgeist team mailing list archive
-
zeitgeist team
-
Mailing list archive
-
Message #05113
[Branch ~zeitgeist/zeitgeist/bluebird] Rev 423: Merge lp:~mhr3/zeitgeist/fts-secondary-sorting
Merge authors:
Michal Hruby (mhr3)
Related merge proposals:
https://code.launchpad.net/~mhr3/zeitgeist/fts-secondary-sorting/+merge/96479
proposed by: Michal Hruby (mhr3)
review: Approve - Siegfried Gevatter (rainct)
------------------------------------------------------------
revno: 423 [merge]
committer: Michal Hruby <michal.mhr@xxxxxxxxx>
branch nick: zeitgeist
timestamp: Wed 2012-03-14 13:34:10 +0100
message:
Merge lp:~mhr3/zeitgeist/fts-secondary-sorting
modified:
extensions/fts++/indexer.cpp
extensions/fts++/test/test-indexer.cpp
--
lp:zeitgeist
https://code.launchpad.net/~zeitgeist/zeitgeist/bluebird
Your team Zeitgeist Framework Team is subscribed to branch lp:zeitgeist.
To unsubscribe from this branch go to https://code.launchpad.net/~zeitgeist/zeitgeist/bluebird/+edit-subscription
=== modified file 'extensions/fts++/indexer.cpp'
--- extensions/fts++/indexer.cpp 2012-03-12 14:22:16 +0000
+++ extensions/fts++/indexer.cpp 2012-03-14 12:31:51 +0000
@@ -824,7 +824,6 @@
if (event_templates->len > 0)
{
- ZeitgeistTimeRange *time_range = zeitgeist_time_range_new_anytime ();
results = zeitgeist_db_reader_find_events (zg_reader,
time_range,
event_templates,
@@ -833,8 +832,6 @@
result_type,
NULL,
error);
-
- g_object_unref (time_range);
}
else
{
@@ -861,6 +858,208 @@
return results;
}
+static guint32*
+find_event_ids_for_combined_template (ZeitgeistDbReader *zg_reader,
+ ZeitgeistWhereClause *query_clause, // steals
+ GPtrArray *event_templates, // steals
+ guint count,
+ ZeitgeistResultType result_type,
+ gint *event_ids_length,
+ GError **error)
+{
+ g_return_val_if_fail (error == NULL || (error && *error == NULL), NULL);
+
+ ZeitgeistWhereClause *uri_where;
+ uri_where = zeitgeist_db_reader_get_where_clause_from_event_templates (
+ zg_reader, event_templates, error);
+ g_ptr_array_unref (event_templates);
+
+ zeitgeist_where_clause_extend (query_clause, uri_where);
+ g_object_unref (G_OBJECT (uri_where));
+
+ guint32 *event_ids;
+ event_ids = zeitgeist_db_reader_find_event_ids_for_clause (zg_reader,
+ query_clause, count, result_type, event_ids_length, error);
+
+ g_object_unref (query_clause);
+
+ return event_ids;
+}
+
+static GPtrArray*
+find_events_for_result_type_and_ids (ZeitgeistDbReader *zg_reader,
+ ZeitgeistTimeRange *time_range,
+ GPtrArray *templates,
+ ZeitgeistStorageState storage_state,
+ unsigned count,
+ ZeitgeistResultType result_type,
+ std::vector<unsigned> const& event_ids,
+ std::map<unsigned, gdouble> &relevancy_map,
+ GError **error)
+{
+ GPtrArray *results = NULL;
+ results = zeitgeist_db_reader_get_events (zg_reader,
+ const_cast<unsigned*>(&event_ids[0]),
+ event_ids.size (),
+ NULL,
+ error);
+
+ if (error && *error) return NULL;
+
+ if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS ||
+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_EVENTS)
+ return results;
+
+ if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_SUBJECTS ||
+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_SUBJECTS ||
+ result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_SUBJECTS ||
+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_SUBJECTS)
+ {
+ // need to get the uris from the events and do another find_events call
+ GPtrArray *event_templates;
+ event_templates = g_ptr_array_new_with_free_func (g_object_unref);
+ std::map<std::string, unsigned> remapper;
+
+ for (unsigned i = 0; i < results->len; i++)
+ {
+ ZeitgeistEvent* original_event = (ZeitgeistEvent*) results->pdata[i];
+ unsigned event_id = zeitgeist_event_get_id (original_event);
+ GPtrArray *subjects = zeitgeist_event_get_subjects (original_event);
+ if (subjects == NULL) continue;
+ for (unsigned j = 0; j < subjects->len; j++)
+ {
+ const gchar *subj_uri = zeitgeist_subject_get_uri ((ZeitgeistSubject*) subjects->pdata[j]);
+ if (subj_uri == NULL) continue;
+ remapper[subj_uri] = event_id;
+ ZeitgeistEvent *event = zeitgeist_event_new ();
+ ZeitgeistSubject *subject = zeitgeist_subject_new ();
+ zeitgeist_subject_set_uri (subject, subj_uri);
+ zeitgeist_event_add_subject (event, subject); // FIXME: leaks?
+ g_ptr_array_add (event_templates, event);
+ }
+ }
+
+ g_ptr_array_unref (results);
+
+ // construct custom where clause which combines the original template
+ // with the uris we found
+ ZeitgeistWhereClause *where;
+ where = zeitgeist_db_reader_get_where_clause_for_query (zg_reader,
+ time_range, templates, storage_state, error);
+
+ guint32 *real_event_ids;
+ gint real_event_ids_length;
+
+ real_event_ids = find_event_ids_for_combined_template (zg_reader,
+ where, event_templates, count, result_type, &real_event_ids_length,
+ error);
+
+ if (error && *error) return NULL;
+
+ results = zeitgeist_db_reader_get_events (zg_reader,
+ real_event_ids,
+ real_event_ids_length,
+ NULL,
+ error);
+
+ g_free (real_event_ids);
+ real_event_ids = NULL;
+
+ if (error && *error) return NULL;
+
+ // the event ids might have changed, we need to update the relevancy_map
+ for (unsigned i = 0; i < results->len; i++)
+ {
+ ZeitgeistEvent* original_event = (ZeitgeistEvent*) results->pdata[i];
+ unsigned event_id = zeitgeist_event_get_id (original_event);
+ GPtrArray *subjects = zeitgeist_event_get_subjects (original_event);
+ if (subjects == NULL) continue;
+ for (unsigned j = 0; j < subjects->len; j++)
+ {
+ const gchar *subj_uri = zeitgeist_subject_get_uri ((ZeitgeistSubject*) subjects->pdata[j]);
+ if (subj_uri == NULL) continue;
+ relevancy_map[event_id] = relevancy_map[remapper[subj_uri]];
+ }
+ }
+
+ }
+ else if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_ORIGIN ||
+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_ORIGIN ||
+ result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN ||
+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_ORIGIN)
+ {
+ // need to get the origins from the events and do another find_events call
+ GPtrArray *event_templates;
+ event_templates = g_ptr_array_new_with_free_func (g_object_unref);
+ std::map<std::string, unsigned> remapper;
+
+ for (unsigned i = 0; i < results->len; i++)
+ {
+ ZeitgeistEvent* original_event = (ZeitgeistEvent*) results->pdata[i];
+ unsigned event_id = zeitgeist_event_get_id (original_event);
+ GPtrArray *subjects = zeitgeist_event_get_subjects (original_event);
+ if (subjects == NULL) continue;
+ for (unsigned j = 0; j < subjects->len; j++)
+ {
+ const gchar *subj_origin = zeitgeist_subject_get_origin ((ZeitgeistSubject*) subjects->pdata[j]);
+ if (subj_origin == NULL) continue;
+ remapper[subj_origin] = event_id;
+ ZeitgeistEvent *event = zeitgeist_event_new ();
+ ZeitgeistSubject *subject = zeitgeist_subject_new ();
+ zeitgeist_subject_set_origin (subject, subj_origin);
+ zeitgeist_event_add_subject (event, subject); // FIXME: leaks?
+ g_ptr_array_add (event_templates, event);
+ }
+ }
+
+ g_ptr_array_unref (results);
+
+ // construct custom where clause which combines the original template
+ // with the uris we found
+ ZeitgeistWhereClause *where;
+ where = zeitgeist_db_reader_get_where_clause_for_query (zg_reader,
+ time_range, templates, storage_state, error);
+
+ guint32 *real_event_ids;
+ gint real_event_ids_length;
+
+ real_event_ids = find_event_ids_for_combined_template (zg_reader,
+ where, event_templates, count, result_type, &real_event_ids_length,
+ error);
+
+ if (error && *error) return NULL;
+
+ results = zeitgeist_db_reader_get_events (zg_reader,
+ real_event_ids,
+ real_event_ids_length,
+ NULL,
+ error);
+
+ if (error && *error) return NULL;
+
+ g_free (real_event_ids);
+ real_event_ids = NULL;
+
+ // the event ids might have changed, we need to update the relevancy_map
+ for (unsigned i = 0; i < results->len; i++)
+ {
+ ZeitgeistEvent* original_event = (ZeitgeistEvent*) results->pdata[i];
+ unsigned event_id = zeitgeist_event_get_id (original_event);
+ GPtrArray *subjects = zeitgeist_event_get_subjects (original_event);
+ if (subjects == NULL) continue;
+ for (unsigned j = 0; j < subjects->len; j++)
+ {
+ const gchar *subj_origin = zeitgeist_subject_get_origin ((ZeitgeistSubject*) subjects->pdata[j]);
+ if (subj_origin == NULL) continue;
+ relevancy_map[event_id] = relevancy_map[remapper[subj_origin]];
+ }
+ }
+
+ }
+
+ return results;
+}
+
GPtrArray* Indexer::SearchWithRelevancies (const gchar *search,
ZeitgeistTimeRange *time_range,
GPtrArray *templates,
@@ -880,21 +1079,58 @@
guint maxhits = count;
- if (result_type == RELEVANCY_RESULT_TYPE)
- {
- enquire->set_sort_by_relevance ();
- }
- else
- {
- enquire->set_sort_by_value (VALUE_TIMESTAMP, true);
- }
-
if (storage_state != ZEITGEIST_STORAGE_STATE_ANY)
{
- g_set_error_literal (error,
- ZEITGEIST_ENGINE_ERROR,
- ZEITGEIST_ENGINE_ERROR_INVALID_ARGUMENT,
- "Only ANY stogate state is supported");
+ // FIXME: add support for this by grabing (un)available storages
+ // from the storage table and appending them to the query
+ g_set_error_literal (error,
+ ZEITGEIST_ENGINE_ERROR,
+ ZEITGEIST_ENGINE_ERROR_INVALID_ARGUMENT,
+ "Only ANY storage state is supported");
+ return NULL;
+ }
+
+ bool reversed_sort =
+ result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS ||
+ result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_SUBJECTS ||
+ result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_SUBJECTS ||
+ result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_ORIGIN ||
+ result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN;
+
+ if (result_type == RELEVANCY_RESULT_TYPE)
+ {
+ enquire->set_sort_by_relevance ();
+ }
+ else if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS ||
+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_EVENTS)
+ {
+ enquire->set_sort_by_relevance_then_value (VALUE_TIMESTAMP, reversed_sort);
+ enquire->set_collapse_key (VALUE_EVENT_ID);
+ }
+ else if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_SUBJECTS ||
+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_SUBJECTS ||
+ result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_SUBJECTS ||
+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_SUBJECTS)
+ {
+ enquire->set_sort_by_relevance_then_value (VALUE_TIMESTAMP, reversed_sort);
+ enquire->set_collapse_key (VALUE_URI_HASH);
+ }
+ else if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_ORIGIN ||
+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_ORIGIN ||
+ result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN ||
+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_ORIGIN)
+ {
+ // FIXME: not really correct but close :)
+ enquire->set_sort_by_relevance_then_value (VALUE_TIMESTAMP, reversed_sort);
+ enquire->set_collapse_key (VALUE_URI_HASH);
+ maxhits *= 3;
+ }
+ else
+ {
+ g_set_error_literal (error,
+ ZEITGEIST_ENGINE_ERROR,
+ ZEITGEIST_ENGINE_ERROR_INVALID_ARGUMENT,
+ "Requested result type is not supported");
return NULL;
}
@@ -926,6 +1162,8 @@
NULL,
error);
+ if (error && *error) return NULL;
+
if (results->len != relevancy_arr.size ())
{
g_warning ("Results don't match relevancies!");
@@ -948,22 +1186,56 @@
}
else
{
- g_set_error_literal (error,
- ZEITGEIST_ENGINE_ERROR,
- ZEITGEIST_ENGINE_ERROR_INVALID_ARGUMENT,
- "Only RELEVANCY result type is supported");
- /*
- * perhaps something like this could be used here?
+ std::vector<unsigned> event_ids;
std::map<unsigned, gdouble> relevancy_map;
- foreach (...)
+ Xapian::MSetIterator iter, end;
+ for (iter = hits.begin (), end = hits.end (); iter != end; ++iter)
{
+ Xapian::Document doc(iter.get_document ());
+ double unserialized =
+ Xapian::sortable_unserialise (doc.get_value (VALUE_EVENT_ID));
+ unsigned event_id = static_cast<unsigned>(unserialized);
+
+ event_ids.push_back (event_id);
+
double rank = iter.get_percent () / 100.;
if (rank > relevancy_map[event_id])
{
relevancy_map[event_id] = rank;
}
}
- */
+
+ results = find_events_for_result_type_and_ids (zg_reader, time_range,
+ templates, storage_state,
+ count, result_type,
+ event_ids,
+ relevancy_map, error);
+
+ if (error && *error) return NULL;
+
+ if (results == NULL)
+ {
+ results = g_ptr_array_new ();
+ if (relevancies) *relevancies = NULL;
+ if (relevancies_size) *relevancies_size = 0;
+ }
+ else
+ {
+ if (relevancies)
+ {
+ *relevancies = g_new (gdouble, results->len);
+ for (unsigned i = 0; i < results->len; i++)
+ {
+ ZeitgeistEvent *event = (ZeitgeistEvent*) g_ptr_array_index (results, i);
+ (*relevancies)[i] = relevancy_map[zeitgeist_event_get_id (event)];
+ }
+ }
+
+ if (relevancies_size)
+ {
+ *relevancies_size = results->len;
+ }
+ }
}
if (matches)
=== modified file 'extensions/fts++/test/test-indexer.cpp'
--- extensions/fts++/test/test-indexer.cpp 2012-02-14 16:56:04 +0000
+++ extensions/fts++/test/test-indexer.cpp 2012-03-11 18:58:01 +0000
@@ -163,6 +163,26 @@
return event;
}
+static ZeitgeistEvent* create_test_event6 (void)
+{
+ ZeitgeistEvent *event = zeitgeist_event_new ();
+ ZeitgeistSubject *subject = zeitgeist_subject_new ();
+
+ zeitgeist_subject_set_interpretation (subject, ZEITGEIST_NFO_PRESENTATION);
+ zeitgeist_subject_set_manifestation (subject, ZEITGEIST_NFO_FILE_DATA_OBJECT);
+ zeitgeist_subject_set_uri (subject, "file:///home/username/Documents/CamelCasePresentation.pdf");
+ zeitgeist_subject_set_text (subject, NULL);
+ zeitgeist_subject_set_mimetype (subject, "application/pdf");
+
+ zeitgeist_event_set_interpretation (event, ZEITGEIST_ZG_MODIFY_EVENT);
+ zeitgeist_event_set_manifestation (event, ZEITGEIST_ZG_USER_ACTIVITY);
+ zeitgeist_event_set_actor (event, "application://libreoffice-impress.desktop");
+ zeitgeist_event_add_subject (event, subject);
+
+ g_object_unref (subject);
+ return event;
+}
+
// Steals the event, ref it if you want to keep it
static guint
index_event (Fixture *fix, ZeitgeistEvent *event)
@@ -172,6 +192,7 @@
guint *event_ids;
int num_events_inserted;
+ zeitgeist_event_set_timestamp (event, zeitgeist_timestamp_now ());
// add event to DBs
events = g_ptr_array_new ();
g_ptr_array_add (events, event);
@@ -586,6 +607,88 @@
g_assert_cmpstr (zeitgeist_subject_get_text (subject), ==, "IDNwiki");
}
+static void
+test_simple_relevancies_query (Fixture *fix, gconstpointer data)
+{
+ guint matches;
+ guint event_id;
+ gdouble *relevancies;
+ gint relevancies_size;
+ ZeitgeistEvent* event;
+
+ // add test events to DBs
+ event_id = index_event (fix, create_test_event1 ());
+ index_event (fix, create_test_event2 ());
+ index_event (fix, create_test_event3 ());
+ index_event (fix, create_test_event4 ());
+
+ GPtrArray *results =
+ zeitgeist_indexer_search_with_relevancies (fix->indexer,
+ "text",
+ zeitgeist_time_range_new_anytime (),
+ g_ptr_array_new (),
+ ZEITGEIST_STORAGE_STATE_ANY,
+ 0,
+ 10,
+ (ZeitgeistResultType) 100,
+ &relevancies, &relevancies_size,
+ &matches,
+ NULL);
+
+ g_assert_cmpuint (matches, >, 0);
+ g_assert_cmpuint (results->len, ==, 1);
+ g_assert_cmpint (relevancies_size, ==, 1);
+ g_assert_cmpfloat (relevancies[0], >=, 1.0);
+
+ event = (ZeitgeistEvent*) results->pdata[0];
+ g_assert_cmpuint (zeitgeist_event_get_id (event), ==, event_id);
+
+ ZeitgeistSubject *subject = (ZeitgeistSubject*)
+ g_ptr_array_index (zeitgeist_event_get_subjects (event), 0);
+ g_assert_cmpstr (zeitgeist_subject_get_text (subject), ==, "text");
+}
+
+static void
+test_simple_relevancies_subject_query (Fixture *fix, gconstpointer data)
+{
+ guint matches;
+ gdouble *relevancies;
+ gint relevancies_size;
+ guint event_id4, event_id5, event_id6;
+
+ // add test events to DBs
+ index_event (fix, create_test_event1 ());
+ index_event (fix, create_test_event2 ());
+ index_event (fix, create_test_event3 ());
+ event_id4 = index_event (fix, create_test_event4 ());
+ usleep (50000);
+ event_id5 = index_event (fix, create_test_event5 ());
+ usleep (50000);
+ event_id6 = index_event (fix, create_test_event6 ());
+
+ GPtrArray *results =
+ zeitgeist_indexer_search_with_relevancies (fix->indexer,
+ "user*",
+ zeitgeist_time_range_new_anytime (),
+ g_ptr_array_new (),
+ ZEITGEIST_STORAGE_STATE_ANY,
+ 0,
+ 10,
+ ZEITGEIST_RESULT_TYPE_MOST_RECENT_SUBJECTS,
+ &relevancies, &relevancies_size,
+ &matches,
+ NULL);
+
+ g_assert_cmpuint (matches, >, 0);
+ g_assert_cmpuint (results->len, ==, 3);
+ g_assert_cmpint (relevancies_size, ==, 3);
+
+ // we're creating event 6 after 5 and 4, so it has to be more recent (but it seems
+ // that number of terms indexed matters as well, so careful with the relevancies)
+ g_assert_cmpuint (event_id6, ==,
+ zeitgeist_event_get_id ((ZeitgeistEvent*) results->pdata[0]));
+}
+
G_BEGIN_DECLS
static void discard_message (const gchar *domain,
@@ -619,6 +722,10 @@
setup, test_simple_idn_support, teardown);
g_test_add ("/Zeitgeist/FTS/Indexer/CJK", Fixture, 0,
setup, test_simple_cjk, teardown);
+ g_test_add ("/Zeitgeist/FTS/Indexer/Relevancies", Fixture, 0,
+ setup, test_simple_relevancies_query, teardown);
+ g_test_add ("/Zeitgeist/FTS/Indexer/RelevanciesSubject", Fixture, 0,
+ setup, test_simple_relevancies_subject_query, teardown);
// get rid of the "rebuilding index..." messages
g_log_set_handler (NULL, G_LOG_LEVEL_MESSAGE, discard_message, NULL);