zeitgeist team mailing list archive
-
zeitgeist team
-
Mailing list archive
-
Message #05224
[Merge] lp:~zeitgeist/zeitgeist/fts-origin-hashing into lp:zeitgeist
Siegfried Gevatter has proposed merging lp:~zeitgeist/zeitgeist/fts-origin-hashing into lp:zeitgeist.
Requested reviews:
Zeitgeist Framework Team (zeitgeist)
For more details, see:
https://code.launchpad.net/~zeitgeist/zeitgeist/fts-origin-hashing/+merge/98281
--
https://code.launchpad.net/~zeitgeist/zeitgeist/fts-origin-hashing/+merge/98281
Your team Zeitgeist Framework Team is requested to review the proposed merge of lp:~zeitgeist/zeitgeist/fts-origin-hashing into lp:zeitgeist.
=== modified file 'extensions/fts++/indexer.cpp'
--- extensions/fts++/indexer.cpp 2012-03-16 20:03:05 +0000
+++ extensions/fts++/indexer.cpp 2012-03-19 20:32:18 +0000
@@ -43,6 +43,7 @@
const Xapian::valueno VALUE_EVENT_ID = 0;
const Xapian::valueno VALUE_TIMESTAMP = 1;
const Xapian::valueno VALUE_URI_HASH = 2;
+const Xapian::valueno VALUE_ORIGIN_HASH = 3;
#define QUERY_PARSER_FLAGS \
Xapian::QueryParser::FLAG_PHRASE | Xapian::QueryParser::FLAG_BOOLEAN | \
@@ -778,7 +779,9 @@
}
else
{
- enquire->set_sort_by_value (VALUE_TIMESTAMP, true);
+ bool reversed_sort = not
+ zeitgeist_result_type_is_sort_order_asc (result_type);
+ enquire->set_sort_by_value (VALUE_TIMESTAMP, reversed_sort);
}
if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_SUBJECTS ||
@@ -786,7 +789,19 @@
result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_SUBJECTS ||
result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_SUBJECTS)
{
- enquire->set_collapse_key (VALUE_URI_HASH);
+ enquire->set_collapse_key (VALUE_URI_HASH);
+ }
+ else if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_ORIGIN ||
+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_ORIGIN ||
+ result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN ||
+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_ORIGIN)
+ {
+ enquire->set_collapse_key (VALUE_ORIGIN_HASH);
+ }
+ else if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS ||
+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_EVENTS)
+ {
+ enquire->set_collapse_key (VALUE_EVENT_ID);
}
Xapian::Query q(query_parser->parse_query (query_string, QUERY_PARSER_FLAGS));
@@ -1096,12 +1111,8 @@
return NULL;
}
- bool reversed_sort =
- result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS ||
- result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_SUBJECTS ||
- result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_SUBJECTS ||
- result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_ORIGIN ||
- result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN;
+ bool reversed_sort = not
+ zeitgeist_result_type_is_sort_order_asc (result_type);
if (result_type == RELEVANCY_RESULT_TYPE)
{
@@ -1126,10 +1137,8 @@
result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN ||
result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_ORIGIN)
{
- // FIXME: not really correct but close :)
enquire->set_sort_by_relevance_then_value (VALUE_TIMESTAMP, reversed_sort);
- enquire->set_collapse_key (VALUE_URI_HASH);
- maxhits *= 3;
+ enquire->set_collapse_key (VALUE_ORIGIN_HASH);
}
else
{
@@ -1311,19 +1320,36 @@
return; // ignore this event completely...
}
+ guint8 uri_hash[HASH_LENGTH + 1];
+
// We need the subject URI so we can use Xapian's collapse key feature
// for *_SUBJECT grouping. However, to save space, we'll just save a hash.
// A better option would be using URI's id, but for that we'd need a SQL
// query that'd be subject to races.
// FIXME(?): This doesn't work for events with multiple subjects.
g_checksum_update (checksum, (guchar *) uri.c_str (), -1);
- guint8 uri_hash[HASH_LENGTH + 1];
gsize hash_size = HASH_LENGTH;
g_checksum_get_digest (checksum, uri_hash, &hash_size);
g_checksum_reset (checksum);
g_assert (hash_size == HASH_LENGTH);
doc.add_value (VALUE_URI_HASH, std::string((char *) uri_hash, hash_size));
+ size_t colon_pos = uri.find (':');
+ // FIXME: use current_origin once we have that
+ val = zeitgeist_subject_get_origin (subject);
+ // Make sure the schemas of the URI and the origin is the same,
+ // to avoid saving some junk.
+ if (val && colon_pos != std::string::npos
+ && strncmp (uri.c_str (), val, colon_pos+1) == 0)
+ {
+ g_checksum_update (checksum, (guchar *) val, -1);
+ g_checksum_get_digest (checksum, uri_hash, &hash_size);
+ g_checksum_reset (checksum);
+ g_assert (hash_size == HASH_LENGTH);
+ doc.add_value (VALUE_ORIGIN_HASH,
+ std::string((char *) uri_hash, hash_size));
+ }
+
val = zeitgeist_subject_get_text (subject);
if (val && val[0] != '\0')
{
=== modified file 'src/datamodel.vala'
--- src/datamodel.vala 2012-02-18 21:33:57 +0000
+++ src/datamodel.vala 2012-03-19 20:32:18 +0000
@@ -230,10 +230,60 @@
// different origin ordered
// by the popularity of the
// origins
- LEAST_POPULAR_EVENT_ORIGIN = 30, // The last event of each
+ LEAST_POPULAR_EVENT_ORIGIN = 30; // The last event of each
// different origin, ordered
// ascendingly by the
// popularity of the origin
+
+ /*
+ * Returns true if the results for the given result_type will be sorted
+ * ascendantly by date, false if they'll be sorted descendingly.
+ **/
+ public static bool is_sort_order_asc (ResultType result_type)
+ {
+ switch (result_type)
+ {
+ // FIXME: Why are LEAST_POPULAR_* using ASC?
+ case ResultType.LEAST_RECENT_EVENTS:
+ case ResultType.LEAST_RECENT_EVENT_ORIGIN:
+ case ResultType.LEAST_POPULAR_EVENT_ORIGIN:
+ case ResultType.LEAST_RECENT_SUBJECTS:
+ case ResultType.LEAST_POPULAR_SUBJECTS:
+ case ResultType.LEAST_RECENT_CURRENT_URI:
+ case ResultType.LEAST_POPULAR_CURRENT_URI:
+ case ResultType.LEAST_RECENT_ACTOR:
+ case ResultType.LEAST_POPULAR_ACTOR:
+ case ResultType.OLDEST_ACTOR:
+ case ResultType.LEAST_RECENT_ORIGIN:
+ case ResultType.LEAST_POPULAR_ORIGIN:
+ case ResultType.LEAST_RECENT_SUBJECT_INTERPRETATION:
+ case ResultType.LEAST_POPULAR_SUBJECT_INTERPRETATION:
+ case ResultType.LEAST_RECENT_MIMETYPE:
+ case ResultType.LEAST_POPULAR_MIMETYPE:
+ return true;
+
+ case ResultType.MOST_RECENT_EVENTS:
+ case ResultType.MOST_RECENT_EVENT_ORIGIN:
+ case ResultType.MOST_POPULAR_EVENT_ORIGIN:
+ case ResultType.MOST_RECENT_SUBJECTS:
+ case ResultType.MOST_POPULAR_SUBJECTS:
+ case ResultType.MOST_RECENT_CURRENT_URI:
+ case ResultType.MOST_POPULAR_CURRENT_URI:
+ case ResultType.MOST_RECENT_ACTOR:
+ case ResultType.MOST_POPULAR_ACTOR:
+ case ResultType.MOST_RECENT_ORIGIN:
+ case ResultType.MOST_POPULAR_ORIGIN:
+ case ResultType.MOST_RECENT_SUBJECT_INTERPRETATION:
+ case ResultType.MOST_POPULAR_SUBJECT_INTERPRETATION:
+ case ResultType.MOST_RECENT_MIMETYPE:
+ case ResultType.MOST_POPULAR_MIMETYPE:
+ return false;
+
+ default:
+ warning ("Unrecognized ResultType: %u", (uint) result_type);
+ return true;
+ }
+ }
}
/*
=== modified file 'src/db-reader.vala'
--- src/db-reader.vala 2012-03-17 14:47:09 +0000
+++ src/db-reader.vala 2012-03-19 20:32:18 +0000
@@ -173,109 +173,107 @@
switch (result_type)
{
case ResultType.MOST_RECENT_EVENTS:
- sql += where_sql + " ORDER BY timestamp DESC";
+ sql += where_sql + " ORDER BY ";
break;
case ResultType.LEAST_RECENT_EVENTS:
- sql += where_sql + " ORDER BY timestamp ASC";
+ sql += where_sql + " ORDER BY ";
break;
case ResultType.MOST_RECENT_EVENT_ORIGIN:
- sql += group_and_sort ("origin", where_sql, false);
+ sql += group_and_sort ("origin", where_sql);
break;
case ResultType.LEAST_RECENT_EVENT_ORIGIN:
- sql += group_and_sort ("origin", where_sql, true);
+ sql += group_and_sort ("origin", where_sql);
break;
case ResultType.MOST_POPULAR_EVENT_ORIGIN:
- sql += group_and_sort ("origin", where_sql, false, false);
+ sql += group_and_sort ("origin", where_sql, false);
break;
case ResultType.LEAST_POPULAR_EVENT_ORIGIN:
- sql += group_and_sort ("origin", where_sql, true, true);
+ sql += group_and_sort ("origin", where_sql, true);
break;
case ResultType.MOST_RECENT_SUBJECTS:
- sql += group_and_sort ("subj_id", where_sql, false);
+ sql += group_and_sort ("subj_id", where_sql);
break;
case ResultType.LEAST_RECENT_SUBJECTS:
- sql += group_and_sort ("subj_id", where_sql, true);
+ sql += group_and_sort ("subj_id", where_sql);
break;
case ResultType.MOST_POPULAR_SUBJECTS:
- sql += group_and_sort ("subj_id", where_sql, false, false);
+ sql += group_and_sort ("subj_id", where_sql, false);
break;
case ResultType.LEAST_POPULAR_SUBJECTS:
- sql += group_and_sort ("subj_id", where_sql, true, true);
+ sql += group_and_sort ("subj_id", where_sql, true);
break;
case ResultType.MOST_RECENT_CURRENT_URI:
- sql += group_and_sort ("subj_id_current", where_sql, false);
+ sql += group_and_sort ("subj_id_current", where_sql);
break;
case ResultType.LEAST_RECENT_CURRENT_URI:
- sql += group_and_sort ("subj_id_current", where_sql, true);
+ sql += group_and_sort ("subj_id_current", where_sql);
break;
case ResultType.MOST_POPULAR_CURRENT_URI:
- sql += group_and_sort ("subj_id_current", where_sql,
- false, false);
+ sql += group_and_sort ("subj_id_current", where_sql, false);
break;
case ResultType.LEAST_POPULAR_CURRENT_URI:
- sql += group_and_sort ("subj_id_current", where_sql,
- true, true);
+ sql += group_and_sort ("subj_id_current", where_sql, true);
break;
case ResultType.MOST_RECENT_ACTOR:
- sql += group_and_sort ("actor", where_sql, false);
+ sql += group_and_sort ("actor", where_sql);
break;
case ResultType.LEAST_RECENT_ACTOR:
- sql += group_and_sort ("actor", where_sql, true);
+ sql += group_and_sort ("actor", where_sql);
break;
case ResultType.MOST_POPULAR_ACTOR:
- sql += group_and_sort ("actor", where_sql, false, false);
+ sql += group_and_sort ("actor", where_sql, false);
break;
case ResultType.LEAST_POPULAR_ACTOR:
- sql += group_and_sort ("actor", where_sql, true, true);
+ sql += group_and_sort ("actor", where_sql, true);
break;
case ResultType.OLDEST_ACTOR:
- sql += group_and_sort ("actor", where_sql, true, null, "min");
+ sql += group_and_sort ("actor", where_sql, null, "min");
break;
case ResultType.MOST_RECENT_ORIGIN:
- sql += group_and_sort ("subj_origin", where_sql, false);
+ sql += group_and_sort ("subj_origin", where_sql);
break;
case ResultType.LEAST_RECENT_ORIGIN:
- sql += group_and_sort ("subj_origin", where_sql, true);
+ sql += group_and_sort ("subj_origin", where_sql);
break;
case ResultType.MOST_POPULAR_ORIGIN:
- sql += group_and_sort ("subj_origin", where_sql, false, false);
+ sql += group_and_sort ("subj_origin", where_sql, false);
break;
case ResultType.LEAST_POPULAR_ORIGIN:
- sql += group_and_sort ("subj_origin", where_sql, true, true);
+ sql += group_and_sort ("subj_origin", where_sql, true);
break;
case ResultType.MOST_RECENT_SUBJECT_INTERPRETATION:
- sql += group_and_sort ("subj_interpretation", where_sql, false);
+ sql += group_and_sort ("subj_interpretation", where_sql);
break;
case ResultType.LEAST_RECENT_SUBJECT_INTERPRETATION:
- sql += group_and_sort ("subj_interpretation", where_sql, true);
+ sql += group_and_sort ("subj_interpretation", where_sql);
break;
case ResultType.MOST_POPULAR_SUBJECT_INTERPRETATION:
- sql += group_and_sort ("subj_interpretation", where_sql,
- false, false);
+ sql += group_and_sort ("subj_interpretation", where_sql, false);
break;
case ResultType.LEAST_POPULAR_SUBJECT_INTERPRETATION:
- sql += group_and_sort ("subj_interpretation", where_sql,
- true, true);
+ sql += group_and_sort ("subj_interpretation", where_sql, true);
break;
case ResultType.MOST_RECENT_MIMETYPE:
- sql += group_and_sort ("subj_mimetype", where_sql, false);
+ sql += group_and_sort ("subj_mimetype", where_sql);
break;
case ResultType.LEAST_RECENT_MIMETYPE:
- sql += group_and_sort ("subj_mimetype", where_sql, true);
+ sql += group_and_sort ("subj_mimetype", where_sql);
break;
case ResultType.MOST_POPULAR_MIMETYPE:
- sql += group_and_sort ("subj_mimetype", where_sql,
- false, false);
+ sql += group_and_sort ("subj_mimetype", where_sql, false);
break;
case ResultType.LEAST_POPULAR_MIMETYPE:
- sql += group_and_sort ("subj_mimetype", where_sql,
- true, true);
+ sql += group_and_sort ("subj_mimetype", where_sql, true);
break;
default:
string error_message = "Invalid ResultType.";
warning (error_message);
throw new EngineError.INVALID_ARGUMENT (error_message);
}
+
+ // complete the sort rule
+ bool time_asc = ResultType.is_sort_order_asc ((ResultType) result_type);
+ sql += " timestamp %s".printf ((time_asc) ? "ASC" : "DESC");
int rc;
Sqlite.Statement stmt;
@@ -578,10 +576,8 @@
// Used by find_event_ids
private string group_and_sort (string field, string where_sql,
- bool time_asc=false, bool? count_asc=null,
- string aggregation_type="max")
+ bool? count_asc=null, string aggregation_type="max")
{
- string time_sorting = (time_asc) ? "ASC" : "DESC";
string aggregation_sql = "";
string order_sql = "";
@@ -599,7 +595,7 @@
FROM event_view %s
GROUP BY %s)
GROUP BY %s
- ORDER BY %s timestamp %s
+ ORDER BY %s
""".printf (
field,
aggregation_type,
@@ -607,7 +603,7 @@
where_sql,
field,
field,
- order_sql, time_sorting);
+ order_sql);
}
// Used by find_event_ids
Follow ups