zeitgeist team mailing list archive
-
zeitgeist team
-
Mailing list archive
-
Message #05233
[Branch ~zeitgeist/zeitgeist/bluebird] Rev 440: Merge lp:~zeitgeist/zeitgeist/fts-origin-hashing
Merge authors:
Michal Hruby (mhr3)
Related merge proposals:
https://code.launchpad.net/~zeitgeist/zeitgeist/fts-origin-hashing/+merge/98281
proposed by: Siegfried Gevatter (rainct)
review: Approve - Siegfried Gevatter (rainct)
------------------------------------------------------------
revno: 440 [merge]
committer: Michal Hruby <michal.mhr@xxxxxxxxx>
branch nick: zeitgeist
timestamp: Mon 2012-03-19 22:44:48 +0100
message:
Merge lp:~zeitgeist/zeitgeist/fts-origin-hashing
modified:
extensions/fts++/indexer.cpp
extensions/fts++/indexer.h
--
lp:zeitgeist
https://code.launchpad.net/~zeitgeist/zeitgeist/bluebird
Your team Zeitgeist Framework Team is subscribed to branch lp:zeitgeist.
To unsubscribe from this branch go to https://code.launchpad.net/~zeitgeist/zeitgeist/bluebird/+edit-subscription
=== modified file 'extensions/fts++/indexer.cpp'
--- extensions/fts++/indexer.cpp 2012-03-19 19:56:38 +0000
+++ extensions/fts++/indexer.cpp 2012-03-19 21:33:59 +0000
@@ -43,6 +43,7 @@
const Xapian::valueno VALUE_EVENT_ID = 0;
const Xapian::valueno VALUE_TIMESTAMP = 1;
const Xapian::valueno VALUE_URI_HASH = 2;
+const Xapian::valueno VALUE_ORIGIN_HASH = 3;
#define QUERY_PARSER_FLAGS \
Xapian::QueryParser::FLAG_PHRASE | Xapian::QueryParser::FLAG_BOOLEAN | \
@@ -763,7 +764,11 @@
result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_SUBJECTS ||
result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_SUBJECTS ||
result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_SUBJECTS ||
- result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_SUBJECTS)
+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_SUBJECTS ||
+ result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_ORIGIN ||
+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_ORIGIN ||
+ result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN ||
+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_ORIGIN)
{
maxhits = count;
}
@@ -795,8 +800,7 @@
result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN ||
result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_ORIGIN)
{
- // FIXME: not really correct but close :)
- enquire->set_collapse_key (VALUE_URI_HASH);
+ enquire->set_collapse_key (VALUE_ORIGIN_HASH);
}
else if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS ||
result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_EVENTS)
@@ -1137,10 +1141,8 @@
result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN ||
result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_ORIGIN)
{
- // FIXME: not really correct but close :)
enquire->set_sort_by_relevance_then_value (VALUE_TIMESTAMP, reversed_sort);
- enquire->set_collapse_key (VALUE_URI_HASH);
- maxhits *= 3;
+ enquire->set_collapse_key (VALUE_ORIGIN_HASH);
}
else
{
@@ -1272,6 +1274,16 @@
return results;
}
+static void
+get_digest_for_uri (GChecksum *checksum, const gchar *uri,
+ guint8 *digest, gsize *digest_size)
+{
+ g_checksum_update (checksum, (guchar *) uri, -1);
+ g_checksum_get_digest (checksum, digest, digest_size);
+ g_checksum_reset (checksum);
+ g_assert (digest_size == NULL || *digest_size == HASH_LENGTH);
+}
+
void Indexer::IndexEvent (ZeitgeistEvent *event)
{
try
@@ -1322,19 +1334,28 @@
return; // ignore this event completely...
}
+ guint8 uri_hash[HASH_LENGTH + 1];
+ gsize hash_size = HASH_LENGTH;
+
// We need the subject URI so we can use Xapian's collapse key feature
// for *_SUBJECT grouping. However, to save space, we'll just save a hash.
// A better option would be using URI's id, but for that we'd need a SQL
// query that'd be subject to races.
// FIXME(?): This doesn't work for events with multiple subjects.
- g_checksum_update (checksum, (guchar *) uri.c_str (), -1);
- guint8 uri_hash[HASH_LENGTH + 1];
- gsize hash_size = HASH_LENGTH;
- g_checksum_get_digest (checksum, uri_hash, &hash_size);
- g_checksum_reset (checksum);
- g_assert (hash_size == HASH_LENGTH);
+ get_digest_for_uri (checksum, uri.c_str (), uri_hash, &hash_size);
doc.add_value (VALUE_URI_HASH, std::string((char *) uri_hash, hash_size));
+ size_t colon_pos = uri.find (':');
+ // FIXME: current_origin once we have that
+ val = zeitgeist_subject_get_origin (subject);
+ // make sure the schemas of the URI and origin are the same
+ if (val && colon_pos != std::string::npos && strncmp (uri.c_str (), val, colon_pos+1) == 0)
+ {
+ hash_size = HASH_LENGTH;
+ get_digest_for_uri (checksum, val, uri_hash, &hash_size);
+ doc.add_value (VALUE_ORIGIN_HASH, std::string((char *) uri_hash, hash_size));
+ }
+
val = zeitgeist_subject_get_text (subject);
if (val && val[0] != '\0')
{
=== modified file 'extensions/fts++/indexer.h'
--- extensions/fts++/indexer.h 2012-03-12 14:22:16 +0000
+++ extensions/fts++/indexer.h 2012-03-19 21:42:52 +0000
@@ -29,7 +29,7 @@
namespace ZeitgeistFTS {
-const std::string INDEX_VERSION = "2";
+const std::string INDEX_VERSION = "3";
class Indexer
{