← Back to team overview

zeitgeist team mailing list archive

[Branch ~zeitgeist/zeitgeist/bluebird] Rev 440: Merge lp:~zeitgeist/zeitgeist/fts-origin-hashing

 

Merge authors:
  Michal Hruby (mhr3)
Related merge proposals:
  https://code.launchpad.net/~zeitgeist/zeitgeist/fts-origin-hashing/+merge/98281
  proposed by: Siegfried Gevatter (rainct)
  review: Approve - Siegfried Gevatter (rainct)
------------------------------------------------------------
revno: 440 [merge]
committer: Michal Hruby <michal.mhr@xxxxxxxxx>
branch nick: zeitgeist
timestamp: Mon 2012-03-19 22:44:48 +0100
message:
  Merge lp:~zeitgeist/zeitgeist/fts-origin-hashing
modified:
  extensions/fts++/indexer.cpp
  extensions/fts++/indexer.h


--
lp:zeitgeist
https://code.launchpad.net/~zeitgeist/zeitgeist/bluebird

Your team Zeitgeist Framework Team is subscribed to branch lp:zeitgeist.
To unsubscribe from this branch go to https://code.launchpad.net/~zeitgeist/zeitgeist/bluebird/+edit-subscription
=== modified file 'extensions/fts++/indexer.cpp'
--- extensions/fts++/indexer.cpp	2012-03-19 19:56:38 +0000
+++ extensions/fts++/indexer.cpp	2012-03-19 21:33:59 +0000
@@ -43,6 +43,7 @@
 const Xapian::valueno VALUE_EVENT_ID = 0;
 const Xapian::valueno VALUE_TIMESTAMP = 1;
 const Xapian::valueno VALUE_URI_HASH = 2;
+const Xapian::valueno VALUE_ORIGIN_HASH = 3;
 
 #define QUERY_PARSER_FLAGS \
   Xapian::QueryParser::FLAG_PHRASE | Xapian::QueryParser::FLAG_BOOLEAN | \
@@ -763,7 +764,11 @@
         result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_SUBJECTS ||
         result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_SUBJECTS ||
         result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_SUBJECTS ||
-        result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_SUBJECTS)
+        result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_SUBJECTS ||
+        result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_ORIGIN ||
+        result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_ORIGIN ||
+        result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN ||
+        result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_ORIGIN)
     {
       maxhits = count;
     }
@@ -795,8 +800,7 @@
         result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN ||
         result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_ORIGIN)
     {
-      // FIXME: not really correct but close :)
-      enquire->set_collapse_key (VALUE_URI_HASH);
+      enquire->set_collapse_key (VALUE_ORIGIN_HASH);
     }
     else if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS ||
         result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_EVENTS)
@@ -1137,10 +1141,8 @@
         result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN ||
         result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_ORIGIN)
     {
-      // FIXME: not really correct but close :)
       enquire->set_sort_by_relevance_then_value (VALUE_TIMESTAMP, reversed_sort);
-      enquire->set_collapse_key (VALUE_URI_HASH);
-      maxhits *= 3;
+      enquire->set_collapse_key (VALUE_ORIGIN_HASH);
     }
     else
     {
@@ -1272,6 +1274,16 @@
   return results;
 }
 
+static void
+get_digest_for_uri (GChecksum *checksum, const gchar *uri,
+                    guint8 *digest, gsize *digest_size)
+{
+  g_checksum_update (checksum, (guchar *) uri, -1);
+  g_checksum_get_digest (checksum, digest, digest_size);
+  g_checksum_reset (checksum);
+  g_assert (digest_size == NULL || *digest_size == HASH_LENGTH);
+}
+
 void Indexer::IndexEvent (ZeitgeistEvent *event)
 {
   try
@@ -1322,19 +1334,28 @@
         return; // ignore this event completely...
       }
 
+      guint8 uri_hash[HASH_LENGTH + 1];
+      gsize hash_size = HASH_LENGTH;
+
       // We need the subject URI so we can use Xapian's collapse key feature
       // for *_SUBJECT grouping. However, to save space, we'll just save a hash.
       // A better option would be using URI's id, but for that we'd need a SQL
       // query that'd be subject to races.
       // FIXME(?): This doesn't work for events with multiple subjects.
-      g_checksum_update (checksum, (guchar *) uri.c_str (), -1);
-      guint8 uri_hash[HASH_LENGTH + 1];
-      gsize hash_size = HASH_LENGTH;
-      g_checksum_get_digest (checksum, uri_hash, &hash_size);
-      g_checksum_reset (checksum);
-      g_assert (hash_size == HASH_LENGTH);
+      get_digest_for_uri (checksum, uri.c_str (), uri_hash, &hash_size);
       doc.add_value (VALUE_URI_HASH, std::string((char *) uri_hash, hash_size));
 
+      size_t colon_pos = uri.find (':');
+      // FIXME: current_origin once we have that
+      val = zeitgeist_subject_get_origin (subject);
+      // make sure the schemas of the URI and origin are the same
+      if (val && colon_pos != std::string::npos && strncmp (uri.c_str (), val, colon_pos+1) == 0)
+      {
+        hash_size = HASH_LENGTH;
+        get_digest_for_uri (checksum, val, uri_hash, &hash_size);
+        doc.add_value (VALUE_ORIGIN_HASH, std::string((char *) uri_hash, hash_size));
+      }
+
       val = zeitgeist_subject_get_text (subject);
       if (val && val[0] != '\0')
       {

=== modified file 'extensions/fts++/indexer.h'
--- extensions/fts++/indexer.h	2012-03-12 14:22:16 +0000
+++ extensions/fts++/indexer.h	2012-03-19 21:42:52 +0000
@@ -29,7 +29,7 @@
 
 namespace ZeitgeistFTS {
 
-const std::string INDEX_VERSION = "2";
+const std::string INDEX_VERSION = "3";
 
 class Indexer
 {