← Back to team overview

zeitgeist team mailing list archive

[Merge] lp:~zeitgeist/zeitgeist/fts-origin-hashing into lp:zeitgeist

 

Siegfried Gevatter has proposed merging lp:~zeitgeist/zeitgeist/fts-origin-hashing into lp:zeitgeist.

Requested reviews:
  Zeitgeist Framework Team (zeitgeist)

For more details, see:
https://code.launchpad.net/~zeitgeist/zeitgeist/fts-origin-hashing/+merge/98281
-- 
https://code.launchpad.net/~zeitgeist/zeitgeist/fts-origin-hashing/+merge/98281
Your team Zeitgeist Framework Team is requested to review the proposed merge of lp:~zeitgeist/zeitgeist/fts-origin-hashing into lp:zeitgeist.
=== modified file 'extensions/fts++/indexer.cpp'
--- extensions/fts++/indexer.cpp	2012-03-16 20:03:05 +0000
+++ extensions/fts++/indexer.cpp	2012-03-19 20:32:18 +0000
@@ -43,6 +43,7 @@
 const Xapian::valueno VALUE_EVENT_ID = 0;
 const Xapian::valueno VALUE_TIMESTAMP = 1;
 const Xapian::valueno VALUE_URI_HASH = 2;
+const Xapian::valueno VALUE_ORIGIN_HASH = 3;
 
 #define QUERY_PARSER_FLAGS \
   Xapian::QueryParser::FLAG_PHRASE | Xapian::QueryParser::FLAG_BOOLEAN | \
@@ -778,7 +779,9 @@
     }
     else
     {
-      enquire->set_sort_by_value (VALUE_TIMESTAMP, true);
+      bool reversed_sort = not
+          zeitgeist_result_type_is_sort_order_asc (result_type);
+      enquire->set_sort_by_value (VALUE_TIMESTAMP, reversed_sort);
     }
 
     if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_SUBJECTS ||
@@ -786,7 +789,19 @@
         result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_SUBJECTS ||
         result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_SUBJECTS)
     {
-        enquire->set_collapse_key (VALUE_URI_HASH);
+      enquire->set_collapse_key (VALUE_URI_HASH);
+    }
+    else if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_ORIGIN ||
+        result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_ORIGIN ||
+        result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN ||
+        result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_ORIGIN)
+    {
+      enquire->set_collapse_key (VALUE_ORIGIN_HASH);
+    }
+    else if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS ||
+        result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_EVENTS)
+    {
+      enquire->set_collapse_key (VALUE_EVENT_ID);
     }
 
     Xapian::Query q(query_parser->parse_query (query_string, QUERY_PARSER_FLAGS));
@@ -1096,12 +1111,8 @@
       return NULL;
     }
 
-    bool reversed_sort =
-      result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS ||
-      result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_SUBJECTS ||
-      result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_SUBJECTS ||
-      result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_ORIGIN ||
-      result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN;
+    bool reversed_sort = not
+        zeitgeist_result_type_is_sort_order_asc (result_type);
 
     if (result_type == RELEVANCY_RESULT_TYPE)
     {
@@ -1126,10 +1137,8 @@
         result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN ||
         result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_ORIGIN)
     {
-      // FIXME: not really correct but close :)
       enquire->set_sort_by_relevance_then_value (VALUE_TIMESTAMP, reversed_sort);
-      enquire->set_collapse_key (VALUE_URI_HASH);
-      maxhits *= 3;
+      enquire->set_collapse_key (VALUE_ORIGIN_HASH);
     }
     else
     {
@@ -1311,19 +1320,36 @@
         return; // ignore this event completely...
       }
 
+      guint8 uri_hash[HASH_LENGTH + 1];
+
       // We need the subject URI so we can use Xapian's collapse key feature
       // for *_SUBJECT grouping. However, to save space, we'll just save a hash.
       // A better option would be using URI's id, but for that we'd need a SQL
       // query that'd be subject to races.
       // FIXME(?): This doesn't work for events with multiple subjects.
       g_checksum_update (checksum, (guchar *) uri.c_str (), -1);
-      guint8 uri_hash[HASH_LENGTH + 1];
       gsize hash_size = HASH_LENGTH;
       g_checksum_get_digest (checksum, uri_hash, &hash_size);
       g_checksum_reset (checksum);
       g_assert (hash_size == HASH_LENGTH);
       doc.add_value (VALUE_URI_HASH, std::string((char *) uri_hash, hash_size));
 
+      size_t colon_pos = uri.find (':');
+      // FIXME: use current_origin once we have that
+      val = zeitgeist_subject_get_origin (subject);
+      // Make sure the schemas of the URI and the origin is the same,
+      // to avoid saving some junk.
+      if (val && colon_pos != std::string::npos
+              && strncmp (uri.c_str (), val, colon_pos+1) == 0)
+      {
+          g_checksum_update (checksum, (guchar *) val, -1);
+          g_checksum_get_digest (checksum, uri_hash, &hash_size);
+          g_checksum_reset (checksum);
+          g_assert (hash_size == HASH_LENGTH);
+          doc.add_value (VALUE_ORIGIN_HASH,
+                         std::string((char *) uri_hash, hash_size));
+      }
+
       val = zeitgeist_subject_get_text (subject);
       if (val && val[0] != '\0')
       {

=== modified file 'src/datamodel.vala'
--- src/datamodel.vala	2012-02-18 21:33:57 +0000
+++ src/datamodel.vala	2012-03-19 20:32:18 +0000
@@ -230,10 +230,60 @@
                                                    // different origin ordered
                                                    // by the popularity of the
                                                    // origins
-        LEAST_POPULAR_EVENT_ORIGIN           = 30, //   The last event of each
+        LEAST_POPULAR_EVENT_ORIGIN           = 30; //   The last event of each
                                                    // different origin, ordered
                                                    // ascendingly by the
                                                    // popularity of the origin
+
+        /*
+         * Returns true if the results for the given result_type will be sorted
+         * ascendantly by date, false if they'll be sorted descendingly.
+         **/
+        public static bool is_sort_order_asc (ResultType result_type)
+        {
+            switch (result_type)
+            {
+                // FIXME: Why are LEAST_POPULAR_* using ASC?
+                case ResultType.LEAST_RECENT_EVENTS:
+                case ResultType.LEAST_RECENT_EVENT_ORIGIN:
+                case ResultType.LEAST_POPULAR_EVENT_ORIGIN:
+                case ResultType.LEAST_RECENT_SUBJECTS:
+                case ResultType.LEAST_POPULAR_SUBJECTS:
+                case ResultType.LEAST_RECENT_CURRENT_URI:
+                case ResultType.LEAST_POPULAR_CURRENT_URI:
+                case ResultType.LEAST_RECENT_ACTOR:
+                case ResultType.LEAST_POPULAR_ACTOR:
+                case ResultType.OLDEST_ACTOR:
+                case ResultType.LEAST_RECENT_ORIGIN:
+                case ResultType.LEAST_POPULAR_ORIGIN:
+                case ResultType.LEAST_RECENT_SUBJECT_INTERPRETATION:
+                case ResultType.LEAST_POPULAR_SUBJECT_INTERPRETATION:
+                case ResultType.LEAST_RECENT_MIMETYPE:
+                case ResultType.LEAST_POPULAR_MIMETYPE:
+                    return true;
+
+                case ResultType.MOST_RECENT_EVENTS:
+                case ResultType.MOST_RECENT_EVENT_ORIGIN:
+                case ResultType.MOST_POPULAR_EVENT_ORIGIN:
+                case ResultType.MOST_RECENT_SUBJECTS:
+                case ResultType.MOST_POPULAR_SUBJECTS:
+                case ResultType.MOST_RECENT_CURRENT_URI:
+                case ResultType.MOST_POPULAR_CURRENT_URI:
+                case ResultType.MOST_RECENT_ACTOR:
+                case ResultType.MOST_POPULAR_ACTOR:
+                case ResultType.MOST_RECENT_ORIGIN:
+                case ResultType.MOST_POPULAR_ORIGIN:
+                case ResultType.MOST_RECENT_SUBJECT_INTERPRETATION:
+                case ResultType.MOST_POPULAR_SUBJECT_INTERPRETATION:
+                case ResultType.MOST_RECENT_MIMETYPE:
+                case ResultType.MOST_POPULAR_MIMETYPE:
+                    return false;
+
+                default:
+                    warning ("Unrecognized ResultType: %u", (uint) result_type);
+                    return true;
+            }
+        }
     }
 
     /*

=== modified file 'src/db-reader.vala'
--- src/db-reader.vala	2012-03-17 14:47:09 +0000
+++ src/db-reader.vala	2012-03-19 20:32:18 +0000
@@ -173,109 +173,107 @@
         switch (result_type)
         {
             case ResultType.MOST_RECENT_EVENTS:
-                sql += where_sql + " ORDER BY timestamp DESC";
+                sql += where_sql + " ORDER BY ";
                 break;
             case ResultType.LEAST_RECENT_EVENTS:
-                sql += where_sql + " ORDER BY timestamp ASC";
+                sql += where_sql + " ORDER BY ";
                 break;
             case ResultType.MOST_RECENT_EVENT_ORIGIN:
-                sql += group_and_sort ("origin", where_sql, false);
+                sql += group_and_sort ("origin", where_sql);
                 break;
             case ResultType.LEAST_RECENT_EVENT_ORIGIN:
-                sql += group_and_sort ("origin", where_sql, true);
+                sql += group_and_sort ("origin", where_sql);
                 break;
             case ResultType.MOST_POPULAR_EVENT_ORIGIN:
-                sql += group_and_sort ("origin", where_sql, false, false);
+                sql += group_and_sort ("origin", where_sql, false);
                 break;
             case ResultType.LEAST_POPULAR_EVENT_ORIGIN:
-                sql += group_and_sort ("origin", where_sql, true, true);
+                sql += group_and_sort ("origin", where_sql, true);
                 break;
             case ResultType.MOST_RECENT_SUBJECTS:
-                sql += group_and_sort ("subj_id", where_sql, false);
+                sql += group_and_sort ("subj_id", where_sql);
                 break;
             case ResultType.LEAST_RECENT_SUBJECTS:
-                sql += group_and_sort ("subj_id", where_sql, true);
+                sql += group_and_sort ("subj_id", where_sql);
                 break;
             case ResultType.MOST_POPULAR_SUBJECTS:
-                sql += group_and_sort ("subj_id", where_sql, false, false);
+                sql += group_and_sort ("subj_id", where_sql, false);
                 break;
             case ResultType.LEAST_POPULAR_SUBJECTS:
-                sql += group_and_sort ("subj_id", where_sql, true, true);
+                sql += group_and_sort ("subj_id", where_sql, true);
                 break;
             case ResultType.MOST_RECENT_CURRENT_URI:
-                sql += group_and_sort ("subj_id_current", where_sql, false);
+                sql += group_and_sort ("subj_id_current", where_sql);
                 break;
             case ResultType.LEAST_RECENT_CURRENT_URI:
-                sql += group_and_sort ("subj_id_current", where_sql, true);
+                sql += group_and_sort ("subj_id_current", where_sql);
                 break;
             case ResultType.MOST_POPULAR_CURRENT_URI:
-                sql += group_and_sort ("subj_id_current", where_sql,
-                    false, false);
+                sql += group_and_sort ("subj_id_current", where_sql, false);
                 break;
             case ResultType.LEAST_POPULAR_CURRENT_URI:
-                sql += group_and_sort ("subj_id_current", where_sql,
-                    true, true);
+                sql += group_and_sort ("subj_id_current", where_sql, true);
                 break;
             case ResultType.MOST_RECENT_ACTOR:
-                sql += group_and_sort ("actor", where_sql, false);
+                sql += group_and_sort ("actor", where_sql);
                 break;
             case ResultType.LEAST_RECENT_ACTOR:
-                sql += group_and_sort ("actor", where_sql, true);
+                sql += group_and_sort ("actor", where_sql);
                 break;
             case ResultType.MOST_POPULAR_ACTOR:
-                sql += group_and_sort ("actor", where_sql, false, false);
+                sql += group_and_sort ("actor", where_sql, false);
                 break;
             case ResultType.LEAST_POPULAR_ACTOR:
-                sql += group_and_sort ("actor", where_sql, true, true);
+                sql += group_and_sort ("actor", where_sql, true);
                 break;
             case ResultType.OLDEST_ACTOR:
-                sql += group_and_sort ("actor", where_sql, true, null, "min");
+                sql += group_and_sort ("actor", where_sql, null, "min");
                 break;
             case ResultType.MOST_RECENT_ORIGIN:
-                sql += group_and_sort ("subj_origin", where_sql, false);
+                sql += group_and_sort ("subj_origin", where_sql);
                 break;
             case ResultType.LEAST_RECENT_ORIGIN:
-                sql += group_and_sort ("subj_origin", where_sql, true);
+                sql += group_and_sort ("subj_origin", where_sql);
                 break;
             case ResultType.MOST_POPULAR_ORIGIN:
-                sql += group_and_sort ("subj_origin", where_sql, false, false);
+                sql += group_and_sort ("subj_origin", where_sql, false);
                 break;
             case ResultType.LEAST_POPULAR_ORIGIN:
-                sql += group_and_sort ("subj_origin", where_sql, true, true);
+                sql += group_and_sort ("subj_origin", where_sql, true);
                 break;
             case ResultType.MOST_RECENT_SUBJECT_INTERPRETATION:
-                sql += group_and_sort ("subj_interpretation", where_sql, false);
+                sql += group_and_sort ("subj_interpretation", where_sql);
                 break;
             case ResultType.LEAST_RECENT_SUBJECT_INTERPRETATION:
-                sql += group_and_sort ("subj_interpretation", where_sql, true);
+                sql += group_and_sort ("subj_interpretation", where_sql);
                 break;
             case ResultType.MOST_POPULAR_SUBJECT_INTERPRETATION:
-                sql += group_and_sort ("subj_interpretation", where_sql,
-                    false, false);
+                sql += group_and_sort ("subj_interpretation", where_sql, false);
                 break;
             case ResultType.LEAST_POPULAR_SUBJECT_INTERPRETATION:
-                sql += group_and_sort ("subj_interpretation", where_sql,
-                    true, true);
+                sql += group_and_sort ("subj_interpretation", where_sql, true);
                 break;
             case ResultType.MOST_RECENT_MIMETYPE:
-                sql += group_and_sort ("subj_mimetype", where_sql, false);
+                sql += group_and_sort ("subj_mimetype", where_sql);
                 break;
             case ResultType.LEAST_RECENT_MIMETYPE:
-                sql += group_and_sort ("subj_mimetype", where_sql, true);
+                sql += group_and_sort ("subj_mimetype", where_sql);
                 break;
             case ResultType.MOST_POPULAR_MIMETYPE:
-                sql += group_and_sort ("subj_mimetype", where_sql,
-                    false, false);
+                sql += group_and_sort ("subj_mimetype", where_sql, false);
                 break;
             case ResultType.LEAST_POPULAR_MIMETYPE:
-                sql += group_and_sort ("subj_mimetype", where_sql,
-                    true, true);
+                sql += group_and_sort ("subj_mimetype", where_sql, true);
                 break;
             default:
                 string error_message = "Invalid ResultType.";
                 warning (error_message);
                 throw new EngineError.INVALID_ARGUMENT (error_message);
         }
+        
+        // complete the sort rule
+        bool time_asc = ResultType.is_sort_order_asc ((ResultType) result_type);
+        sql += " timestamp %s".printf ((time_asc) ? "ASC" : "DESC");
 
         int rc;
         Sqlite.Statement stmt;
@@ -578,10 +576,8 @@
 
     // Used by find_event_ids
     private string group_and_sort (string field, string where_sql,
-        bool time_asc=false, bool? count_asc=null,
-        string aggregation_type="max")
+        bool? count_asc=null, string aggregation_type="max")
     {
-        string time_sorting = (time_asc) ? "ASC" : "DESC";
         string aggregation_sql = "";
         string order_sql = "";
 
@@ -599,7 +595,7 @@
                 FROM event_view %s
                 GROUP BY %s)
             GROUP BY %s
-            ORDER BY %s timestamp %s
+            ORDER BY %s 
             """.printf (
                 field,
                 aggregation_type,
@@ -607,7 +603,7 @@
                 where_sql,
                 field,
                 field,
-                order_sql, time_sorting);
+                order_sql);
     }
 
     // Used by find_event_ids


Follow ups