← Back to team overview

zeitgeist team mailing list archive

[Branch ~zeitgeist/zeitgeist/bluebird] Rev 420: Index only recognized uri schemes

 

------------------------------------------------------------
revno: 420
committer: Michal Hruby <michal.mhr@xxxxxxxxx>
branch nick: zeitgeist
timestamp: Mon 2012-03-12 15:22:16 +0100
message:
  Index only recognized uri schemes
modified:
  extensions/fts++/indexer.cpp
  extensions/fts++/indexer.h


--
lp:zeitgeist
https://code.launchpad.net/~zeitgeist/zeitgeist/bluebird

Your team Zeitgeist Framework Team is subscribed to branch lp:zeitgeist.
To unsubscribe from this branch go to https://code.launchpad.net/~zeitgeist/zeitgeist/bluebird/+edit-subscription
=== modified file 'extensions/fts++/indexer.cpp'
--- extensions/fts++/indexer.cpp	2012-03-07 16:08:26 +0000
+++ extensions/fts++/indexer.cpp	2012-03-12 14:22:16 +0000
@@ -106,9 +106,17 @@
     
     g_assert (g_checksum_type_get_length (G_CHECKSUM_MD5) == HASH_LENGTH);
     this->checksum = g_checksum_new (G_CHECKSUM_MD5);
-    if (!this->checksum)
-        g_critical ("GChecksum initialization failed.");
-
+    if (!this->checksum) g_critical ("GChecksum initialization failed.");
+
+    GError *error = NULL;
+    /* we need to be careful with what we log, for example ubuntuone logs its
+     * weird uids and that screws up the index */
+    this->uri_schemes_regex = g_regex_new (
+        "(file|http[s]?|[s]?ftp|ssh|smb|dav[s]?|application)$", G_REGEX_OPTIMIZE,
+        (GRegexMatchFlags) 0, &error);
+
+    if (error)
+      g_critical ("Unable to initialize uri scheme regex: %s", error->message);
   }
   catch (const Xapian::Error &xp_error)
   {
@@ -399,7 +407,7 @@
   tokenizer->index_text (StringUtils::AsciiFold (text), 5);
 }
 
-void Indexer::IndexUri (std::string const& uri, std::string const& origin)
+bool Indexer::IndexUri (std::string const& uri, std::string const& origin)
 {
   GFile *f = g_file_new_for_uri (uri.c_str ());
 
@@ -407,12 +415,21 @@
   if (scheme == NULL)
   {
     g_warning ("Invalid URI: %s", uri.c_str ());
-    return;
+    g_object_unref (f);
+    return false;
   }
 
   std::string scheme_str(scheme);
   g_free (scheme);
 
+  // do we support this scheme?
+  if (!g_regex_match (uri_schemes_regex, scheme_str.c_str (),
+        (GRegexMatchFlags) 0, NULL))
+  {
+    g_object_unref (f);
+    return false;
+  }
+
   if (scheme_str == "file")
   {
     // FIXME: special case some typical filenames (like photos)
@@ -462,7 +479,7 @@
         weight_index < G_N_ELEMENTS (path_weights))
     {
       // if this is already home directory we don't want it
-      if (path_component == home_dir_path) return;
+      if (path_component == home_dir_path) break;
 
       gchar *name = g_path_get_basename (path_component.c_str ());
 
@@ -481,10 +498,11 @@
     // mailto:username@xxxxxxxxxx
     size_t scheme_len = scheme_str.length () + 1;
     size_t at_pos = uri.find ('@', scheme_len);
-    if (at_pos == std::string::npos) return;
-
-    tokenizer->index_text (uri.substr (scheme_len, at_pos - scheme_len), 5);
-    tokenizer->index_text (uri.substr (at_pos + 1), 1);
+    if (at_pos != std::string::npos)
+    {
+      tokenizer->index_text (uri.substr (scheme_len, at_pos - scheme_len), 5);
+      tokenizer->index_text (uri.substr (at_pos + 1), 1);
+    }
   }
   else if (scheme_str.compare (0, 4, "http") == 0)
   {
@@ -578,6 +596,8 @@
   }
 
   g_object_unref (f);
+
+  return true;
 }
 
 bool Indexer::IndexActor (std::string const& actor, bool is_subject)
@@ -1035,15 +1055,11 @@
         if (!IndexActor (uri, true))
           IndexUri (uri, origin);
       }
-      else if (uri.compare (0, 10, "ubuntuone:") == 0)
+      else if (!IndexUri (uri, origin))
       {
-        // U1 logs its uids, we don't want to index those
+        // unsupported uri scheme
         return;
       }
-      else
-      {
-        IndexUri (uri, origin);
-      }
     }
 
     AddDocFilters (event, doc);

=== modified file 'extensions/fts++/indexer.h'
--- extensions/fts++/indexer.h	2012-03-07 16:08:26 +0000
+++ extensions/fts++/indexer.h	2012-03-12 14:22:16 +0000
@@ -57,6 +57,7 @@
     if (query_parser) delete query_parser;
     if (db) delete db;
     if (checksum) g_checksum_free (checksum);
+    if (uri_schemes_regex) g_regex_unref (uri_schemes_regex);
 
     for (AppInfoMap::iterator it = app_info_cache.begin ();
          it != app_info_cache.end (); ++it)
@@ -111,7 +112,7 @@
 
   void AddDocFilters (ZeitgeistEvent *event, Xapian::Document &doc);
   void IndexText (std::string const& text);
-  void IndexUri (std::string const& uri, std::string const& origin);
+  bool IndexUri (std::string const& uri, std::string const& origin);
   bool IndexActor (std::string const& actor, bool is_subject);
 
   gboolean ClearFailedLookupsCb ();
@@ -123,7 +124,8 @@
   Xapian::TermGenerator    *tokenizer;
   AppInfoMap                app_info_cache;
   ApplicationSet            failed_lookups;
-  GChecksum                 *checksum;
+  GChecksum                *checksum;
+  GRegex                   *uri_schemes_regex; 
 
   guint                     clear_failed_id;
   std::string               home_dir_path;