zeitgeist team mailing list archive
-
zeitgeist team
-
Mailing list archive
-
Message #05073
[Branch ~zeitgeist/zeitgeist/bluebird] Rev 420: Index only recognized uri schemes
------------------------------------------------------------
revno: 420
committer: Michal Hruby <michal.mhr@xxxxxxxxx>
branch nick: zeitgeist
timestamp: Mon 2012-03-12 15:22:16 +0100
message:
Index only recognized uri schemes
modified:
extensions/fts++/indexer.cpp
extensions/fts++/indexer.h
--
lp:zeitgeist
https://code.launchpad.net/~zeitgeist/zeitgeist/bluebird
Your team Zeitgeist Framework Team is subscribed to branch lp:zeitgeist.
To unsubscribe from this branch go to https://code.launchpad.net/~zeitgeist/zeitgeist/bluebird/+edit-subscription
=== modified file 'extensions/fts++/indexer.cpp'
--- extensions/fts++/indexer.cpp 2012-03-07 16:08:26 +0000
+++ extensions/fts++/indexer.cpp 2012-03-12 14:22:16 +0000
@@ -106,9 +106,17 @@
g_assert (g_checksum_type_get_length (G_CHECKSUM_MD5) == HASH_LENGTH);
this->checksum = g_checksum_new (G_CHECKSUM_MD5);
- if (!this->checksum)
- g_critical ("GChecksum initialization failed.");
-
+ if (!this->checksum) g_critical ("GChecksum initialization failed.");
+
+ GError *error = NULL;
+ /* we need to be careful with what we log, for example ubuntuone logs its
+ * weird uids and that screws up the index */
+ this->uri_schemes_regex = g_regex_new (
+ "(file|http[s]?|[s]?ftp|ssh|smb|dav[s]?|application)$", G_REGEX_OPTIMIZE,
+ (GRegexMatchFlags) 0, &error);
+
+ if (error)
+ g_critical ("Unable to initialize uri scheme regex: %s", error->message);
}
catch (const Xapian::Error &xp_error)
{
@@ -399,7 +407,7 @@
tokenizer->index_text (StringUtils::AsciiFold (text), 5);
}
-void Indexer::IndexUri (std::string const& uri, std::string const& origin)
+bool Indexer::IndexUri (std::string const& uri, std::string const& origin)
{
GFile *f = g_file_new_for_uri (uri.c_str ());
@@ -407,12 +415,21 @@
if (scheme == NULL)
{
g_warning ("Invalid URI: %s", uri.c_str ());
- return;
+ g_object_unref (f);
+ return false;
}
std::string scheme_str(scheme);
g_free (scheme);
+ // do we support this scheme?
+ if (!g_regex_match (uri_schemes_regex, scheme_str.c_str (),
+ (GRegexMatchFlags) 0, NULL))
+ {
+ g_object_unref (f);
+ return false;
+ }
+
if (scheme_str == "file")
{
// FIXME: special case some typical filenames (like photos)
@@ -462,7 +479,7 @@
weight_index < G_N_ELEMENTS (path_weights))
{
// if this is already home directory we don't want it
- if (path_component == home_dir_path) return;
+ if (path_component == home_dir_path) break;
gchar *name = g_path_get_basename (path_component.c_str ());
@@ -481,10 +498,11 @@
// mailto:username@xxxxxxxxxx
size_t scheme_len = scheme_str.length () + 1;
size_t at_pos = uri.find ('@', scheme_len);
- if (at_pos == std::string::npos) return;
-
- tokenizer->index_text (uri.substr (scheme_len, at_pos - scheme_len), 5);
- tokenizer->index_text (uri.substr (at_pos + 1), 1);
+ if (at_pos != std::string::npos)
+ {
+ tokenizer->index_text (uri.substr (scheme_len, at_pos - scheme_len), 5);
+ tokenizer->index_text (uri.substr (at_pos + 1), 1);
+ }
}
else if (scheme_str.compare (0, 4, "http") == 0)
{
@@ -578,6 +596,8 @@
}
g_object_unref (f);
+
+ return true;
}
bool Indexer::IndexActor (std::string const& actor, bool is_subject)
@@ -1035,15 +1055,11 @@
if (!IndexActor (uri, true))
IndexUri (uri, origin);
}
- else if (uri.compare (0, 10, "ubuntuone:") == 0)
+ else if (!IndexUri (uri, origin))
{
- // U1 logs its uids, we don't want to index those
+ // unsupported uri scheme
return;
}
- else
- {
- IndexUri (uri, origin);
- }
}
AddDocFilters (event, doc);
=== modified file 'extensions/fts++/indexer.h'
--- extensions/fts++/indexer.h 2012-03-07 16:08:26 +0000
+++ extensions/fts++/indexer.h 2012-03-12 14:22:16 +0000
@@ -57,6 +57,7 @@
if (query_parser) delete query_parser;
if (db) delete db;
if (checksum) g_checksum_free (checksum);
+ if (uri_schemes_regex) g_regex_unref (uri_schemes_regex);
for (AppInfoMap::iterator it = app_info_cache.begin ();
it != app_info_cache.end (); ++it)
@@ -111,7 +112,7 @@
void AddDocFilters (ZeitgeistEvent *event, Xapian::Document &doc);
void IndexText (std::string const& text);
- void IndexUri (std::string const& uri, std::string const& origin);
+ bool IndexUri (std::string const& uri, std::string const& origin);
bool IndexActor (std::string const& actor, bool is_subject);
gboolean ClearFailedLookupsCb ();
@@ -123,7 +124,8 @@
Xapian::TermGenerator *tokenizer;
AppInfoMap app_info_cache;
ApplicationSet failed_lookups;
- GChecksum *checksum;
+ GChecksum *checksum;
+ GRegex *uri_schemes_regex;
guint clear_failed_id;
std::string home_dir_path;