divmod-dev team mailing list archive
-
divmod-dev team
-
Mailing list archive
-
Message #00320
[Merge] lp:~exarkun/divmod.org/spambayes-fewer-potatoes into lp:divmod.org
Jean-Paul Calderone has proposed merging lp:~exarkun/divmod.org/spambayes-fewer-potatoes into lp:divmod.org.
Requested reviews:
Divmod-dev (divmod-dev)
Related bugs:
Bug #1040874 in Quotient: "Database persistence for spambayes classifier data could be faster"
https://bugs.launchpad.net/quotient/+bug/1040874
For more details, see:
https://code.launchpad.net/~exarkun/divmod.org/spambayes-fewer-potatoes/+merge/121094
Some speedups to the new spambayes database layer. Benchmark results on the ticket (sorry, Launchpad, how do you work exactly?)
--
https://code.launchpad.net/~exarkun/divmod.org/spambayes-fewer-potatoes/+merge/121094
Your team Divmod-dev is requested to review the proposed merge of lp:~exarkun/divmod.org/spambayes-fewer-potatoes into lp:divmod.org.
=== added directory 'Quotient/benchmarks'
=== added file 'Quotient/benchmarks/spambayes'
--- Quotient/benchmarks/spambayes 1970-01-01 00:00:00 +0000
+++ Quotient/benchmarks/spambayes 2012-08-23 22:01:18 +0000
@@ -0,0 +1,44 @@
+#!/usr/bin/python
+
+# Benchmark of Quotient spambayes filter, both training and classification.
+
+import sys, tempfile, random, time
+
+from xquotient.spam import _SQLite3Classifier
+
+words = list(open('/usr/share/dict/words', 'r'))
+
+TRAINING_FACTOR = 50
+MESSAGE_FACTOR = 500
+
+def adj(duration):
+ return duration / (TRAINING_FACTOR * MESSAGE_FACTOR) * 1000.0
+
+
+def main(argv):
+ prng = random.Random()
+ prng.seed(12345)
+ prng.shuffle(words)
+
+ classifier = _SQLite3Classifier(tempfile.mktemp())
+
+ before = time.time()
+ for i in range(TRAINING_FACTOR):
+ classifier.learn(words[i:i + MESSAGE_FACTOR], True)
+
+ for i in range(TRAINING_FACTOR, TRAINING_FACTOR * 2):
+ classifier.learn(words[i:i + MESSAGE_FACTOR], False)
+ after = time.time()
+
+ print 'Learning: %.2f ms/word' % (adj(after - before),)
+
+ before = time.time()
+ for i in range(TRAINING_FACTOR * 2):
+ classifier.spamprob(words[i:i + MESSAGE_FACTOR])
+ after = time.time()
+
+ print 'Guessing: %.2f ms/word' % (adj(after - before),)
+
+
+if __name__ == '__main__':
+ main(sys.argv)
=== modified file 'Quotient/xquotient/spam.py'
--- Quotient/xquotient/spam.py 2012-08-20 19:51:00 +0000
+++ Quotient/xquotient/spam.py 2012-08-23 22:01:18 +0000
@@ -382,6 +382,21 @@
statements. These are executed any time the classifier database is
opened, with the expected failure which occurs any time the schema has
already been initialized handled and disregarded.
+
+ @ivar _readCache: Word information that is already known, either because it
+ has already been read from the database once or because we wrote the
+ information to the database. Keys are unicode tokens, values are
+ three-sequences of token, nspam, and nham counts. This is used to hold
+ word info between two different Spambayes hooks, C{_getclues} and
+ C{_wordinfoget}. The former has access to all tokens in a particular
+ document, the latter is a potato-programming mistake. Loading all of
+ the values at once in C{_getclues} is a big performance win.
+
+ @ivar _writeCache: Word information that is on its way to the database due
+ to training. This has the same shape as C{_readCache}. Word info is
+ held here until training on one document is complete, then all the word
+ info is dumped into the database in a single SQL operation (via
+ I{executemany}).
"""
SCHEMA = [
@@ -413,6 +428,7 @@
return get, set, None, doc
nspam = property(*nspam())
+
def nham():
doc = """
A property which reflects the number of messages trained as ham, while
@@ -438,6 +454,9 @@
"""
classifier.Classifier.__init__(self)
self.databaseName = databaseName
+ self._readCache = {}
+ self._writeCache = {}
+
# Open the database, possibly initializing it if it has not yet been
# initialized, and then load the necessary global state from it (nspam,
# nham).
@@ -484,6 +503,47 @@
self.cursor.execute('UPDATE state SET nspam=?, nham=?', (self._nspam, self._nham))
+ def _getclues(self, wordstream):
+ """
+ Hook into the classification process to speed it up.
+
+ See the base implementation for details about what C{_getclues} is
+ supposed to do. This implementation extends the base to look into
+ wordstream and load all the necessary information with the minimum
+ amount of SQLite3 work, then calls up to the base implementation to let
+ it do the actual classification-related work.
+
+ @param wordstream: An iterable (probably a generator) of tokens from the
+ document to be classified.
+ """
+ # Make sure we can consume it and give it to the base implementation for
+ # consumption.
+ wordstream = list(wordstream)
+
+ # Find all the tokens we don't have in memory already
+ missing = []
+ for word in wordstream:
+ if isinstance(word, str):
+ word = word.decode('utf-8', 'replace')
+ if word not in self._readCache:
+ missing.append(word)
+
+ # Load their state
+ self.cursor.execute(
+ "SELECT word, nspam, nham FROM bayes WHERE word IN (%s)" % (
+ ", ".join("?" * len(missing))),
+ missing)
+ rows = self.cursor.fetchall()
+
+ # Save them for later
+ for row in rows:
+ self._readCache[row[0]] = row
+
+ # Let the base class do its thing, which will involve asking us about
+ # that state we just cached.
+ return classifier.Classifier._getclues(self, wordstream)
+
+
def _get(self, word):
"""
Load the training data for the given word.
@@ -497,13 +557,22 @@
"""
if isinstance(word, str):
word = word.decode('utf-8', 'replace')
-
- self.cursor.execute(
- "SELECT word, nspam, nham FROM bayes WHERE word=?", (word,))
- rows = self.cursor.fetchall()
- if rows:
- return rows[0]
- return None
+ try:
+ # Check to see if we already have this word's info in memory.
+ row = self._readCache[word]
+ except KeyError:
+ # If not, load it from the database.
+ self.cursor.execute(
+ "SELECT word, nspam, nham FROM bayes WHERE word=?", (word,))
+ rows = self.cursor.fetchall()
+ if rows:
+ # Add it to the cache and return it.
+ self._readCache[rows[0][0]] = rows[0]
+ return rows[0]
+ return None
+ else:
+ # Otherwise return what we knew already.
+ return row
def _set(self, word, nspam, nham):
@@ -519,10 +588,7 @@
"""
if isinstance(word, str):
word = word.decode('utf-8', 'replace')
- self.cursor.execute(
- "INSERT OR REPLACE INTO bayes (word, nspam, nham) "
- "VALUES (?, ?, ?)",
- (word, nspam, nham))
+ self._readCache[word] = self._writeCache[word] = (word, nspam, nham)
def _delete(self, word):
@@ -532,10 +598,12 @@
@param word: A word (or any other kind of token) to lose training
information about.
@type word: C{str} or C{unicode} (but really, C{unicode} please)
+
+ @raise NotImplementedError: Deletion is not actually supported in this
+ backend. Fortunately, Quotient does not need it (it never calls
+ C{unlearn}).
"""
- if isinstance(word, str):
- word = word.decode('utf-8', 'replace')
- self.cursor.execute("DELETE FROM bayes WHERE word=?", (word,))
+ raise NotImplementedError("There is no support for deletion.")
def _post_training(self):
@@ -545,6 +613,11 @@
transaction, which contains all of the database modifications for each
token in that message.
"""
+ writes = self._writeCache.itervalues()
+ self._writeCache = {}
+ self.cursor.executemany(
+ "INSERT OR REPLACE INTO bayes (word, nspam, nham) "
+ "VALUES (?, ?, ?)", writes)
self.db.commit()
=== modified file 'Quotient/xquotient/test/test_spambayes.py'
--- Quotient/xquotient/test/test_spambayes.py 2012-08-20 19:53:37 +0000
+++ Quotient/xquotient/test/test_spambayes.py 2012-08-23 22:01:18 +0000
@@ -42,6 +42,54 @@
self.assertEqual(bayes.nspam, 1)
+ def test_spamTokenRecorded(self):
+ """
+ The first time a token is encountered during spam training, a row is
+ inserted into the database counting it as once a spam token, never a ham
+ token.
+ """
+ self.classifier.train(StringIO("spam bad gross"), True)
+ bayes = spam._SQLite3Classifier(self.path)
+ wordInfo = bayes._get(u"spam")
+ self.assertEqual((u"spam", 1, 0), wordInfo)
+
+
+ def test_hamTokenRecorded(self):
+ """
+ The first time a token is encountered during ham training, a row is
+ inserted into the database counting it as never a spam token, once a ham
+ token.
+ """
+ self.classifier.train(StringIO("justice sunshine puppies"), False)
+ bayes = spam._SQLite3Classifier(self.path)
+ wordInfo = bayes._get(u"sunshine")
+ self.assertEqual((u"sunshine", 0, 1), wordInfo)
+
+
+ def test_spamTokenIncremented(self):
+ """
+ Encountered on a subsequent spam training operation, an existing word
+ info row has its spam count incremented and its ham count left alone.
+ """
+ self.classifier.train(StringIO("justice sunshine puppies"), False)
+ self.classifier.train(StringIO("spam bad puppies"), True)
+ bayes = spam._SQLite3Classifier(self.path)
+ wordInfo = bayes._get(u"puppies")
+ self.assertEqual((u"puppies", 1, 1), wordInfo)
+
+
+ def test_hamTokenIncremented(self):
+ """
+ Encountered on a subsequent ham training operation, an existing word
+ info row has its spam count left alone and its ham count incremented.
+ """
+ self.classifier.train(StringIO("spam bad puppies"), True)
+ self.classifier.train(StringIO("justice sunshine puppies"), False)
+ bayes = spam._SQLite3Classifier(self.path)
+ wordInfo = bayes._get(u"puppies")
+ self.assertEqual((u"puppies", 1, 1), wordInfo)
+
+
def test_nham(self):
"""
L{SQLite3Classifier} tracks, in memory, the number of ham messages it
@@ -71,6 +119,17 @@
self.classifier.score(StringIO("spamfulness words of spam")) > 0.99)
+ def test_spamClassificationWithoutCache(self):
+ """
+ Like L{test_spamClassification}, but ensure no instance cache is used to
+ satisfied word info lookups.
+ """
+ self.classifier.train(StringIO("spam words of spamfulness"), True)
+ classifier = Hammie(spam._SQLite3Classifier(self.path), mode='r')
+ self.assertTrue(
+ classifier.score(StringIO("spamfulness words of spam")) > 0.99)
+
+
def test_hamClassification(self):
"""
L{SQLite3Classifier} can be trained with a ham message so as to later
@@ -81,6 +140,17 @@
self.classifier.score(StringIO("words, very nice")) < 0.01)
+ def test_hamClassificationWithoutCache(self):
+ """
+ Like L{test_spamClassification}, but ensure no instance cache is used to
+ satisfied word info lookups.
+ """
+ self.classifier.train(StringIO("very nice words"), False)
+ classifier = Hammie(spam._SQLite3Classifier(self.path), mode='r')
+ self.assertTrue(
+ classifier.score(StringIO("words, very nice")) < 0.01)
+
+
class SpambayesFilterTestCase(unittest.TestCase, MessageCreationMixin):
"""
Follow ups