← Back to team overview

divmod-dev team mailing list archive

[Merge] lp:~exarkun/divmod.org/spambayes-fewer-potatoes into lp:divmod.org

 

Jean-Paul Calderone has proposed merging lp:~exarkun/divmod.org/spambayes-fewer-potatoes into lp:divmod.org.

Requested reviews:
  Divmod-dev (divmod-dev)
Related bugs:
  Bug #1040874 in Quotient: "Database persistence for spambayes classifier data could be faster"
  https://bugs.launchpad.net/quotient/+bug/1040874

For more details, see:
https://code.launchpad.net/~exarkun/divmod.org/spambayes-fewer-potatoes/+merge/121094

Some speedups to the new spambayes database layer.  Benchmark results on the ticket (sorry, Launchpad, how do you work exactly?)

-- 
https://code.launchpad.net/~exarkun/divmod.org/spambayes-fewer-potatoes/+merge/121094
Your team Divmod-dev is requested to review the proposed merge of lp:~exarkun/divmod.org/spambayes-fewer-potatoes into lp:divmod.org.
=== added directory 'Quotient/benchmarks'
=== added file 'Quotient/benchmarks/spambayes'
--- Quotient/benchmarks/spambayes	1970-01-01 00:00:00 +0000
+++ Quotient/benchmarks/spambayes	2012-08-23 22:01:18 +0000
@@ -0,0 +1,44 @@
+#!/usr/bin/python
+
+# Benchmark of Quotient spambayes filter, both training and classification.
+
+import sys, tempfile, random, time
+
+from xquotient.spam import _SQLite3Classifier
+
+words = list(open('/usr/share/dict/words', 'r'))
+
+TRAINING_FACTOR = 50
+MESSAGE_FACTOR = 500
+
+def adj(duration):
+   return duration / (TRAINING_FACTOR * MESSAGE_FACTOR) * 1000.0
+
+
+def main(argv):
+   prng = random.Random()
+   prng.seed(12345)
+   prng.shuffle(words)
+
+   classifier = _SQLite3Classifier(tempfile.mktemp())
+
+   before = time.time()
+   for i in range(TRAINING_FACTOR):
+      classifier.learn(words[i:i + MESSAGE_FACTOR], True)
+
+   for i in range(TRAINING_FACTOR, TRAINING_FACTOR * 2):
+      classifier.learn(words[i:i + MESSAGE_FACTOR], False)
+   after = time.time()
+
+   print 'Learning: %.2f ms/word' % (adj(after - before),)
+
+   before = time.time()
+   for i in range(TRAINING_FACTOR * 2):
+      classifier.spamprob(words[i:i + MESSAGE_FACTOR])
+   after = time.time()
+
+   print 'Guessing: %.2f ms/word' % (adj(after - before),)
+
+
+if __name__ == '__main__':
+   main(sys.argv)

=== modified file 'Quotient/xquotient/spam.py'
--- Quotient/xquotient/spam.py	2012-08-20 19:51:00 +0000
+++ Quotient/xquotient/spam.py	2012-08-23 22:01:18 +0000
@@ -382,6 +382,21 @@
         statements.  These are executed any time the classifier database is
         opened, with the expected failure which occurs any time the schema has
         already been initialized handled and disregarded.
+
+    @ivar _readCache: Word information that is already known, either because it
+        has already been read from the database once or because we wrote the
+        information to the database.  Keys are unicode tokens, values are
+        three-sequences of token, nspam, and nham counts.  This is used to hold
+        word info between two different Spambayes hooks, C{_getclues} and
+        C{_wordinfoget}.  The former has access to all tokens in a particular
+        document, the latter is a potato-programming mistake.  Loading all of
+        the values at once in C{_getclues} is a big performance win.
+
+    @ivar _writeCache: Word information that is on its way to the database due
+        to training.  This has the same shape as C{_readCache}.  Word info is
+        held here until training on one document is complete, then all the word
+        info is dumped into the database in a single SQL operation (via
+        I{executemany}).
     """
 
     SCHEMA = [
@@ -413,6 +428,7 @@
         return get, set, None, doc
     nspam = property(*nspam())
 
+
     def nham():
         doc = """
         A property which reflects the number of messages trained as ham, while
@@ -438,6 +454,9 @@
         """
         classifier.Classifier.__init__(self)
         self.databaseName = databaseName
+        self._readCache = {}
+        self._writeCache = {}
+
         # Open the database, possibly initializing it if it has not yet been
         # initialized, and then load the necessary global state from it (nspam,
         # nham).
@@ -484,6 +503,47 @@
             self.cursor.execute('UPDATE state SET nspam=?, nham=?', (self._nspam, self._nham))
 
 
+    def _getclues(self, wordstream):
+        """
+        Hook into the classification process to speed it up.
+
+        See the base implementation for details about what C{_getclues} is
+        supposed to do.  This implementation extends the base to look into
+        wordstream and load all the necessary information with the minimum
+        amount of SQLite3 work, then calls up to the base implementation to let
+        it do the actual classification-related work.
+
+        @param wordstream: An iterable (probably a generator) of tokens from the
+            document to be classified.
+        """
+        # Make sure we can consume it and give it to the base implementation for
+        # consumption.
+        wordstream = list(wordstream)
+
+        # Find all the tokens we don't have in memory already
+        missing = []
+        for word in wordstream:
+            if isinstance(word, str):
+                word = word.decode('utf-8', 'replace')
+            if word not in self._readCache:
+                missing.append(word)
+
+        # Load their state
+        self.cursor.execute(
+            "SELECT word, nspam, nham FROM bayes WHERE word IN (%s)" % (
+                ", ".join("?" * len(missing))),
+            missing)
+        rows = self.cursor.fetchall()
+
+        # Save them for later
+        for row in rows:
+            self._readCache[row[0]] = row
+
+        # Let the base class do its thing, which will involve asking us about
+        # that state we just cached.
+        return classifier.Classifier._getclues(self, wordstream)
+
+
     def _get(self, word):
         """
         Load the training data for the given word.
@@ -497,13 +557,22 @@
         """
         if isinstance(word, str):
             word = word.decode('utf-8', 'replace')
-
-        self.cursor.execute(
-            "SELECT word, nspam, nham FROM bayes WHERE word=?", (word,))
-        rows = self.cursor.fetchall()
-        if rows:
-            return rows[0]
-        return None
+        try:
+            # Check to see if we already have this word's info in memory.
+            row = self._readCache[word]
+        except KeyError:
+            # If not, load it from the database.
+            self.cursor.execute(
+                "SELECT word, nspam, nham FROM bayes WHERE word=?", (word,))
+            rows = self.cursor.fetchall()
+            if rows:
+                # Add it to the cache and return it.
+                self._readCache[rows[0][0]] = rows[0]
+                return rows[0]
+            return None
+        else:
+            # Otherwise return what we knew already.
+            return row
 
 
     def _set(self, word, nspam, nham):
@@ -519,10 +588,7 @@
         """
         if isinstance(word, str):
             word = word.decode('utf-8', 'replace')
-        self.cursor.execute(
-            "INSERT OR REPLACE INTO bayes (word, nspam, nham) "
-            "VALUES (?, ?, ?)",
-            (word, nspam, nham))
+        self._readCache[word] = self._writeCache[word] = (word, nspam, nham)
 
 
     def _delete(self, word):
@@ -532,10 +598,12 @@
         @param word: A word (or any other kind of token) to lose training
             information about.
         @type word: C{str} or C{unicode} (but really, C{unicode} please)
+
+        @raise NotImplementedError: Deletion is not actually supported in this
+            backend.  Fortunately, Quotient does not need it (it never calls
+            C{unlearn}).
         """
-        if isinstance(word, str):
-            word = word.decode('utf-8', 'replace')
-        self.cursor.execute("DELETE FROM bayes WHERE word=?", (word,))
+        raise NotImplementedError("There is no support for deletion.")
 
 
     def _post_training(self):
@@ -545,6 +613,11 @@
         transaction, which contains all of the database modifications for each
         token in that message.
         """
+        writes = self._writeCache.itervalues()
+        self._writeCache = {}
+        self.cursor.executemany(
+            "INSERT OR REPLACE INTO bayes (word, nspam, nham) "
+            "VALUES (?, ?, ?)", writes)
         self.db.commit()
 
 

=== modified file 'Quotient/xquotient/test/test_spambayes.py'
--- Quotient/xquotient/test/test_spambayes.py	2012-08-20 19:53:37 +0000
+++ Quotient/xquotient/test/test_spambayes.py	2012-08-23 22:01:18 +0000
@@ -42,6 +42,54 @@
         self.assertEqual(bayes.nspam, 1)
 
 
+    def test_spamTokenRecorded(self):
+        """
+        The first time a token is encountered during spam training, a row is
+        inserted into the database counting it as once a spam token, never a ham
+        token.
+        """
+        self.classifier.train(StringIO("spam bad gross"), True)
+        bayes = spam._SQLite3Classifier(self.path)
+        wordInfo = bayes._get(u"spam")
+        self.assertEqual((u"spam", 1, 0), wordInfo)
+
+
+    def test_hamTokenRecorded(self):
+        """
+        The first time a token is encountered during ham training, a row is
+        inserted into the database counting it as never a spam token, once a ham
+        token.
+        """
+        self.classifier.train(StringIO("justice sunshine puppies"), False)
+        bayes = spam._SQLite3Classifier(self.path)
+        wordInfo = bayes._get(u"sunshine")
+        self.assertEqual((u"sunshine", 0, 1), wordInfo)
+
+
+    def test_spamTokenIncremented(self):
+        """
+        Encountered on a subsequent spam training operation, an existing word
+        info row has its spam count incremented and its ham count left alone.
+        """
+        self.classifier.train(StringIO("justice sunshine puppies"), False)
+        self.classifier.train(StringIO("spam bad puppies"), True)
+        bayes = spam._SQLite3Classifier(self.path)
+        wordInfo = bayes._get(u"puppies")
+        self.assertEqual((u"puppies", 1, 1), wordInfo)
+
+
+    def test_hamTokenIncremented(self):
+        """
+        Encountered on a subsequent ham training operation, an existing word
+        info row has its spam count left alone and its ham count incremented.
+        """
+        self.classifier.train(StringIO("spam bad puppies"), True)
+        self.classifier.train(StringIO("justice sunshine puppies"), False)
+        bayes = spam._SQLite3Classifier(self.path)
+        wordInfo = bayes._get(u"puppies")
+        self.assertEqual((u"puppies", 1, 1), wordInfo)
+
+
     def test_nham(self):
         """
         L{SQLite3Classifier} tracks, in memory, the number of ham messages it
@@ -71,6 +119,17 @@
             self.classifier.score(StringIO("spamfulness words of spam")) > 0.99)
 
 
+    def test_spamClassificationWithoutCache(self):
+        """
+        Like L{test_spamClassification}, but ensure no instance cache is used to
+        satisfied word info lookups.
+        """
+        self.classifier.train(StringIO("spam words of spamfulness"), True)
+        classifier = Hammie(spam._SQLite3Classifier(self.path), mode='r')
+        self.assertTrue(
+            classifier.score(StringIO("spamfulness words of spam")) > 0.99)
+
+
     def test_hamClassification(self):
         """
         L{SQLite3Classifier} can be trained with a ham message so as to later
@@ -81,6 +140,17 @@
             self.classifier.score(StringIO("words, very nice")) < 0.01)
 
 
+    def test_hamClassificationWithoutCache(self):
+        """
+        Like L{test_spamClassification}, but ensure no instance cache is used to
+        satisfied word info lookups.
+        """
+        self.classifier.train(StringIO("very nice words"), False)
+        classifier = Hammie(spam._SQLite3Classifier(self.path), mode='r')
+        self.assertTrue(
+            classifier.score(StringIO("words, very nice")) < 0.01)
+
+
 
 class SpambayesFilterTestCase(unittest.TestCase, MessageCreationMixin):
     """


Follow ups