launchpad-reviewers team mailing list archive

Thread
Date

[Merge] lp:~flacoste/launchpad/ppr-constant-memory into lp:launchpad/devel

To: mp+39578@xxxxxxxxxxxxxxxxxx
From: "Francis J. Lacoste" <francis.lacoste@xxxxxxxxxx>
Date: Thu, 28 Oct 2010 21:35:26 -0000
Reply-to: mp+39578@xxxxxxxxxxxxxxxxxx
Sender: bounces@xxxxxxxxxxxxx

Francis J. Lacoste has proposed merging lp:~flacoste/launchpad/ppr-constant-memory into lp:launchpad/devel.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)


This is my second iteration on making the page-performance-report uses less
memory.

This time around, I managed to get it to run in constant memory.

I dropped numpy and compute all statistics using SQL.

I also added some automated tests. They are very coarse, making sure that the
computed stats for a set of requests match. The expected results were
hand-calculated in a spreadsheet.

Another drive-by is that I increased the cache_size on Robert's suggestions.
Local testing seem to show that it makes a very small difference (main effect
seems to reduce the sys time, but that's not the bulk of the time.)

The whole report seems at least now twice slower than the previous algorithm.

The previous took ~1m52 to generate a report for 300k records. The new one
takes ~4m. But the biggest advantage is that it runs in a constant amount of
memory. This means that we'll be able to generate weekly and monthly reports
(once we have more space on sodium, space is currently too tight to generate
them).




-- 
https://code.launchpad.net/~flacoste/launchpad/ppr-constant-memory/+merge/39578
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~flacoste/launchpad/ppr-constant-memory into lp:launchpad/devel.

=== modified file 'lib/lp/scripts/utilities/pageperformancereport.py'
--- lib/lp/scripts/utilities/pageperformancereport.py	2010-10-25 21:47:16 +0000
+++ lib/lp/scripts/utilities/pageperformancereport.py	2010-10-28 21:34:57 +0000
@@ -10,15 +10,15 @@
 from ConfigParser import RawConfigParser
 from datetime import datetime
 import os.path
+import math
 import re
 import subprocess
 from textwrap import dedent
 import sqlite3
 import tempfile
+import textwrap
 import time
-import warnings
 
-import numpy
 import simplejson as json
 import sre_constants
 import zc.zservertracelog.tracereport
@@ -27,9 +27,6 @@
 from canonical.launchpad.scripts.logger import log
 from lp.scripts.helpers import LPOptionParser
 
-# We don't care about conversion to nan, they are expected.
-warnings.filterwarnings(
-    'ignore', '.*converting a masked element to nan.', UserWarning)
 
 class Request(zc.zservertracelog.tracereport.Request):
     url = None
@@ -82,7 +79,6 @@
     mean = 0 # Mean time per hit.
     median = 0 # Median time per hit.
     std = 0 # Standard deviation per hit.
-    ninetyninth_percentile_time = 0
     histogram = None # # Request times histogram.
 
     total_sqltime = 0 # Total time spent waiting for SQL to process.
@@ -95,51 +91,38 @@
     median_sqlstatements = 0
     std_sqlstatements = 0
 
-    def __init__(self, times, timeout):
-        """Compute the stats based on times.
-
-        Times is a list of (app_time, sql_statements, sql_times).
-
-        The histogram is a list of request counts per 1 second bucket.
-        ie. histogram[0] contains the number of requests taking between 0 and
-        1 second, histogram[1] contains the number of requests taking between
-        1 and 2 seconds etc. histogram is None if there are no requests in
-        this Category.
+    @property
+    def ninetyninth_percentile_time(self):
+        """Time under which 99% of requests are rendered.
+
+        This is estimated as 3 std deviations from the mean. Given that
+        in a daily report, many URLs or PageIds won't have 100 requests, it's
+        more useful to use this estimator.
         """
-        if not times:
-            return
-
-        self.total_hits = len(times)
-
-        # Ignore missing values (-1) in computation.
-        times_array = numpy.ma.masked_values(
-            numpy.asarray(times, dtype=numpy.float32), -1.)
-
-        self.total_time, self.total_sqlstatements, self.total_sqltime = (
-            times_array.sum(axis=0))
-
-        self.mean, self.mean_sqlstatements, self.mean_sqltime = (
-            times_array.mean(axis=0))
-
-        self.median, self.median_sqlstatements, self.median_sqltime = (
-            numpy.median(times_array, axis=0))
-
-        self.std, self.std_sqlstatements, self.std_sqltime = (
-            numpy.std(times_array, axis=0))
-
-        # This is an approximation which may not be true: we don't know if we
-        # have a std distribution or not. We could just find the 99th
-        # percentile by counting. Shock. Horror; however this appears pretty
-        # good based on eyeballing things so far - once we're down in the 2-3
-        # second range for everything we may want to revisit.
-        self.ninetyninth_percentile_time = self.mean + self.std*3
-
-        histogram_width = int(timeout*1.5)
-        histogram_times = numpy.clip(times_array[:,0], 0, histogram_width)
-        histogram = numpy.histogram(
-            histogram_times, normed=True, range=(0, histogram_width),
-            bins=histogram_width)
-        self.histogram = zip(histogram[1], histogram[0])
+        return self.mean + 3*self.std
+
+    @property
+    def relative_histogram(self):
+        """Return an histogram where the frequency is relative."""
+        if self.histogram:
+            return [[x, float(f)/self.total_hits] for x, f in self.histogram]
+        else:
+            return None
+
+
+    def text(self):
+        """Return a textual version of the stats."""
+        return textwrap.dedent("""
+        <Stats for %d requests:
+            Time:     total=%.2f; mean=%.2f; median=%.2f; std=%.2f
+            SQL time: total=%.2f; mean=%.2f; median=%.2f; std=%.2f
+            SQL stmt: total=%.f;  mean=%.2f; median=%.f; std=%.2f
+            >""" % (
+                self.total_hits, self.total_time, self.mean, self.median,
+                self.std, self.total_sqltime, self.mean_sqltime,
+                self.median_sqltime, self.std_sqltime,
+                self.total_sqlstatements, self.mean_sqlstatements,
+                self.median_sqlstatements, self.std_sqlstatements))
 
 
 class SQLiteRequestTimes:
@@ -154,12 +137,15 @@
         self.con = sqlite3.connect(self.filename, isolation_level='EXCLUSIVE')
         log.debug('Using request database %s' % self.filename)
         # Some speed optimization.
+        self.con.execute('PRAGMA cache_size = 400000') # ~400M
         self.con.execute('PRAGMA synchronous = off')
         self.con.execute('PRAGMA journal_mode = off')
 
         self.categories = categories
         self.store_all_request = options.pageids or options.top_urls
-        self.timeout = options.timeout
+
+        # Histogram has a bin per second up to 1.5 our timeout.
+        self.histogram_width = int(options.timeout*1.5)
         self.cur = self.con.cursor()
 
         # Create the tables, ignore errors about them being already present.
@@ -193,17 +179,16 @@
                 else:
                     raise
 
+        # A table that will be used in a join to compute histogram.
+        self.cur.execute("CREATE TEMP TABLE histogram (bin INT)")
+        for x in range(self.histogram_width):
+            self.cur.execute('INSERT INTO histogram VALUES (?)', (x,))
+
     def add_request(self, request):
         """Add a request to the cache."""
         sql_statements = request.sql_statements
         sql_seconds = request.sql_seconds
 
-        # Store missing value as -1, as it makes dealing with those
-        # easier with numpy.
-        if sql_statements is None:
-            sql_statements = -1
-        if sql_seconds is None:
-            sql_seconds = -1
         for idx, category in enumerate(self.categories):
             if category.match(request):
                 self.con.execute(
@@ -213,7 +198,7 @@
         if self.store_all_request:
             pageid = request.pageid or 'Unknown'
             self.con.execute(
-                "INSERT INTO request VALUES (?,?,?,?,?)", 
+                "INSERT INTO request VALUES (?,?,?,?,?)",
                 (pageid, request.url, request.app_seconds, sql_statements,
                     sql_seconds))
 
@@ -223,72 +208,150 @@
 
     def get_category_times(self):
         """Return the times for each category."""
-        category_query = 'SELECT * FROM category_request ORDER BY category'
-
-        empty_stats = Stats([], 0)
-        categories = dict(self.get_times(category_query))
+
+        times = self.get_times('category_request', 'category')
+
         return [
-            (category, categories.get(idx, empty_stats))
-            for idx, category in enumerate(self.categories)]
+            (category, times.get(i, Stats()))
+            for i, category in enumerate(self.categories)]
+
+    def get_times(self, table, column):
+        """Return the stats for unique value of table.column"""
+
+        times = {}
+        median_idx_by_key = {}
+        replacements = dict(table=table, column=column)
+
+        # Compute count, total and average.
+        self.cur.execute('''
+            CREATE TEMPORARY TABLE IF NOT EXISTS {table}_stats AS
+            SELECT {column},
+                count(time) AS time_n, sum(time) AS total_time,
+                avg(time) AS mean_time,
+                count(sql_time) AS sqltime_n, sum(sql_time) AS total_sqltime,
+                avg(sql_time) AS mean_sqltime,
+                count(sql_statements) AS sqlstatements_n,
+                sum(sql_statements) AS total_sqlstatements,
+                avg(sql_statements) AS mean_sqlstatements
+            FROM {table}
+            GROUP BY {column}
+            '''.format(**replacements))
+        self.cur.execute('''
+            SELECT {column}, time_n, total_time, mean_time,
+                   sqltime_n,
+                   coalesce(total_sqltime, 0), coalesce(mean_sqltime, 0),
+                   sqlstatements_n, coalesce(total_sqlstatements, 0),
+                   coalesce(mean_sqlstatements, 0)
+              FROM {table}_stats
+              '''.format(**replacements))
+        for row in self.cur.fetchall():
+            stats = times.setdefault(row[0], Stats())
+            (stats.total_hits, stats.total_time, stats.mean,
+                sqltime_n, stats.total_sqltime, stats.mean_sqltime,
+                sqlstatements_n, stats.total_sqlstatements,
+                stats.mean_sqlstatements) = row[1:]
+            # Store the index of the median for each field.
+            median_idx = median_idx_by_key.setdefault(row[0], {})
+            median_idx['time'] = int((stats.total_hits-1)/2)
+            median_idx['sql_time'] = int((sqltime_n-1)/2)
+            median_idx['sql_statements'] = int((sqlstatements_n-1)/2)
+
+        # Compute std deviation.
+        # The variance is the average of the sum of the square difference to
+        # the mean.
+        # The standard deviation is the square-root of the variance.
+        # sqlite doesn't support ** or POWER so we expand the expression.
+        # For the same reason, we do the square root in python.
+        self.cur.execute('''
+            SELECT {table}_stats.{column},
+                time_square_diff/time_n AS var_time,
+                coalesce(sqltime_square_diff/sqltime_n, 0) AS var_sqltime,
+                coalesce(sqlstatements_square_diff/sqlstatements_n, 0)
+                    AS var_sqlstatements
+            FROM {table}_stats JOIN (
+                SELECT {table}.{column},
+                    sum((time-mean_time)*(time-mean_time))
+                        AS time_square_diff,
+                    sum((sql_time-mean_sqltime)*(sql_time-mean_sqltime))
+                        AS sqltime_square_diff,
+                    sum((sql_statements-mean_sqlstatements)*
+                        (sql_statements-mean_sqlstatements))
+                        AS sqlstatements_square_diff
+                  FROM {table} JOIN {table}_stats
+                    ON ({table}.{column}= {table}_stats.{column})
+               GROUP BY {table}.{column}
+                ) AS {table}_square_diff ON (
+                    {table}_stats.{column} = {table}_square_diff.{column})
+                '''.format(**replacements))
+        for row in self.cur.fetchall():
+            stats = times[row[0]]
+            (stats.std, stats.std_sqltime, stats.std_sqlstatements) = [
+                math.sqrt(x) for x in row[1:]]
+
+        # Compute the median.
+        for field, median_attribute in [
+                ('time', 'median'),
+                ('sql_time', 'median_sqltime'),
+                ('sql_statements', 'median_sqlstatements')]:
+            self.cur.execute('''
+                SELECT {column}, {field} FROM {table}
+                WHERE {field} IS NOT NULL
+              ORDER BY {column}, {field}
+                      '''.format(field=field, **replacements))
+            idx = 0
+            current_key = None
+            for key, value in self.cur.fetchall():
+                if key != current_key:
+                    idx = 0
+                    median_idx = median_idx_by_key[key][field]
+                    current_key = key
+                if idx == median_idx:
+                    stats = times[key]
+                    setattr(stats, median_attribute, value)
+                idx += 1
+
+        # Compute the histogram of requests.
+        self.cur.execute('''
+            SELECT {column}, bin, count(time)
+              FROM {table} JOIN histogram ON (
+                histogram.bin = CAST (min(time, {last_bin_index}) AS INTEGER))
+              GROUP BY {column}, bin
+              '''.format(
+                      last_bin_index=(self.histogram_width-1),
+                      **replacements))
+        for key, bin, n in self.cur.fetchall():
+            stats = times[key]
+            if stats.histogram is None:
+                # Create an empty histogram.
+                stats.histogram = [
+                    [x, 0] for x in range(self.histogram_width)]
+            stats.histogram[bin][1] = n
+
+        return times
 
     def get_top_urls_times(self, top_n):
         """Return the times for the Top URL by total time"""
-        top_url_query = '''
-            SELECT url, time, sql_statements, sql_time
-            FROM request WHERE url IN (
-                SELECT url FROM (SELECT url, sum(time) FROM request
-                    GROUP BY url
-                    ORDER BY sum(time) DESC
-                    LIMIT %d))
-            ORDER BY url
-        ''' % top_n
+        # Get the requests from the top N urls by total time.
+        self.cur.execute('''
+            CREATE TEMPORARY TABLE IF NOT EXISTS top_n_url_request AS
+            SELECT request.url, time, sql_statements, sql_time
+            FROM request JOIN (
+                SELECT url, sum(time) FROM request
+                GROUP BY url
+                ORDER BY sum(time) DESC
+                LIMIT %d) AS top_n_url
+                ON (request.url = top_n_url.url)
+        ''' % top_n)
         # Sort the result by total time
         return sorted(
-            self.get_times(top_url_query), key=lambda x: x[1].total_time,
-            reverse=True)
+            self.get_times('top_n_url_request', 'url').items(),
+            key=lambda x: x[1].total_time, reverse=True)
 
     def get_pageid_times(self):
         """Return the times for the pageids."""
-        pageid_query = '''
-            SELECT pageid, time, sql_statements, sql_time
-            FROM request
-            ORDER BY pageid
-        '''
-        return self.get_times(pageid_query)
-
-    def get_times(self, query):
-        """Return a list of key, stats based on the query.
-
-        The query should return rows of the form:
-            [key, app_time, sql_statements, sql_times]
-
-        And should be sorted on key.
-        """
-        times = []
-        current_key = None
-        results = []
-        self.cur.execute(query)
-        while True:
-            rows = self.cur.fetchmany()
-            if len(rows) == 0:
-                break
-            for row in rows:
-                # We are encountering a new group...
-                if row[0] != current_key:
-                    # Compute the stats of the previous group
-                    if current_key != None:
-                        results.append(
-                            (current_key, Stats(times, self.timeout)))
-                    # Initialize the new group.
-                    current_key = row[0]
-                    times = []
-
-                times.append(row[1:])
-        # Compute the stats of the last group
-        if current_key != None:
-            results.append((current_key, Stats(times, self.timeout)))
-
-        return results
+        # Sort the result by pageid
+        return sorted(
+            self.get_times('request', 'pageid').items())
 
     def close(self, remove=False):
         """Close the SQLite connection.
@@ -684,7 +747,7 @@
     histograms = []
 
     def handle_times(html_title, stats):
-        histograms.append(stats.histogram)
+        histograms.append(stats.relative_histogram)
         print >> outf, dedent("""\
             <tr>
             <th class="category-title">%s</th>

=== added file 'lib/lp/scripts/utilities/tests/test_pageperformancereport.py'
--- lib/lp/scripts/utilities/tests/test_pageperformancereport.py	1970-01-01 00:00:00 +0000
+++ lib/lp/scripts/utilities/tests/test_pageperformancereport.py	2010-10-28 21:34:57 +0000
@@ -0,0 +1,232 @@
+# Copyright 2010 Canonical Ltd.  This software is licensed under the
+# GNU Affero General Public License version 3 (see the file LICENSE).
+
+"""Test the pageperformancereport script."""
+
+__metaclass__ = type
+
+import unittest
+
+from lp.testing import TestCase
+
+from lp.scripts.utilities.pageperformancereport import (
+    Category,
+    SQLiteRequestTimes,
+    Stats,
+    )
+
+class FakeOptions:
+    timeout = 4
+    db_file = None
+    pageids = True
+    top_urls = True
+
+    def __init__(self, **kwargs):
+        """Assign all arguments as attributes."""
+        self.__dict__.update(kwargs)
+
+class FakeRequest:
+    def __init__(self, url, app_seconds, sql_statements=None, 
+                 sql_seconds=None, pageid=None):
+        self.url = url
+        self.pageid = pageid
+        self.app_seconds = app_seconds
+        self.sql_statements = sql_statements
+        self.sql_seconds = sql_seconds
+
+
+class FakeStats(Stats):
+    def __init__(self, **kwargs):
+        # Override the constructor to just store the values.
+        self.__dict__.update(kwargs)
+
+FAKE_REQUESTS = [
+    FakeRequest('/', 0.5, pageid='+root'),
+    FakeRequest('/bugs', 4.5, 56, 3.0, pageid='+bugs'),
+    FakeRequest('/bugs', 4.2, 56, 2.2, pageid='+bugs'),
+    FakeRequest('/bugs', 5.5, 76, 4.0, pageid='+bugs'),
+    FakeRequest('/ubuntu', 2.5, 6, 2.0, pageid='+distribution'),
+    FakeRequest('/launchpad', 3.5, 3, 3.0, pageid='+project'),
+    FakeRequest('/bzr', 2.5, 4, 2.0, pageid='+project'),
+    FakeRequest('/bugs/1', 20.5, 567, 14.0, pageid='+bug'),
+    FakeRequest('/bugs/1', 15.5, 567, 9.0, pageid='+bug'),
+    FakeRequest('/bugs/5', 1.5, 30, 1.2, pageid='+bug'),
+    FakeRequest('/lazr', 1.0, 16, 0.3, pageid='+project'),
+    FakeRequest('/drizzle', 0.9, 11, 1.3, pageid='+project'),
+    ]
+
+
+# The category stats computed for the above 12 requests.
+CATEGORY_STATS = [
+    (Category('All', ''), FakeStats(
+        total_hits=12, total_time=62.60, mean=5.22, median=2.5, std=5.99,
+        total_sqltime=42, mean_sqltime=3.82, median_sqltime=2.2,
+        std_sqltime=3.89,
+        total_sqlstatements=1392, mean_sqlstatements=126.55,
+        median_sqlstatements=30, std_sqlstatements=208.94,
+        histogram=[[0, 2], [1, 2], [2, 2], [3, 1], [4, 2], [5, 3]],
+        )),
+    (Category('Test', ''), FakeStats()),
+    (Category('Bugs', ''), FakeStats(
+        total_hits=6, total_time=51.70, mean=8.62, median=4.5, std=6.90,
+        total_sqltime=33.40, mean_sqltime=5.57, median_sqltime=3,
+        std_sqltime=4.52,
+        total_sqlstatements=1352, mean_sqlstatements=225.33,
+        median_sqlstatements=56, std_sqlstatements=241.96,
+        histogram=[[0, 0], [1, 1], [2, 0], [3, 0], [4, 2], [5, 3]],
+        )),
+    ]
+
+
+# The top 3 URL stats computed for the above 12 requests.
+TOP_3_URL_STATS = [
+    ('/bugs/1', FakeStats(
+        total_hits=2, total_time=36.0, mean=18.0, median=15.5, std=2.50,
+        total_sqltime=23.0, mean_sqltime=11.5, median_sqltime=9.0,
+        std_sqltime=2.50,
+        total_sqlstatements=1134, mean_sqlstatements=567.0,
+        median_sqlstatements=567, std_statements=0,
+        histogram=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0], [5, 2]],
+        )),
+    ('/bugs', FakeStats(
+        total_hits=3, total_time=14.2, mean=4.73, median=4.5, std=0.56,
+        total_sqltime=9.2, mean_sqltime=3.07, median_sqltime=3,
+        std_sqltime=0.74,
+        total_sqlstatements=188, mean_sqlstatements=62.67,
+        median_sqlstatements=56, std_sqlstatements=9.43,
+        histogram=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 2], [5, 1]],
+        )),
+    ('/launchpad', FakeStats(
+        total_hits=1, total_time=3.5, mean=3.5, median=3.5, std=0,
+        total_sqltime=3.0, mean_sqltime=3, median_sqltime=3, std_sqltime=0,
+        total_sqlstatements=3, mean_sqlstatements=3,
+        median_sqlstatements=3, std_sqlstatements=0,
+        histogram=[[0, 0], [1, 0], [2, 0], [3, 1], [4, 0], [5, 0]],
+        )),
+    ]
+
+
+# The pageid stats computed for the above 12 requests.
+PAGEID_STATS = [
+    ('+bug', FakeStats(
+        total_hits=3, total_time=37.5, mean=12.5, median=15.5, std=8.04,
+        total_sqltime=24.2, mean_sqltime=8.07, median_sqltime=9,
+        std_sqltime=5.27,
+        total_sqlstatements=1164, mean_sqlstatements=388,
+        median_sqlstatements=567, std_sqlstatements=253.14,
+        histogram=[[0, 0], [1, 1], [2, 0], [3, 0], [4, 0], [5, 2]],
+        )),
+    ('+bugs', FakeStats(
+        total_hits=3, total_time=14.2, mean=4.73, median=4.5, std=0.56,
+        total_sqltime=9.2, mean_sqltime=3.07, median_sqltime=3,
+        std_sqltime=0.74,
+        total_sqlstatements=188, mean_sqlstatements=62.67,
+        median_sqlstatements=56, std_sqlstatements=9.43,
+        histogram=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 2], [5, 1]],
+        )),
+    ('+distribution', FakeStats(
+        total_hits=1, total_time=2.5, mean=2.5, median=2.5, std=0,
+        total_sqltime=2.0, mean_sqltime=2, median_sqltime=2, std_sqltime=0,
+        total_sqlstatements=6, mean_sqlstatements=6,
+        median_sqlstatements=6, std_sqlstatements=0,
+        histogram=[[0, 0], [1, 0], [2, 1], [3, 0], [4, 0], [5, 0]],
+        )),
+    ('+project', FakeStats(
+        total_hits=4, total_time=7.9, mean=1.98, median=1, std=1.08,
+        total_sqltime=6.6, mean_sqltime=1.65, median_sqltime=1.3,
+        std_sqltime=0.99,
+        total_sqlstatements=34, mean_sqlstatements=8.5,
+        median_sqlstatements=4, std_sqlstatements=5.32,
+        histogram=[[0, 1], [1, 1], [2, 1], [3, 1], [4, 0], [5, 0]],
+        )),
+    ('+root', FakeStats(
+        total_hits=1, total_time=0.5, mean=0.5, median=0.5, std=0,
+        histogram=[[0, 1], [1, 0], [2, 0], [3, 0], [4, 0], [5, 0]],
+        )),
+    ]
+
+class TestSQLiteTimes(TestCase):
+    """Tests the SQLiteTimes backend."""
+
+    def setUp(self):
+        TestCase.setUp(self)
+        self.categories = [
+            Category('All', '.*'), Category('Test', '.*test.*'),
+            Category('Bugs', '.*bugs.*')]
+        self.db = SQLiteRequestTimes(self.categories, FakeOptions())
+        self.addCleanup(self.db.close)
+
+    def test_histogram_table_is_created(self):
+        # A histogram table with one row by histogram bin should be
+        # present.
+        self.db.cur.execute('SELECT bin FROM histogram')
+        # Default timeout is 4.
+        self.assertEquals(
+            range(6), [row[0] for row in self.db.cur.fetchall()])
+
+    def test_add_report_null_missing_sql_fields(self):
+        # Ensure that missing sql_statements and sql_time values are
+        # inserted as NULL.
+        self.db.add_request(FakeRequest('/', 10.0))
+        # Request should be inserted into the All category (index 0)
+        # and the normal request table.
+        self.db.cur.execute(
+            '''SELECT sql_statements, sql_time 
+               FROM category_request WHERE category = 0''')
+        self.assertEquals([(None, None)], self.db.cur.fetchall())
+
+        self.db.cur.execute(
+            """SELECT sql_statements, sql_time
+               FROM request WHERE url = '/'""")
+        self.assertEquals([(None, None)], self.db.cur.fetchall())
+
+    def setUpRequests(self):
+        """Insert some requests into the db."""
+        for r in FAKE_REQUESTS:
+            self.db.add_request(r)
+
+    def assertStatsAreEquals(self, expected, results):
+        self.assertEquals(
+            len(expected), len(results), 'Wrong number of results')
+        for idx in range(len(results)):
+            self.assertEquals(expected[idx][0], results[idx][0],
+                "Wrong key for results %d" % idx)
+            key = results[idx][0]
+            self.assertEquals(expected[idx][1].text(), results[idx][1].text(),
+                "Wrong stats for results %d (%s)" % (idx, key))
+            self.assertEquals(
+                expected[idx][1].histogram, results[idx][1].histogram,
+                "Wrong histogram for results %d (%s)" % (idx, key))
+
+    def test_get_category_times(self):
+        self.setUpRequests()
+        category_times = self.db.get_category_times()
+        self.assertStatsAreEquals(CATEGORY_STATS, category_times)
+
+    def test_get_url_times(self):
+        self.setUpRequests()
+        url_times = self.db.get_top_urls_times(3)
+        self.assertStatsAreEquals(TOP_3_URL_STATS, url_times)
+
+    def test_get_pageid_times(self):
+        self.setUpRequests()
+        pageid_times = self.db.get_pageid_times()
+        self.assertStatsAreEquals(PAGEID_STATS, pageid_times)
+
+
+class TestStats(TestCase):
+    """Tests for the stats class."""
+
+    def test_relative_histogram(self):
+        # Test that relative histogram gives an histogram using
+        # relative frequency.
+        stats = Stats()
+        stats.total_hits = 100
+        stats.histogram = [[0, 50], [1, 10], [2, 33], [3, 0], [4, 0], [5, 7]]
+        self.assertEquals(
+            [[0, 0.5], [1, .1], [2, .33], [3, 0], [4, 0], [5, .07]],
+            stats.relative_histogram)
+
+
+def test_suite():
+    return unittest.TestLoader().loadTestsFromName(__name__)

=== modified file 'setup.py'
--- setup.py	2010-09-18 08:00:27 +0000
+++ setup.py	2010-10-28 21:34:57 +0000
@@ -51,7 +51,6 @@
         'meliae',
         'mercurial',
         'mocker',
-        'numpy',
         'oauth',
         'paramiko',
         'python-memcached',

=== modified file 'versions.cfg'
--- versions.cfg	2010-10-27 04:23:52 +0000
+++ versions.cfg	2010-10-28 21:34:57 +0000
@@ -45,7 +45,6 @@
 mercurial = 1.6.2
 mocker = 0.10.1
 mozrunner = 1.3.4
-numpy = 1.3.0
 oauth = 1.0
 paramiko = 1.7.4
 Paste = 1.7.2