launchpad-reviewers team mailing list archive

Thread
Date
[Merge] lp:~flacoste/launchpad/ppr-constant-memory into lp:launchpad/devel

To: mp+39666@xxxxxxxxxxxxxxxxxx
From: "Francis J. Lacoste" <francis.lacoste@xxxxxxxxxx>
Date: Fri, 29 Oct 2010 22:11:48 -0000
Reply-to: mp+39666@xxxxxxxxxxxxxxxxxx
Sender: bounces@xxxxxxxxxxxxx
Francis J. Lacoste has proposed merging lp:~flacoste/launchpad/ppr-constant-memory into lp:launchpad/devel.
Requested reviews:
Launchpad code reviewers (launchpad-reviewers)
This is my third (and hopefully final) round of refactoring of the page
performance report.
The second SQLite version which was running in constant memory was taking a
long time (>8hours) to compute report for 8M requests. It was also thrashing
the machine doing a lot of IO.
After searching the net, I found a couple of ways to compute the stats we are
gathering on-line with the data with minimal storage. That's the algorithms
I'm implemeting there.
A big change is that we are not computing the exact median, but only an
approximation using a variation on the median of median. That's where the
bulk of the memory is going. It took 900M on my local machine to compute the
report for 300K record. I'll see how many we are using on devpad when
generating a daily. If that's too much, I suggest we simply drop the median
computation. (It took 2m42 of user time to generate, that's way better than
both SQLite algorithm and about 50s than the keep-everything-in-memory
version.)
Other changes:
- I switched to using the gzip and bz2 python builtin library instead of
external process. Hopefully this will get rid of some warnings that were
happening in production with unclosed streams.
- I added saving the 99 and mean values for selected categories in cricket
format. This will allow us to chart our progress using tuolumne.
--
https://code.launchpad.net/~flacoste/launchpad/ppr-constant-memory/+merge/39666
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~flacoste/launchpad/ppr-constant-memory into lp:launchpad/devel.
=== modified file 'lib/lp/scripts/utilities/pageperformancereport.py'
--- lib/lp/scripts/utilities/pageperformancereport.py	2010-10-25 21:47:16 +0000
+++ lib/lp/scripts/utilities/pageperformancereport.py	2010-10-29 22:11:28 +0000
@@ -6,19 +6,19 @@
 __metaclass__ = type
 __all__ = ['main']
 
+import bz2
 from cgi import escape as html_quote
 from ConfigParser import RawConfigParser
+import csv
 from datetime import datetime
+import gzip
+import math
 import os.path
 import re
-import subprocess
 from textwrap import dedent
-import sqlite3
-import tempfile
+import textwrap
 import time
-import warnings
 
-import numpy
 import simplejson as json
 import sre_constants
 import zc.zservertracelog.tracereport
@@ -27,9 +27,6 @@
 from canonical.launchpad.scripts.logger import log
 from lp.scripts.helpers import LPOptionParser
 
-# We don't care about conversion to nan, they are expected.
-warnings.filterwarnings(
-    'ignore', '.*converting a masked element to nan.', UserWarning)
 
 class Request(zc.zservertracelog.tracereport.Request):
     url = None
@@ -58,6 +55,7 @@
 
     Requests belong to a Category if the URL matches a regular expression.
     """
+
     def __init__(self, title, regexp):
         self.title = title
         self.regexp = regexp
@@ -71,8 +69,128 @@
         return cmp(self.title.lower(), other.title.lower())
 
 
+class OnlineStatsCalculator:
+    """Object that can compute count, sum, mean, variance and median.
+
+    It computes these value incrementally and using minimal storage
+    using the Welford / Knuth algorithm described at
+    http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
+    """
+
+    def __init__(self):
+        self.count = 0
+        self.sum = 0
+        self.M2 = 0.0 # Sum of square difference
+        self.mean = 0.0
+
+    def update(self, x):
+        """Incrementally update the stats when adding x to the set.
+
+        None values are ignored.
+        """
+        if x is None:
+            return
+        self.count += 1
+        self.sum += x
+        delta = x - self.mean
+        self.mean = float(self.sum)/self.count
+        self.M2 += delta*(x - self.mean)
+
+    @property
+    def variance(self):
+        """Return the population variance."""
+        if self.count == 0:
+            return 0
+        else:
+            return self.M2/self.count
+
+    @property
+    def std(self):
+        """Return the standard deviation."""
+        if self.count == 0:
+            return 0
+        else:
+            return math.sqrt(self.variance)
+
+
+class OnlineApproximateMedian:
+    """Approximate the median of a set of elements.
+
+    This implements a space-efficient algorithm which only sees each value
+    once. (It will hold in memory log B of n elements.)
+
+    It was described and analysed in
+    D. Cantone and  M.Hofri,
+    "Analysis of An Approximate Median Selection Algorithm"
+    ftp://ftp.cs.wpi.edu/pub/techreports/pdf/06-17.pdf
+
+    This algorithm is similar to Tukey's median of medians technique.
+    It will compute the median among bucket_size values. And the median among
+    those.
+    """
+
+    def __init__(self, bucket_size=9):
+        """Creates a new estimator.
+
+        It approximates the median by finding the median among each
+        successive bucket_size element. And then using these medians for other
+        round of selection.
+
+        The bucket size should be a low odd-integer.
+        """
+        self.count = 0
+        self.bucket_size = bucket_size
+        # Index of the median in a completed bucket.
+        self.median_idx = bucket_size/2
+        self.buckets = []
+
+    def update(self, x):
+        """Update with x."""
+        if x is None:
+            return
+
+        self.count += 1
+        i = 0
+        while True:
+            # Create bucket on demand.
+            if i == len(self.buckets):
+                self.buckets.append([])
+            bucket = self.buckets[i]
+            bucket.append(x)
+            if len(bucket) == self.bucket_size:
+                # Select the median in this bucket, and promote it.
+                x = sorted(bucket)[self.median_idx]
+                # Free the bucket for the next round.
+                del bucket[:]
+                i += 1
+                continue
+            else:
+                break
+
+    @property
+    def median(self):
+        """Return the median."""
+        if self.count == 0:
+            return 0
+
+        # Find the 'weighted' median by assigning a weight to each
+        # element proportional to how far they have been selected.
+        candidates = []
+        total_weight = 0
+        for i, bucket in enumerate(self.buckets):
+            weight = self.bucket_size ** i
+            for x in bucket:
+                total_weight += weight
+                candidates.append([weight, x])
+        # Make weight relative.
+        candidates = [
+            ((float(weight)/total_weight)*x, x)
+            for weight, x in candidates]
+        return sorted(candidates)[(len(candidates)-1)/2][1]
+
+
 class Stats:
-    """Bag to hold request statistics.
+    """Bag to hold and compute request statistics.
 
     All times are in seconds.
     """
@@ -82,7 +200,6 @@
     mean = 0 # Mean time per hit.
     median = 0 # Median time per hit.
     std = 0 # Standard deviation per hit.
-    ninetyninth_percentile_time = 0
     histogram = None # # Request times histogram.
 
     total_sqltime = 0 # Total time spent waiting for SQL to process.
@@ -95,212 +212,175 @@
     median_sqlstatements = 0
     std_sqlstatements = 0
 
-    def __init__(self, times, timeout):
-        """Compute the stats based on times.
-
-        Times is a list of (app_time, sql_statements, sql_times).
-
-        The histogram is a list of request counts per 1 second bucket.
-        ie. histogram[0] contains the number of requests taking between 0 and
-        1 second, histogram[1] contains the number of requests taking between
-        1 and 2 seconds etc. histogram is None if there are no requests in
-        this Category.
+    @property
+    def ninetyninth_percentile_time(self):
+        """Time under which 99% of requests are rendered.
+
+        This is estimated as 3 std deviations from the mean. Given that
+        in a daily report, many URLs or PageIds won't have 100 requests, it's
+        more useful to use this estimator.
         """
-        if not times:
-            return
-
-        self.total_hits = len(times)
-
-        # Ignore missing values (-1) in computation.
-        times_array = numpy.ma.masked_values(
-            numpy.asarray(times, dtype=numpy.float32), -1.)
-
-        self.total_time, self.total_sqlstatements, self.total_sqltime = (
-            times_array.sum(axis=0))
-
-        self.mean, self.mean_sqlstatements, self.mean_sqltime = (
-            times_array.mean(axis=0))
-
-        self.median, self.median_sqlstatements, self.median_sqltime = (
-            numpy.median(times_array, axis=0))
-
-        self.std, self.std_sqlstatements, self.std_sqltime = (
-            numpy.std(times_array, axis=0))
-
-        # This is an approximation which may not be true: we don't know if we
-        # have a std distribution or not. We could just find the 99th
-        # percentile by counting. Shock. Horror; however this appears pretty
-        # good based on eyeballing things so far - once we're down in the 2-3
-        # second range for everything we may want to revisit.
-        self.ninetyninth_percentile_time = self.mean + self.std*3
-
-        histogram_width = int(timeout*1.5)
-        histogram_times = numpy.clip(times_array[:,0], 0, histogram_width)
-        histogram = numpy.histogram(
-            histogram_times, normed=True, range=(0, histogram_width),
-            bins=histogram_width)
-        self.histogram = zip(histogram[1], histogram[0])
-
-
-class SQLiteRequestTimes:
-    """SQLite-based request times computation."""
+        return self.mean + 3*self.std
+
+    @property
+    def relative_histogram(self):
+        """Return an histogram where the frequency is relative."""
+        if self.histogram:
+            return [[x, float(f)/self.total_hits] for x, f in self.histogram]
+        else:
+            return None
+
+    def text(self):
+        """Return a textual version of the stats."""
+        return textwrap.dedent("""
+        <Stats for %d requests:
+            Time:     total=%.2f; mean=%.2f; median=%.2f; std=%.2f
+            SQL time: total=%.2f; mean=%.2f; median=%.2f; std=%.2f
+            SQL stmt: total=%.f;  mean=%.2f; median=%.f; std=%.2f
+            >""" % (
+                self.total_hits, self.total_time, self.mean, self.median,
+                self.std, self.total_sqltime, self.mean_sqltime,
+                self.median_sqltime, self.std_sqltime,
+                self.total_sqlstatements, self.mean_sqlstatements,
+                self.median_sqlstatements, self.std_sqlstatements))
+
+
+class OnlineStats(Stats):
+    """Implementation of stats that can be computed online.
+
+    You call update() for each request and the stats are updated incrementally
+    with minimum storage space.
+    """
+
+    def __init__(self, histogram_width):
+        self.time_stats = OnlineStatsCalculator()
+        self.time_median_approximate = OnlineApproximateMedian()
+        self.sql_time_stats = OnlineStatsCalculator()
+        self.sql_time_median_approximate = OnlineApproximateMedian()
+        self.sql_statements_stats = OnlineStatsCalculator()
+        self.sql_statements_median_approximate = OnlineApproximateMedian()
+        self._histogram = [
+            [x, 0] for x in range(histogram_width)]
+
+    @property
+    def total_hits(self):
+        return self.time_stats.count
+
+    @property
+    def total_time(self):
+        return self.time_stats.sum
+
+    @property
+    def mean(self):
+        return self.time_stats.mean
+
+    @property
+    def median(self):
+        return self.time_median_approximate.median
+
+    @property
+    def std(self):
+        return self.time_stats.std
+
+    @property
+    def total_sqltime(self):
+        return self.sql_time_stats.sum
+
+    @property
+    def mean_sqltime(self):
+        return self.sql_time_stats.mean
+
+    @property
+    def median_sqltime(self):
+        return self.sql_time_median_approximate.median
+
+    @property
+    def std_sqltime(self):
+        return self.sql_time_stats.std
+
+    @property
+    def total_sqlstatements(self):
+        return self.sql_statements_stats.sum
+
+    @property
+    def mean_sqlstatements(self):
+        return self.sql_statements_stats.mean
+
+    @property
+    def median_sqlstatements(self):
+        return self.sql_statements_median_approximate.median
+
+    @property
+    def std_sqlstatements(self):
+        return self.sql_statements_stats.std
+
+    @property
+    def histogram(self):
+        if self.time_stats.count:
+            return self._histogram
+        else:
+            return None
+
+    def update(self, request):
+        """Update the stats based on request."""
+        self.time_stats.update(request.app_seconds)
+        self.time_median_approximate.update(request.app_seconds)
+        self.sql_time_stats.update(request.sql_seconds)
+        self.sql_time_median_approximate.update(request.sql_seconds)
+        self.sql_statements_stats.update(request.sql_statements)
+        self.sql_statements_median_approximate.update(request.sql_statements)
+
+        idx = int(min(len(self.histogram)-1, request.app_seconds))
+        self.histogram[idx][1] += 1
+
+
+class RequestTimes:
+    """Collect the """
 
     def __init__(self, categories, options):
-        if options.db_file is None:
-            fd, self.filename = tempfile.mkstemp(suffix='.db', prefix='ppr')
-            os.close(fd)
-        else:
-            self.filename = options.db_file
-        self.con = sqlite3.connect(self.filename, isolation_level='EXCLUSIVE')
-        log.debug('Using request database %s' % self.filename)
-        # Some speed optimization.
-        self.con.execute('PRAGMA synchronous = off')
-        self.con.execute('PRAGMA journal_mode = off')
-
-        self.categories = categories
-        self.store_all_request = options.pageids or options.top_urls
-        self.timeout = options.timeout
-        self.cur = self.con.cursor()
-
-        # Create the tables, ignore errors about them being already present.
-        try:
-            self.cur.execute('''
-                CREATE TABLE category_request (
-                    category INTEGER,
-                    time REAL,
-                    sql_statements INTEGER,
-                    sql_time REAL)
-                    ''');
-        except sqlite3.OperationalError, e:
-            if 'already exists' in str(e):
-                pass
-            else:
-                raise
-
-        if self.store_all_request:
-            try:
-                self.cur.execute('''
-                    CREATE TABLE request (
-                        pageid TEXT,
-                        url TEXT,
-                        time REAL,
-                        sql_statements INTEGER,
-                        sql_time REAL)
-                        ''');
-            except sqlite3.OperationalError, e:
-                if 'already exists' in str(e):
-                    pass
-                else:
-                    raise
+        self.by_pageids = options.pageids
+        self.by_urls = options.top_urls
+
+        # Histogram has a bin per second up to 1.5 our timeout.
+        self.histogram_width = int(options.timeout*1.5)
+        self.category_times = [
+            (category, OnlineStats(self.histogram_width))
+            for category in categories]
+        self.url_times = {}
+        self.pageid_times = {}
 
     def add_request(self, request):
-        """Add a request to the cache."""
-        sql_statements = request.sql_statements
-        sql_seconds = request.sql_seconds
-
-        # Store missing value as -1, as it makes dealing with those
-        # easier with numpy.
-        if sql_statements is None:
-            sql_statements = -1
-        if sql_seconds is None:
-            sql_seconds = -1
-        for idx, category in enumerate(self.categories):
+        """Add a request to the ."""
+        for category, stats in self.category_times:
             if category.match(request):
-                self.con.execute(
-                    "INSERT INTO category_request VALUES (?,?,?,?)",
-                    (idx, request.app_seconds, sql_statements, sql_seconds))
+                stats.update(request)
 
-        if self.store_all_request:
+        if self.by_pageids:
             pageid = request.pageid or 'Unknown'
-            self.con.execute(
-                "INSERT INTO request VALUES (?,?,?,?,?)", 
-                (pageid, request.url, request.app_seconds, sql_statements,
-                    sql_seconds))
+            stats = self.pageid_times.setdefault(
+                pageid, OnlineStats(self.histogram_width))
+            stats.update(request)
 
-    def commit(self):
-        """Call commit on the underlying connection."""
-        self.con.commit()
+        if self.by_urls:
+            stats = self.url_times.setdefault(
+                request.url, OnlineStats(self.histogram_width))
+            stats.update(request)
 
     def get_category_times(self):
         """Return the times for each category."""
-        category_query = 'SELECT * FROM category_request ORDER BY category'
-
-        empty_stats = Stats([], 0)
-        categories = dict(self.get_times(category_query))
-        return [
-            (category, categories.get(idx, empty_stats))
-            for idx, category in enumerate(self.categories)]
+        return self.category_times
 
     def get_top_urls_times(self, top_n):
         """Return the times for the Top URL by total time"""
-        top_url_query = '''
-            SELECT url, time, sql_statements, sql_time
-            FROM request WHERE url IN (
-                SELECT url FROM (SELECT url, sum(time) FROM request
-                    GROUP BY url
-                    ORDER BY sum(time) DESC
-                    LIMIT %d))
-            ORDER BY url
-        ''' % top_n
         # Sort the result by total time
         return sorted(
-            self.get_times(top_url_query), key=lambda x: x[1].total_time,
-            reverse=True)
+            self.url_times.items(),
+            key=lambda x: x[1].total_time, reverse=True)[:top_n]
 
     def get_pageid_times(self):
         """Return the times for the pageids."""
-        pageid_query = '''
-            SELECT pageid, time, sql_statements, sql_time
-            FROM request
-            ORDER BY pageid
-        '''
-        return self.get_times(pageid_query)
-
-    def get_times(self, query):
-        """Return a list of key, stats based on the query.
-
-        The query should return rows of the form:
-            [key, app_time, sql_statements, sql_times]
-
-        And should be sorted on key.
-        """
-        times = []
-        current_key = None
-        results = []
-        self.cur.execute(query)
-        while True:
-            rows = self.cur.fetchmany()
-            if len(rows) == 0:
-                break
-            for row in rows:
-                # We are encountering a new group...
-                if row[0] != current_key:
-                    # Compute the stats of the previous group
-                    if current_key != None:
-                        results.append(
-                            (current_key, Stats(times, self.timeout)))
-                    # Initialize the new group.
-                    current_key = row[0]
-                    times = []
-
-                times.append(row[1:])
-        # Compute the stats of the last group
-        if current_key != None:
-            results.append((current_key, Stats(times, self.timeout)))
-
-        return results
-
-    def close(self, remove=False):
-        """Close the SQLite connection.
-
-        :param remove: If true, the DB file will be removed.
-        """
-        self.con.close()
-        if remove:
-            log.debug('Deleting request database.')
-            os.unlink(self.filename)
-        else:
-            log.debug('Keeping request database %s.' % self.filename)
+        # Sort the result by pageid
+        return sorted(self.pageid_times.items())
 
 
 def main():
@@ -339,17 +419,13 @@
         # Default to 12: the staging timeout.
         default=12, type="int",
         help="The configured timeout value : determines high risk page ids.")
-    parser.add_option(
-        "--db-file", dest="db_file",
-        default=None, metavar="FILE",
-        help="Do not parse the records, generate reports from the DB file.")
 
     options, args = parser.parse_args()
 
     if not os.path.isdir(options.directory):
         parser.error("Directory %s does not exist" % options.directory)
 
-    if len(args) == 0 and options.db_file is None:
+    if len(args) == 0:
         parser.error("At least one zserver tracelog file must be provided")
 
     if options.from_ts is not None and options.until_ts is not None:
@@ -383,22 +459,17 @@
     if len(categories) == 0:
         parser.error("No data in [categories] section of configuration.")
 
-    times = SQLiteRequestTimes(categories, options)
-
-    if len(args) > 0:
-        parse(args, times, options)
-        times.commit()
-
-    log.debug('Generating category statistics...')
+    times = RequestTimes(categories, options)
+
+    parse(args, times, options)
+
     category_times = times.get_category_times()
 
     pageid_times = []
     url_times= []
     if options.top_urls:
-        log.debug('Generating top %d urls statistics...' % options.top_urls)
         url_times = times.get_top_urls_times(options.top_urls)
     if options.pageids:
-        log.debug('Generating pageid statistics...')
         pageid_times = times.get_pageid_times()
 
     def _report_filename(filename):
@@ -436,7 +507,30 @@
         open(report_filename, 'w'), None, pageid_times, None,
         options.timeout - 2)
 
-    times.close(options.db_file is None)
+    # Output metrics for selected categories.
+    report_filename = _report_filename('metrics.dat')
+    log.info('Saving category_metrics %s', report_filename)
+    metrics_file = open(report_filename, 'w')
+    writer = csv.writer(metrics_file, delimiter=':')
+    date = options.until_ts or options.from_ts or datetime.utcnow()
+    date = time.mktime(date.timetuple())
+
+    for option in script_config.options('metrics'):
+        name = script_config.get('metrics', option)
+        found = False
+        for category, stats in category_times:
+            if category.title == name:
+                writer.writerows([
+                    ("%s_99" % option, "%f@%d" % (
+                        stats.ninetyninth_percentile_time, date)),
+                    ("%s_mean" % option, "%f@%d" % (stats.mean, date))])
+                found = True
+                break
+        if not found:
+            log.warning("Can't find category %s for metric %s" % (
+                option, name))
+    metrics_file.close()
+
     return 0
 
 
@@ -447,17 +541,9 @@
     """
     ext = os.path.splitext(filename)[1]
     if ext == '.bz2':
-        p = subprocess.Popen(
-            ['bunzip2', '-c', filename],
-            stdout=subprocess.PIPE, stdin=subprocess.PIPE)
-        p.stdin.close()
-        return p.stdout
+        return bz2.BZ2File(filename, 'r')
     elif ext == '.gz':
-        p = subprocess.Popen(
-            ['gunzip', '-c', filename],
-            stdout=subprocess.PIPE, stdin=subprocess.PIPE)
-        p.stdin.close()
-        return p.stdout
+        return gzip.GzipFile(filename, 'r')
     else:
         return open(filename, mode)
 
@@ -684,7 +770,7 @@
     histograms = []
 
     def handle_times(html_title, stats):
-        histograms.append(stats.histogram)
+        histograms.append(stats.relative_histogram)
         print >> outf, dedent("""\
             <tr>
             <th class="category-title">%s</th>
@@ -810,4 +896,3 @@
         </body>
         </html>
         """)
-

=== added file 'lib/lp/scripts/utilities/tests/test_pageperformancereport.py'
--- lib/lp/scripts/utilities/tests/test_pageperformancereport.py	1970-01-01 00:00:00 +0000
+++ lib/lp/scripts/utilities/tests/test_pageperformancereport.py	2010-10-29 22:11:28 +0000
@@ -0,0 +1,290 @@
+# Copyright 2010 Canonical Ltd.  This software is licensed under the
+# GNU Affero General Public License version 3 (see the file LICENSE).
+
+"""Test the pageperformancereport script."""
+
+__metaclass__ = type
+
+import unittest
+
+from lp.testing import TestCase
+
+from lp.scripts.utilities.pageperformancereport import (
+    Category,
+    OnlineApproximateMedian,
+    OnlineStatsCalculator,
+    RequestTimes,
+    Stats,
+    )
+
+
+class FakeOptions:
+    timeout = 4
+    db_file = None
+    pageids = True
+    top_urls = True
+
+    def __init__(self, **kwargs):
+        """Assign all arguments as attributes."""
+        self.__dict__.update(kwargs)
+
+
+class FakeRequest:
+    def __init__(self, url, app_seconds, sql_statements=None,
+                 sql_seconds=None, pageid=None):
+        self.url = url
+        self.pageid = pageid
+        self.app_seconds = app_seconds
+        self.sql_statements = sql_statements
+        self.sql_seconds = sql_seconds
+
+
+class FakeStats(Stats):
+    def __init__(self, **kwargs):
+        # Override the constructor to just store the values.
+        self.__dict__.update(kwargs)
+
+
+FAKE_REQUESTS = [
+    FakeRequest('/', 0.5, pageid='+root'),
+    FakeRequest('/bugs', 4.5, 56, 3.0, pageid='+bugs'),
+    FakeRequest('/bugs', 4.2, 56, 2.2, pageid='+bugs'),
+    FakeRequest('/bugs', 5.5, 76, 4.0, pageid='+bugs'),
+    FakeRequest('/ubuntu', 2.5, 6, 2.0, pageid='+distribution'),
+    FakeRequest('/launchpad', 3.5, 3, 3.0, pageid='+project'),
+    FakeRequest('/bzr', 2.5, 4, 2.0, pageid='+project'),
+    FakeRequest('/bugs/1', 20.5, 567, 14.0, pageid='+bug'),
+    FakeRequest('/bugs/1', 15.5, 567, 9.0, pageid='+bug'),
+    FakeRequest('/bugs/5', 1.5, 30, 1.2, pageid='+bug'),
+    FakeRequest('/lazr', 1.0, 16, 0.3, pageid='+project'),
+    FakeRequest('/drizzle', 0.9, 11, 1.3, pageid='+project'),
+    ]
+
+
+# The category stats computed for the above 12 requests.
+CATEGORY_STATS = [
+    # Median is an approximation.
+    # Real values are: 2.50, 2.20, 30
+    (Category('All', ''), FakeStats(
+        total_hits=12, total_time=62.60, mean=5.22, median=1.0, std=5.99,
+        total_sqltime=42, mean_sqltime=3.82, median_sqltime=1.3,
+        std_sqltime=3.89,
+        total_sqlstatements=1392, mean_sqlstatements=126.55,
+        median_sqlstatements=16, std_sqlstatements=208.94,
+        histogram=[[0, 2], [1, 2], [2, 2], [3, 1], [4, 2], [5, 3]],
+        )),
+    (Category('Test', ''), FakeStats()),
+    (Category('Bugs', ''), FakeStats(
+        total_hits=6, total_time=51.70, mean=8.62, median=4.5, std=6.90,
+        total_sqltime=33.40, mean_sqltime=5.57, median_sqltime=3,
+        std_sqltime=4.52,
+        total_sqlstatements=1352, mean_sqlstatements=225.33,
+        median_sqlstatements=56, std_sqlstatements=241.96,
+        histogram=[[0, 0], [1, 1], [2, 0], [3, 0], [4, 2], [5, 3]],
+        )),
+    ]
+
+
+# The top 3 URL stats computed for the above 12 requests.
+TOP_3_URL_STATS = [
+    ('/bugs/1', FakeStats(
+        total_hits=2, total_time=36.0, mean=18.0, median=15.5, std=2.50,
+        total_sqltime=23.0, mean_sqltime=11.5, median_sqltime=9.0,
+        std_sqltime=2.50,
+        total_sqlstatements=1134, mean_sqlstatements=567.0,
+        median_sqlstatements=567, std_statements=0,
+        histogram=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0], [5, 2]],
+        )),
+    ('/bugs', FakeStats(
+        total_hits=3, total_time=14.2, mean=4.73, median=4.5, std=0.56,
+        total_sqltime=9.2, mean_sqltime=3.07, median_sqltime=3,
+        std_sqltime=0.74,
+        total_sqlstatements=188, mean_sqlstatements=62.67,
+        median_sqlstatements=56, std_sqlstatements=9.43,
+        histogram=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 2], [5, 1]],
+        )),
+    ('/launchpad', FakeStats(
+        total_hits=1, total_time=3.5, mean=3.5, median=3.5, std=0,
+        total_sqltime=3.0, mean_sqltime=3, median_sqltime=3, std_sqltime=0,
+        total_sqlstatements=3, mean_sqlstatements=3,
+        median_sqlstatements=3, std_sqlstatements=0,
+        histogram=[[0, 0], [1, 0], [2, 0], [3, 1], [4, 0], [5, 0]],
+        )),
+    ]
+
+
+# The pageid stats computed for the above 12 requests.
+PAGEID_STATS = [
+    ('+bug', FakeStats(
+        total_hits=3, total_time=37.5, mean=12.5, median=15.5, std=8.04,
+        total_sqltime=24.2, mean_sqltime=8.07, median_sqltime=9,
+        std_sqltime=5.27,
+        total_sqlstatements=1164, mean_sqlstatements=388,
+        median_sqlstatements=567, std_sqlstatements=253.14,
+        histogram=[[0, 0], [1, 1], [2, 0], [3, 0], [4, 0], [5, 2]],
+        )),
+    ('+bugs', FakeStats(
+        total_hits=3, total_time=14.2, mean=4.73, median=4.5, std=0.56,
+        total_sqltime=9.2, mean_sqltime=3.07, median_sqltime=3,
+        std_sqltime=0.74,
+        total_sqlstatements=188, mean_sqlstatements=62.67,
+        median_sqlstatements=56, std_sqlstatements=9.43,
+        histogram=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 2], [5, 1]],
+        )),
+    ('+distribution', FakeStats(
+        total_hits=1, total_time=2.5, mean=2.5, median=2.5, std=0,
+        total_sqltime=2.0, mean_sqltime=2, median_sqltime=2, std_sqltime=0,
+        total_sqlstatements=6, mean_sqlstatements=6,
+        median_sqlstatements=6, std_sqlstatements=0,
+        histogram=[[0, 0], [1, 0], [2, 1], [3, 0], [4, 0], [5, 0]],
+        )),
+    ('+project', FakeStats(
+        total_hits=4, total_time=7.9, mean=1.98, median=1, std=1.08,
+        total_sqltime=6.6, mean_sqltime=1.65, median_sqltime=1.3,
+        std_sqltime=0.99,
+        total_sqlstatements=34, mean_sqlstatements=8.5,
+        median_sqlstatements=4, std_sqlstatements=5.32,
+        histogram=[[0, 1], [1, 1], [2, 1], [3, 1], [4, 0], [5, 0]],
+        )),
+    ('+root', FakeStats(
+        total_hits=1, total_time=0.5, mean=0.5, median=0.5, std=0,
+        histogram=[[0, 1], [1, 0], [2, 0], [3, 0], [4, 0], [5, 0]],
+        )),
+    ]
+
+
+class TestRequestTimes(TestCase):
+    """Tests the RequestTimes backend."""
+
+    def setUp(self):
+        TestCase.setUp(self)
+        self.categories = [
+            Category('All', '.*'), Category('Test', '.*test.*'),
+            Category('Bugs', '.*bugs.*')]
+        self.db = RequestTimes(self.categories, FakeOptions())
+
+    def setUpRequests(self):
+        """Insert some requests into the db."""
+        for r in FAKE_REQUESTS:
+            self.db.add_request(r)
+
+    def assertStatsAreEquals(self, expected, results):
+        self.assertEquals(
+            len(expected), len(results), 'Wrong number of results')
+        for idx in range(len(results)):
+            self.assertEquals(expected[idx][0], results[idx][0],
+                "Wrong key for results %d" % idx)
+            key = results[idx][0]
+            self.assertEquals(expected[idx][1].text(), results[idx][1].text(),
+                "Wrong stats for results %d (%s)" % (idx, key))
+            self.assertEquals(
+                expected[idx][1].histogram, results[idx][1].histogram,
+                "Wrong histogram for results %d (%s)" % (idx, key))
+
+    def test_get_category_times(self):
+        self.setUpRequests()
+        category_times = self.db.get_category_times()
+        self.assertStatsAreEquals(CATEGORY_STATS, category_times)
+
+    def test_get_url_times(self):
+        self.setUpRequests()
+        url_times = self.db.get_top_urls_times(3)
+        self.assertStatsAreEquals(TOP_3_URL_STATS, url_times)
+
+    def test_get_pageid_times(self):
+        self.setUpRequests()
+        pageid_times = self.db.get_pageid_times()
+        self.assertStatsAreEquals(PAGEID_STATS, pageid_times)
+
+
+class TestStats(TestCase):
+    """Tests for the stats class."""
+
+    def test_relative_histogram(self):
+        # Test that relative histogram gives an histogram using
+        # relative frequency.
+        stats = Stats()
+        stats.total_hits = 100
+        stats.histogram = [[0, 50], [1, 10], [2, 33], [3, 0], [4, 0], [5, 7]]
+        self.assertEquals(
+            [[0, 0.5], [1, .1], [2, .33], [3, 0], [4, 0], [5, .07]],
+            stats.relative_histogram)
+
+
+class TestOnlineStatsCalculator(TestCase):
+    """Tests for the online stats calculator."""
+
+    def setUp(self):
+        TestCase.setUp(self)
+        self.stats = OnlineStatsCalculator()
+
+    def test_stats_for_empty_set(self):
+        # Test the stats when there is no input.
+        self.assertEquals(0, self.stats.count)
+        self.assertEquals(0, self.stats.sum)
+        self.assertEquals(0, self.stats.mean)
+        self.assertEquals(0, self.stats.variance)
+        self.assertEquals(0, self.stats.std)
+
+    def test_stats_for_one_value(self):
+        # Test the stats when adding one element.
+        self.stats.update(5)
+        self.assertEquals(1, self.stats.count)
+        self.assertEquals(5, self.stats.sum)
+        self.assertEquals(5, self.stats.mean)
+        self.assertEquals(0, self.stats.variance)
+        self.assertEquals(0, self.stats.std)
+
+    def test_None_are_ignored(self):
+        self.stats.update(None)
+        self.assertEquals(0, self.stats.count)
+
+    def test_stats_for_3_values(self):
+        for x in [3, 6, 9]:
+            self.stats.update(x)
+        self.assertEquals(3, self.stats.count)
+        self.assertEquals(18, self.stats.sum)
+        self.assertEquals(6, self.stats.mean)
+        self.assertEquals(6, self.stats.variance)
+        self.assertEquals("2.45", "%.2f" % self.stats.std)
+
+
+SHUFFLE_RANGE_100 = [
+    25, 79, 99, 76, 60, 63, 87, 77, 51, 82, 42, 96, 93, 58, 32, 66, 75,
+     2, 26, 22, 11, 73, 61, 83, 65, 68, 44, 81, 64,  3, 33, 34, 15,  1,
+    92, 27, 90, 74, 46, 57, 59, 31, 13, 19, 89, 29, 56, 94, 50, 49, 62,
+    37, 21, 35, 5, 84, 88, 16, 8, 23, 40, 6, 48, 10, 97, 0, 53, 17, 30,
+    18, 43, 86, 12, 71, 38, 78, 36, 7, 45, 47, 80, 54, 39, 91, 98, 24,
+    55, 14, 52, 20, 69, 85, 95, 28, 4, 9, 67, 70, 41, 72
+    ]
+
+
+class TestOnlineApproximateMedian(TestCase):
+    """Tests for the approximate median computation."""
+
+    def setUp(self):
+        TestCase.setUp(self)
+        self.estimator = OnlineApproximateMedian()
+
+    def test_median_is_0_when_no_input(self):
+        self.assertEquals(0, self.estimator.median)
+
+    def test_median_is_true_median_for_n_lower_than_bucket_size(self):
+        for x in range(9):
+            self.estimator.update(x)
+        self.assertEquals(4, self.estimator.median)
+
+    def test_None_input_is_ignored(self):
+        self.estimator.update(1)
+        self.estimator.update(None)
+        self.assertEquals(1, self.estimator.median)
+
+    def test_approximage_median_is_good_enough(self):
+        for x in SHUFFLE_RANGE_100:
+            self.estimator.update(x)
+        # True median is 50, 52 is good enough :-)
+        self.assertEquals(52, self.estimator.median)
+
+
+def test_suite():
+    return unittest.TestLoader().loadTestsFromName(__name__)

=== modified file 'setup.py'
--- setup.py	2010-09-18 08:00:27 +0000
+++ setup.py	2010-10-29 22:11:28 +0000
@@ -51,7 +51,6 @@
         'meliae',
         'mercurial',
         'mocker',
-        'numpy',
         'oauth',
         'paramiko',
         'python-memcached',

=== modified file 'utilities/page-performance-report.ini'
--- utilities/page-performance-report.ini	2010-10-24 21:00:11 +0000
+++ utilities/page-performance-report.ini	2010-10-29 22:11:28 +0000
@@ -45,3 +45,10 @@
 Private XML-RPC=^https?://xmlrpc-private\.
 Shipit=^https?://shipit\.
 
+[metrics]
+ppr_all=All launchpad except opstats
+ppr_bugs=Bugs
+ppr_api=API
+ppr_code=Code
+ppr_translations=Translations
+ppr_registry=Registry

=== modified file 'versions.cfg'
--- versions.cfg	2010-10-27 04:23:52 +0000
+++ versions.cfg	2010-10-29 22:11:28 +0000
@@ -45,7 +45,6 @@
 mercurial = 1.6.2
 mocker = 0.10.1
 mozrunner = 1.3.4
-numpy = 1.3.0
 oauth = 1.0
 paramiko = 1.7.4
 Paste = 1.7.2
Follow ups

Re: [Merge] lp:~flacoste/launchpad/ppr-constant-memory into lp:launchpad/devel
From: Francis J. Lacoste, 2010-11-01