launchpad-reviewers team mailing list archive

Thread
Date

[Merge] lp:~flacoste/launchpad/ppr-merge into lp:launchpad/devel

To: mp+40014@xxxxxxxxxxxxxxxxxx
From: "Francis J. Lacoste" <francis.lacoste@xxxxxxxxxx>
Date: Wed, 03 Nov 2010 19:58:53 -0000
Reply-to: mp+40014@xxxxxxxxxxxxxxxxxx
Sender: bounces@xxxxxxxxxxxxx

Francis J. Lacoste has proposed merging lp:~flacoste/launchpad/ppr-merge into lp:launchpad/devel with lp:~flacoste/launchpad/ppr-constant-memory as a prerequisite.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)


Hello,

This is a follow-up on my previous constant-memory branch. It adds the option
of generating a report by aggregating the RequestTimes data structure pickled
in a previous run.

This will allow us to generate weekly, monthly and even yearly page
performance report with minimal fuss.

I did this by adding a __add__ method to all data structure involved. There
are only two tricky algorithms, one how to merge the variance. I used the
algorithm pointed at by the wikipedia page
http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance

The transcription on that page actually didn't produce the expected results.
So I went to the paper and used the original formula.

The may to merge two approximate medians is home-made. I went with what seemed
to make the most sense. It seem to produce ok results.


-- 
https://code.launchpad.net/~flacoste/launchpad/ppr-merge/+merge/40014
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~flacoste/launchpad/ppr-merge into lp:launchpad/devel.

=== modified file 'lib/lp/scripts/utilities/pageperformancereport.py'
--- lib/lp/scripts/utilities/pageperformancereport.py	2010-11-03 19:58:08 +0000
+++ lib/lp/scripts/utilities/pageperformancereport.py	2010-11-03 19:58:27 +0000
@@ -9,6 +9,7 @@
 import bz2
 import cPickle
 from cgi import escape as html_quote
+import copy
 from ConfigParser import RawConfigParser
 import csv
 from datetime import datetime
@@ -69,6 +70,11 @@
     def __cmp__(self, other):
         return cmp(self.title.lower(), other.title.lower())
 
+    def __deepcopy__(self, memo):
+        # We provide __deepcopy__ because the module doesn't handle
+        # compiled regular expression by default.
+        return Category(self.title, self.regexp)
+
 
 class OnlineStatsCalculator:
     """Object that can compute count, sum, mean, variance and median.
@@ -113,6 +119,30 @@
         else:
             return math.sqrt(self.variance)
 
+    def __add__(self, other):
+        """Adds this and another OnlineStatsCalculator.
+
+        The result combines the stats of the two objects.
+        """
+        results = OnlineStatsCalculator()
+        results.count = self.count + other.count
+        results.sum = self.sum + other.sum
+        if self.count > 0 and other.count > 0:
+            # This is 2.1b in Chan, Tony F.; Golub, Gene H.; LeVeque, 
+            # Randall J. (1979), "Updating Formulae and a Pairwise Algorithm
+            # for Computing Sample Variances.",
+            # Technical Report STAN-CS-79-773,
+            # Department of Computer Science, Stanford University, 
+            # ftp://reports.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf .
+            results.M2 = self.M2 + other.M2 + (
+                (float(self.count)/(other.count*results.count)) *
+                ((float(other.count)/self.count)*self.sum - other.sum)**2)
+        else:
+            results.M2 = self.M2 + other.M2 # One of them is 0.
+        if results.count > 0:
+            results.mean = float(results.sum)/results.count
+        return results
+
 
 class OnlineApproximateMedian:
     """Approximate the median of a set of elements.
@@ -139,23 +169,22 @@
 
         The bucket size should be a low odd-integer.
         """
-        self.count = 0
         self.bucket_size = bucket_size
         # Index of the median in a completed bucket.
         self.median_idx = bucket_size/2
         self.buckets = []
 
-    def update(self, x):
+    def update(self, x, order=0):
         """Update with x."""
         if x is None:
             return
 
-        self.count += 1
-        i = 0
+        i = order
         while True:
             # Create bucket on demand.
-            if i == len(self.buckets):
-                self.buckets.append([])
+            if i >= len(self.buckets):
+                for n in range((i+1)-len(self.buckets)):
+                    self.buckets.append([])
             bucket = self.buckets[i]
             bucket.append(x)
             if len(bucket) == self.bucket_size:
@@ -171,9 +200,6 @@
     @property
     def median(self):
         """Return the median."""
-        if self.count == 0:
-            return 0
-
         # Find the 'weighted' median by assigning a weight to each
         # element proportional to how far they have been selected.
         candidates = []
@@ -183,12 +209,30 @@
             for x in bucket:
                 total_weight += weight
                 candidates.append([weight, x])
+
+        if len(candidates) == 0:
+            return 0
         # Make weight relative.
         candidates = [
             ((float(weight)/total_weight)*x, x)
             for weight, x in candidates]
         return sorted(candidates)[(len(candidates)-1)/2][1]
 
+    def __add__(self, other):
+        """Merge two approximator together.
+
+        All candidates from the other are merged through the standard
+        algorithm, starting at the same level. So an item that went through
+        two rounds of selection, will be compared with other items having
+        gone through the same number of rounds.
+        """
+        results = OnlineApproximateMedian(self.bucket_size)
+        results.buckets = copy.deepcopy(self.buckets)
+        for i, bucket  in enumerate(other.buckets):
+            for x in bucket:
+                results.update(x, i)
+        return results
+
 
 class Stats:
     """Bag to hold and compute request statistics.
@@ -334,6 +378,21 @@
         idx = int(min(len(self.histogram)-1, request.app_seconds))
         self.histogram[idx][1] += 1
 
+    def __add__(self, other):
+        """Merge another OnlineStats with this one."""
+        results = copy.deepcopy(self)
+        results.time_stats += other.time_stats
+        results.time_median_approximate += other.time_median_approximate
+        results.sql_time_stats += other.sql_time_stats
+        results.sql_time_median_approximate += (
+            other.sql_time_median_approximate)
+        results.sql_statements_stats += other.sql_statements_stats
+        results.sql_statements_median_approximate += (
+            other.sql_statements_median_approximate)
+        for i, (n, f) in enumerate(other._histogram):
+            results._histogram[i][1] += f
+        return results
+
 
 class RequestTimes:
     """Collect the """
@@ -404,6 +463,43 @@
         # Sort the result by pageid
         return sorted(self.pageid_times.items())
 
+    def __add__(self, other):
+        """Merge two RequestTimes together."""
+        results = copy.deepcopy(self)
+        for other_category, other_stats in other.category_times:
+            found = False
+            for i, (category, stats) in enumerate(self.category_times):
+                if category.title == other_category.title:
+                    results.category_times[i] = (
+                        category, stats + other_stats)
+                    found = True
+                    break
+            if not found:
+                results.category_times.append(
+                    (other_category, copy.deepcopy(other_stats)))
+
+        url_times = results.url_times
+        for url, stats in other.url_times.items():
+            if url in url_times:
+                url_times[url] += stats
+            else:
+                url_times[url] = copy.deepcopy(stats)
+        # Only keep top_urls_cache_size entries.
+        if len(self.url_times) > self.top_urls_cache_size:
+            self.url_times = dict(
+                sorted(url_times.items(),
+                key=lambda x: x[1].total_time,
+                reverse=True)[:self.top_urls_cache_size])
+
+        pageid_times = results.pageid_times
+        for pageid, stats in other.pageid_times.items():
+            if pageid in pageid_times:
+                pageid_times[pageid] += stats
+            else:
+                pageid_times[pageid] = copy.deepcopy(stats)
+
+        return results
+
 
 def main():
     parser = LPOptionParser("%prog [args] tracelog [...]")
@@ -441,6 +537,11 @@
         # Default to 12: the staging timeout.
         default=12, type="int",
         help="The configured timeout value : determines high risk page ids.")
+    parser.add_option(
+        "--merge", dest="merge",
+        default=False, action='store_true',
+        help="Files are interpreted as pickled stats and are aggregated for" +
+        "the report.")
 
     options, args = parser.parse_args()
 
@@ -455,6 +556,9 @@
             parser.error(
                 "--from timestamp %s is before --until timestamp %s"
                 % (options.from_ts, options.until_ts))
+    if options.from_ts is not None or options.until_ts is not None:
+        if options.merge:
+            parser.error('--from and --until cannot be used with --merge')
 
     for filename in args:
         if not os.path.exists(filename):
@@ -483,7 +587,14 @@
 
     times = RequestTimes(categories, options)
 
-    parse(args, times, options)
+    if options.merge:
+        for filename in args:
+            log.info('Merging %s...' % filename)
+            f = bz2.BZ2File(filename, 'r')
+            times += cPickle.load(f)
+            f.close()
+    else:
+        parse(args, times, options)
 
     category_times = times.get_category_times()
 

=== modified file 'lib/lp/scripts/utilities/tests/test_pageperformancereport.py'
--- lib/lp/scripts/utilities/tests/test_pageperformancereport.py	2010-11-03 19:58:08 +0000
+++ lib/lp/scripts/utilities/tests/test_pageperformancereport.py	2010-11-03 19:58:27 +0000
@@ -12,6 +12,7 @@
 from lp.scripts.utilities.pageperformancereport import (
     Category,
     OnlineApproximateMedian,
+    OnlineStats,
     OnlineStatsCalculator,
     RequestTimes,
     Stats,
@@ -196,9 +197,28 @@
         pageid_times = self.db.get_pageid_times()
         self.assertStatsAreEquals(PAGEID_STATS, pageid_times)
 
+    def test___add__(self):
+        # Ensure that adding two RequestTimes together results in
+        # a merge of their constituency.
+        db1 = self.db
+        db2 = RequestTimes(self.categories, FakeOptions())
+        db1.add_request(FakeRequest('/', 1.5, 5, 1.0, '+root'))
+        db1.add_request(FakeRequest('/bugs', 3.5, 15, 1.0, '+bugs'))
+        db2.add_request(FakeRequest('/bugs/1', 5.0, 30, 4.0, '+bug'))
+        results = db1 + db2
+        self.assertEquals(3, results.category_times[0][1].total_hits)
+        self.assertEquals(0, results.category_times[1][1].total_hits)
+        self.assertEquals(2, results.category_times[2][1].total_hits)
+        self.assertEquals(1, results.pageid_times['+root'].total_hits)
+        self.assertEquals(1, results.pageid_times['+bugs'].total_hits)
+        self.assertEquals(1, results.pageid_times['+bug'].total_hits)
+        self.assertEquals(1, results.url_times['/'].total_hits)
+        self.assertEquals(1, results.url_times['/bugs'].total_hits)
+        self.assertEquals(1, results.url_times['/bugs/1'].total_hits)
+
 
 class TestStats(TestCase):
-    """Tests for the stats class."""
+    """Tests for the Stats class."""
 
     def test_relative_histogram(self):
         # Test that relative histogram gives an histogram using
@@ -211,6 +231,26 @@
             stats.relative_histogram)
 
 
+class TestOnlineStats(TestCase):
+    """Tests for the OnlineStats class."""
+
+    def test___add__(self):
+        # Ensure that adding two OnlineStats merge all its constituency.
+        stats1 = OnlineStats(4)
+        stats1.update(FakeRequest('/', 2.0, 5, 1.5))
+        stats2 = OnlineStats(4)
+        stats2.update(FakeRequest('/', 1.5, 2, 3.0))
+        stats2.update(FakeRequest('/', 5.0, 2, 2.0))
+        results = stats1 + stats2
+        self.assertEquals(3, results.total_hits)
+        self.assertEquals(2, results.median)
+        self.assertEquals(9, results.total_sqlstatements)
+        self.assertEquals(2, results.median_sqlstatements)
+        self.assertEquals(6.5, results.total_sqltime)
+        self.assertEquals(2.0, results.median_sqltime)
+        self.assertEquals([[0, 0], [1, 1], [2, 1], [3, 1]], results.histogram)
+
+
 class TestOnlineStatsCalculator(TestCase):
     """Tests for the online stats calculator."""
 
@@ -248,6 +288,36 @@
         self.assertEquals(6, self.stats.variance)
         self.assertEquals("2.45", "%.2f" % self.stats.std)
 
+    def test___add___two_empty(self):
+        stats2 =  OnlineStatsCalculator()
+        results = self.stats + stats2
+        self.assertEquals(0, results.count)
+        self.assertEquals(0, results.sum)
+        self.assertEquals(0, results.mean)
+        self.assertEquals(0, results.variance)
+
+    def test___add___empty(self):
+        stats2 =  OnlineStatsCalculator()
+        for x in [1, 2, 3]:
+            self.stats.update(x)
+        results = self.stats + stats2
+        self.assertEquals(3, results.count)
+        self.assertEquals(6, results.sum)
+        self.assertEquals(2, results.mean)
+        self.assertEquals(2, results.M2)
+
+    def test___add__(self):
+        stats2 = OnlineStatsCalculator()
+        for x in [3, 6, 9]:
+            self.stats.update(x)
+        for x in [1, 2, 3]:
+            stats2.update(x)
+        results = self.stats + stats2
+        self.assertEquals(6, results.count)
+        self.assertEquals(24, results.sum)
+        self.assertEquals(4, results.mean)
+        self.assertEquals(44, results.M2)
+
 
 SHUFFLE_RANGE_100 = [
     25, 79, 99, 76, 60, 63, 87, 77, 51, 82, 42, 96, 93, 58, 32, 66, 75,
@@ -285,6 +355,14 @@
         # True median is 50, 52 is good enough :-)
         self.assertEquals(52, self.estimator.median)
 
+    def test___add__(self):
+        median1 = OnlineApproximateMedian(3)
+        median1.buckets = [[1, 3], [4, 5 ], [6, 3]]
+        median2 = OnlineApproximateMedian(3)
+        median2.buckets = [[], [3, 6], [3, 7 ]]
+        results = median1 + median2
+        self.assertEquals([[1, 3], [6], [3, 7], [4]], results.buckets)
+
 
 def test_suite():
     return unittest.TestLoader().loadTestsFromName(__name__)