launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #01803
[Merge] lp:~flacoste/launchpad/ppr-merge into lp:launchpad/devel
Francis J. Lacoste has proposed merging lp:~flacoste/launchpad/ppr-merge into lp:launchpad/devel with lp:~flacoste/launchpad/ppr-constant-memory as a prerequisite.
Requested reviews:
Launchpad code reviewers (launchpad-reviewers)
Hello,
This is a follow-up on my previous constant-memory branch. It adds the option
of generating a report by aggregating the RequestTimes data structure pickled
in a previous run.
This will allow us to generate weekly, monthly and even yearly page
performance report with minimal fuss.
I did this by adding a __add__ method to all data structure involved. There
are only two tricky algorithms, one how to merge the variance. I used the
algorithm pointed at by the wikipedia page
http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
The transcription on that page actually didn't produce the expected results.
So I went to the paper and used the original formula.
The may to merge two approximate medians is home-made. I went with what seemed
to make the most sense. It seem to produce ok results.
--
https://code.launchpad.net/~flacoste/launchpad/ppr-merge/+merge/40014
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~flacoste/launchpad/ppr-merge into lp:launchpad/devel.
=== modified file 'lib/lp/scripts/utilities/pageperformancereport.py'
--- lib/lp/scripts/utilities/pageperformancereport.py 2010-11-03 19:58:08 +0000
+++ lib/lp/scripts/utilities/pageperformancereport.py 2010-11-03 19:58:27 +0000
@@ -9,6 +9,7 @@
import bz2
import cPickle
from cgi import escape as html_quote
+import copy
from ConfigParser import RawConfigParser
import csv
from datetime import datetime
@@ -69,6 +70,11 @@
def __cmp__(self, other):
return cmp(self.title.lower(), other.title.lower())
+ def __deepcopy__(self, memo):
+ # We provide __deepcopy__ because the module doesn't handle
+ # compiled regular expression by default.
+ return Category(self.title, self.regexp)
+
class OnlineStatsCalculator:
"""Object that can compute count, sum, mean, variance and median.
@@ -113,6 +119,30 @@
else:
return math.sqrt(self.variance)
+ def __add__(self, other):
+ """Adds this and another OnlineStatsCalculator.
+
+ The result combines the stats of the two objects.
+ """
+ results = OnlineStatsCalculator()
+ results.count = self.count + other.count
+ results.sum = self.sum + other.sum
+ if self.count > 0 and other.count > 0:
+ # This is 2.1b in Chan, Tony F.; Golub, Gene H.; LeVeque,
+ # Randall J. (1979), "Updating Formulae and a Pairwise Algorithm
+ # for Computing Sample Variances.",
+ # Technical Report STAN-CS-79-773,
+ # Department of Computer Science, Stanford University,
+ # ftp://reports.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf .
+ results.M2 = self.M2 + other.M2 + (
+ (float(self.count)/(other.count*results.count)) *
+ ((float(other.count)/self.count)*self.sum - other.sum)**2)
+ else:
+ results.M2 = self.M2 + other.M2 # One of them is 0.
+ if results.count > 0:
+ results.mean = float(results.sum)/results.count
+ return results
+
class OnlineApproximateMedian:
"""Approximate the median of a set of elements.
@@ -139,23 +169,22 @@
The bucket size should be a low odd-integer.
"""
- self.count = 0
self.bucket_size = bucket_size
# Index of the median in a completed bucket.
self.median_idx = bucket_size/2
self.buckets = []
- def update(self, x):
+ def update(self, x, order=0):
"""Update with x."""
if x is None:
return
- self.count += 1
- i = 0
+ i = order
while True:
# Create bucket on demand.
- if i == len(self.buckets):
- self.buckets.append([])
+ if i >= len(self.buckets):
+ for n in range((i+1)-len(self.buckets)):
+ self.buckets.append([])
bucket = self.buckets[i]
bucket.append(x)
if len(bucket) == self.bucket_size:
@@ -171,9 +200,6 @@
@property
def median(self):
"""Return the median."""
- if self.count == 0:
- return 0
-
# Find the 'weighted' median by assigning a weight to each
# element proportional to how far they have been selected.
candidates = []
@@ -183,12 +209,30 @@
for x in bucket:
total_weight += weight
candidates.append([weight, x])
+
+ if len(candidates) == 0:
+ return 0
# Make weight relative.
candidates = [
((float(weight)/total_weight)*x, x)
for weight, x in candidates]
return sorted(candidates)[(len(candidates)-1)/2][1]
+ def __add__(self, other):
+ """Merge two approximator together.
+
+ All candidates from the other are merged through the standard
+ algorithm, starting at the same level. So an item that went through
+ two rounds of selection, will be compared with other items having
+ gone through the same number of rounds.
+ """
+ results = OnlineApproximateMedian(self.bucket_size)
+ results.buckets = copy.deepcopy(self.buckets)
+ for i, bucket in enumerate(other.buckets):
+ for x in bucket:
+ results.update(x, i)
+ return results
+
class Stats:
"""Bag to hold and compute request statistics.
@@ -334,6 +378,21 @@
idx = int(min(len(self.histogram)-1, request.app_seconds))
self.histogram[idx][1] += 1
+ def __add__(self, other):
+ """Merge another OnlineStats with this one."""
+ results = copy.deepcopy(self)
+ results.time_stats += other.time_stats
+ results.time_median_approximate += other.time_median_approximate
+ results.sql_time_stats += other.sql_time_stats
+ results.sql_time_median_approximate += (
+ other.sql_time_median_approximate)
+ results.sql_statements_stats += other.sql_statements_stats
+ results.sql_statements_median_approximate += (
+ other.sql_statements_median_approximate)
+ for i, (n, f) in enumerate(other._histogram):
+ results._histogram[i][1] += f
+ return results
+
class RequestTimes:
"""Collect the """
@@ -404,6 +463,43 @@
# Sort the result by pageid
return sorted(self.pageid_times.items())
+ def __add__(self, other):
+ """Merge two RequestTimes together."""
+ results = copy.deepcopy(self)
+ for other_category, other_stats in other.category_times:
+ found = False
+ for i, (category, stats) in enumerate(self.category_times):
+ if category.title == other_category.title:
+ results.category_times[i] = (
+ category, stats + other_stats)
+ found = True
+ break
+ if not found:
+ results.category_times.append(
+ (other_category, copy.deepcopy(other_stats)))
+
+ url_times = results.url_times
+ for url, stats in other.url_times.items():
+ if url in url_times:
+ url_times[url] += stats
+ else:
+ url_times[url] = copy.deepcopy(stats)
+ # Only keep top_urls_cache_size entries.
+ if len(self.url_times) > self.top_urls_cache_size:
+ self.url_times = dict(
+ sorted(url_times.items(),
+ key=lambda x: x[1].total_time,
+ reverse=True)[:self.top_urls_cache_size])
+
+ pageid_times = results.pageid_times
+ for pageid, stats in other.pageid_times.items():
+ if pageid in pageid_times:
+ pageid_times[pageid] += stats
+ else:
+ pageid_times[pageid] = copy.deepcopy(stats)
+
+ return results
+
def main():
parser = LPOptionParser("%prog [args] tracelog [...]")
@@ -441,6 +537,11 @@
# Default to 12: the staging timeout.
default=12, type="int",
help="The configured timeout value : determines high risk page ids.")
+ parser.add_option(
+ "--merge", dest="merge",
+ default=False, action='store_true',
+ help="Files are interpreted as pickled stats and are aggregated for" +
+ "the report.")
options, args = parser.parse_args()
@@ -455,6 +556,9 @@
parser.error(
"--from timestamp %s is before --until timestamp %s"
% (options.from_ts, options.until_ts))
+ if options.from_ts is not None or options.until_ts is not None:
+ if options.merge:
+ parser.error('--from and --until cannot be used with --merge')
for filename in args:
if not os.path.exists(filename):
@@ -483,7 +587,14 @@
times = RequestTimes(categories, options)
- parse(args, times, options)
+ if options.merge:
+ for filename in args:
+ log.info('Merging %s...' % filename)
+ f = bz2.BZ2File(filename, 'r')
+ times += cPickle.load(f)
+ f.close()
+ else:
+ parse(args, times, options)
category_times = times.get_category_times()
=== modified file 'lib/lp/scripts/utilities/tests/test_pageperformancereport.py'
--- lib/lp/scripts/utilities/tests/test_pageperformancereport.py 2010-11-03 19:58:08 +0000
+++ lib/lp/scripts/utilities/tests/test_pageperformancereport.py 2010-11-03 19:58:27 +0000
@@ -12,6 +12,7 @@
from lp.scripts.utilities.pageperformancereport import (
Category,
OnlineApproximateMedian,
+ OnlineStats,
OnlineStatsCalculator,
RequestTimes,
Stats,
@@ -196,9 +197,28 @@
pageid_times = self.db.get_pageid_times()
self.assertStatsAreEquals(PAGEID_STATS, pageid_times)
+ def test___add__(self):
+ # Ensure that adding two RequestTimes together results in
+ # a merge of their constituency.
+ db1 = self.db
+ db2 = RequestTimes(self.categories, FakeOptions())
+ db1.add_request(FakeRequest('/', 1.5, 5, 1.0, '+root'))
+ db1.add_request(FakeRequest('/bugs', 3.5, 15, 1.0, '+bugs'))
+ db2.add_request(FakeRequest('/bugs/1', 5.0, 30, 4.0, '+bug'))
+ results = db1 + db2
+ self.assertEquals(3, results.category_times[0][1].total_hits)
+ self.assertEquals(0, results.category_times[1][1].total_hits)
+ self.assertEquals(2, results.category_times[2][1].total_hits)
+ self.assertEquals(1, results.pageid_times['+root'].total_hits)
+ self.assertEquals(1, results.pageid_times['+bugs'].total_hits)
+ self.assertEquals(1, results.pageid_times['+bug'].total_hits)
+ self.assertEquals(1, results.url_times['/'].total_hits)
+ self.assertEquals(1, results.url_times['/bugs'].total_hits)
+ self.assertEquals(1, results.url_times['/bugs/1'].total_hits)
+
class TestStats(TestCase):
- """Tests for the stats class."""
+ """Tests for the Stats class."""
def test_relative_histogram(self):
# Test that relative histogram gives an histogram using
@@ -211,6 +231,26 @@
stats.relative_histogram)
+class TestOnlineStats(TestCase):
+ """Tests for the OnlineStats class."""
+
+ def test___add__(self):
+ # Ensure that adding two OnlineStats merge all its constituency.
+ stats1 = OnlineStats(4)
+ stats1.update(FakeRequest('/', 2.0, 5, 1.5))
+ stats2 = OnlineStats(4)
+ stats2.update(FakeRequest('/', 1.5, 2, 3.0))
+ stats2.update(FakeRequest('/', 5.0, 2, 2.0))
+ results = stats1 + stats2
+ self.assertEquals(3, results.total_hits)
+ self.assertEquals(2, results.median)
+ self.assertEquals(9, results.total_sqlstatements)
+ self.assertEquals(2, results.median_sqlstatements)
+ self.assertEquals(6.5, results.total_sqltime)
+ self.assertEquals(2.0, results.median_sqltime)
+ self.assertEquals([[0, 0], [1, 1], [2, 1], [3, 1]], results.histogram)
+
+
class TestOnlineStatsCalculator(TestCase):
"""Tests for the online stats calculator."""
@@ -248,6 +288,36 @@
self.assertEquals(6, self.stats.variance)
self.assertEquals("2.45", "%.2f" % self.stats.std)
+ def test___add___two_empty(self):
+ stats2 = OnlineStatsCalculator()
+ results = self.stats + stats2
+ self.assertEquals(0, results.count)
+ self.assertEquals(0, results.sum)
+ self.assertEquals(0, results.mean)
+ self.assertEquals(0, results.variance)
+
+ def test___add___empty(self):
+ stats2 = OnlineStatsCalculator()
+ for x in [1, 2, 3]:
+ self.stats.update(x)
+ results = self.stats + stats2
+ self.assertEquals(3, results.count)
+ self.assertEquals(6, results.sum)
+ self.assertEquals(2, results.mean)
+ self.assertEquals(2, results.M2)
+
+ def test___add__(self):
+ stats2 = OnlineStatsCalculator()
+ for x in [3, 6, 9]:
+ self.stats.update(x)
+ for x in [1, 2, 3]:
+ stats2.update(x)
+ results = self.stats + stats2
+ self.assertEquals(6, results.count)
+ self.assertEquals(24, results.sum)
+ self.assertEquals(4, results.mean)
+ self.assertEquals(44, results.M2)
+
SHUFFLE_RANGE_100 = [
25, 79, 99, 76, 60, 63, 87, 77, 51, 82, 42, 96, 93, 58, 32, 66, 75,
@@ -285,6 +355,14 @@
# True median is 50, 52 is good enough :-)
self.assertEquals(52, self.estimator.median)
+ def test___add__(self):
+ median1 = OnlineApproximateMedian(3)
+ median1.buckets = [[1, 3], [4, 5 ], [6, 3]]
+ median2 = OnlineApproximateMedian(3)
+ median2.buckets = [[], [3, 6], [3, 7 ]]
+ results = median1 + median2
+ self.assertEquals([[1, 3], [6], [3, 7], [4]], results.buckets)
+
def test_suite():
return unittest.TestLoader().loadTestsFromName(__name__)