launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #05523
[Merge] lp:~lifeless/python-oops-datedir-repo/oops-prune into lp:python-oops-datedir-repo
Robert Collins has proposed merging lp:~lifeless/python-oops-datedir-repo/oops-prune into lp:python-oops-datedir-repo.
Requested reviews:
Launchpad code reviewers (launchpad-reviewers)
Related bugs:
Bug #890875 in Python OOPS Date-dir repository: "no way to remove cruft from the repository."
https://bugs.launchpad.net/python-oops-datedir-repo/+bug/890875
For more details, see:
https://code.launchpad.net/~lifeless/python-oops-datedir-repo/oops-prune/+merge/82324
Implement incremental OOPS pruning, finishing the decoupling of Launchpad from OOPS storage.
I'm pretty happy with this branch, the only untested code is the CLI glue, which is traditionally a PITA to test - and exacerbated here by the need to talk to Launchpad itself.
For now, I've deliberately left it untested - all the complex logic is in the fully TDD and unit tested repository object.
--
https://code.launchpad.net/~lifeless/python-oops-datedir-repo/oops-prune/+merge/82324
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~lifeless/python-oops-datedir-repo/oops-prune into lp:python-oops-datedir-repo.
=== modified file 'NEWS'
--- NEWS 2011-11-13 21:03:43 +0000
+++ NEWS 2011-11-15 21:50:28 +0000
@@ -6,6 +6,24 @@
NEXT
----
+* Repository has a simple generic config API. See the set_config and get_config
+ methods. (Robert Collins)
+
+* Repository can now answer 'what is the oldest date in the repository' which
+ is useful for incremental report pruning. See the oldest_date method.
+ (Robert Collins)
+
+* Repository can perform garbage collection of a date range if a list of
+ references to keep is supplied. See the prune_unreferenced method.
+ (Robert Collins)
+
+* There is a new script bin/prune which will prune reports from a repository
+ keeping only those referenced in a given Launchpad project or project group.
+ This adds a dependency on launchpadlib, which should be pypi installable
+ and is included in the Ubuntu default install - so should be a low barrier
+ for use. If this is an issue a future release can split this out into a
+ new addon package. (Robert Collins, #890875)
+
0.0.11
------
=== added file 'oops_datedir_repo/prune.py'
--- oops_datedir_repo/prune.py 1970-01-01 00:00:00 +0000
+++ oops_datedir_repo/prune.py 2011-11-15 21:50:28 +0000
@@ -0,0 +1,155 @@
+#
+# Copyright (c) 2011, Canonical Ltd
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, version 3 only.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+# GNU Lesser General Public License version 3 (see the file LICENSE).
+
+"""Delete OOPSes that are not referenced in the bugtracker.
+
+Currently only has support for the Launchpad bug tracker.
+"""
+
+__metaclass__ = type
+
+import datetime
+import logging
+import optparse
+from textwrap import dedent
+import sys
+
+from launchpadlib.launchpad import Launchpad
+from launchpadlib.uris import lookup_service_root
+from pytz import utc
+
+import oops_datedir_repo
+
+__all__ = [
+ 'main',
+ ]
+
+
+class LaunchpadTracker:
+ """Abstracted bug tracker/forums etc - permits testing of main()."""
+
+ def __init__(self, options):
+ self.lp = Launchpad.login_anonymously(
+ 'oops-prune', options.lpinstance, version='devel')
+
+ def find_oops_references(self, start_time, end_time, project=None,
+ projectgroup=None):
+ projects = set([])
+ if project is not None:
+ projects.add(project)
+ if projectgroup is not None:
+ [projects.add(lp_proj.name)
+ for lp_proj in self.lp.project_groups[projectgroup].projects]
+ result = set()
+ lp_projects = self.lp.projects
+ one_week = datetime.timedelta(weeks=1)
+ for project in projects:
+ lp_project = lp_projects[project]
+ current_start = start_time
+ while current_start < end_time:
+ current_end = current_start + one_week
+ if current_end > end_time:
+ current_end = end_time
+ logging.info(
+ "Querying OOPS references on %s from %s to %s",
+ project, current_start, current_end)
+ result.update(lp_project.findReferencedOOPS(
+ start_date=current_start, end_date=current_end))
+ current_start = current_end
+ return result
+
+
+def main(argv=None, tracker=LaunchpadTracker, logging=logging):
+ """Console script entry point."""
+ if argv is None:
+ argv = sys.argv
+ usage = dedent("""\
+ %prog [options]
+
+ The following options must be supplied:
+ --repo
+
+ And either
+ --project
+ or
+ --projectgroup
+
+ e.g.
+ %prog --repo . --projectgroup launchpad-project
+
+ Will process every member project of launchpad-project.
+
+ When run this program will ask Launchpad for OOPS references made since
+ the last date it pruned up to, with an upper limit of one week from
+ today. It then looks in the repository for all oopses created during
+ that date range, and if they are not in the set returned by Launchpad,
+ deletes them. If the repository has never been pruned before, it will
+ pick the earliest datedir present in the repository as the start date.
+ """)
+ description = \
+ "Delete OOPS reports that are not referenced in a bug tracker."
+ parser = optparse.OptionParser(
+ description=description, usage=usage)
+ parser.add_option('--project',
+ help="Launchpad project to find references in.")
+ parser.add_option('--projectgroup',
+ help="Launchpad project group to find references in.")
+ parser.add_option('--repo', help="Path to the repository to read from.")
+ parser.add_option(
+ '--lpinstance', help="Launchpad instance to use", default="production")
+ options, args = parser.parse_args(argv[1:])
+ def needed(*optnames):
+ present = set()
+ for optname in optnames:
+ if getattr(options, optname, None) is not None:
+ present.add(optname)
+ if not present:
+ if len(optnames) == 1:
+ raise ValueError('Option "%s" must be supplied' % optname)
+ else:
+ raise ValueError(
+ 'One of options %s must be supplied' % (optnames,))
+ elif len(present) != 1:
+ raise ValueError(
+ 'Only one of options %s can be supplied' % (optnames,))
+ needed('repo')
+ needed('project', 'projectgroup')
+ logging.basicConfig(
+ filename='prune.log', filemode='w', level=logging.DEBUG)
+ repo = oops_datedir_repo.DateDirRepo(options.repo)
+ one_week = datetime.timedelta(weeks=1)
+ one_day = datetime.timedelta(days=1)
+ # max date to scan for
+ prune_until = datetime.datetime.now(utc) - one_week
+ # Get min date to scan for
+ try:
+ prune_from = repo.get_config('pruned-until')
+ except KeyError:
+ try:
+ oldest_oops = repo.oldest_date()
+ except ValueError:
+ logging.info("No OOPSes in repo, nothing to do.")
+ return 0
+ prune_from = datetime.datetime.fromordinal(oldest_oops.toordinal())
+ # get references from date range
+ finder = tracker(options)
+ references = finder.find_oops_references(
+ prune_from, prune_until, options.project, options.projectgroup)
+ # delete oops files on disk
+ repo.prune_unreferenced(prune_from, prune_until, references)
+ # stash most recent date
+ repo.set_config('pruned-until', prune_until)
+ return 0
=== modified file 'oops_datedir_repo/repository.py'
--- oops_datedir_repo/repository.py 2011-11-11 04:21:05 +0000
+++ oops_datedir_repo/repository.py 2011-11-15 21:50:28 +0000
@@ -23,11 +23,13 @@
]
import datetime
+import errno
from functools import partial
from hashlib import md5
import os.path
import stat
+import bson
from pytz import utc
import serializer
@@ -37,7 +39,19 @@
class DateDirRepo:
- """Publish oopses to a date-dir repository."""
+ """Publish oopses to a date-dir repository.
+
+ A date-dir repository is a directory containing:
+
+ * Zero or one directories called 'metadata'. If it exists this directory
+ contains any housekeeping material needed (such as a metadata.conf ini
+ file).
+
+ * Zero or more directories named like YYYY-MM-DD, which contain zero or
+ more OOPS reports. OOPS file names can take various forms, but must not
+ end in .tmp - those are considered to be OOPS reports that are currently
+ being written.
+ """
def __init__(self, error_dir, instance_id=None, serializer=None,
inherit_id=False, stash_path=False):
@@ -70,12 +84,14 @@
)
else:
self.log_namer = None
- self.root = error_dir
+ self.root = error_dir
if serializer is None:
serializer = serializer_bson
self.serializer = serializer
self.inherit_id = inherit_id
self.stash_path = stash_path
+ self.metadatadir = os.path.join(self.root, 'metadata')
+ self.config_path = os.path.join(self.metadatadir, 'config.bson')
def publish(self, report, now=None):
"""Write the report to disk.
@@ -148,13 +164,8 @@
two_days = datetime.timedelta(2)
now = datetime.date.today()
old = now - two_days
- for dirname in os.listdir(self.root):
- try:
- y, m, d = dirname.split('-')
- except ValueError:
- # Not a datedir
- continue
- date = datetime.date(int(y),int(m),int(d))
+ for dirname, (y,m,d) in self._datedirs():
+ date = datetime.date(y, m, d)
prune = date < old
dirpath = os.path.join(self.root, dirname)
files = os.listdir(dirpath)
@@ -171,3 +182,123 @@
oopsid = publisher(report)
if oopsid:
os.unlink(candidate)
+
+ def _datedirs(self):
+ """Yield each subdir which looks like a datedir."""
+ for dirname in os.listdir(self.root):
+ try:
+ y, m, d = dirname.split('-')
+ y = int(y)
+ m = int(m)
+ d = int(d)
+ except ValueError:
+ # Not a datedir
+ continue
+ yield dirname, (y, m, d)
+
+ def _read_config(self):
+ """Return the current config document from disk."""
+ try:
+ with open(self.config_path, 'rb') as config_file:
+ return bson.loads(config_file.read())
+ except IOError, e:
+ if e.errno != errno.ENOENT:
+ raise
+ return {}
+
+ def get_config(self, key):
+ """Return a key from the repository config.
+
+ :param key: A key to read from the config.
+ """
+ return self._read_config()[key]
+
+ def set_config(self, key, value):
+ """Set config option key to value.
+
+ This is written to the bson document root/metadata/config.bson
+
+ :param key: The key to set - anything that can be a key in a bson
+ document.
+ :param value: The value to set - anything that can be a value in a
+ bson document.
+ """
+ config = self._read_config()
+ config[key] = value
+ try:
+ with open(self.config_path + '.tmp', 'wb') as config_file:
+ config_file.write(bson.dumps(config))
+ except IOError, e:
+ if e.errno != errno.ENOENT:
+ raise
+ os.mkdir(self.metadatadir)
+ with open(self.config_path + '.tmp', 'wb') as config_file:
+ config_file.write(bson.dumps(config))
+ os.rename(self.config_path + '.tmp', self.config_path)
+
+ def oldest_date(self):
+ """Return the date of the oldest datedir in the repository.
+
+ If pruning / resubmission is working this should also be the date of
+ the oldest oops in the repository.
+ """
+ dirs = list(self._datedirs())
+ if not dirs:
+ raise ValueError("No OOPSes in repository.")
+ return datetime.date(*sorted(dirs)[0][1])
+
+ def prune_unreferenced(self, start_time, stop_time, references):
+ """Delete OOPS reports filed between start_time and stop_time.
+
+ A report is deleted if all of the following are true:
+
+ * it is in a datedir covered by [start_time, stop_time] inclusive of
+ the end points.
+
+ * It is not in the set references.
+
+ * Its timestamp falls between start_time and stop_time inclusively or
+ it's timestamp is outside the datedir it is in or there is no
+ timestamp on the report.
+
+ :param start_time: The lower bound to prune within.
+ :param stop_time: The upper bound to prune within.
+ :param references: An iterable of OOPS ids to keep.
+ """
+ start_date = start_time.date()
+ stop_date = stop_time.date()
+ midnight = datetime.time(tzinfo=utc)
+ for dirname, (y,m,d) in self._datedirs():
+ dirdate = datetime.date(y, m, d)
+ if dirdate < start_date or dirdate > stop_date:
+ continue
+ dirpath = os.path.join(self.root, dirname)
+ files = os.listdir(dirpath)
+ deleted = 0
+ for candidate in map(partial(os.path.join, dirpath), files):
+ if candidate.endswith('.tmp'):
+ # Old half-written oops: just remove.
+ os.unlink(candidate)
+ deleted += 1
+ continue
+ with file(candidate, 'rb') as report_file:
+ report = serializer.read(report_file)
+ report_time = report.get('time', None)
+ if (report_time is None or
+ report_time.date() < dirdate or
+ report_time.date() > dirdate):
+ # The report is oddly filed or missing a precise
+ # datestamp. Treat it like midnight on the day of the
+ # directory it was placed in - this is a lower bound on
+ # when it was actually created.
+ report_time = datetime.datetime.combine(
+ dirdate, midnight)
+ if (report_time >= start_time and
+ report_time <= stop_time and
+ report['id'] not in references):
+ # Unreferenced and prunable
+ os.unlink(candidate)
+ deleted += 1
+ if deleted == len(files):
+ # Everything in the directory was deleted.
+ os.rmdir(dirpath)
=== modified file 'oops_datedir_repo/tests/test_repository.py'
--- oops_datedir_repo/tests/test_repository.py 2011-11-11 04:21:05 +0000
+++ oops_datedir_repo/tests/test_repository.py 2011-11-15 21:50:28 +0000
@@ -26,6 +26,7 @@
import bson
from pytz import utc
import testtools
+from testtools.matchers import raises
from oops_datedir_repo import (
DateDirRepo,
@@ -234,3 +235,103 @@
repo = DateDirRepo(self.useFixture(TempDir()).path)
os.mkdir(repo.root + '/foo')
repo.republish([].append)
+
+ def test_republish_ignores_metadata_dir(self):
+ # The metadata directory is never pruned
+ repo = DateDirRepo(self.useFixture(TempDir()).path)
+ os.mkdir(repo.root + '/metadata')
+ repo.republish([].append)
+ self.assertTrue(os.path.exists(repo.root + '/metadata'))
+
+ def test_get_config_value(self):
+ # Config values can be asked for from the repository.
+ repo = DateDirRepo(self.useFixture(TempDir()).path)
+ pruned = datetime.datetime(2006, 04, 01, 00, 30, 00, tzinfo=utc)
+ repo.set_config('pruned-until', pruned)
+ # Fresh instance, no memory tricks.
+ repo = DateDirRepo(repo.root)
+ self.assertEqual(pruned, repo.get_config('pruned-until'))
+
+ def test_set_config_value(self):
+ # Config values are just keys in a bson document.
+ repo = DateDirRepo(self.useFixture(TempDir()).path)
+ pruned = datetime.datetime(2006, 04, 01, 00, 30, 00, tzinfo=utc)
+ repo.set_config('pruned-until', pruned)
+ with open(repo.root + '/metadata/config.bson', 'rb') as config_file:
+ from_bson = bson.loads(config_file.read())
+ self.assertEqual({'pruned-until': pruned}, from_bson)
+
+ def test_set_config_preserves_other_values(self):
+ # E.g. setting 'a' does not affect 'b'
+ repo = DateDirRepo(self.useFixture(TempDir()).path)
+ repo.set_config('b', 'b-value')
+ repo = DateDirRepo(repo.root)
+ repo.set_config('a', 'a-value')
+ with open(repo.root + '/metadata/config.bson', 'rb') as config_file:
+ from_bson = bson.loads(config_file.read())
+ self.assertEqual({'a': 'a-value', 'b': 'b-value'}, from_bson)
+
+ def test_oldest_date_no_contents(self):
+ repo = DateDirRepo(self.useFixture(TempDir()).path)
+ self.assertThat(lambda: repo.oldest_date(),
+ raises(ValueError("No OOPSes in repository.")))
+
+ def test_oldest_date_is_oldest(self):
+ repo = DateDirRepo(self.useFixture(TempDir()).path)
+ os.mkdir(repo.root + '/2006-04-12')
+ os.mkdir(repo.root + '/2006-04-13')
+ self.assertEqual(datetime.date(2006, 4, 12), repo.oldest_date())
+
+ def test_prune_unreferenced_no_oopses(self):
+ # This shouldn't crash.
+ repo = DateDirRepo(self.useFixture(TempDir()).path, inherit_id=True)
+ now = datetime.datetime(2006, 04, 01, 00, 30, 00, tzinfo=utc)
+ old = now - datetime.timedelta(weeks=1)
+ repo.prune_unreferenced(old, now, [])
+
+ def test_prune_unreferenced_no_references(self):
+ # When there are no references, everything specified is zerged.
+ repo = DateDirRepo(self.useFixture(TempDir()).path, inherit_id=True)
+ now = datetime.datetime(2006, 04, 01, 00, 30, 00, tzinfo=utc)
+ old = now - datetime.timedelta(weeks=1)
+ report = {'time': now - datetime.timedelta(hours=5)}
+ repo.publish(report, report['time'])
+ repo.prune_unreferenced(old, now, [])
+ self.assertThat(lambda: repo.oldest_date(), raises(ValueError))
+
+ def test_prune_unreferenced_outside_dates_kept(self):
+ # Pruning only affects stuff in the datedirs selected by the dates.
+ repo = DateDirRepo(
+ self.useFixture(TempDir()).path, inherit_id=True, stash_path=True)
+ now = datetime.datetime(2006, 04, 01, 00, 30, 00, tzinfo=utc)
+ old = now - datetime.timedelta(weeks=1)
+ before = {'time': old - datetime.timedelta(minutes=1)}
+ after = {'time': now + datetime.timedelta(minutes=1)}
+ repo.publish(before, before['time'])
+ repo.publish(after, after['time'])
+ repo.prune_unreferenced(old, now, [])
+ self.assertTrue(os.path.isfile(before['datedir_repo_filepath']))
+ self.assertTrue(os.path.isfile(after['datedir_repo_filepath']))
+
+ def test_prune_referenced_inside_dates_kept(self):
+ repo = DateDirRepo(
+ self.useFixture(TempDir()).path, inherit_id=True, stash_path=True)
+ now = datetime.datetime(2006, 04, 01, 00, 30, 00, tzinfo=utc)
+ old = now - datetime.timedelta(weeks=1)
+ report = {'id': 'foo', 'time': now - datetime.timedelta(minutes=1)}
+ repo.publish(report, report['time'])
+ repo.prune_unreferenced(old, now, ['foo'])
+ self.assertTrue(os.path.isfile(report['datedir_repo_filepath']))
+
+ def test_prune_report_midnight_gets_invalid_timed_reports(self):
+ # If a report has a wonky or missing time, pruning treats it as being
+ # timed on midnight of the datedir day it is on.
+ repo = DateDirRepo(self.useFixture(TempDir()).path, stash_path=True)
+ now = datetime.datetime(2006, 04, 01, 00, 01, 00, tzinfo=utc)
+ old = now - datetime.timedelta(minutes=2)
+ badtime = {'time': now - datetime.timedelta(weeks=2)}
+ missingtime = {}
+ repo.publish(badtime, now)
+ repo.publish(missingtime, now)
+ repo.prune_unreferenced(old, now, [])
+ self.assertThat(lambda: repo.oldest_date(), raises(ValueError))
=== modified file 'setup.py'
--- setup.py 2011-11-13 21:07:43 +0000
+++ setup.py 2011-11-15 21:50:28 +0000
@@ -40,6 +40,7 @@
install_requires = [
'bson',
'iso8601',
+ 'launchpadlib', # Needed for pruning - perhaps should be optional.
'oops',
'pytz',
],
@@ -49,4 +50,8 @@
'testtools',
]
),
+ entry_points=dict(
+ console_scripts=[ # `console_scripts` is a magic name to setuptools
+ 'prune = oops_datedir_repo.prune:main',
+ ]),
)
=== modified file 'versions.cfg'
--- versions.cfg 2011-10-07 05:52:09 +0000
+++ versions.cfg 2011-11-15 21:50:28 +0000
@@ -3,13 +3,25 @@
[versions]
bson = 0.3.2
+elementtree = 1.2.6-20050316
fixtures = 0.3.6
+httplib2 = 0.6.0
iso8601 = 0.1.4
+keyring = 0.6.2
+launchpadlib = 1.9.9
+lazr.authentication = 0.1.1
+lazr.restfulclient = 0.12.0
+lazr.uri = 1.0.2
+oauth = 1.0.0
oops = 0.0.8
pytz = 2010o
setuptools = 0.6c11
+simplejson = 2.1.3
testtools = 0.9.11
+wadllib = 1.2.0
+wsgi-intercept = 0.4
zc.recipe.egg = 1.3.2
z3c.recipe.filetemplate = 2.1.0
z3c.recipe.scripts = 1.0.1
zc.buildout = 1.5.1
+zope.interface = 3.8.0