← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] lp:~wgrant/launchpad/bug-694004-log-parser-glob into lp:launchpad

 

William Grant has proposed merging lp:~wgrant/launchpad/bug-694004-log-parser-glob into lp:launchpad.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)
Related bugs:
  #694004 PPA Apache log parser needs to exclude error logs
  https://bugs.launchpad.net/bugs/694004

For more details, see:
https://code.launchpad.net/~wgrant/launchpad/bug-694004-log-parser-glob/+merge/45201

The PPA Apache log parser currently generates thousands of OOPSes each day, as it erroneously considers the Apache error logs that live alongside the access logs that it intends to parse. The librarian log parser is not affected, since it is run over a synced directory that omits the error logs.

This branch allows the parser infrastructure to glob for filenames, and convinces the PPA parser to filter it to the usual access log pattern. The glob is configurable, but I believe that the default should work fine for production.

To test this I added a non-matching log file, and wrapped the script's execution in a OOPS check, confirming that the file is not considered.
-- 
https://code.launchpad.net/~wgrant/launchpad/bug-694004-log-parser-glob/+merge/45201
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~wgrant/launchpad/bug-694004-log-parser-glob into lp:launchpad.
=== modified file 'cronscripts/parse-ppa-apache-access-logs.py'
--- cronscripts/parse-ppa-apache-access-logs.py	2010-11-08 12:52:43 +0000
+++ cronscripts/parse-ppa-apache-access-logs.py	2011-01-05 05:24:55 +0000
@@ -32,6 +32,10 @@
         """See `ParseApacheLogs`."""
         return config.ppa_apache_log_parser.logs_root
 
+    @property
+    def log_file_glob(self):
+        return config.ppa_apache_log_parser.log_file_glob
+
     def getDownloadKey(self, path):
         """See `ParseApacheLogs`."""
         return get_ppa_file_key(path)

=== modified file 'lib/canonical/config/schema-lazr.conf'
--- lib/canonical/config/schema-lazr.conf	2010-12-23 17:14:16 +0000
+++ lib/canonical/config/schema-lazr.conf	2011-01-05 05:24:55 +0000
@@ -1588,6 +1588,7 @@
 
 [ppa_apache_log_parser]
 logs_root: /srv/ppa.launchpad.net-logs
+log_file_glob: *-access.log*
 
 
 [poimport]

=== modified file 'lib/lp/services/apachelogparser/base.py'
--- lib/lp/services/apachelogparser/base.py	2010-11-24 11:00:33 +0000
+++ lib/lp/services/apachelogparser/base.py	2011-01-05 05:24:55 +0000
@@ -23,18 +23,16 @@
 parser = apachelog.parser(apachelog.formats['extended'])
 
 
-def get_files_to_parse(root, file_names):
+def get_files_to_parse(file_paths):
     """Return an iterator of file and position where reading should start.
 
     The lines read from that position onwards will be the ones that have not
     been parsed yet.
 
-    :param root: The directory where the files are stored.
-    :param file_names: The names of the files.
+    :param file_paths: The paths to the files.
     """
     store = getUtility(IStoreSelector).get(MAIN_STORE, DEFAULT_FLAVOR)
-    for file_name in file_names:
-        file_path = os.path.join(root, file_name)
+    for file_path in file_paths:
         fd, file_size = get_fd_and_file_size(file_path)
         first_line = unicode(fd.readline())
         parsed_file = store.find(ParsedApacheLog, first_line=first_line).one()

=== modified file 'lib/lp/services/apachelogparser/script.py'
--- lib/lp/services/apachelogparser/script.py	2010-12-16 14:29:43 +0000
+++ lib/lp/services/apachelogparser/script.py	2011-01-05 05:24:55 +0000
@@ -4,6 +4,7 @@
 __metaclass__ = type
 __all__ = ['ParseApacheLogs']
 
+import glob
 import os
 
 from zope.component import getUtility
@@ -30,6 +31,9 @@
     and optionally setUpUtilities.
     """
 
+    # Glob to restrict filenames that are parsed.
+    log_file_glob = '*'
+
     def setUpUtilities(self):
         """Prepare any utilities that might be used many times."""
         pass
@@ -63,7 +67,8 @@
         raise NotImplementedError
 
     def main(self):
-        files_to_parse = get_files_to_parse(self.root, os.listdir(self.root))
+        files_to_parse = get_files_to_parse(
+            glob.glob(os.path.join(self.root, self.log_file_glob)))
 
         self.setUpUtilities()
         country_set = getUtility(ICountrySet)

=== modified file 'lib/lp/services/apachelogparser/tests/test_apachelogparser.py'
--- lib/lp/services/apachelogparser/tests/test_apachelogparser.py	2010-12-23 01:02:00 +0000
+++ lib/lp/services/apachelogparser/tests/test_apachelogparser.py	2011-01-05 05:24:55 +0000
@@ -359,6 +359,7 @@
     layer = LaunchpadZopelessLayer
     # The directory in which the sample log files live.
     root = os.path.join(here, 'apache-log-files')
+    file_path = os.path.join(root, 'launchpadlibrarian.net.access-log')
 
     def setUp(self):
         super(TestParsedFilesDetection, self).setUp()
@@ -367,31 +368,28 @@
     def test_not_parsed_file(self):
         # A file that has never been parsed will have to be parsed from the
         # start.
-        file_name = 'launchpadlibrarian.net.access-log'
-        files_to_parse = get_files_to_parse(self.root, [file_name])
+        files_to_parse = get_files_to_parse([self.file_path])
         fd, position = list(files_to_parse)[0]
         self.assertEqual(position, 0)
 
     def test_completely_parsed_file(self):
         # A file that has been completely parsed will be skipped.
-        file_name = 'launchpadlibrarian.net.access-log'
-        fd = open(os.path.join(self.root, file_name))
+        fd = open(self.file_path)
         first_line = fd.readline()
         fd.seek(0)
         ParsedApacheLog(first_line, len(fd.read()))
 
-        files_to_parse = get_files_to_parse(self.root, [file_name])
+        files_to_parse = get_files_to_parse([self.file_path])
         self.failUnlessEqual(list(files_to_parse), [])
 
     def test_parsed_file_with_new_content(self):
         # A file that has been parsed already but in which new content was
         # added will be parsed again, starting from where parsing stopped last
         # time.
-        file_name = 'launchpadlibrarian.net.access-log'
-        first_line = open(os.path.join(self.root, file_name)).readline()
+        first_line = open(self.file_path).readline()
         ParsedApacheLog(first_line, len(first_line))
 
-        files_to_parse = list(get_files_to_parse(self.root, [file_name]))
+        files_to_parse = list(get_files_to_parse([self.file_path]))
         self.assertEqual(len(files_to_parse), 1)
         fd, position = files_to_parse[0]
         # Since we parsed the first line above, we'll be told to start where
@@ -409,31 +407,33 @@
         # This file has the same name of the previous one (which has been
         # parsed already), but its first line is different, so we'll have to
         # parse it from the start.
-        fd, file_name = tempfile.mkstemp()
+        fd, new_path = tempfile.mkstemp()
         content2 = 'Different First Line\nSecond Line'
-        fd = open(file_name, 'w')
+        fd = open(new_path, 'w')
         fd.write(content2)
         fd.close()
-        files_to_parse = get_files_to_parse(self.root, [file_name])
+        files_to_parse = get_files_to_parse([new_path])
         positions = map(itemgetter(1), files_to_parse)
         self.failUnlessEqual(positions, [0])
 
     def test_fresh_gzipped_file(self):
         # get_files_to_parse() handles gzipped files just like uncompressed
         # ones.  The first time we see one, we'll parse from the beginning.
-        file_name = 'launchpadlibrarian.net.access-log.1.gz'
-        first_line = gzip.open(os.path.join(self.root, file_name)).readline()
-        files_to_parse = get_files_to_parse(self.root, [file_name])
+        gz_name = 'launchpadlibrarian.net.access-log.1.gz'
+        gz_path = os.path.join(self.root, gz_name)
+        first_line = gzip.open(gz_path).readline()
+        files_to_parse = get_files_to_parse([gz_path])
         positions = map(itemgetter(1), files_to_parse)
         self.assertEqual(positions, [0])
 
     def test_resumed_gzipped_file(self):
         # In subsequent runs of the script we will resume from where we
         # stopped last time. (Here we pretend we parsed only the first line)
-        file_name = 'launchpadlibrarian.net.access-log.1.gz'
-        first_line = gzip.open(os.path.join(self.root, file_name)).readline()
+        gz_name = 'launchpadlibrarian.net.access-log.1.gz'
+        gz_path = os.path.join(self.root, gz_name)
+        first_line = gzip.open(gz_path).readline()
         ParsedApacheLog(first_line, len(first_line))
-        files_to_parse = get_files_to_parse(self.root, [file_name])
+        files_to_parse = get_files_to_parse([gz_path])
         positions = map(itemgetter(1), files_to_parse)
         self.failUnlessEqual(positions, [len(first_line)])
 

=== modified file 'lib/lp/soyuz/scripts/ppa_apache_log_parser.py'
--- lib/lp/soyuz/scripts/ppa_apache_log_parser.py	2010-03-16 08:36:41 +0000
+++ lib/lp/soyuz/scripts/ppa_apache_log_parser.py	2011-01-05 05:24:55 +0000
@@ -3,8 +3,6 @@
 
 __all__ = ['DBUSER', 'get_ppa_file_key']
 
-import re
-
 from lp.archiveuploader.utils import re_isadeb
 
 

=== renamed file 'lib/lp/soyuz/scripts/tests/ppa-apache-log-files/ppa.launchpad.net.access-log' => 'lib/lp/soyuz/scripts/tests/ppa-apache-log-files/ppa.launchpad.net-access.log'
=== added file 'lib/lp/soyuz/scripts/tests/ppa-apache-log-files/ppa.launchpad.net-error.log'
--- lib/lp/soyuz/scripts/tests/ppa-apache-log-files/ppa.launchpad.net-error.log	1970-01-01 00:00:00 +0000
+++ lib/lp/soyuz/scripts/tests/ppa-apache-log-files/ppa.launchpad.net-error.log	2011-01-05 05:24:55 +0000
@@ -0,0 +1,1 @@
+[Mon Sep 20 11:14:20 2010] [error] [client 127.0.0.1] File does not exist: /srv/launchpad.net/foo

=== modified file 'lib/lp/soyuz/scripts/tests/test_ppa_apache_log_parser.py'
--- lib/lp/soyuz/scripts/tests/test_ppa_apache_log_parser.py	2010-08-20 20:31:18 +0000
+++ lib/lp/soyuz/scripts/tests/test_ppa_apache_log_parser.py	2011-01-05 05:24:55 +0000
@@ -6,6 +6,7 @@
 
 from zope.component import getUtility
 
+from canonical.launchpad.webapp.errorlog import globalErrorUtility
 from canonical.launchpad.webapp.interfaces import (
     DEFAULT_FLAVOR,
     IStoreSelector,
@@ -85,9 +86,13 @@
         # After the script's run, we will check that the results in the
         # database match the sample log files we use for this test:
         # lib/lp/soyuz/scripts/tests/ppa-apache-log-files
+        # In addition to the wanted access log file, there is also an
+        # error log that will be skipped by the configured glob.
         self.assertEqual(
             0, self.store.find(BinaryPackageReleaseDownloadCount).count())
 
+        last_oops = globalErrorUtility.getLastOopsReport()
+
         process = subprocess.Popen(
             'cronscripts/parse-ppa-apache-access-logs.py', shell=True,
             stdin=subprocess.PIPE, stdout=subprocess.PIPE,
@@ -96,6 +101,10 @@
         self.assertEqual(
             process.returncode, 0, "stdout:%s, stderr:%s" % (out, err))
 
+        # The error log does not match the glob, so it is not processed,
+        # and no OOPS is generated.
+        self.assertNoNewOops(last_oops)
+
         # Must commit because the changes were done in another transaction.
         import transaction
         transaction.commit()