launchpad-reviewers team mailing list archive

Thread
Date

[Merge] lp:~benji/launchpad/bug-622765 into lp:launchpad/devel

To: mp+34077@xxxxxxxxxxxxxxxxxx
From: Benji York <benji@xxxxxxxxxxxxx>
Date: Mon, 30 Aug 2010 15:03:46 -0000
Reply-to: mp+34077@xxxxxxxxxxxxxxxxxx
Sender: bounces@xxxxxxxxxxxxx

Benji York has proposed merging lp:~benji/launchpad/bug-622765 into lp:launchpad/devel.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)
Related bugs:
  #622765 apache-log-parser should parse one file at a time
  https://bugs.launchpad.net/bugs/622765


The config option "logparser_max_parsed_lines" was respected only for individual files, not across multiple files (e.g., if a max of 1000 lines were to be parsed and there were 10 files to parse 10,000 lines would be parsed instead of just 1000).

This branch fixes the problem by keeping track of the number of lines parsed across calls to parse_file.

The related tests can be run with

    bin/test -c test_apachelogparser

No lint was reported by make lint.
-- 
https://code.launchpad.net/~benji/launchpad/bug-622765/+merge/34077
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~benji/launchpad/bug-622765 into lp:launchpad/devel.

=== modified file 'lib/canonical/launchpad/scripts/tests/test_librarian_apache_log_parser.py'
--- lib/canonical/launchpad/scripts/tests/test_librarian_apache_log_parser.py	2010-08-20 20:31:18 +0000
+++ lib/canonical/launchpad/scripts/tests/test_librarian_apache_log_parser.py	2010-08-30 15:03:44 +0000
@@ -102,7 +102,7 @@
             '69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET '
             '/15018215/ul_logo_64x64.png HTTP/1.1" 200 2261 '
             '"https://launchpad.net/~ubuntulite/+archive"; "Mozilla"')
-        downloads, parsed_bytes = parse_file(
+        downloads, parsed_bytes, ignored = parse_file(
             fd, start_position=0, logger=self.logger,
             get_download_key=get_library_file_id)
         self.assertEqual(
@@ -121,7 +121,7 @@
         fd = StringIO(
             '69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET / HTTP/1.1" '
             '200 2261 "https://launchpad.net/~ubuntulite/+archive"; "Mozilla"')
-        downloads, parsed_bytes = parse_file(
+        downloads, parsed_bytes, ignored = parse_file(
             fd, start_position=0, logger=self.logger,
             get_download_key=get_library_file_id)
         self.assertEqual(

=== modified file 'lib/lp/services/apachelogparser/base.py'
--- lib/lp/services/apachelogparser/base.py	2010-08-27 18:41:01 +0000
+++ lib/lp/services/apachelogparser/base.py	2010-08-30 15:03:44 +0000
@@ -77,7 +77,7 @@
     return fd, file_size
 
 
-def parse_file(fd, start_position, logger, get_download_key):
+def parse_file(fd, start_position, logger, get_download_key, parsed_lines=0):
     """Parse the given file starting on the given position.
 
     Return a dictionary mapping file_ids (from the librarian) to days to
@@ -91,7 +91,6 @@
 
     geoip = getUtility(IGeoIP)
     downloads = {}
-    parsed_lines = 0
 
     # Check for an optional max_parsed_lines config option.
     max_parsed_lines = getattr(
@@ -167,7 +166,7 @@
         logger.info('Parsed %d lines resulting in %d download stats.' % (
             parsed_lines, len(downloads)))
 
-    return downloads, parsed_bytes
+    return downloads, parsed_bytes, parsed_lines
 
 
 def create_or_update_parsedlog_entry(first_line, parsed_bytes):

=== modified file 'lib/lp/services/apachelogparser/script.py'
--- lib/lp/services/apachelogparser/script.py	2010-08-27 18:41:01 +0000
+++ lib/lp/services/apachelogparser/script.py	2010-08-30 15:03:44 +0000
@@ -8,6 +8,7 @@
 
 from zope.component import getUtility
 
+from canonical.config import config
 from lp.app.errors import NotFoundError
 from lp.services.apachelogparser.base import (
     create_or_update_parsedlog_entry,
@@ -65,8 +66,15 @@
 
         self.setUpUtilities()
         country_set = getUtility(ICountrySet)
+        parsed_lines = 0
+        max_parsed_lines = getattr(
+            config.launchpad, 'logparser_max_parsed_lines', None)
         for fd, position in files_to_parse:
-            downloads, parsed_bytes = parse_file(
+            # If we've used up our budget of lines to process, stop.
+            if (max_parsed_lines is not None
+            and parsed_lines >= max_parsed_lines):
+                break
+            downloads, parsed_bytes, parsed_lines = parse_file(
                 fd, position, self.logger, self.getDownloadKey)
             # Use a while loop here because we want to pop items from the dict
             # in order to free some memory as we go along. This is a good

=== modified file 'lib/lp/services/apachelogparser/tests/test_apachelogparser.py'
--- lib/lp/services/apachelogparser/tests/test_apachelogparser.py	2010-08-27 18:41:01 +0000
+++ lib/lp/services/apachelogparser/tests/test_apachelogparser.py	2010-08-30 15:03:44 +0000
@@ -6,7 +6,6 @@
 import os
 from StringIO import StringIO
 import tempfile
-import textwrap
 from operator import itemgetter
 import unittest
 
@@ -125,17 +124,18 @@
 
     def test_parsing(self):
         # The parse_file() function returns a tuple containing a dict (mapping
-        # days and library file IDs to number of downloads) and the total
-        # number of bytes that have been parsed from this file.  In our sample
+        # days and library file IDs to number of downloads), the total number
+        # of bytes that have been parsed from this file, and the running total
+        # of lines parsed (for testing against the maximum).  In our sample
         # log, the file with ID 8196569 has been downloaded twice (once from
-        # Argentina and once from Japan) and the files with ID 12060796
-        # and 9096290 have been downloaded once.  The file with ID 15018215
-        # has also been downloaded once (last line of the sample log), but
+        # Argentina and once from Japan) and the files with ID 12060796 and
+        # 9096290 have been downloaded once.  The file with ID 15018215 has
+        # also been downloaded once (last line of the sample log), but
         # parse_file() always skips the last line as it may be truncated, so
         # it doesn't show up in the dict returned.
         fd = open(os.path.join(
             here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
-        downloads, parsed_bytes = parse_file(
+        downloads, parsed_bytes, ignored = parse_file(
             fd, start_position=0, logger=self.logger,
             get_download_key=get_path_download_key)
         self.assertEqual(
@@ -159,7 +159,7 @@
         # line without worrying about whether or not it's been truncated.
         fd = open(os.path.join(
             here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
-        downloads, parsed_bytes = parse_file(
+        downloads, parsed_bytes, ignored = parse_file(
             fd, start_position=self._getLastLineStart(fd), logger=self.logger,
             get_download_key=get_path_download_key)
         self.assertEqual(
@@ -177,7 +177,7 @@
         # parsed up to the line before the one where the failure occurred.
         # Here we force an unexpected error on the first line.
         fd = StringIO('Not a log')
-        downloads, parsed_bytes = parse_file(
+        downloads, parsed_bytes, ignored = parse_file(
             fd, start_position=0, logger=self.logger,
             get_download_key=get_path_download_key)
         self.assertIn('Error', self.logger.buffer.getvalue())
@@ -188,7 +188,7 @@
         """Assert that responses with the given status are ignored."""
         fd = StringIO(
             self.sample_line % dict(status=status, method='GET'))
-        downloads, parsed_bytes = parse_file(
+        downloads, parsed_bytes, ignored = parse_file(
             fd, start_position=0, logger=self.logger,
             get_download_key=get_path_download_key)
         self.assertEqual(
@@ -213,7 +213,7 @@
         """Assert that requests with the given method are ignored."""
         fd = StringIO(
             self.sample_line % dict(status='200', method=method))
-        downloads, parsed_bytes = parse_file(
+        downloads, parsed_bytes, ignored = parse_file(
             fd, start_position=0, logger=self.logger,
             get_download_key=get_path_download_key)
         self.assertEqual(
@@ -231,7 +231,7 @@
     def test_normal_request_is_not_ignored(self):
         fd = StringIO(
             self.sample_line % dict(status=200, method='GET'))
-        downloads, parsed_bytes = parse_file(
+        downloads, parsed_bytes, ignored = parse_file(
             fd, start_position=0, logger=self.logger,
             get_download_key=get_path_download_key)
         self.assertEqual(
@@ -249,22 +249,20 @@
         # lines.
         config.push(
             'log_parser config',
-            textwrap.dedent('''\
-                [launchpad]
-                logparser_max_parsed_lines: 2
-                '''))
+            '[launchpad]\nlogparser_max_parsed_lines: 2')
         self.addCleanup(config.pop, 'log_parser config')
         fd = open(os.path.join(
             here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
         self.addCleanup(fd.close)
 
-        downloads, parsed_bytes = parse_file(
+        downloads, parsed_bytes, parsed_lines = parse_file(
             fd, start_position=0, logger=self.logger,
             get_download_key=get_path_download_key)
 
         # We have initially parsed only the first two lines of data,
         # corresponding to one download (the first line is a 404 and
         # so ignored).
+        self.assertEqual(parsed_lines, 2)
         date = datetime(2008, 6, 13)
         self.assertContentEqual(
             downloads.items(),
@@ -276,7 +274,7 @@
 
         # And the subsequent parse will be for the 3rd and 4th lines,
         # corresponding to two downloads of the same file.
-        downloads, parsed_bytes = parse_file(
+        downloads, parsed_bytes, ignored = parse_file(
             fd, start_position=parsed_bytes, logger=self.logger,
             get_download_key=get_path_download_key)
         self.assertContentEqual(
@@ -285,6 +283,33 @@
              ('/8196569/mediumubuntulogo.png', {date: {'AR': 1}})])
         self.assertEqual(parsed_bytes, sum(line_lengths[:4]))
 
+    def test_max_parsed_lines_exceeded(self):
+        # The max_parsed_lines config option limits the number of parsed
+        # lines.
+        config.push(
+            'log_parser config',
+            '[launchpad]\nlogparser_max_parsed_lines: 2')
+        self.addCleanup(config.pop, 'log_parser config')
+        fd = open(os.path.join(
+            here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
+        self.addCleanup(fd.close)
+
+        # If we have already parsed some lines, then the number of lines
+        # parsed will be passed in (parsed_lines argument) and parse_file will
+        # take that number into account when determining if the maximum number
+        # of lines to parse has been reached.
+        parsed_lines = 1
+        downloads, parsed_bytes, parsed_lines = parse_file(
+            fd, start_position=0, logger=self.logger,
+            get_download_key=get_path_download_key, parsed_lines=parsed_lines)
+
+        # Since we told parse_file that we had already parsed 1 line and the
+        # limit is 2 lines, it only parsed the first line of the file which is
+        # a 404, so there were no downloads recorded.
+        self.assertEqual(parsed_lines, 2)
+        self.assertContentEqual(downloads.items(), [])
+        fd.seek(0)
+
 
 class TestParsedFilesDetection(TestCase):
     """Test the detection of already parsed logs."""