← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] lp:~benji/launchpad/bug-622765 into lp:launchpad/devel

 

Benji York has proposed merging lp:~benji/launchpad/bug-622765 into lp:launchpad/devel.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)
Related bugs:
  #622765 apache-log-parser should parse one file at a time
  https://bugs.launchpad.net/bugs/622765


When parsing Apache log files, the cron scripts want to pick up where
they left off.  The lp.services.apachelogparser.base.get_files_to_parse
function used to return a mapping of open files to the position at which
parsing should resume.  The problem was that all the files (which could
number in the thousands) were opened at once and returned en masse.

Since there was only one caller of get_files_to_parse and it only
iterated over the .items() of the result, it was easy to refactor the
code to instead return/consume a generator.  The result is that only one
file is open at any given time (the client consumes each file and closes
it before proceeding to the next).

Gary and I had a pre-implementation call.

The get_files_to_parse function was already well tested and only trivial
transformations were required to make the tests compatible with the new
API.

The tests can be run with the command:

    bin/test -c test_apachelogparser

I verified that the scripts still run by faking up enough of a
production environment that they would at least run to completion
without generating errors.

No lint was reported by "make lint".

-- 
https://code.launchpad.net/~benji/launchpad/bug-622765/+merge/33951
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~benji/launchpad/bug-622765 into lp:launchpad/devel.
=== modified file 'lib/lp/services/apachelogparser/base.py'
--- lib/lp/services/apachelogparser/base.py	2010-08-20 20:31:18 +0000
+++ lib/lp/services/apachelogparser/base.py	2010-08-27 19:04:45 +0000
@@ -24,7 +24,7 @@
 
 
 def get_files_to_parse(root, file_names):
-    """Return a dict mapping files to the position where reading should start.
+    """Return an iterator of file and position where reading should start.
 
     The lines read from that position onwards will be the ones that have not
     been parsed yet.
@@ -32,7 +32,6 @@
     :param root: The directory where the files are stored.
     :param file_names: The names of the files.
     """
-    files_to_parse = {}
     store = getUtility(IStoreSelector).get(MAIN_STORE, DEFAULT_FLAVOR)
     for file_name in file_names:
         file_path = os.path.join(root, file_name)
@@ -52,8 +51,7 @@
                 # parse what's new.
                 position = parsed_file.bytes_read
 
-        files_to_parse[fd] = position
-    return files_to_parse
+        yield fd, position
 
 
 def get_fd_and_file_size(file_path):

=== modified file 'lib/lp/services/apachelogparser/script.py'
--- lib/lp/services/apachelogparser/script.py	2010-08-20 20:31:18 +0000
+++ lib/lp/services/apachelogparser/script.py	2010-08-27 19:04:45 +0000
@@ -65,7 +65,7 @@
 
         self.setUpUtilities()
         country_set = getUtility(ICountrySet)
-        for fd, position in files_to_parse.items():
+        for fd, position in files_to_parse:
             downloads, parsed_bytes = parse_file(
                 fd, position, self.logger, self.getDownloadKey)
             # Use a while loop here because we want to pop items from the dict

=== modified file 'lib/lp/services/apachelogparser/tests/test_apachelogparser.py'
--- lib/lp/services/apachelogparser/tests/test_apachelogparser.py	2010-08-20 20:31:18 +0000
+++ lib/lp/services/apachelogparser/tests/test_apachelogparser.py	2010-08-27 19:04:45 +0000
@@ -7,6 +7,7 @@
 from StringIO import StringIO
 import tempfile
 import textwrap
+from operator import itemgetter
 import unittest
 
 from zope.component import getUtility
@@ -301,7 +302,8 @@
         # start.
         file_name = 'launchpadlibrarian.net.access-log'
         files_to_parse = get_files_to_parse(self.root, [file_name])
-        self.failUnlessEqual(files_to_parse.values(), [0])
+        fd, position = list(files_to_parse)[0]
+        self.assertEqual(position, 0)
 
     def test_completely_parsed_file(self):
         # A file that has been completely parsed will be skipped.
@@ -311,7 +313,8 @@
         fd.seek(0)
         ParsedApacheLog(first_line, len(fd.read()))
 
-        self.failUnlessEqual(get_files_to_parse(self.root, [file_name]), {})
+        files_to_parse = get_files_to_parse(self.root, [file_name])
+        self.failUnlessEqual(list(files_to_parse), [])
 
     def test_parsed_file_with_new_content(self):
         # A file that has been parsed already but in which new content was
@@ -321,8 +324,12 @@
         first_line = open(os.path.join(self.root, file_name)).readline()
         ParsedApacheLog(first_line, len(first_line))
 
-        files_to_parse = get_files_to_parse(self.root, [file_name])
-        self.failUnlessEqual(files_to_parse.values(), [len(first_line)])
+        files_to_parse = list(get_files_to_parse(self.root, [file_name]))
+        self.assertEqual(len(files_to_parse), 1)
+        fd, position = files_to_parse[0]
+        # Since we parsed the first line above, we'll be told to start where
+        # the first line ends.
+        self.assertEqual(position, len(first_line))
 
     def test_different_files_with_same_name(self):
         # Thanks to log rotation, two runs of our script may see files with
@@ -341,22 +348,27 @@
         fd.write(content2)
         fd.close()
         files_to_parse = get_files_to_parse(self.root, [file_name])
-        self.failUnlessEqual(files_to_parse.values(), [0])
+        positions = map(itemgetter(1), files_to_parse)
+        self.failUnlessEqual(positions, [0])
 
-    def test_gzipped_file(self):
+    def test_fresh_gzipped_file(self):
         # get_files_to_parse() handles gzipped files just like uncompressed
-        # ones.
-        # The first time we see one, we'll parse from the beginning.
+        # ones.  The first time we see one, we'll parse from the beginning.
         file_name = 'launchpadlibrarian.net.access-log.1.gz'
         first_line = gzip.open(os.path.join(self.root, file_name)).readline()
         files_to_parse = get_files_to_parse(self.root, [file_name])
-        self.failUnlessEqual(files_to_parse.values(), [0])
+        positions = map(itemgetter(1), files_to_parse)
+        self.assertEqual(positions, [0])
 
-        # And in subsequent runs of the script we will resume from where we
+    def test_resumed_gzipped_file(self):
+        # In subsequent runs of the script we will resume from where we
         # stopped last time. (Here we pretend we parsed only the first line)
+        file_name = 'launchpadlibrarian.net.access-log.1.gz'
+        first_line = gzip.open(os.path.join(self.root, file_name)).readline()
         ParsedApacheLog(first_line, len(first_line))
         files_to_parse = get_files_to_parse(self.root, [file_name])
-        self.failUnlessEqual(files_to_parse.values(), [len(first_line)])
+        positions = map(itemgetter(1), files_to_parse)
+        self.failUnlessEqual(positions, [len(first_line)])
 
 
 class Test_create_or_update_parsedlog_entry(TestCase):