launchpad-reviewers team mailing list archive

Thread
Date

[Merge] ~cjwatson/launchpad:py3-apachelogparser into launchpad:master

To: mp+393759@xxxxxxxxxxxxxxxxxx
From: Colin Watson <cjwatson@xxxxxxxxxxxxx>
Date: Fri, 13 Nov 2020 22:36:23 -0000
Reply-to: mp+393759@xxxxxxxxxxxxxxxxxx
Sender: bounces@xxxxxxxxxxxxx

Colin Watson has proposed merging ~cjwatson/launchpad:py3-apachelogparser into launchpad:master.

Commit message:
Fix apachelogparser for Python 3

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~cjwatson/launchpad/+git/launchpad/+merge/393759
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of ~cjwatson/launchpad:py3-apachelogparser into launchpad:master.

diff --git a/lib/lp/services/apachelogparser/base.py b/lib/lp/services/apachelogparser/base.py
index e5fbaa7..6b3a15a 100644
--- a/lib/lp/services/apachelogparser/base.py
+++ b/lib/lp/services/apachelogparser/base.py
@@ -4,6 +4,7 @@
 from datetime import datetime
 import gzip
 import os
+import struct
 
 from contrib import apachelog
 from lazr.uri import (
@@ -35,7 +36,7 @@ def get_files_to_parse(file_paths):
     file_paths = sorted(file_paths, key=lambda path: os.stat(path).st_mtime)
     for file_path in file_paths:
         fd, file_size = get_fd_and_file_size(file_path)
-        first_line = six.ensure_text(fd.readline())
+        first_line = six.ensure_text(fd.readline(), errors='replace')
         parsed_file = store.find(ParsedApacheLog, first_line=first_line).one()
         position = 0
         if parsed_file is not None:
@@ -56,8 +57,8 @@ def get_files_to_parse(file_paths):
 def get_fd_and_file_size(file_path):
     """Return a file descriptor and the file size for the given file path.
 
-    The file descriptor will have the default mode ('r') and will be seeked to
-    the beginning.
+    The file descriptor will be read-only, opened in binary mode, and with
+    its position set to the beginning of the file.
 
     The file size returned is that of the uncompressed file, in case the given
     file_path points to a gzipped file.
@@ -68,11 +69,10 @@ def get_fd_and_file_size(file_path):
         # module in Python 2.6.
         fd = gzip.open(file_path)
         fd.fileobj.seek(-4, os.SEEK_END)
-        isize = gzip.read32(fd.fileobj)   # may exceed 2GB
-        file_size = isize & 0xffffffffL
+        file_size = struct.unpack('<I', fd.fileobj.read(4))[0]
         fd.fileobj.seek(0)
     else:
-        fd = open(file_path)
+        fd = open(file_path, 'rb')
         file_size = os.path.getsize(file_path)
     return fd, file_size
 
@@ -106,6 +106,7 @@ def parse_file(fd, start_position, logger, get_download_key, parsed_lines=0):
             break
 
         line = next_line
+        line_text = six.ensure_text(line, errors='replace')
 
         # Always skip the last line as it may be truncated since we're
         # rsyncing live logs, unless there is only one line for us to
@@ -122,7 +123,7 @@ def parse_file(fd, start_position, logger, get_download_key, parsed_lines=0):
             parsed_lines += 1
             parsed_bytes += len(line)
             host, date, status, request = get_host_date_status_and_request(
-                line)
+                line_text)
 
             if status != '200':
                 continue
@@ -158,7 +159,7 @@ def parse_file(fd, start_position, logger, get_download_key, parsed_lines=0):
             # successfully, log this as an error and break the loop so that
             # we return.
             parsed_bytes -= len(line)
-            logger.error('Error (%s) while parsing "%s"' % (e, line))
+            logger.error('Error (%s) while parsing "%s"' % (e, line_text))
             break
 
     if parsed_lines > 0:
@@ -170,7 +171,7 @@ def parse_file(fd, start_position, logger, get_download_key, parsed_lines=0):
 
 def create_or_update_parsedlog_entry(first_line, parsed_bytes):
     """Create or update the ParsedApacheLog with the given first_line."""
-    first_line = six.ensure_text(first_line)
+    first_line = six.ensure_text(first_line, errors='replace')
     parsed_file = IStore(ParsedApacheLog).find(
         ParsedApacheLog, first_line=first_line).one()
     if parsed_file is None:
diff --git a/lib/lp/services/apachelogparser/tests/test_apachelogparser.py b/lib/lp/services/apachelogparser/tests/test_apachelogparser.py
index ee0ef78..513ce5d 100644
--- a/lib/lp/services/apachelogparser/tests/test_apachelogparser.py
+++ b/lib/lp/services/apachelogparser/tests/test_apachelogparser.py
@@ -3,9 +3,8 @@
 
 from datetime import datetime
 import gzip
-from operator import itemgetter
+import io
 import os
-from StringIO import StringIO
 import tempfile
 import time
 
@@ -199,8 +198,10 @@ class TestLogFileParsing(TestCase):
         # also been downloaded once (last line of the sample log), but
         # parse_file() always skips the last line as it may be truncated, so
         # it doesn't show up in the dict returned.
-        fd = open(os.path.join(
-            here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
+        fd = open(
+            os.path.join(
+                here, 'apache-log-files', 'launchpadlibrarian.net.access-log'),
+            'rb')
         downloads, parsed_bytes, parsed_lines = parse_file(
             fd, start_position=0, logger=self.logger,
             get_download_key=get_path_download_key)
@@ -223,8 +224,10 @@ class TestLogFileParsing(TestCase):
         # When there's only the last line of a given file for us to parse, we
         # assume the file has been rotated and it's safe to parse its last
         # line without worrying about whether or not it's been truncated.
-        fd = open(os.path.join(
-            here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
+        fd = open(
+            os.path.join(
+                here, 'apache-log-files', 'launchpadlibrarian.net.access-log'),
+            'rb')
         downloads, parsed_bytes, parsed_lines = parse_file(
             fd, start_position=self._getLastLineStart(fd), logger=self.logger,
             get_download_key=get_path_download_key)
@@ -242,7 +245,7 @@ class TestLogFileParsing(TestCase):
         # When there's an unexpected error, we log it and return as if we had
         # parsed up to the line before the one where the failure occurred.
         # Here we force an unexpected error on the first line.
-        fd = StringIO('Not a log')
+        fd = io.BytesIO(b'Not a log')
         downloads, parsed_bytes, parsed_lines = parse_file(
             fd, start_position=0, logger=self.logger,
             get_download_key=get_path_download_key)
@@ -252,8 +255,9 @@ class TestLogFileParsing(TestCase):
 
     def _assertResponseWithGivenStatusIsIgnored(self, status):
         """Assert that responses with the given status are ignored."""
-        fd = StringIO(
-            self.sample_line % dict(status=status, method='GET'))
+        fd = io.BytesIO(
+            (self.sample_line % dict(status=status, method='GET')).encode(
+                'UTF-8'))
         downloads, parsed_bytes, parsed_lines = parse_file(
             fd, start_position=0, logger=self.logger,
             get_download_key=get_path_download_key)
@@ -277,8 +281,9 @@ class TestLogFileParsing(TestCase):
 
     def _assertRequestWithGivenMethodIsIgnored(self, method):
         """Assert that requests with the given method are ignored."""
-        fd = StringIO(
-            self.sample_line % dict(status='200', method=method))
+        fd = io.BytesIO(
+            (self.sample_line % dict(status='200', method=method)).encode(
+                'UTF-8'))
         downloads, parsed_bytes, parsed_lines = parse_file(
             fd, start_position=0, logger=self.logger,
             get_download_key=get_path_download_key)
@@ -295,8 +300,9 @@ class TestLogFileParsing(TestCase):
         self._assertRequestWithGivenMethodIsIgnored('POST')
 
     def test_normal_request_is_not_ignored(self):
-        fd = StringIO(
-            self.sample_line % dict(status=200, method='GET'))
+        fd = io.BytesIO(
+            (self.sample_line % dict(status=200, method='GET')).encode(
+                'UTF-8'))
         downloads, parsed_bytes, parsed_lines = parse_file(
             fd, start_position=0, logger=self.logger,
             get_download_key=get_path_download_key)
@@ -317,8 +323,10 @@ class TestLogFileParsing(TestCase):
             'log_parser config',
             '[launchpad]\nlogparser_max_parsed_lines: 2')
         self.addCleanup(config.pop, 'log_parser config')
-        fd = open(os.path.join(
-            here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
+        fd = open(
+            os.path.join(
+                here, 'apache-log-files', 'launchpadlibrarian.net.access-log'),
+            'rb')
         self.addCleanup(fd.close)
 
         downloads, parsed_bytes, parsed_lines = parse_file(
@@ -359,8 +367,10 @@ class TestLogFileParsing(TestCase):
             'log_parser config',
             '[launchpad]\nlogparser_max_parsed_lines: 2')
         self.addCleanup(config.pop, 'log_parser config')
-        fd = open(os.path.join(
-            here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
+        fd = open(
+            os.path.join(
+                here, 'apache-log-files', 'launchpadlibrarian.net.access-log'),
+            'rb')
         self.addCleanup(fd.close)
 
         # We want to start parsing on line 2 so we will have a value in
@@ -413,24 +423,27 @@ class TestParsedFilesDetection(TestCase):
         for fd, _ in get_files_to_parse(file_paths):
             fd.seek(0)
             contents.append(fd.read())
-        self.assertEqual(['2\n', '1\n', '0\n'], contents)
+            fd.close()
+        self.assertEqual([b'2\n', b'1\n', b'0\n'], contents)
 
     def test_not_parsed_file(self):
         # A file that has never been parsed will have to be parsed from the
         # start.
-        files_to_parse = get_files_to_parse([self.file_path])
-        fd, position = list(files_to_parse)[0]
-        self.assertEqual(position, 0)
+        files_to_parse = list(get_files_to_parse([self.file_path]))
+        self.assertEqual(1, len(files_to_parse))
+        fd, position = files_to_parse[0]
+        fd.close()
+        self.assertEqual(0, position)
 
     def test_completely_parsed_file(self):
         # A file that has been completely parsed will be skipped.
-        fd = open(self.file_path)
-        first_line = fd.readline()
-        fd.seek(0)
-        ParsedApacheLog(first_line, len(fd.read()))
+        with open(self.file_path) as fd:
+            first_line = fd.readline()
+            fd.seek(0)
+            ParsedApacheLog(first_line, len(fd.read()))
 
         files_to_parse = get_files_to_parse([self.file_path])
-        self.assertEqual(list(files_to_parse), [])
+        self.assertEqual([], list(files_to_parse))
 
     def test_parsed_file_with_new_content(self):
         # A file that has been parsed already but in which new content was
@@ -440,11 +453,12 @@ class TestParsedFilesDetection(TestCase):
         ParsedApacheLog(first_line, len(first_line))
 
         files_to_parse = list(get_files_to_parse([self.file_path]))
-        self.assertEqual(len(files_to_parse), 1)
+        self.assertEqual(1, len(files_to_parse))
         fd, position = files_to_parse[0]
+        fd.close()
         # Since we parsed the first line above, we'll be told to start where
         # the first line ends.
-        self.assertEqual(position, len(first_line))
+        self.assertEqual(len(first_line), position)
 
     def test_different_files_with_same_name(self):
         # Thanks to log rotation, two runs of our script may see files with
@@ -459,12 +473,14 @@ class TestParsedFilesDetection(TestCase):
         # parse it from the start.
         fd, new_path = tempfile.mkstemp()
         content2 = 'Different First Line\nSecond Line'
-        fd = open(new_path, 'w')
-        fd.write(content2)
-        fd.close()
+        with open(new_path, 'w') as fd:
+            fd.write(content2)
         files_to_parse = get_files_to_parse([new_path])
-        positions = map(itemgetter(1), files_to_parse)
-        self.assertEqual(positions, [0])
+        positions = []
+        for fd, position in files_to_parse:
+            fd.close()
+            positions.append(position)
+        self.assertEqual([0], positions)
 
     def test_fresh_gzipped_file(self):
         # get_files_to_parse() handles gzipped files just like uncompressed
@@ -472,8 +488,11 @@ class TestParsedFilesDetection(TestCase):
         gz_name = 'launchpadlibrarian.net.access-log.1.gz'
         gz_path = os.path.join(self.root, gz_name)
         files_to_parse = get_files_to_parse([gz_path])
-        positions = map(itemgetter(1), files_to_parse)
-        self.assertEqual(positions, [0])
+        positions = []
+        for fd, position in files_to_parse:
+            fd.close()
+            positions.append(position)
+        self.assertEqual([0], positions)
 
     def test_resumed_gzipped_file(self):
         # In subsequent runs of the script we will resume from where we
@@ -483,8 +502,11 @@ class TestParsedFilesDetection(TestCase):
         first_line = gzip.open(gz_path).readline()
         ParsedApacheLog(first_line, len(first_line))
         files_to_parse = get_files_to_parse([gz_path])
-        positions = map(itemgetter(1), files_to_parse)
-        self.assertEqual(positions, [len(first_line)])
+        positions = []
+        for fd, position in files_to_parse:
+            fd.close()
+            positions.append(position)
+        self.assertEqual([len(first_line)], positions)
 
 
 class Test_create_or_update_parsedlog_entry(TestCase):
diff --git a/lib/lp/services/librarianserver/tests/test_apachelogparser.py b/lib/lp/services/librarianserver/tests/test_apachelogparser.py
index ffd30c7..1be4c14 100644
--- a/lib/lp/services/librarianserver/tests/test_apachelogparser.py
+++ b/lib/lp/services/librarianserver/tests/test_apachelogparser.py
@@ -2,8 +2,8 @@
 # GNU Affero General Public License version 3 (see the file LICENSE).
 
 from datetime import datetime
+import io
 import os
-from StringIO import StringIO
 import subprocess
 
 from zope.component import getUtility
@@ -95,10 +95,10 @@ class TestLibrarianLogFileParsing(TestCase):
         self.logger = BufferLogger()
 
     def test_request_to_lfa_is_parsed(self):
-        fd = StringIO(
-            '69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET '
-            '/15018215/ul_logo_64x64.png HTTP/1.1" 200 2261 '
-            '"https://launchpad.net/~ubuntulite/+archive"; "Mozilla"')
+        fd = io.BytesIO(
+            b'69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET '
+            b'/15018215/ul_logo_64x64.png HTTP/1.1" 200 2261 '
+            b'"https://launchpad.net/~ubuntulite/+archive"; "Mozilla"')
         downloads, parsed_bytes, ignored = parse_file(
             fd, start_position=0, logger=self.logger,
             get_download_key=get_library_file_id)
@@ -114,9 +114,9 @@ class TestLibrarianLogFileParsing(TestCase):
     def test_request_to_non_lfa_is_ignored(self):
         # A request to a path which doesn't map to a LibraryFileAlias (e.g.
         # '/') is ignored.
-        fd = StringIO(
-            '69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET / HTTP/1.1" '
-            '200 2261 "https://launchpad.net/~ubuntulite/+archive"; "Mozilla"')
+        fd = io.BytesIO(
+            b'69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET / HTTP/1.1" '
+            b'200 2261 "https://launchpad.net/~ubuntulite/+archive"; "Mozilla"')
         downloads, parsed_bytes, ignored = parse_file(
             fd, start_position=0, logger=self.logger,
             get_download_key=get_library_file_id)