launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #25652
[Merge] ~cjwatson/launchpad:py3-apachelogparser into launchpad:master
Colin Watson has proposed merging ~cjwatson/launchpad:py3-apachelogparser into launchpad:master.
Commit message:
Fix apachelogparser for Python 3
Requested reviews:
Launchpad code reviewers (launchpad-reviewers)
For more details, see:
https://code.launchpad.net/~cjwatson/launchpad/+git/launchpad/+merge/393759
--
Your team Launchpad code reviewers is requested to review the proposed merge of ~cjwatson/launchpad:py3-apachelogparser into launchpad:master.
diff --git a/lib/lp/services/apachelogparser/base.py b/lib/lp/services/apachelogparser/base.py
index e5fbaa7..6b3a15a 100644
--- a/lib/lp/services/apachelogparser/base.py
+++ b/lib/lp/services/apachelogparser/base.py
@@ -4,6 +4,7 @@
from datetime import datetime
import gzip
import os
+import struct
from contrib import apachelog
from lazr.uri import (
@@ -35,7 +36,7 @@ def get_files_to_parse(file_paths):
file_paths = sorted(file_paths, key=lambda path: os.stat(path).st_mtime)
for file_path in file_paths:
fd, file_size = get_fd_and_file_size(file_path)
- first_line = six.ensure_text(fd.readline())
+ first_line = six.ensure_text(fd.readline(), errors='replace')
parsed_file = store.find(ParsedApacheLog, first_line=first_line).one()
position = 0
if parsed_file is not None:
@@ -56,8 +57,8 @@ def get_files_to_parse(file_paths):
def get_fd_and_file_size(file_path):
"""Return a file descriptor and the file size for the given file path.
- The file descriptor will have the default mode ('r') and will be seeked to
- the beginning.
+ The file descriptor will be read-only, opened in binary mode, and with
+ its position set to the beginning of the file.
The file size returned is that of the uncompressed file, in case the given
file_path points to a gzipped file.
@@ -68,11 +69,10 @@ def get_fd_and_file_size(file_path):
# module in Python 2.6.
fd = gzip.open(file_path)
fd.fileobj.seek(-4, os.SEEK_END)
- isize = gzip.read32(fd.fileobj) # may exceed 2GB
- file_size = isize & 0xffffffffL
+ file_size = struct.unpack('<I', fd.fileobj.read(4))[0]
fd.fileobj.seek(0)
else:
- fd = open(file_path)
+ fd = open(file_path, 'rb')
file_size = os.path.getsize(file_path)
return fd, file_size
@@ -106,6 +106,7 @@ def parse_file(fd, start_position, logger, get_download_key, parsed_lines=0):
break
line = next_line
+ line_text = six.ensure_text(line, errors='replace')
# Always skip the last line as it may be truncated since we're
# rsyncing live logs, unless there is only one line for us to
@@ -122,7 +123,7 @@ def parse_file(fd, start_position, logger, get_download_key, parsed_lines=0):
parsed_lines += 1
parsed_bytes += len(line)
host, date, status, request = get_host_date_status_and_request(
- line)
+ line_text)
if status != '200':
continue
@@ -158,7 +159,7 @@ def parse_file(fd, start_position, logger, get_download_key, parsed_lines=0):
# successfully, log this as an error and break the loop so that
# we return.
parsed_bytes -= len(line)
- logger.error('Error (%s) while parsing "%s"' % (e, line))
+ logger.error('Error (%s) while parsing "%s"' % (e, line_text))
break
if parsed_lines > 0:
@@ -170,7 +171,7 @@ def parse_file(fd, start_position, logger, get_download_key, parsed_lines=0):
def create_or_update_parsedlog_entry(first_line, parsed_bytes):
"""Create or update the ParsedApacheLog with the given first_line."""
- first_line = six.ensure_text(first_line)
+ first_line = six.ensure_text(first_line, errors='replace')
parsed_file = IStore(ParsedApacheLog).find(
ParsedApacheLog, first_line=first_line).one()
if parsed_file is None:
diff --git a/lib/lp/services/apachelogparser/tests/test_apachelogparser.py b/lib/lp/services/apachelogparser/tests/test_apachelogparser.py
index ee0ef78..513ce5d 100644
--- a/lib/lp/services/apachelogparser/tests/test_apachelogparser.py
+++ b/lib/lp/services/apachelogparser/tests/test_apachelogparser.py
@@ -3,9 +3,8 @@
from datetime import datetime
import gzip
-from operator import itemgetter
+import io
import os
-from StringIO import StringIO
import tempfile
import time
@@ -199,8 +198,10 @@ class TestLogFileParsing(TestCase):
# also been downloaded once (last line of the sample log), but
# parse_file() always skips the last line as it may be truncated, so
# it doesn't show up in the dict returned.
- fd = open(os.path.join(
- here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
+ fd = open(
+ os.path.join(
+ here, 'apache-log-files', 'launchpadlibrarian.net.access-log'),
+ 'rb')
downloads, parsed_bytes, parsed_lines = parse_file(
fd, start_position=0, logger=self.logger,
get_download_key=get_path_download_key)
@@ -223,8 +224,10 @@ class TestLogFileParsing(TestCase):
# When there's only the last line of a given file for us to parse, we
# assume the file has been rotated and it's safe to parse its last
# line without worrying about whether or not it's been truncated.
- fd = open(os.path.join(
- here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
+ fd = open(
+ os.path.join(
+ here, 'apache-log-files', 'launchpadlibrarian.net.access-log'),
+ 'rb')
downloads, parsed_bytes, parsed_lines = parse_file(
fd, start_position=self._getLastLineStart(fd), logger=self.logger,
get_download_key=get_path_download_key)
@@ -242,7 +245,7 @@ class TestLogFileParsing(TestCase):
# When there's an unexpected error, we log it and return as if we had
# parsed up to the line before the one where the failure occurred.
# Here we force an unexpected error on the first line.
- fd = StringIO('Not a log')
+ fd = io.BytesIO(b'Not a log')
downloads, parsed_bytes, parsed_lines = parse_file(
fd, start_position=0, logger=self.logger,
get_download_key=get_path_download_key)
@@ -252,8 +255,9 @@ class TestLogFileParsing(TestCase):
def _assertResponseWithGivenStatusIsIgnored(self, status):
"""Assert that responses with the given status are ignored."""
- fd = StringIO(
- self.sample_line % dict(status=status, method='GET'))
+ fd = io.BytesIO(
+ (self.sample_line % dict(status=status, method='GET')).encode(
+ 'UTF-8'))
downloads, parsed_bytes, parsed_lines = parse_file(
fd, start_position=0, logger=self.logger,
get_download_key=get_path_download_key)
@@ -277,8 +281,9 @@ class TestLogFileParsing(TestCase):
def _assertRequestWithGivenMethodIsIgnored(self, method):
"""Assert that requests with the given method are ignored."""
- fd = StringIO(
- self.sample_line % dict(status='200', method=method))
+ fd = io.BytesIO(
+ (self.sample_line % dict(status='200', method=method)).encode(
+ 'UTF-8'))
downloads, parsed_bytes, parsed_lines = parse_file(
fd, start_position=0, logger=self.logger,
get_download_key=get_path_download_key)
@@ -295,8 +300,9 @@ class TestLogFileParsing(TestCase):
self._assertRequestWithGivenMethodIsIgnored('POST')
def test_normal_request_is_not_ignored(self):
- fd = StringIO(
- self.sample_line % dict(status=200, method='GET'))
+ fd = io.BytesIO(
+ (self.sample_line % dict(status=200, method='GET')).encode(
+ 'UTF-8'))
downloads, parsed_bytes, parsed_lines = parse_file(
fd, start_position=0, logger=self.logger,
get_download_key=get_path_download_key)
@@ -317,8 +323,10 @@ class TestLogFileParsing(TestCase):
'log_parser config',
'[launchpad]\nlogparser_max_parsed_lines: 2')
self.addCleanup(config.pop, 'log_parser config')
- fd = open(os.path.join(
- here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
+ fd = open(
+ os.path.join(
+ here, 'apache-log-files', 'launchpadlibrarian.net.access-log'),
+ 'rb')
self.addCleanup(fd.close)
downloads, parsed_bytes, parsed_lines = parse_file(
@@ -359,8 +367,10 @@ class TestLogFileParsing(TestCase):
'log_parser config',
'[launchpad]\nlogparser_max_parsed_lines: 2')
self.addCleanup(config.pop, 'log_parser config')
- fd = open(os.path.join(
- here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
+ fd = open(
+ os.path.join(
+ here, 'apache-log-files', 'launchpadlibrarian.net.access-log'),
+ 'rb')
self.addCleanup(fd.close)
# We want to start parsing on line 2 so we will have a value in
@@ -413,24 +423,27 @@ class TestParsedFilesDetection(TestCase):
for fd, _ in get_files_to_parse(file_paths):
fd.seek(0)
contents.append(fd.read())
- self.assertEqual(['2\n', '1\n', '0\n'], contents)
+ fd.close()
+ self.assertEqual([b'2\n', b'1\n', b'0\n'], contents)
def test_not_parsed_file(self):
# A file that has never been parsed will have to be parsed from the
# start.
- files_to_parse = get_files_to_parse([self.file_path])
- fd, position = list(files_to_parse)[0]
- self.assertEqual(position, 0)
+ files_to_parse = list(get_files_to_parse([self.file_path]))
+ self.assertEqual(1, len(files_to_parse))
+ fd, position = files_to_parse[0]
+ fd.close()
+ self.assertEqual(0, position)
def test_completely_parsed_file(self):
# A file that has been completely parsed will be skipped.
- fd = open(self.file_path)
- first_line = fd.readline()
- fd.seek(0)
- ParsedApacheLog(first_line, len(fd.read()))
+ with open(self.file_path) as fd:
+ first_line = fd.readline()
+ fd.seek(0)
+ ParsedApacheLog(first_line, len(fd.read()))
files_to_parse = get_files_to_parse([self.file_path])
- self.assertEqual(list(files_to_parse), [])
+ self.assertEqual([], list(files_to_parse))
def test_parsed_file_with_new_content(self):
# A file that has been parsed already but in which new content was
@@ -440,11 +453,12 @@ class TestParsedFilesDetection(TestCase):
ParsedApacheLog(first_line, len(first_line))
files_to_parse = list(get_files_to_parse([self.file_path]))
- self.assertEqual(len(files_to_parse), 1)
+ self.assertEqual(1, len(files_to_parse))
fd, position = files_to_parse[0]
+ fd.close()
# Since we parsed the first line above, we'll be told to start where
# the first line ends.
- self.assertEqual(position, len(first_line))
+ self.assertEqual(len(first_line), position)
def test_different_files_with_same_name(self):
# Thanks to log rotation, two runs of our script may see files with
@@ -459,12 +473,14 @@ class TestParsedFilesDetection(TestCase):
# parse it from the start.
fd, new_path = tempfile.mkstemp()
content2 = 'Different First Line\nSecond Line'
- fd = open(new_path, 'w')
- fd.write(content2)
- fd.close()
+ with open(new_path, 'w') as fd:
+ fd.write(content2)
files_to_parse = get_files_to_parse([new_path])
- positions = map(itemgetter(1), files_to_parse)
- self.assertEqual(positions, [0])
+ positions = []
+ for fd, position in files_to_parse:
+ fd.close()
+ positions.append(position)
+ self.assertEqual([0], positions)
def test_fresh_gzipped_file(self):
# get_files_to_parse() handles gzipped files just like uncompressed
@@ -472,8 +488,11 @@ class TestParsedFilesDetection(TestCase):
gz_name = 'launchpadlibrarian.net.access-log.1.gz'
gz_path = os.path.join(self.root, gz_name)
files_to_parse = get_files_to_parse([gz_path])
- positions = map(itemgetter(1), files_to_parse)
- self.assertEqual(positions, [0])
+ positions = []
+ for fd, position in files_to_parse:
+ fd.close()
+ positions.append(position)
+ self.assertEqual([0], positions)
def test_resumed_gzipped_file(self):
# In subsequent runs of the script we will resume from where we
@@ -483,8 +502,11 @@ class TestParsedFilesDetection(TestCase):
first_line = gzip.open(gz_path).readline()
ParsedApacheLog(first_line, len(first_line))
files_to_parse = get_files_to_parse([gz_path])
- positions = map(itemgetter(1), files_to_parse)
- self.assertEqual(positions, [len(first_line)])
+ positions = []
+ for fd, position in files_to_parse:
+ fd.close()
+ positions.append(position)
+ self.assertEqual([len(first_line)], positions)
class Test_create_or_update_parsedlog_entry(TestCase):
diff --git a/lib/lp/services/librarianserver/tests/test_apachelogparser.py b/lib/lp/services/librarianserver/tests/test_apachelogparser.py
index ffd30c7..1be4c14 100644
--- a/lib/lp/services/librarianserver/tests/test_apachelogparser.py
+++ b/lib/lp/services/librarianserver/tests/test_apachelogparser.py
@@ -2,8 +2,8 @@
# GNU Affero General Public License version 3 (see the file LICENSE).
from datetime import datetime
+import io
import os
-from StringIO import StringIO
import subprocess
from zope.component import getUtility
@@ -95,10 +95,10 @@ class TestLibrarianLogFileParsing(TestCase):
self.logger = BufferLogger()
def test_request_to_lfa_is_parsed(self):
- fd = StringIO(
- '69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET '
- '/15018215/ul_logo_64x64.png HTTP/1.1" 200 2261 '
- '"https://launchpad.net/~ubuntulite/+archive" "Mozilla"')
+ fd = io.BytesIO(
+ b'69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET '
+ b'/15018215/ul_logo_64x64.png HTTP/1.1" 200 2261 '
+ b'"https://launchpad.net/~ubuntulite/+archive" "Mozilla"')
downloads, parsed_bytes, ignored = parse_file(
fd, start_position=0, logger=self.logger,
get_download_key=get_library_file_id)
@@ -114,9 +114,9 @@ class TestLibrarianLogFileParsing(TestCase):
def test_request_to_non_lfa_is_ignored(self):
# A request to a path which doesn't map to a LibraryFileAlias (e.g.
# '/') is ignored.
- fd = StringIO(
- '69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET / HTTP/1.1" '
- '200 2261 "https://launchpad.net/~ubuntulite/+archive" "Mozilla"')
+ fd = io.BytesIO(
+ b'69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET / HTTP/1.1" '
+ b'200 2261 "https://launchpad.net/~ubuntulite/+archive" "Mozilla"')
downloads, parsed_bytes, ignored = parse_file(
fd, start_position=0, logger=self.logger,
get_download_key=get_library_file_id)