← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] ~cjwatson/launchpad-buildd:py3-build-logs-bytes into launchpad-buildd:master

 

Colin Watson has proposed merging ~cjwatson/launchpad-buildd:py3-build-logs-bytes into launchpad-buildd:master.

Commit message:
Treat build logs as binary files

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~cjwatson/launchpad-buildd/+git/launchpad-buildd/+merge/383344

Build logs can contain arbitrary binary data.  Although we generally parse and display them as text, when fetching the log tail we may start in the middle of a UTF-8 sequence, and in general we need to parse them defensively.  Treat the files themselves as binary and decode them for parsing purposes.
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of ~cjwatson/launchpad-buildd:py3-build-logs-bytes into launchpad-buildd:master.
diff --git a/debian/changelog b/debian/changelog
index c8029fc..ee93dfa 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -8,6 +8,7 @@ launchpad-buildd (190) UNRELEASED; urgency=medium
   * Open temporary files in text mode in more cases.
   * Ensure that regex patterns with \-escapes are raw strings.
   * Adjust X-LXD-mode header construction for Python 3.
+  * Treat build logs as binary files.
 
  -- Colin Watson <cjwatson@xxxxxxxxxx>  Tue, 28 Apr 2020 10:19:27 +0100
 
diff --git a/lpbuildd/binarypackage.py b/lpbuildd/binarypackage.py
index 664f459..2111bbe 100644
--- a/lpbuildd/binarypackage.py
+++ b/lpbuildd/binarypackage.py
@@ -359,12 +359,12 @@ class BinaryPackageBuildManager(DebianBuildManager):
             # Check the last 4KiB for the Fail-Stage. If it failed
             # during install-deps, search for the missing dependency
             # string.
-            with open(os.path.join(self._cachepath, "buildlog")) as log:
+            with open(os.path.join(self._cachepath, "buildlog"), "rb") as log:
                 try:
                     log.seek(-4096, os.SEEK_END)
                 except IOError:
                     pass
-                tail = log.read(4096)
+                tail = log.read(4096).decode("UTF-8", "replace")
             if re.search(r"^Fail-Stage: install-deps$", tail, re.M):
                 for rx in BuildLogRegexes.MAYBEDEPFAIL:
                     log_patterns.append([rx, re.M | re.S])
@@ -389,9 +389,11 @@ class BinaryPackageBuildManager(DebianBuildManager):
                     success = SBuildExitCodes.FAILED
             elif rx in BuildLogRegexes.DEPFAIL:
                 # A depwait match forces depwait.
-                missing_dep = mo.expand(BuildLogRegexes.DEPFAIL[rx])
+                missing_dep = mo.expand(
+                    BuildLogRegexes.DEPFAIL[rx].encode("UTF-8"))
                 missing_dep = self.stripDependencies(
-                    PkgRelation.parse_relations(missing_dep))
+                    PkgRelation.parse_relations(
+                        missing_dep.decode("UTF-8", "replace")))
             else:
                 # Otherwise it was a givenback pattern, so leave it
                 # in givenback.
diff --git a/lpbuildd/builder.py b/lpbuildd/builder.py
index c5ad197..fc3a0ce 100644
--- a/lpbuildd/builder.py
+++ b/lpbuildd/builder.py
@@ -36,24 +36,24 @@ from lpbuildd.util import shell_escape
 devnull = open("/dev/null", "r")
 
 
-def _sanitizeURLs(text_seq):
-    """A generator that deletes URL passwords from a string sequence.
+def _sanitizeURLs(bytes_seq):
+    """A generator that deletes URL passwords from a bytes sequence.
 
     This generator removes user/password data from URLs if embedded
     in the latter as follows: scheme://user:passwd@netloc/path.
 
-    :param text_seq: A sequence of strings (that may contain URLs).
+    :param bytes_seq: A sequence of byte strings (that may contain URLs).
     :return: A (sanitized) line stripped of authentication credentials.
     """
     # This regular expression will be used to remove authentication
     # credentials from URLs.
-    password_re = re.compile(r'://([^:]+:[^@]+@)(\S+)')
+    password_re = re.compile(br'://([^:]+:[^@]+@)(\S+)')
     # Snap proxy passwords are UUIDs.
-    snap_proxy_auth_re = re.compile(r',proxyauth=[^:]+:[A-Za-z0-9-]+')
+    snap_proxy_auth_re = re.compile(br',proxyauth=[^:]+:[A-Za-z0-9-]+')
 
-    for line in text_seq:
-        sanitized_line = password_re.sub(r'://\2', line)
-        sanitized_line = snap_proxy_auth_re.sub('', sanitized_line)
+    for line in bytes_seq:
+        sanitized_line = password_re.sub(br'://\2', line)
+        sanitized_line = snap_proxy_auth_re.sub(b'', sanitized_line)
         yield sanitized_line
 
 
@@ -512,8 +512,12 @@ class Builder(object):
     def log(self, data):
         """Write the provided data to the log."""
         if self._log is not None:
-            self._log.write(data)
+            data_bytes = (
+                data if isinstance(data, bytes) else data.encode("UTF-8"))
+            self._log.write(data_bytes)
             self._log.flush()
+        if not isinstance(data, str):
+            data = data.decode("UTF-8", "replace")
         if data.endswith("\n"):
             data = data[:-1]
         log.msg("Build log: " + data)
@@ -522,27 +526,27 @@ class Builder(object):
         """Return the tail of the log.
 
         If the buildlog is not yet opened for writing (self._log is None),
-        return a empty string.
+        return an empty bytes object.
 
         It safely tries to open the 'buildlog', if it doesn't exist, due to
         job cleanup or buildlog sanitization race-conditions, it also returns
-        an empty string.
+        an empty bytes object.
 
-        When the 'buildlog' is present it return up to 2 KiB character of
-        the end of the file.
+        When the 'buildlog' is present it returns up to 2 KiB bytes of the
+        end of the file.
 
         The returned content will be 'sanitized', see `_sanitizeURLs` for
         further information.
         """
         if self._log is None:
-            return ""
+            return b""
 
         rlog = None
         try:
             try:
-                rlog = open(self.cachePath("buildlog"), "r")
+                rlog = open(self.cachePath("buildlog"), "rb")
             except IOError:
-                ret = ""
+                ret = b""
             else:
                 # We rely on good OS practices that keep the file handler
                 # usable once it's opened. So, if open() is ok, a subsequent
@@ -567,7 +571,7 @@ class Builder(object):
             # excerpt to be scrubbed) because it may be cut off thus
             # thwarting the detection of embedded passwords.
             clean_content_iter = _sanitizeURLs(log_lines[1:])
-            ret = '\n'.join(clean_content_iter)
+            ret = b'\n'.join(clean_content_iter)
 
         return ret
 
@@ -584,7 +588,7 @@ class Builder(object):
         """Empty the log and start again."""
         if self._log is not None:
             self._log.close()
-        self._log = open(self.cachePath("buildlog"), "w")
+        self._log = open(self.cachePath("buildlog"), "wb")
 
     def builderFail(self):
         """Cease building because the builder has a problem."""
@@ -667,14 +671,14 @@ class Builder(object):
         os.rename(log_path, unsanitized_path)
 
         # Open the unsanitized buildlog file for reading.
-        unsanitized_file = open(unsanitized_path)
+        unsanitized_file = open(unsanitized_path, 'rb')
 
         # Open the file that will hold the resulting, sanitized buildlog
         # content for writing.
         sanitized_file = None
 
         try:
-            sanitized_file = open(log_path, 'w')
+            sanitized_file = open(log_path, 'wb')
 
             # Scrub the buildlog file line by line
             clean_content_iter = _sanitizeURLs(unsanitized_file)
diff --git a/lpbuildd/debian.py b/lpbuildd/debian.py
index 7065c7d..7f8dd92 100644
--- a/lpbuildd/debian.py
+++ b/lpbuildd/debian.py
@@ -237,29 +237,27 @@ class DebianBuildManager(BuildManager):
         """
         chunk_size = 256 * 1024
         regexes = [
-            re.compile(pattern, flags)
+            re.compile(pattern.encode("UTF-8"), flags)
             for pattern, flags in patterns_and_flags]
         stop_regexes = [
-            re.compile(pattern, flags)
+            re.compile(pattern.encode("UTF-8"), flags)
             for pattern, flags in stop_patterns_and_flags]
-        buildlog = open(os.path.join(self._cachepath, "buildlog"))
-        try:
-            window = ""
+        buildlog_path = os.path.join(self._cachepath, "buildlog")
+        with open(buildlog_path, "rb") as buildlog:
+            window = b""
             chunk = buildlog.read(chunk_size)
             while chunk:
                 window += chunk
                 for regex in regexes:
                     match = regex.search(window)
                     if match is not None:
-                        return regex.pattern, match
+                        return regex.pattern.decode("UTF-8"), match
                 for regex in stop_regexes:
                     if regex.search(window) is not None:
                         return None, None
                 if len(window) > chunk_size:
                     window = window[chunk_size:]
                 chunk = buildlog.read(chunk_size)
-        finally:
-            buildlog.close()
         return None, None
 
     def iterate_SOURCES(self, success):
diff --git a/lpbuildd/sourcepackagerecipe.py b/lpbuildd/sourcepackagerecipe.py
index 4781847..b2f46cb 100644
--- a/lpbuildd/sourcepackagerecipe.py
+++ b/lpbuildd/sourcepackagerecipe.py
@@ -109,9 +109,10 @@ class SourcePackageRecipeBuildManager(DebianBuildManager):
                     r'.*: Depends: ([^ ]*( \([^)]*\))?)')
                 _, mo = self.searchLogContents([[rx, re.M]])
                 if mo:
-                    self._builder.depFail(mo.group(1))
+                    missing_dep = mo.group(1).decode("UTF-8", "replace")
+                    self._builder.depFail(missing_dep)
                     print("Returning build status: DEPFAIL")
-                    print("Dependencies: " + mo.group(1))
+                    print("Dependencies: " + missing_dep)
                 else:
                     print("Returning build status: Build failed")
                     self._builder.buildFail()
diff --git a/lpbuildd/tests/test_buildd_slave.py b/lpbuildd/tests/test_buildd_slave.py
index 53c259e..6c70529 100644
--- a/lpbuildd/tests/test_buildd_slave.py
+++ b/lpbuildd/tests/test_buildd_slave.py
@@ -112,12 +112,12 @@ class LaunchpadBuilddSlaveTests(BuilddTestCase):
         # behaviour because the BuildManager's 'is_archive_private'
         # property is initialized to False).
         self.slave.manager.is_archive_private = False
-        unsanitized = self.slave.getLogTail().splitlines()
+        unsanitized = self.slave.getLogTail().decode('UTF-8').splitlines()
 
         # Make the builder believe we are building in a private archive to
         # obtain the scrubbed log tail output.
         self.slave.manager.is_archive_private = True
-        clean = self.slave.getLogTail().splitlines()
+        clean = self.slave.getLogTail().decode('UTF-8').splitlines()
 
         # Get the differences ..
         differences = '\n'.join(difflib.unified_diff(unsanitized, clean))