launchpad-reviewers team mailing list archive

Thread
Date

[Merge] ~ilkeremrekoc/launchpad:upgrade-multipart-fix into launchpad:master

To: mp+486203@xxxxxxxxxxxxxxxxxx
From: İlker Emre Koç <mp+486203@xxxxxxxxxxxxxxxxxx>
Date: Wed, 28 May 2025 14:55:30 -0000
Reply-to: mp+486203@xxxxxxxxxxxxxxxxxx
Sender: noreply@xxxxxxxxxxxxx

İlker Emre Koç has proposed merging ~ilkeremrekoc/launchpad:upgrade-multipart-fix into launchpad:master.

Commit message:
upgrade multipart and ensure correct parsing

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~ilkeremrekoc/launchpad/+git/launchpad/+merge/486203

Multipart upgrade will affect the requests that can be
processed on launchpad by refusing to parse non-CRLF 
line-endings. But since we have old packages like apport
sending requests by manually constructing them, which
originally didn't follow HTTP standards and used LF
endings instead, we added additional layers of parsing
to ensure we can work on both CRLF and LF endings, in 
such cases.
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of ~ilkeremrekoc/launchpad:upgrade-multipart-fix into launchpad:master.

diff --git a/lib/lp/services/webapp/servers.py b/lib/lp/services/webapp/servers.py
index f01969e..4938385 100644
--- a/lib/lp/services/webapp/servers.py
+++ b/lib/lp/services/webapp/servers.py
@@ -3,8 +3,10 @@
 
 """Definition of the internet servers that Launchpad uses."""
 
+import tempfile
 import threading
 import xmlrpc.client
+from io import BytesIO
 from urllib.parse import parse_qs
 
 import six
@@ -32,6 +34,7 @@ from zope.formlib.itemswidgets import MultiDataHelper
 from zope.formlib.widget import SimpleInputWidget
 from zope.interface import alsoProvides, implementer
 from zope.publisher.browser import BrowserRequest, BrowserResponse, TestRequest
+from zope.publisher.http import HTTPInputStream
 from zope.publisher.interfaces import NotFound
 from zope.publisher.xmlrpc import XMLRPCRequest, XMLRPCResponse
 from zope.security.interfaces import IParticipation, Unauthorized
@@ -707,6 +710,7 @@ class LaunchpadBrowserRequest(
     """
 
     retry_max_count = 5  # How many times we're willing to retry
+    buffer_size = 64 * 1024  # buffer size for the processInputs
 
     def __init__(self, body_instream, environ, response=None):
         BasicLaunchpadRequest.__init__(self, body_instream, environ, response)
@@ -765,6 +769,170 @@ class LaunchpadBrowserRequest(
         """See `ISynchronizer`."""
         pass
 
+    def _skipParseProcess(self):
+        """Skips the parsing process inside processInputs by re-creating and
+        re-inserting the input stream"""
+
+        # The cache stream stores all the previous read operation results.
+        # getCacheStream() reads the remaining body as well and adds it to the
+        # cache stream. A complete re-creation is necessary since these stream
+        # classes don't have any "seek()" method we can use to reset.
+        new_body_instream = self._body_instream.getCacheStream()
+
+        if isinstance(self._body_instream.stream, HTTPInputStream):
+            self._body_instream.stream.cacheStream.close()
+
+        self._body_instream = HTTPInputStream(new_body_instream, self._environ)
+
+    def _createResetableStream(self, environment):
+        """Creates a stream to be resetted at the end of the parsing process.
+
+        This method is a copy of the __init__ method of HTTPInputStream class
+        in zope.publisher since we use the returned value of this method in
+        the same request instances in the same way while being unable to
+        properly use that method here.
+        """
+
+        size = environment.get("CONTENT_LENGTH")
+        # There can be no size in the environment (None) or the size
+        # can be an empty string, in which case we treat it as absent.
+        if not size:
+            size = environment.get("HTTP_CONTENT_LENGTH")
+
+        if not size or int(size) < 65536:
+            resetable_stream = BytesIO()
+        else:
+            resetable_stream = tempfile.TemporaryFile()
+
+        return resetable_stream
+
+    # XXX: ilkeremrekoc 2025-03-12
+    # processInputs is a part of Zope's request class, which we override.
+    # This addition is because we are about to upgrade multipart package,
+    # which in its newer versions is strict in its parsing of requests, only
+    # accepting CRLF line-breaks throughout. And since the old versions of
+    # apport package, which is the main bug-filing tool in Ubuntu, uses LF
+    # line-breaks in its requests, we need backwards compatibility until the
+    # older Ubuntu versions' apport packages can implement CRLF SRUs (Stable
+    # Release Update). Once they do, and a sufficient section of users and
+    # Ubuntu versions make the update, we can get rid of this method here.
+    def processInputs(self):
+        """See IPublisherRequest'
+
+        Processes inputs before traversal
+
+        This method overrides `BrowserRequest.processInputs()` to parse LF or
+        CR line-breaks inside requests if they exist into CRLF line-breaks.
+        Calls the parent method afterwards for further processing.
+        """
+
+        # Trigger the parsing only when the arriving request is pointed
+        # towards apport package's entry point.
+        # Note: Aside from apport, the browsers' UI can also reach this
+        # page. This shouldn't affect our current process but should be kept
+        # in mind for any large scale changes in the future.
+        # Note: The third condition is redundant but was added as an extra
+        # security since we really don't wish to enter this block if we are
+        # not in the jurisdiction of the apport package.
+        if (
+            "PATH_INFO" in self._environ
+            and self._environ["PATH_INFO"] == "/+storeblob"
+            and self._environ["REQUEST_METHOD"] == "POST"
+            and self._environ["CONTENT_TYPE"].startswith("multipart/form-data")
+        ):
+
+            # The request's body can rise to enourmous sizes since this part
+            # of the code is for filing of bug reports, which require
+            # extensive auto generated information. As a result, we create
+            # the instance dynamically for different request sizes in the
+            # same way the zope.publisher framework's HTTPInputStream's
+            # cacheStream is created.
+            parsing_body_stream = self._createResetableStream(self._environ)
+
+            buffer = self._body_instream.read(self.buffer_size)
+
+            # If the parsing happens, the content length will change with the
+            # new line-breaks so we store the new one.
+            newContentLength = 0
+
+            log_linebreak_type = ""  # Store the linebreak type for logging
+
+            while buffer != b"":
+
+                if b"\r\n" in buffer:
+
+                    # Since we assume there is a single line-break type in the
+                    # request, we can skip the rest of the parsing when the
+                    # CRLF is found.
+                    self._skipParseProcess()
+                    parsing_body_stream.close()
+
+                    logging_context.push(linebreak_type="CRLF")
+                    super().processInputs()
+                    return
+
+                # The only CRLF in the buffer might have been split between
+                # the next buffer and the current one
+                elif buffer.endswith(b"\r"):
+                    nextChar = self._body_instream.read(1)
+                    buffer += nextChar
+
+                    if nextChar == b"\n":
+                        self._skipParseProcess()
+                        parsing_body_stream.close()
+
+                        logging_context.push(linebreak_type="CRLF")
+                        super().processInputs()
+                        return
+
+                if b"\n" in buffer:
+                    buffer = buffer.replace(b"\n", b"\r\n")
+                    newContentLength += len(buffer)
+
+                    log_linebreak_type = "LF"
+                elif b"\r" in buffer:
+                    # Technically, we don't need CR functionality in our
+                    # codebase, but in this case, not having it would make
+                    # things harder to understand as the question of "what
+                    # happens" would be left unknowable since the downstream
+                    # doesn't raise errors.
+
+                    buffer = buffer.replace(b"\r", b"\r\n")
+                    newContentLength += len(buffer)
+
+                    log_linebreak_type = "CR"
+                else:
+                    # If there are neither CRLF, LF or CR in a buffer, we
+                    # cannot be sure if the Content-Length will change later,
+                    # so, we store the buffer length just in case.
+                    newContentLength += len(buffer)
+
+                parsing_body_stream.write(buffer)
+                buffer = self._body_instream.read(self.buffer_size)
+
+            # Note that we can reach this part only if we parsed the whole
+            # request.
+
+            # Updating the content length by anticipating change.
+            self._environ["CONTENT_LENGTH"] = newContentLength
+
+            logging_context.push(linebreak_type=log_linebreak_type)
+
+            # Reset the stream before we re-create the original one.
+            parsing_body_stream.seek(0)
+
+            deleting_stream = self._body_instream
+            self._body_instream = HTTPInputStream(
+                parsing_body_stream, self._environ
+            )
+
+            if isinstance(deleting_stream.stream, HTTPInputStream):
+                deleting_stream.stream.cacheStream.close()
+            deleting_stream.cacheStream.close()
+
+        super().processInputs()
+        return
+
 
 @implementer(IBrowserFormNG)
 class BrowserFormNG:
diff --git a/lib/lp/services/webapp/tests/test_servers.py b/lib/lp/services/webapp/tests/test_servers.py
index 491fed7..bdd07ed 100644
--- a/lib/lp/services/webapp/tests/test_servers.py
+++ b/lib/lp/services/webapp/tests/test_servers.py
@@ -5,6 +5,7 @@ import io
 import unittest
 from datetime import datetime, timedelta, timezone
 from doctest import ELLIPSIS, NORMALIZE_WHITESPACE, DocTestSuite
+from textwrap import dedent
 
 import transaction
 from lazr.restful.interfaces import (
@@ -22,6 +23,7 @@ from talisker.logs import logging_context
 from testtools.matchers import ContainsDict, Equals
 from zope.component import getGlobalSiteManager, getUtility
 from zope.interface import Interface, implementer
+from zope.publisher.http import HTTPInputStream
 from zope.security.interfaces import Unauthorized
 from zope.security.management import newInteraction
 from zope.security.proxy import removeSecurityProxy
@@ -753,6 +755,427 @@ class TestLaunchpadBrowserRequest(TestCase):
         )
 
 
+# XXX: ilkeremrekoc 2025-03-12
+# This test case is for the overridden processInputs inside
+# LaunchpadBrowserRequest. The reason for the overriding is connected to
+# the changes in multipart package (which now only accepts CRLF endings) and
+# the original state of apport package (which only sent LF endings)
+# (see https://bugs.launchpad.net/ubuntu/+source/apport/+bug/2096327). Thus,
+# we add the overridden method and this test case temporarily until the apport
+# package can be SRU'd in a sufficient percantage of Ubuntu users. Both of
+# these can be removed once the prerequisite is fulfilled.
+class TestLaunchpadBrowserRequestProcessInputs(TestCase):
+    layer = FunctionalLayer
+
+    def setUp(self):
+        super().setUp()
+
+        self.body_template_str = dedent(
+            """\
+            Content-Type: multipart/mixed; \
+                boundary="===============4913381966751462219=="
+            MIME-Version: 1.0
+            \n\
+            --===============4913381966751462219==
+            Content-Type: text/plain; charset="us-ascii"
+            MIME-Version: 1.0
+            Content-Transfer-Encoding: 7bit
+            Content-Disposition: form-data; name="FORM_SUBMIT"
+            \n\
+            1
+            --===============4913381966751462219==
+            Content-Type: application/octet-stream
+            MIME-Version: 1.0
+            Content-Disposition: form-data; name="field.blob"; \
+                filename="x"
+            \n\
+            %s
+            \n\
+            --===============4913381966751462219==--
+            \n\
+            """
+        )
+
+        self.standard_blob = dedent(
+            """
+            This is for a test....
+            \n\
+            The second line of test
+            \n\
+            The third
+            """
+        )
+
+        self.body_str = self.body_template_str % self.standard_blob
+        self.body_bytes = self.body_str.encode("ASCII")
+
+        self.environ = {
+            "PATH_INFO": "/+storeblob",
+            "REQUEST_METHOD": "POST",
+            "CONTENT_TYPE": "multipart/form-data; \
+                boundary================4913381966751462219==",
+        }
+
+    def prepareRequest_lineBreakTest(self, body_bytes, environ):
+        """Return a `LaunchpadBrowserRequest` with the given form.
+
+        Also set the accepted charset to 'utf-8'.
+        """
+
+        body_stream = io.BytesIO(body_bytes)
+        input_stream = HTTPInputStream(body_stream, environ)
+
+        request = LaunchpadBrowserRequest(input_stream, environ)
+        request.charsets = ["utf-8"]
+        return request
+
+    def _assessProcessInputsResult(self, request):
+        """
+        Assesses whether the request has correctly parsed the blob field.
+
+        Passing in this case means the request's form is correctly parsed
+        and it only includes CRLF endings.
+        """
+
+        self.assertIn("field.blob", request.form)
+
+        result = request.form["field.blob"].read()
+
+        self.assertIn(b"\r\n", result)
+        self.assertNotRegex(result, b"[^\r]\n")
+
+    def test_processInputs_with_LF(self):
+        """
+        processInputs parses request bodies with LF line-breaks into CRLF.
+        """
+
+        request = self.prepareRequest_lineBreakTest(
+            self.body_bytes, self.environ
+        )
+        request.processInputs()
+
+        self._assessProcessInputsResult(request)
+
+    def test_processInputs_with_CR(self):
+        """
+        processInputs parses request bodies with CR line-breaks into CRLF.
+        """
+
+        body_bytes_with_CR = self.body_bytes.replace(b"\n", b"\r")
+
+        request = self.prepareRequest_lineBreakTest(
+            body_bytes_with_CR, self.environ
+        )
+        request.processInputs()
+
+        self._assessProcessInputsResult(request)
+
+    def test_processInputs_with_CRLF(self):
+        """
+        processInputs passes request bodies with CRLF line-breaks.
+        """
+
+        body_bytes_with_CRLF = self.body_bytes.replace(b"\n", b"\r\n")
+
+        request = self.prepareRequest_lineBreakTest(
+            body_bytes_with_CRLF, self.environ
+        )
+        request.processInputs()
+
+        self._assessProcessInputsResult(request)
+
+    def test_processInputs_with_multiple_buffer_runs(self):
+        """
+        processInputs should work even when the message overflows the buffer
+        """
+
+        request = self.prepareRequest_lineBreakTest(
+            self.body_bytes, self.environ
+        )
+        request.buffer_size = 4
+
+        request.processInputs()
+
+        self._assessProcessInputsResult(request)
+
+    def test_processInputs_with_cut_CRLF(self):
+        """
+        processInputs should work even when a CRLF is cut in the middle
+        """
+
+        body_bytes_with_CRLF = self.body_bytes.replace(b"\n", b"\r\n")
+        CR_index = body_bytes_with_CRLF.index(b"\r")
+
+        request = self.prepareRequest_lineBreakTest(
+            body_bytes_with_CRLF, self.environ
+        )
+        request.buffer_size = len(body_bytes_with_CRLF[: CR_index + 1])
+
+        request.processInputs()
+
+        self._assessProcessInputsResult(request)
+
+    def test_processInputs_different_environ_with_LF(self):
+        """
+        processInputs shouldn't work outside its domain. If LF line-breaks are
+        present outside of the apport package domain, the parsing should fail.
+        """
+
+        different_environ = self.environ.copy()
+        different_environ["PATH_INFO"] = ""
+
+        request = self.prepareRequest_lineBreakTest(
+            self.body_bytes, different_environ
+        )
+
+        request.processInputs()
+
+        self.assertNotIn("field.blob", request.form)
+
+    def test_processInputs_different_environ_with_CR(self):
+        """
+        processInputs shouldn't work outside its domain. If CR line-breaks are
+        present outside of the apport package domain, the parsing should fail.
+        """
+
+        body_bytes_with_CR = self.body_bytes.replace(b"\n", b"\r")
+
+        different_environ = self.environ.copy()
+        different_environ["PATH_INFO"] = ""
+
+        request = self.prepareRequest_lineBreakTest(
+            body_bytes_with_CR, different_environ
+        )
+
+        request.processInputs()
+
+        self.assertNotIn("field.blob", request.form)
+
+    def test_processInputs_different_environ_with_CRLF(self):
+        """
+        processInputs should work with CRLF everytime, regardless of domain.
+        """
+
+        body_bytes_with_CRLF = self.body_bytes.replace(b"\n", b"\r\n")
+
+        different_environ = self.environ.copy()
+        different_environ["PATH_INFO"] = ""
+
+        request = self.prepareRequest_lineBreakTest(
+            body_bytes_with_CRLF, different_environ
+        )
+
+        request.processInputs()
+
+        self._assessProcessInputsResult(request)
+
+    def test_processInputs_content_length(self):
+        """
+        processInputs should change the content length in proportion to the
+        number of LFs.
+        """
+
+        original_body_length = len(self.body_str)
+        lf_count = self.body_str.count("\n")
+
+        environ = self.environ.copy()
+        environ["CONTENT_LENGTH"] = original_body_length
+
+        body_bytes = self.body_str.encode("ASCII")
+
+        request = self.prepareRequest_lineBreakTest(body_bytes, environ)
+
+        request.processInputs()
+
+        # Note that processInputs change the values within "_environ"
+        # internally.
+        self.assertEqual(
+            original_body_length + lf_count, request._environ["CONTENT_LENGTH"]
+        )
+
+
+class TestLaunchpadBrowserRequestProcessInputsLarge(TestCase):
+    # Test case for the large inputs slotted into ProcessInputs method
+
+    layer = FunctionalLayer
+
+    def setUp(self):
+        super().setUp()
+
+        self.body_template_str = dedent(
+            """\
+            Content-Type: multipart/mixed; \
+                boundary="===============4913381966751462219=="
+            MIME-Version: 1.0
+            \n\
+            --===============4913381966751462219==
+            Content-Type: text/plain; charset="us-ascii"
+            MIME-Version: 1.0
+            Content-Transfer-Encoding: 7bit
+            Content-Disposition: form-data; name="FORM_SUBMIT"
+            \n\
+            1
+            --===============4913381966751462219==
+            Content-Type: application/octet-stream
+            MIME-Version: 1.0
+            Content-Disposition: form-data; name="field.blob"; \
+                filename="x"
+            \n\
+            %s
+            \n\
+            --===============4913381966751462219==--
+            \n\
+            """
+        )
+
+        self.standard_blob = dedent(
+            """
+            This is for a test....
+            \n\
+            The second line of test
+            \n\
+            The third
+            """
+        )
+
+        self.body_str = self.body_template_str % self.standard_blob
+        self.body_bytes = self.body_str.encode("ASCII")
+
+        self.environ = {
+            "PATH_INFO": "/+storeblob",
+            "REQUEST_METHOD": "POST",
+            "CONTENT_TYPE": "multipart/form-data; \
+                boundary================4913381966751462219==",
+        }
+
+    def prepareRequest_lineBreakTest(self, body_bytes, environ):
+        """Return a `LaunchpadBrowserRequest` with the given form.
+
+        Also set the accepted charset to 'utf-8'.
+        """
+
+        body_stream = io.BytesIO(body_bytes)
+        input_stream = HTTPInputStream(body_stream, environ)
+
+        request = LaunchpadBrowserRequest(input_stream, environ)
+        request.charsets = ["utf-8"]
+        return request
+
+    def _assessProcessInputsResult(self, request):
+        """
+        Assesses whether the request has correctly parsed the blob field.
+
+        Passing in this case means the request's form is correctly parsed
+        and it only includes CRLF endings.
+        """
+
+        self.assertIn("field.blob", request.form)
+
+        result = request.form["field.blob"].read()
+
+        self.assertIn(b"\r\n", result)
+        self.assertNotRegex(result, b"[^\r]\n")
+
+    def test_processInputs_above_buffer_size_limit(self):
+        """
+        processInputs should work even when the initial message overflows the
+        buffer.
+        """
+
+        template_size = len(self.body_template_str) - 2  # Subtracting "%s"
+        buffer_size = LaunchpadBrowserRequest.buffer_size
+
+        space_to_fill = buffer_size - template_size
+        blob_size = len(self.standard_blob)
+
+        # Make the whole request above the buffer size limit
+        multiply_blob = (space_to_fill // blob_size) + 1
+        large_blob = self.standard_blob * multiply_blob
+
+        large_body_str = self.body_template_str % large_blob
+        large_body_bytes = large_body_str.encode("ASCII")
+
+        lf_count = large_body_str.count("\n")
+
+        environ = self.environ.copy()
+        environ["CONTENT_LENGTH"] = len(large_body_str)
+
+        request = self.prepareRequest_lineBreakTest(large_body_bytes, environ)
+
+        old_content_length = request._environ["CONTENT_LENGTH"]
+
+        request.processInputs()
+
+        self.assertEqual(
+            old_content_length + lf_count, request._environ["CONTENT_LENGTH"]
+        )
+        self._assessProcessInputsResult(request)
+
+    def test_processInputs_just_below_buffer_size_limit(self):
+        """
+        processInputs should work even when the message gets overflowed after
+        initiation.
+        """
+
+        template_size = len(self.body_template_str) - 2  # Subtracting "%s"
+        buffer_size = LaunchpadBrowserRequest.buffer_size
+
+        space_to_fill = buffer_size - template_size
+        blob_size = len(self.standard_blob)
+
+        # Make the whole request above the buffer size limit
+        multiply_blob = space_to_fill // blob_size
+        large_blob = self.standard_blob * multiply_blob
+
+        large_body_str = self.body_template_str % large_blob
+        large_body_bytes = large_body_str.encode("ASCII")
+
+        lf_count = large_body_str.count("\n")
+
+        environ = self.environ.copy()
+        environ["CONTENT_LENGTH"] = len(large_body_str)
+
+        request = self.prepareRequest_lineBreakTest(large_body_bytes, environ)
+
+        old_content_length = request._environ["CONTENT_LENGTH"]
+
+        request.processInputs()
+
+        self.assertEqual(
+            old_content_length + lf_count, request._environ["CONTENT_LENGTH"]
+        )
+        self._assessProcessInputsResult(request)
+
+    def test_processInputs_blob_only_LF(self):
+        """
+        processInputs should work even when the whole blob within the message
+        is gibberish, or made up of entirely LFs and is extremely large.
+        """
+
+        buffer_size = LaunchpadBrowserRequest.buffer_size
+
+        lf_blob = "\n" * buffer_size * 2
+
+        lf_body_str = self.body_template_str % lf_blob
+        lf_body_bytes = lf_body_str.encode("ASCII")
+
+        lf_count = lf_body_str.count("\n")
+
+        environ = self.environ.copy()
+        environ["CONTENT_LENGTH"] = len(lf_body_str)
+
+        request = self.prepareRequest_lineBreakTest(lf_body_bytes, environ)
+
+        old_content_length = request._environ["CONTENT_LENGTH"]
+
+        request.processInputs()
+
+        self.assertEqual(
+            old_content_length + lf_count, request._environ["CONTENT_LENGTH"]
+        )
+        self._assessProcessInputsResult(request)
+
+
 class TestWebServiceRequestToBrowserRequest(WebServiceTestCase):
     def test_unicode_path_info(self):
         web_service_request = WebServiceTestRequest(
diff --git a/requirements/launchpad.txt b/requirements/launchpad.txt
index 022a99e..dfb4119 100644
--- a/requirements/launchpad.txt
+++ b/requirements/launchpad.txt
@@ -98,7 +98,7 @@ mistune==0.8.3
 monotonic==1.5
 more-itertools==10.5.0
 msgpack==1.0.2
-multipart==0.2.4
+multipart==1.2.1
 netaddr==0.7.19
 netifaces==0.11.0
 oauth==1.0