launchpad-reviewers team mailing list archive

Thread
Date
Re: [Merge] ~ilkeremrekoc/launchpad:accept-crlf-bugreports into launchpad:master

To: mp+481785@xxxxxxxxxxxxxxxxxx
From: Yuliy Schwartzburg <mp+481785@xxxxxxxxxxxxxxxxxx>
Date: Wed, 26 Feb 2025 13:07:45 -0000
In-reply-to: <174039782253.266779.14302778473473145788.launchpad@scripts-bzrsyncd.lp.internal>
Reply-to: mp+481785@xxxxxxxxxxxxxxxxxx
Sender: noreply@xxxxxxxxxxxxx
Replied

Diff comments:

> diff --git a/lib/lp/bugs/utilities/filebugdataparser.py b/lib/lp/bugs/utilities/filebugdataparser.py
> index 4806dcb..6b37f14 100644
> --- a/lib/lp/bugs/utilities/filebugdataparser.py
> +++ b/lib/lp/bugs/utilities/filebugdataparser.py
> @@ -32,6 +32,105 @@ class FileBugDataParser:
>          self.attachments = []
>          self.BUFFER_SIZE = 8192
>  
> +    # XXX: ilkeremrekoc 2025-02-26: The whole point of trying to find the "LF"
> +    # or "CRLF" is because the Apport package (which is used to send
> +    # bug-reports to Launchpad) uses LF line-breaks since its inception.
> +    # This is against the RFC standards, which standardized the "CRLF" as the
> +    # line-break for all HTTP requests. And because the main parser for zope
> +    # (which is the "multipart" package) started enforcing this standard
> +    # strictly in its newer versions, and considering we must upgrade our
> +    # dependencies, we had to make sure FileBugDataParser accepts the "CRLF"
> +    # as well. But we cannot accept only "CRLF" for now because the apport
> +    # package for every older version of Ubuntu would still use the "LF"
> +    # making it impossible to send bug-reports from older LTS versions.
> +    # So, until a sufficient number or all the apport packages in future,
> +    # current and past Ubuntu versions are patched to only send "CRLF" we
> +    # must use this method to ensure both "LF" and "CRLF" is accepted.
> +    # Once the apport package patches are done, we can revert this change,
> +    # get rid of this method and its tests and simply change
> +    # the "LF" expectance in the previous version into "CRLF" expectance.
> +    def _findLineBreakType(self) -> bytes:
> +        """Find the line-break/end_string the message is using and return it.
> +
> +        Assumptions:
> +            - The method would be run at the start of any parsing run as this
> +            method doesn't open the blob file but resets the blob-stream's
> +            cursor.
> +
> +            - The line-break can only be LF ("\n") or CRLF ("\r\n") with
> +            no CR ("\r") functionality.
> +
> +            - The message must have at least one line-break character when
> +            parsing. An empty message or one without any line-breaks doesn't
> +            count (which should be impossible anyway but I digress).
> +
> +            - The whole message must be made up of a single line-break type.
> +            If more than one type is present, something will break after this
> +            method.
> +
> +        Reads through the message until it finds an LF character ("\n") before
> +        closing and reopening the file-alias/blob to reset its cursor (as the
> +        file-alias/blob doesn't have a "seek" functionality). Then returns
> +        whichever line-break type is used in the message.
> +
> +        ...
> +        :raises AssertionError: If the method cannot find any LF linebreak on
> +        the message stream, it raises.
> +        ...
> +        :return: Either byte typed CRLF (b"\r\n") or byte typed LF (b"\n")
> +        :rtype: bytes
> +        """
> +
> +        # The LF line break is assumed to be a part of the message as it is
> +        # a part of both LF and CRLF.
> +        lf_linebreak = b"\n"
> +
> +        # A temporary buffer we don't need to save outside the scope as it is
> +        # only for finding the first line-break.
> +        temp_buffer = b""
> +
> +        while lf_linebreak not in temp_buffer:
> +            data = self.blob_file.read(self.BUFFER_SIZE)
> +
> +            # Append the data to the temp_buffer. This is to ensure any
> +            # CR ("\r") that is part of a CRLF isn't deleted in a preceding
> +            # buffer accidentally.
> +            temp_buffer += data
> +
> +            if len(data) < self.BUFFER_SIZE:
> +                # End of file.
> +
> +                if lf_linebreak not in temp_buffer:
> +                    # If the linebreak isn't present, then the message must
> +                    # be broken since the method must read from the start
> +
> +                    if (
> +                        b"\r" in temp_buffer
> +                    ):  # If the linebreak inside the whole message is only CR
> +                        raise AssertionError(
> +                            "The wrong linebreak is used. CR isn't accepted."
> +                        )
> +
> +                    raise AssertionError(
> +                        "There are no linebreaks in the blob."
> +                    )
> +                break
> +
> +        # This part is for ".seek(0)" functionality that LibraryFileAlias
> +        # lack, requiring the calls to close() -> open() back to back
> +        # to reset the stream's read cursor to the start of the file.
> +        self.blob_file.close()
> +        self.blob_file.open()
> +
> +        lf_index = temp_buffer.index(lf_linebreak)
> +
> +        # A slice is needed even if for a single character as bytes type acts
> +        # differently in slices and in singe character accesses.

Yup, good practice anyway to re-order your commits logically at the end, rather than in the order you made them.

> +        if temp_buffer[lf_index - 1 : lf_index] == b"\r":
> +            return b"\r\n"
> +
> +        return b"\n"
> +
>      def _consumeBytes(self, end_string):
>          """Read bytes from the message up to the end_string.
>  
> diff --git a/lib/lp/bugs/utilities/tests/test_filebugdataparser.py b/lib/lp/bugs/utilities/tests/test_filebugdataparser.py
> index 28d4be9..84b8e0b 100644
> --- a/lib/lp/bugs/utilities/tests/test_filebugdataparser.py
> +++ b/lib/lp/bugs/utilities/tests/test_filebugdataparser.py
> @@ -44,9 +67,65 @@ class TestFileBugDataParser(TestCase):
>          self.assertEqual(b"", parser._consumeBytes(b"0"))
>          self.assertEqual(b"", parser._consumeBytes(b"0"))
>  
> +    def test__findLineBreakType(self):
> +        # _findLineBreakType reads the whole message until either an LF or
> +        # a CRLF is found, returns that type in bytes string format and exits.
> +
> +        msg = dedent(
> +            """\
> +            Header: value
> +            Space-Folded-Header: this header
> +             is folded with a space.
> +            Tab-Folded-Header: this header
> +            \tis folded with a tab.
> +            Another-header: another-value
> +
> +            Not-A-Header: not-a-value
> +            """
> +        )
> +
> +        # Test the above message with LF endings
> +        msg_with_lf = msg.encode("ASCII")
> +        lf_parser = FileBugDataParser(MockBytesIO(msg_with_lf))
> +        lf_linebreak = lf_parser._findLineBreakType()
> +
> +        self.assertEqual(lf_linebreak, b"\n")

ack

> +
> +        # Test the above message after replacing LF with CRLF
> +        msg_with_crlf = msg.replace("\n", "\r\n").encode("ASCII")
> +        crlf_parser = FileBugDataParser(MockBytesIO(msg_with_crlf))
> +        crlf_linebreak = crlf_parser._findLineBreakType()
> +
> +        self.assertEqual(crlf_linebreak, b"\r\n")
> +
> +        # Test the above message after deleting the line-breaks
> +        # Which should break as there must be at least some line-breaks
> +        # in the message

We should also think about future users though. I don't think it's technically a problem to have no line breaks at all right? Why explicitly prevent this when it's not a necessary condition?

> +        msg_without_linebreak = msg.replace("\n", "").encode("ASCII")
> +        without_linebreak_parser = FileBugDataParser(
> +            MockBytesIO(msg_without_linebreak)
> +        )
> +
> +        self.assertRaisesWithContent(
> +            AssertionError,
> +            "There are no linebreaks in the blob.",
> +            without_linebreak_parser._findLineBreakType,
> +        )
> +
> +        # Test the above message after replacing LF with CR
> +        # Which should break as we do not accept CR line-breaks
> +        msg_with_cr = msg.replace("\n", "\r").encode("ASCII")
> +        cr_parser = FileBugDataParser(MockBytesIO(msg_with_cr))
> +
> +        self.assertRaisesWithContent(
> +            AssertionError,
> +            "The wrong linebreak is used. CR isn't accepted.",
> +            cr_parser._findLineBreakType,
> +        )
> +
>      def test_readLine(self):
>          # readLine reads a single line of the file.
> -        parser = FileBugDataParser(io.BytesIO(b"123\n456\n789"))
> +        parser = FileBugDataParser(MockBytesIO(b"123\n456\n789"))
>          self.assertEqual(b"123\n", parser.readLine())
>          self.assertEqual(b"456\n", parser.readLine())
>          self.assertEqual(b"789", parser.readLine())


-- 
https://code.launchpad.net/~ilkeremrekoc/launchpad/+git/launchpad/+merge/481785
Your team Launchpad code reviewers is requested to review the proposed merge of ~ilkeremrekoc/launchpad:accept-crlf-bugreports into launchpad:master.
References

[Merge] ~ilkeremrekoc/launchpad:accept-crlf-bugreports into launchpad:master
From: İlker Emre Koç, 2025-02-24