← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] lp:~benji/launchpad/bug-191199 into lp:launchpad

 

Benji York has proposed merging lp:~benji/launchpad/bug-191199 into lp:launchpad.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)
Related bugs:
  Bug #191199 in Launchpad itself: "XML from Bugzilla is not guaranteed well-formed or correctly encoded, so treat it as such"
  https://bugs.launchpad.net/launchpad/+bug/191199

For more details, see:
https://code.launchpad.net/~benji/launchpad/bug-191199/+merge/64058

Bug 191199 describes a problem importing Bugzilla bugs that contain ASCII control characters.  This branch fixes the problem by stripping low-ASCII control characters (other than the whitespace characters \n\r\t) after encoding to UTF-8.  This is safe because non-ASCII characters in UTF-8 are represented with the high bit set and therefore won't be confused as ASCII control characters.

After fixing some minor pre-existing lint, the lint report is clear.
-- 
https://code.launchpad.net/~benji/launchpad/bug-191199/+merge/64058
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~benji/launchpad/bug-191199 into lp:launchpad.
=== modified file 'lib/lp/bugs/doc/externalbugtracker-bugzilla-oddities.txt'
--- lib/lp/bugs/doc/externalbugtracker-bugzilla-oddities.txt	2011-02-23 08:57:39 +0000
+++ lib/lp/bugs/doc/externalbugtracker-bugzilla-oddities.txt	2011-06-09 17:21:08 +0000
@@ -138,3 +138,13 @@
     ...
     UnparsableBugData: Failed to parse XML description...
 
+However, embedded control characters do not generate errors.
+
+    >>> from lp.bugs.tests.externalbugtracker import (
+    ...     AnotherBrokenBugzilla)
+    >>> broken_bugzilla = AnotherBrokenBugzilla(mozilla_bugzilla.baseurl)
+    >>> r"NOT\x01USED" in repr(broken_bugzilla._readBugItemFile())
+    True
+
+    >>> remote_bugs = ['42', '2000']
+    >>> broken_bugzilla.initializeRemoteBugDB(remote_bugs) # no exception

=== modified file 'lib/lp/bugs/externalbugtracker/bugzilla.py'
--- lib/lp/bugs/externalbugtracker/bugzilla.py	2011-05-12 21:33:10 +0000
+++ lib/lp/bugs/externalbugtracker/bugzilla.py	2011-06-09 17:21:08 +0000
@@ -12,10 +12,11 @@
     ]
 
 from email.Utils import parseaddr
-import re
 from httplib import BadStatusLine
 from urllib2 import URLError
 from xml.dom import minidom
+import re
+import string
 import xml.parsers.expat
 import xmlrpclib
 
@@ -59,7 +60,7 @@
 class Bugzilla(ExternalBugTracker):
     """An ExternalBugTrack for dealing with remote Bugzilla systems."""
 
-    batch_query_threshold = 0 # Always use the batch method.
+    batch_query_threshold = 0  # Always use the batch method.
     _test_xmlrpc_proxy = None
 
     def __init__(self, baseurl, version=None):
@@ -177,6 +178,15 @@
         # encoding that page is in, and then encode() it into the utf-8
         # that minidom requires.
         contents = encoding.guess(contents).encode("utf-8")
+        # Since the string is utf-8 encoded and utf-8 encoded string have the
+        # high bit set for non-ASCII characters, we can now strip out any
+        # ASCII control characters without touching encoded Unicode
+        # characters.
+        bad_chars = ''.join(chr(i) for i in range(0, 32))
+        for char in '\n\r\t':
+            bad_chars = bad_chars.replace(char, '')
+        trans_map = string.maketrans(bad_chars, ' ' * len(bad_chars))
+        contents = contents.translate(trans_map)
         return minidom.parseString(contents)
 
     def _probe_version(self):
@@ -489,7 +499,7 @@
         """See `ExternalBugTracker`."""
         try:
             if bug_id not in self.remote_bug_importance:
-                return "Bug %s is not in remote_bug_importance" %(bug_id)
+                return "Bug %s is not in remote_bug_importance" % bug_id
             return self.remote_bug_importance[bug_id]
         except:
             return UNKNOWN_REMOTE_IMPORTANCE

=== modified file 'lib/lp/bugs/tests/externalbugtracker.py'
--- lib/lp/bugs/tests/externalbugtracker.py	2011-03-01 23:51:13 +0000
+++ lib/lp/bugs/tests/externalbugtracker.py	2011-06-09 17:21:08 +0000
@@ -358,6 +358,11 @@
                 2000: ('RESOLVED', 'FIXED', 'LOW', 'BLOCKER')}
 
 
+class AnotherBrokenBugzilla(TestBrokenBugzilla):
+    """Test parsing of a Bugzilla which returns broken XML."""
+    bug_item_file = 'unescaped_control_character.xml'
+
+
 class TestIssuezilla(TestBugzilla):
     """Test support for Issuezilla, with slightly modified XML."""
     version_file = 'issuezilla_version.xml'

=== added file 'lib/lp/bugs/tests/testfiles/unescaped_control_character.xml'
--- lib/lp/bugs/tests/testfiles/unescaped_control_character.xml	1970-01-01 00:00:00 +0000
+++ lib/lp/bugs/tests/testfiles/unescaped_control_character.xml	2011-06-09 17:21:08 +0000
@@ -0,0 +1,17 @@
+      <li>
+
+        <bz:bug rdf:about="http://bugzilla.gnome.org/show_bug.cgi?id=%(bug_id)s">
+
+          <bz:id nc:parseType="Integer">%(bug_id)s</bz:id>
+
+          <bz:bug_severity>%(severity)s</bz:bug_severity>
+          <bz:priority>%(priority)s</bz:priority>
+          <bz:op_sys>NOT USED</bz:op_sys>
+          <bz:product>NOTUSED</bz:product>
+          <bz:bug_status>%(status)s</bz:bug_status>
+          <bz:resolution>%(resolution)s</bz:resolution>
+          <bz:short_short_desc>NOT USED</bz:short_short_desc>
+
+        </bz:bug>
+
+      </li>