launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #03883
[Merge] lp:~benji/launchpad/bug-191199 into lp:launchpad
Benji York has proposed merging lp:~benji/launchpad/bug-191199 into lp:launchpad.
Requested reviews:
Launchpad code reviewers (launchpad-reviewers)
Related bugs:
Bug #191199 in Launchpad itself: "XML from Bugzilla is not guaranteed well-formed or correctly encoded, so treat it as such"
https://bugs.launchpad.net/launchpad/+bug/191199
For more details, see:
https://code.launchpad.net/~benji/launchpad/bug-191199/+merge/64058
Bug 191199 describes a problem importing Bugzilla bugs that contain ASCII control characters. This branch fixes the problem by stripping low-ASCII control characters (other than the whitespace characters \n\r\t) after encoding to UTF-8. This is safe because non-ASCII characters in UTF-8 are represented with the high bit set and therefore won't be confused as ASCII control characters.
After fixing some minor pre-existing lint, the lint report is clear.
--
https://code.launchpad.net/~benji/launchpad/bug-191199/+merge/64058
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~benji/launchpad/bug-191199 into lp:launchpad.
=== modified file 'lib/lp/bugs/doc/externalbugtracker-bugzilla-oddities.txt'
--- lib/lp/bugs/doc/externalbugtracker-bugzilla-oddities.txt 2011-02-23 08:57:39 +0000
+++ lib/lp/bugs/doc/externalbugtracker-bugzilla-oddities.txt 2011-06-09 17:21:08 +0000
@@ -138,3 +138,13 @@
...
UnparsableBugData: Failed to parse XML description...
+However, embedded control characters do not generate errors.
+
+ >>> from lp.bugs.tests.externalbugtracker import (
+ ... AnotherBrokenBugzilla)
+ >>> broken_bugzilla = AnotherBrokenBugzilla(mozilla_bugzilla.baseurl)
+ >>> r"NOT\x01USED" in repr(broken_bugzilla._readBugItemFile())
+ True
+
+ >>> remote_bugs = ['42', '2000']
+ >>> broken_bugzilla.initializeRemoteBugDB(remote_bugs) # no exception
=== modified file 'lib/lp/bugs/externalbugtracker/bugzilla.py'
--- lib/lp/bugs/externalbugtracker/bugzilla.py 2011-05-12 21:33:10 +0000
+++ lib/lp/bugs/externalbugtracker/bugzilla.py 2011-06-09 17:21:08 +0000
@@ -12,10 +12,11 @@
]
from email.Utils import parseaddr
-import re
from httplib import BadStatusLine
from urllib2 import URLError
from xml.dom import minidom
+import re
+import string
import xml.parsers.expat
import xmlrpclib
@@ -59,7 +60,7 @@
class Bugzilla(ExternalBugTracker):
"""An ExternalBugTrack for dealing with remote Bugzilla systems."""
- batch_query_threshold = 0 # Always use the batch method.
+ batch_query_threshold = 0 # Always use the batch method.
_test_xmlrpc_proxy = None
def __init__(self, baseurl, version=None):
@@ -177,6 +178,15 @@
# encoding that page is in, and then encode() it into the utf-8
# that minidom requires.
contents = encoding.guess(contents).encode("utf-8")
+ # Since the string is utf-8 encoded and utf-8 encoded string have the
+ # high bit set for non-ASCII characters, we can now strip out any
+ # ASCII control characters without touching encoded Unicode
+ # characters.
+ bad_chars = ''.join(chr(i) for i in range(0, 32))
+ for char in '\n\r\t':
+ bad_chars = bad_chars.replace(char, '')
+ trans_map = string.maketrans(bad_chars, ' ' * len(bad_chars))
+ contents = contents.translate(trans_map)
return minidom.parseString(contents)
def _probe_version(self):
@@ -489,7 +499,7 @@
"""See `ExternalBugTracker`."""
try:
if bug_id not in self.remote_bug_importance:
- return "Bug %s is not in remote_bug_importance" %(bug_id)
+ return "Bug %s is not in remote_bug_importance" % bug_id
return self.remote_bug_importance[bug_id]
except:
return UNKNOWN_REMOTE_IMPORTANCE
=== modified file 'lib/lp/bugs/tests/externalbugtracker.py'
--- lib/lp/bugs/tests/externalbugtracker.py 2011-03-01 23:51:13 +0000
+++ lib/lp/bugs/tests/externalbugtracker.py 2011-06-09 17:21:08 +0000
@@ -358,6 +358,11 @@
2000: ('RESOLVED', 'FIXED', 'LOW', 'BLOCKER')}
+class AnotherBrokenBugzilla(TestBrokenBugzilla):
+ """Test parsing of a Bugzilla which returns broken XML."""
+ bug_item_file = 'unescaped_control_character.xml'
+
+
class TestIssuezilla(TestBugzilla):
"""Test support for Issuezilla, with slightly modified XML."""
version_file = 'issuezilla_version.xml'
=== added file 'lib/lp/bugs/tests/testfiles/unescaped_control_character.xml'
--- lib/lp/bugs/tests/testfiles/unescaped_control_character.xml 1970-01-01 00:00:00 +0000
+++ lib/lp/bugs/tests/testfiles/unescaped_control_character.xml 2011-06-09 17:21:08 +0000
@@ -0,0 +1,17 @@
+ <li>
+
+ <bz:bug rdf:about="http://bugzilla.gnome.org/show_bug.cgi?id=%(bug_id)s">
+
+ <bz:id nc:parseType="Integer">%(bug_id)s</bz:id>
+
+ <bz:bug_severity>%(severity)s</bz:bug_severity>
+ <bz:priority>%(priority)s</bz:priority>
+ <bz:op_sys>NOT USED</bz:op_sys>
+ <bz:product>NOTUSED</bz:product>
+ <bz:bug_status>%(status)s</bz:bug_status>
+ <bz:resolution>%(resolution)s</bz:resolution>
+ <bz:short_short_desc>NOT USED</bz:short_short_desc>
+
+ </bz:bug>
+
+ </li>