launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #24932
[Merge] ~cjwatson/launchpad:py3-encoding into launchpad:master
Colin Watson has proposed merging ~cjwatson/launchpad:py3-encoding into launchpad:master.
Commit message:
Port lp.services.encoding to Python 3
Requested reviews:
Launchpad code reviewers (launchpad-reviewers)
For more details, see:
https://code.launchpad.net/~cjwatson/launchpad/+git/launchpad/+merge/386666
escape_nonascii_uniquely's tests become rather simpler by using a raw docstring.
--
Your team Launchpad code reviewers is requested to review the proposed merge of ~cjwatson/launchpad:py3-encoding into launchpad:master.
diff --git a/lib/lp/services/encoding.py b/lib/lp/services/encoding.py
index 7feb308..29a0d3f 100644
--- a/lib/lp/services/encoding.py
+++ b/lib/lp/services/encoding.py
@@ -1,3 +1,7 @@
+# -*- coding: utf-8 -*-
+# NOTE: The first line above must stay first; do not move the copyright
+# notice to the top. See http://www.python.org/dev/peps/pep-0263/.
+#
# Copyright 2009-2019 Canonical Ltd. This software is licensed under the
# GNU Affero General Public License version 3 (see the file LICENSE).
@@ -49,8 +53,8 @@ def guess(s):
ASCII is easy
- >>> guess('hello')
- u'hello'
+ >>> print(guess(b'hello'))
+ hello
Unicode raises an exception to annoy lazy programmers. It should also
catches bugs as if you have valid Unicode you shouldn't be going anywhere
@@ -63,18 +67,18 @@ def guess(s):
UTF-8 is our best guess
- >>> guess(u'100% Pure Beef\N{TRADE MARK SIGN}'.encode('UTF-8'))
- u'100% Pure Beef\u2122'
+ >>> print(guess(u'100% Pure Beef\N{TRADE MARK SIGN}'.encode('UTF-8')))
+ 100% Pure Beef™
But we fall back to ISO-8859-1 if UTF-8 fails
>>> u = u'Ol\N{LATIN SMALL LETTER E WITH ACUTE}'
>>> u.encode('UTF-8') == u.encode('ISO-8859-1')
False
- >>> guess(u.encode('UTF-8'))
- u'Ol\xe9'
- >>> guess(u.encode('ISO-8859-1'))
- u'Ol\xe9'
+ >>> print(guess(u.encode('UTF-8')))
+ Olé
+ >>> print(guess(u.encode('ISO-8859-1')))
+ Olé
However, if the string contains ISO-8859-1 control characters, it is
probably a CP1252 document (Windows).
@@ -82,22 +86,22 @@ def guess(s):
>>> u = u'Show me the \N{EURO SIGN}'
>>> u.encode('UTF-8') == u.encode('CP1252')
False
- >>> guess(u.encode('UTF-8'))
- u'Show me the \u20ac'
- >>> guess(u.encode('CP1252'))
- u'Show me the \u20ac'
+ >>> print(guess(u.encode('UTF-8')))
+ Show me the €
+ >>> print(guess(u.encode('CP1252')))
+ Show me the €
We also check for characters common in ISO-8859-15 that are uncommon
in ISO-8859-1, and use ISO-8859-15 if they are found.
>>> u = u'\N{LATIN SMALL LETTER S WITH CARON}'
- >>> guess(u.encode('iso-8859-15'))
- u'\u0161'
+ >>> print(guess(u.encode('iso-8859-15')))
+ š
Strings with a BOM are unambiguous.
- >>> guess(u'hello'.encode('UTF-16'))
- u'hello'
+ >>> print(guess(u'hello'.encode('UTF-16')))
+ hello
However, UTF-16 strings without a BOM will be interpreted as ISO-8859-1.
I doubt this is a problem, as we are unlikely to see this except with
@@ -105,59 +109,60 @@ def guess(s):
at the moment like ISO-2022-jp, BIG5, SHIFT-JIS etc. will be a bigger
problem.
- >>> guess(u'hello'.encode('UTF-16be'))
- u'\x00h\x00e\x00l\x00l\x00o'
+ >>> guess(u'hello'.encode('UTF-16be')) == u'\x00h\x00e\x00l\x00l\x00o'
+ True
'''
# Calling this method with a Unicode argument indicates a hidden bug
# that will bite you eventually -- StuartBishop 20050709
- if isinstance(s, unicode):
+ if isinstance(s, six.text_type):
raise TypeError(
'encoding.guess called with Unicode string %r' % (s,)
)
# Attempt to use an objects default Unicode conversion, for objects
# that can encode themselves as ASCII.
- try:
- return unicode(s)
- except UnicodeDecodeError:
- pass
+ if not isinstance(s, bytes):
+ try:
+ return six.text_type(s)
+ except UnicodeDecodeError:
+ pass
# Detect BOM
try:
for bom, encoding in _boms:
if s.startswith(bom):
- return unicode(s[len(bom):], encoding)
+ return six.text_type(s[len(bom):], encoding)
except UnicodeDecodeError:
pass
# Try preferred encoding
try:
- return unicode(s, 'UTF-8')
+ return six.text_type(s, 'UTF-8')
except UnicodeDecodeError:
pass
# If we have characters in this range, it is probably CP1252
- if re.search(r"[\x80-\x9f]", s) is not None:
+ if re.search(br"[\x80-\x9f]", s) is not None:
try:
- return unicode(s, 'CP1252')
+ return six.text_type(s, 'CP1252')
except UnicodeDecodeError:
pass
# If we have characters in this range, it is probably ISO-8859-15
- if re.search(r"[\xa4\xa6\xa8\xb4\xb8\xbc-\xbe]", s) is not None:
+ if re.search(br"[\xa4\xa6\xa8\xb4\xb8\xbc-\xbe]", s) is not None:
try:
- return unicode(s, 'ISO-8859-15')
+ return six.text_type(s, 'ISO-8859-15')
except UnicodeDecodeError:
pass
# Otherwise we default to ISO-8859-1
- return unicode(s, 'ISO-8859-1', 'replace')
+ return six.text_type(s, 'ISO-8859-1', 'replace')
def escape_nonascii_uniquely(bogus_string):
- """Replace non-ascii characters with a hex representation.
+ r"""Replace non-ascii characters with a hex representation.
This is mainly for preventing emails with invalid characters from causing
oopses. The nonascii characters could have been removed or just converted
@@ -166,24 +171,16 @@ def escape_nonascii_uniquely(bogus_string):
all the nonascii characters have been replaced with the same ascii
character.
- Unfortunately, all the strings below are actually part of this
- function's docstring, so python processes the backslash once before
- doctest, and then python processes it again when doctest runs the
- test. This makes it confusing, since four backslashes will get
- converted into a single ascii character.
-
- >>> print len('\xa9'), len('\\xa9'), len('\\\\xa9')
- 1 1 4
- >>> print escape_nonascii_uniquely('hello \xa9')
- hello \\xa9
- >>> print escape_nonascii_uniquely('hello \\xa9')
- hello \\xa9
+ >>> print(len('\xa9'), len('\\xa9'))
+ 1 4
+ >>> print(escape_nonascii_uniquely('hello \xa9'))
+ hello \xa9
This string only has ascii characters, so escape_nonascii_uniquely()
actually has no effect.
- >>> print escape_nonascii_uniquely('hello \\\\xa9')
- hello \\xa9
+ >>> print(escape_nonascii_uniquely('hello \\xa9'))
+ hello \xa9
"""
nonascii_regex = re.compile(r'[\200-\377]')
@@ -198,19 +195,22 @@ def escape_nonascii_uniquely(bogus_string):
def is_ascii_only(string):
- """Ensure that the string contains only ASCII characters.
+ r"""Ensure that the string contains only ASCII characters.
>>> is_ascii_only(u'ascii only')
True
- >>> is_ascii_only('ascii only')
+ >>> is_ascii_only(b'ascii only')
True
- >>> is_ascii_only('\xf4')
+ >>> is_ascii_only(b'\xf4')
False
>>> is_ascii_only(u'\xf4')
False
"""
try:
- string.encode('ascii')
+ if isinstance(string, bytes):
+ string.decode('ascii')
+ else:
+ string.encode('ascii')
except UnicodeError:
return False
else:
diff --git a/lib/lp/services/tests/test_encoding.py b/lib/lp/services/tests/test_encoding.py
index 4a0e235..e271e55 100644
--- a/lib/lp/services/tests/test_encoding.py
+++ b/lib/lp/services/tests/test_encoding.py
@@ -1,6 +1,8 @@
# Copyright 2009-2019 Canonical Ltd. This software is licensed under the
# GNU Affero General Public License version 3 (see the file LICENSE).
+from __future__ import absolute_import, print_function, unicode_literals
+
from doctest import (
DocTestSuite,
ELLIPSIS,
@@ -38,7 +40,13 @@ class TestWSGINativeString(TestCase):
def test_suite():
+ globs = {
+ 'absolute_import': absolute_import,
+ 'print_function': print_function,
+ 'unicode_literals': unicode_literals,
+ }
return unittest.TestSuite((
unittest.TestLoader().loadTestsFromName(__name__),
- DocTestSuite(lp.services.encoding, optionflags=ELLIPSIS),
+ DocTestSuite(
+ lp.services.encoding, extraglobs=globs, optionflags=ELLIPSIS),
))