← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] lp:~abentley/launchpad/macintosh-encoding into lp:launchpad

 

Aaron Bentley has proposed merging lp:~abentley/launchpad/macintosh-encoding into lp:launchpad.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)
Related bugs:
  Bug #820039 in Launchpad itself: "process-mail.py fails with a LookupError: unknown encoding: macintosh error"
  https://bugs.launchpad.net/launchpad/+bug/820039

For more details, see:
https://code.launchpad.net/~abentley/launchpad/macintosh-encoding/+merge/71599

= Summary =
Fix bug #820039: process-mail.py fails with a LookupError: unknown encoding: macintosh error

== Proposed fix ==
Handle 'macintosh' as mac_roman, handle unknown encodings as latin-1

== Pre-implementation notes ==
Discussed with deryck

== Implementation details ==
Python itself has decided to treat 'macintosh' as 'mac_roman', but our version doesn't have this fix: http://hg.python.org/cpython/rev/0c3d763f38f1/

Unknown encodings are handled as latin-1, because they must not oops, and most encodings are somewhat readable in latin-1.  If the content contains non-ascii characters, the translation is likely to be somewhat incorrect, so a warning is emitted.

== Tests ==
bin/test test_message

== Demo and Q/A ==
Craft an email using the 'macintosh' encoding, and send it.  It should be decoded correctly.  Craft an email using the 'booga' encoding, using only ASCII characters.  It should be decoded as if it was ascii.  Craft an email using the 'booga' encoding, using high-bit characters.  It should be decoded as if it was latin-1, and a warning should be emitted.


= Launchpad lint =

Checking for conflicts and issues in changed files.

Linting changed files:
  lib/lp/services/messages/model/message.py
  lib/lp/services/messages/tests/test_message.py
-- 
https://code.launchpad.net/~abentley/launchpad/macintosh-encoding/+merge/71599
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~abentley/launchpad/macintosh-encoding into lp:launchpad.
=== modified file 'lib/lp/services/messages/model/message.py'
--- lib/lp/services/messages/model/message.py	2011-08-12 14:39:51 +0000
+++ lib/lp/services/messages/model/message.py	2011-08-15 19:24:18 +0000
@@ -27,6 +27,7 @@
     parseaddr,
     parsedate_tz,
     )
+import logging
 from operator import attrgetter
 import os.path
 
@@ -207,6 +208,10 @@
 class MessageSet:
     implements(IMessageSet)
 
+    extra_encoding_aliases = {
+        'macintosh': 'mac_roman',
+    }
+
     def get(self, rfc822msgid):
         messages = list(Message.selectBy(rfc822msgid=rfc822msgid))
         if len(messages) == 0:
@@ -231,6 +236,19 @@
         Store.of(message).flush()
         return message
 
+    @classmethod
+    def decode(self, encoded, encoding):
+        encoding = self.extra_encoding_aliases.get(encoding, encoding)
+        try:
+            return encoded.decode(encoding, 'replace')
+        except LookupError:
+            try:
+                return encoded.decode('us-ascii')
+            except UnicodeDecodeError:
+                logging.getLogger().warning(
+                    'Treating unknown encoding "%s" as latin-1.' % encoding)
+                return encoded.decode('latin-1')
+
     def _decode_header(self, header):
         r"""Decode an RFC 2047 encoded header.
 
@@ -264,7 +282,7 @@
             # cause problems in unusual encodings that we are hopefully
             # unlikely to encounter in this part of the code.
             re_encoded_bits.append(
-                (bytes.decode(charset, 'replace').encode('utf-8'), 'utf-8'))
+                (self.decode(bytes, charset).encode('utf-8'), 'utf-8'))
 
         return unicode(email.Header.make_header(re_encoded_bits))
 
@@ -441,8 +459,7 @@
                 charset = part.get_content_charset()
                 if charset is None or str(charset).lower() == 'x-unknown':
                     charset = 'latin-1'
-
-                content = content.decode(charset, 'replace')
+                content = self.decode(content, charset)
 
                 if content.strip():
                     MessageChunk(

=== modified file 'lib/lp/services/messages/tests/test_message.py'
--- lib/lp/services/messages/tests/test_message.py	2011-05-31 20:59:10 +0000
+++ lib/lp/services/messages/tests/test_message.py	2011-08-15 19:24:18 +0000
@@ -4,14 +4,16 @@
 __metaclass__ = type
 
 from cStringIO import StringIO
-from email.Message import Message
-from email.MIMEMultipart import MIMEMultipart
-from email.MIMEText import MIMEText
-from email.Utils import (
+from email.header import Header
+from email.message import Message
+from email import (
+    MIMEMultipart,
+    MIMEText,
+    )
+from email.utils import (
     formatdate,
     make_msgid,
     )
-import unittest
 
 from sqlobject import SQLObjectNotFound
 import transaction
@@ -29,17 +31,22 @@
     )
 from lp.services.job.model.job import Job
 from lp.services.mail.sendmail import MailController
-from lp.testing import TestCaseWithFactory
+from lp.testing import (
+    TestCase,
+    TestCaseWithFactory,
+    )
 from lp.testing.factory import LaunchpadObjectFactory
 
 
-class TestMessageSet(unittest.TestCase):
+class TestMessageSet(TestCase):
     """Test the methods of `MessageSet`."""
 
     layer = LaunchpadFunctionalLayer
 
+    high_characters = ''.join(chr(c) for c in range(128, 256))
+
     def setUp(self):
-        unittest.TestCase.setUp(self)
+        super(TestMessageSet, self).setUp()
         # Testing behavior, not permissions here.
         login('foo.bar@xxxxxxxxxxxxx')
         self.factory = LaunchpadObjectFactory()
@@ -126,7 +133,7 @@
         msg.attach(MIMEText('This is the body of the email.'))
         attachment = Message()
         attachment.set_payload('This is the diff, honest.')
-        attachment['Content-Type'] = 'text/x-diff'
+        attachment['content-type'] = 'text/x-diff'
         attachment['Content-Disposition'] = (
             'attachment; filename="/tmp/foo/review.diff"')
         msg.attach(attachment)
@@ -149,6 +156,53 @@
         dupe_message = MessageSet().fromEmail(email.as_string())
         self.assertNotEqual(orig_message.id, dupe_message.id)
 
+    def makeEncodedEmail(self, encoding_name, actual_encoding):
+        email = self.factory.makeEmailMessage(body=self.high_characters)
+        email.set_type('text/plain')
+        email.set_charset(encoding_name)
+        macroman = Header(self.high_characters, actual_encoding).encode()
+        new_subject = macroman.replace(actual_encoding, encoding_name)
+        email.replace_header('Subject', new_subject)
+        return email
+
+    def test_fromEmail_decodes_macintosh_encoding(self):
+        """"macintosh encoding is equivalent to MacRoman."""
+        high_decoded = self.high_characters.decode('macroman')
+        email = self.makeEncodedEmail('macintosh', 'macroman')
+        message = MessageSet().fromEmail(email.as_string())
+        self.assertEqual(high_decoded, message.subject)
+        self.assertEqual(high_decoded, message.text_contents)
+
+    def test_fromEmail_decodes_booga_encoding(self):
+        """"'booga' encoding is decoded as latin-1."""
+        high_decoded = self.high_characters.decode('latin-1')
+        email = self.makeEncodedEmail('booga', 'latin-1')
+        message = MessageSet().fromEmail(email.as_string())
+        self.assertEqual(high_decoded, message.subject)
+        self.assertEqual(high_decoded, message.text_contents)
+
+    def test_decode_utf8(self):
+        """Test decode with a known encoding."""
+        result = MessageSet.decode(u'\u1234'.encode('utf-8'), 'utf-8')
+        self.assertEqual(u'\u1234', result)
+
+    def test_decode_macintosh(self):
+        """Test decode with macintosh encoding."""
+        result = MessageSet.decode(self.high_characters, 'macintosh')
+        self.assertEqual(self.high_characters.decode('macroman'), result)
+
+    def test_decode_unknown_ascii(self):
+        """Test decode with ascii characters in an unknown encoding."""
+        result = MessageSet.decode('abcde', 'booga')
+        self.assertEqual(u'abcde', result)
+
+    def test_decode_unknown_high_characters(self):
+        """Test decode with non-ascii characters in an unknown encoding."""
+        with self.expectedLog(
+            'Treating unknown encoding "booga" as latin-1.'):
+            result = MessageSet.decode(self.high_characters, 'booga')
+        self.assertEqual(self.high_characters.decode('latin-1'), result)
+
 
 class TestMessageJob(TestCaseWithFactory):
     """Tests for MessageJob."""