openlp-core team mailing list archive

Thread
Date

[Merge] lp:~whydoubt/openlp/enhance_strip_rtf into lp:openlp

To: mp+184875@xxxxxxxxxxxxxxxxxx
From: Jeffrey Smith <whydoubt@xxxxxxxxx>
Date: Tue, 10 Sep 2013 20:55:48 -0000
Reply-to: mp+184875@xxxxxxxxxxxxxxxxxx
Sender: bounces@xxxxxxxxxxxxx

Jeffrey Smith has proposed merging lp:~whydoubt/openlp/enhance_strip_rtf into lp:openlp.

Requested reviews:
  Andreas Preikschat (googol)

For more details, see:
https://code.launchpad.net/~whydoubt/openlp/enhance_strip_rtf/+merge/184875
-- 
https://code.launchpad.net/~whydoubt/openlp/enhance_strip_rtf/+merge/184875
Your team OpenLP Core is subscribed to branch lp:openlp.

=== modified file 'openlp/plugins/songs/lib/__init__.py'
--- openlp/plugins/songs/lib/__init__.py	2013-08-31 18:17:38 +0000
+++ openlp/plugins/songs/lib/__init__.py	2013-09-10 20:55:06 +0000
@@ -46,7 +46,13 @@
 
 WHITESPACE = re.compile(r'[\W_]+', re.UNICODE)
 APOSTROPHE = re.compile('[\'`’ʻ′]', re.UNICODE)
-PATTERN = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I)
+# PATTERN will look for the next occurence of one of these symbols:
+#   \controlword - optionally preceded by \*, optionally followed by a number
+#   \'## - where ## is a pair of hex digits, representing a single character
+#   \# - where # is a single non-alpha character, representing a special symbol
+#   { or } - marking the beginning/end of a group
+#   a run of characters without any \ { } or end-of-line
+PATTERN = re.compile(r"(\\\*)?\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z*])|([{}])|[\r\n]+|([^\\{}\r\n]+)", re.I)
 # RTF control words which specify a "destination" to be ignored.
 DESTINATIONS = frozenset((
     'aftncn', 'aftnsep', 'aftnsepc', 'annotation', 'atnauthor',
@@ -57,8 +63,8 @@
     'datafield', 'datastore', 'defchp', 'defpap', 'do', 'doccomm',
     'docvar', 'dptxbxtext', 'ebcend', 'ebcstart', 'factoidname',
     'falt', 'fchars', 'ffdeftext', 'ffentrymcr', 'ffexitmcr',
-    'ffformat', 'ffhelptext', 'ffl', 'ffname', 'ffstattext', 'field',
-    'file', 'filetbl', 'fldinst', 'fldrslt', 'fldtype', 'fname',
+    'ffformat', 'ffhelptext', 'ffl', 'ffname', 'ffstattext',
+    'file', 'filetbl', 'fldinst', 'fldtype', 'fname',
     'fontemb', 'fontfile', 'footer', 'footerf', 'footerl', 'footerr',
     'footnote', 'formfield', 'ftncn', 'ftnsep', 'ftnsepc', 'g',
     'generator', 'gridtbl', 'header', 'headerf', 'headerl',
@@ -106,6 +112,11 @@
     'xmlclose', 'xmlname', 'xmlnstbl', 'xmlopen'))
 # Translation of some special characters.
 SPECIAL_CHARS = {
+    '\n': '\n',
+    '\r': '\n',
+    '~': '\u00A0',
+    '-': '\u00AD',
+    '_': '\u2011',
     'par': '\n',
     'sect': '\n\n',
     # Required page and column break.
@@ -132,16 +143,19 @@
     'zwj': '\u200D',
     'zwnj': '\u200C'}
 CHARSET_MAPPING = {
-    'fcharset0': 'cp1252',
-    'fcharset161': 'cp1253',
-    'fcharset162': 'cp1254',
-    'fcharset163': 'cp1258',
-    'fcharset177': 'cp1255',
-    'fcharset178': 'cp1256',
-    'fcharset186': 'cp1257',
-    'fcharset204': 'cp1251',
-    'fcharset222': 'cp874',
-    'fcharset238': 'cp1250'}
+    '0': 'cp1252',
+    '128': 'cp932',
+    '129': 'cp949',
+    '134': 'cp936',
+    '161': 'cp1253',
+    '162': 'cp1254',
+    '163': 'cp1258',
+    '177': 'cp1255',
+    '178': 'cp1256',
+    '186': 'cp1257',
+    '204': 'cp1251',
+    '222': 'cp874',
+    '238': 'cp1250'}
 
 
 class VerseType(object):
@@ -351,7 +365,7 @@
             if recommendation == encodings[index][0]:
                 recommended_index = index
                 break
-    if recommended_index > 0:
+    if recommended_index > -1:
         choice = QtGui.QInputDialog.getItem(None,
             translate('SongsPlugin', 'Character Encoding'),
             translate('SongsPlugin', 'The codepage setting is responsible\n'
@@ -365,7 +379,7 @@
                 [pair[1] for pair in encodings], 0, False)
     if not choice[1]:
         return None
-    return filter(lambda item: item[1] == choice[0], encodings)[0][0]
+    return next(filter(lambda item: item[1] == choice[0], encodings))[0]
 
 
 def clean_string(string):
@@ -521,43 +535,59 @@
     curskip = 0
     # Output buffer.
     out = []
+    # Encoded buffer.
+    ebytes = bytearray()
     for match in PATTERN.finditer(text):
-        word, arg, hex, char, brace, tchar = match.groups()
+        iinu, word, arg, hex, char, brace, tchar = match.groups()
+        # \x (non-alpha character)
+        if char:
+            if char in '\\{}':
+                tchar = char
+            else:
+                word = char
+        # Flush encoded buffer to output buffer
+        if ebytes and not hex and not tchar:
+            failed = False
+            while True:
+                try:
+                    encoding, default_encoding = get_encoding(font, font_table, default_encoding, failed=failed)
+                    if not encoding:
+                        return None
+                    dbytes = ebytes.decode(encoding)
+                    # Code 5C is a peculiar case with Windows Codepage 932
+                    if encoding == 'cp932' and '\\' in dbytes:
+                        dbytes = dbytes.replace('\\', '\u00A5')
+                    out.append(dbytes)
+                    ebytes.clear()
+                except UnicodeDecodeError:
+                    failed = True
+                else:
+                    break
+        # {}
         if brace:
             curskip = 0
             if brace == '{':
                 # Push state
                 stack.append((ucskip, ignorable, font))
-            elif brace == '}':
+            elif brace == '}' and len(stack) > 0:
                 # Pop state
                 ucskip, ignorable, font = stack.pop()
-        # \x (not a letter)
-        elif char:
-            curskip = 0
-            if char == '~' and not ignorable:
-                out.append('\xA0')
-            elif char in '{}\\' and not ignorable:
-                out.append(char)
-            elif char == '-' and not ignorable:
-                out.append('\u00AD')
-            elif char == '_' and not ignorable:
-                out.append('\u2011')
-            elif char == '*':
-                ignorable = True
         # \command
         elif word:
             curskip = 0
             if word in DESTINATIONS:
                 ignorable = True
             elif word in SPECIAL_CHARS:
-                out.append(SPECIAL_CHARS[word])
+                if not ignorable:
+                    out.append(SPECIAL_CHARS[word])
             elif word == 'uc':
                 ucskip = int(arg)
-            elif word == ' ':
+            elif word == 'u':
                 c = int(arg)
                 if c < 0:
                     c += 0x10000
-                out.append(chr(c))
+                if not ignorable:
+                    out.append(chr(c))
                 curskip = ucskip
             elif word == 'fonttbl':
                 ignorable = True
@@ -565,31 +595,24 @@
                 font = arg
             elif word == 'ansicpg':
                 font_table[font] = 'cp' + arg
-            elif word == 'fcharset' and font not in font_table and word + arg in CHARSET_MAPPING:
-                # \ansicpg overrides \fcharset, if present.
-                font_table[font] = CHARSET_MAPPING[word + arg]
+            elif word == 'fcharset' and font not in font_table and arg in CHARSET_MAPPING:
+                font_table[font] = CHARSET_MAPPING[arg]
+            elif word == 'fldrslt':
+                pass
+            # \* 'Ignore if not understood' marker
+            elif iinu:
+                ignorable = True
         # \'xx
         elif hex:
             if curskip > 0:
                 curskip -= 1
             elif not ignorable:
-                charcode = int(hex, 16)
-                failed = False
-                while True:
-                    try:
-                        encoding, default_encoding = get_encoding(font, font_table, default_encoding, failed=failed)
-                        if not encoding:
-                            return None
-                        out.append(chr(charcode).decode(encoding))
-                    except UnicodeDecodeError:
-                        failed = True
-                    else:
-                        break
+                ebytes.append(int(hex, 16))
         elif tchar:
             if curskip > 0:
                 curskip -= 1
             elif not ignorable:
-                out.append(tchar)
+                ebytes += tchar.encode()
     text = ''.join(out)
     return text, default_encoding
 

=== modified file 'tests/functional/openlp_plugins/songs/test_lib.py'
--- tests/functional/openlp_plugins/songs/test_lib.py	2013-08-31 18:17:38 +0000
+++ tests/functional/openlp_plugins/songs/test_lib.py	2013-09-10 20:55:06 +0000
@@ -6,7 +6,7 @@
 
 from mock import patch, MagicMock
 
-from openlp.plugins.songs.lib import VerseType, clean_string, clean_title
+from openlp.plugins.songs.lib import VerseType, clean_string, clean_title, strip_rtf
 from openlp.plugins.songs.lib.songcompare import songs_probably_equal, _remove_typos, _op_length
 
 
@@ -215,6 +215,38 @@
         # THEN: The maximum length should be returned.
         assert result == 10, 'The length should be 10.'
 
+    def strip_rtf_charsets_test(self):
+        """
+        Test that the strip_rtf() method properly decodes the supported charsets.
+        """
+        test_charset_table = [
+            ('0', 'weor\\\'F0-myndum \\\'FEah\\par ', 'weorð-myndum þah\n'),
+            ('128', '\\\'83C\\\'83G\\\'83X\\\'A5\\\'83L\\\'83\\\'8A\\\'83X\\\'83g\\\'A1 '
+                    '\\\\ \\\'95\\\\ \\\'8E\\} \\\'8E\\{ \\\'A1\\par ', 'イエス･キリスト｡ ¥ 表 枝 施 ｡\n'),
+            ('129', '\\\'BF\\\'B9\\\'BC\\\'F6 \\\'B1\\\'D7\\\'B8\\\'AE\\\'BD\\\'BA\\\'B5\\\'B5\\par ', '예수 그리스도\n'),
+            ('134', '\\\'D2\\\'AE\\\'F6\\\'D5\\\'BB\\\'F9\\\'B6\\\'BD\\\'CA\\\'C7\\\'D6\\\'F7\\par ', '耶稣基督是主\n'),
+            ('161', '\\\'D7\\\'F1\\\'E9\\\'F3\\\'F4\\\'FC\\\'F2\\par ', 'Χριστός\n'),
+            ('162', 'Hazreti \\\'DDsa\\par ', 'Hazreti İsa\n'),
+            ('163', 'ph\\\'FD\\\'F5ng\\par ', 'phương\n'),
+            ('177', '\\\'E1\\\'F8\\\'E0\\\'F9\\\'E9\\\'FA\\par ', 'בראשית\n'),
+            ('178', '\\\'ED\\\'D3\\\'E6\\\'DA \\\'C7\\\'E1\\\'E3\\\'D3\\\'ED\\\'CD\\par ', 'يسوع المسيح\n'),
+            ('186', 'J\\\'EBzus Kristus yra Vie\\\'F0pats\\par ', 'Jėzus Kristus yra Viešpats\n'),
+            ('204', '\\\'D0\\\'EE\\\'F1\\\'F1\\\'E8\\\'FF\\par ', 'Россия\n'),
+            ('222', '\\\'A4\\\'C3\\\'D4\\\'CA\\\'B5\\\'EC\\par ', 'คริสต์\n'),
+            ('238', 'Z\\\'E1v\\\'ECre\\\'E8n\\\'E1 zkou\\\'9Aka\\par ', 'Závěrečná zkouška\n')
+        ]
+
+        # GIVEN: For each character set and input
+        for charset, input, exp_result in test_charset_table:
+
+            # WHEN: We call strip_rtf on the input RTF
+            result, result_enc = strip_rtf(
+               '{\\rtf1 \\ansi \\ansicpg1252 {\\fonttbl \\f0 \\fswiss \\fcharset%s Helvetica;}' \
+               '{\\colortbl ;\\red0 \\green0 \\blue0 ;}\\pard \\f0 %s}' % (charset, input))
+
+            # THEN: The stripped text matches thed expected result
+            assert result == exp_result, 'The result should be %s' % exp_result
+
 
 class TestVerseType(TestCase):
     """