openlp-core team mailing list archive
-
openlp-core team
-
Mailing list archive
-
Message #21714
[Merge] lp:~whydoubt/openlp/enhance_strip_rtf into lp:openlp
Jeffrey Smith has proposed merging lp:~whydoubt/openlp/enhance_strip_rtf into lp:openlp.
Requested reviews:
Andreas Preikschat (googol)
For more details, see:
https://code.launchpad.net/~whydoubt/openlp/enhance_strip_rtf/+merge/184875
--
https://code.launchpad.net/~whydoubt/openlp/enhance_strip_rtf/+merge/184875
Your team OpenLP Core is subscribed to branch lp:openlp.
=== modified file 'openlp/plugins/songs/lib/__init__.py'
--- openlp/plugins/songs/lib/__init__.py 2013-08-31 18:17:38 +0000
+++ openlp/plugins/songs/lib/__init__.py 2013-09-10 20:55:06 +0000
@@ -46,7 +46,13 @@
WHITESPACE = re.compile(r'[\W_]+', re.UNICODE)
APOSTROPHE = re.compile('[\'`’ʻ′]', re.UNICODE)
-PATTERN = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I)
+# PATTERN will look for the next occurence of one of these symbols:
+# \controlword - optionally preceded by \*, optionally followed by a number
+# \'## - where ## is a pair of hex digits, representing a single character
+# \# - where # is a single non-alpha character, representing a special symbol
+# { or } - marking the beginning/end of a group
+# a run of characters without any \ { } or end-of-line
+PATTERN = re.compile(r"(\\\*)?\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z*])|([{}])|[\r\n]+|([^\\{}\r\n]+)", re.I)
# RTF control words which specify a "destination" to be ignored.
DESTINATIONS = frozenset((
'aftncn', 'aftnsep', 'aftnsepc', 'annotation', 'atnauthor',
@@ -57,8 +63,8 @@
'datafield', 'datastore', 'defchp', 'defpap', 'do', 'doccomm',
'docvar', 'dptxbxtext', 'ebcend', 'ebcstart', 'factoidname',
'falt', 'fchars', 'ffdeftext', 'ffentrymcr', 'ffexitmcr',
- 'ffformat', 'ffhelptext', 'ffl', 'ffname', 'ffstattext', 'field',
- 'file', 'filetbl', 'fldinst', 'fldrslt', 'fldtype', 'fname',
+ 'ffformat', 'ffhelptext', 'ffl', 'ffname', 'ffstattext',
+ 'file', 'filetbl', 'fldinst', 'fldtype', 'fname',
'fontemb', 'fontfile', 'footer', 'footerf', 'footerl', 'footerr',
'footnote', 'formfield', 'ftncn', 'ftnsep', 'ftnsepc', 'g',
'generator', 'gridtbl', 'header', 'headerf', 'headerl',
@@ -106,6 +112,11 @@
'xmlclose', 'xmlname', 'xmlnstbl', 'xmlopen'))
# Translation of some special characters.
SPECIAL_CHARS = {
+ '\n': '\n',
+ '\r': '\n',
+ '~': '\u00A0',
+ '-': '\u00AD',
+ '_': '\u2011',
'par': '\n',
'sect': '\n\n',
# Required page and column break.
@@ -132,16 +143,19 @@
'zwj': '\u200D',
'zwnj': '\u200C'}
CHARSET_MAPPING = {
- 'fcharset0': 'cp1252',
- 'fcharset161': 'cp1253',
- 'fcharset162': 'cp1254',
- 'fcharset163': 'cp1258',
- 'fcharset177': 'cp1255',
- 'fcharset178': 'cp1256',
- 'fcharset186': 'cp1257',
- 'fcharset204': 'cp1251',
- 'fcharset222': 'cp874',
- 'fcharset238': 'cp1250'}
+ '0': 'cp1252',
+ '128': 'cp932',
+ '129': 'cp949',
+ '134': 'cp936',
+ '161': 'cp1253',
+ '162': 'cp1254',
+ '163': 'cp1258',
+ '177': 'cp1255',
+ '178': 'cp1256',
+ '186': 'cp1257',
+ '204': 'cp1251',
+ '222': 'cp874',
+ '238': 'cp1250'}
class VerseType(object):
@@ -351,7 +365,7 @@
if recommendation == encodings[index][0]:
recommended_index = index
break
- if recommended_index > 0:
+ if recommended_index > -1:
choice = QtGui.QInputDialog.getItem(None,
translate('SongsPlugin', 'Character Encoding'),
translate('SongsPlugin', 'The codepage setting is responsible\n'
@@ -365,7 +379,7 @@
[pair[1] for pair in encodings], 0, False)
if not choice[1]:
return None
- return filter(lambda item: item[1] == choice[0], encodings)[0][0]
+ return next(filter(lambda item: item[1] == choice[0], encodings))[0]
def clean_string(string):
@@ -521,43 +535,59 @@
curskip = 0
# Output buffer.
out = []
+ # Encoded buffer.
+ ebytes = bytearray()
for match in PATTERN.finditer(text):
- word, arg, hex, char, brace, tchar = match.groups()
+ iinu, word, arg, hex, char, brace, tchar = match.groups()
+ # \x (non-alpha character)
+ if char:
+ if char in '\\{}':
+ tchar = char
+ else:
+ word = char
+ # Flush encoded buffer to output buffer
+ if ebytes and not hex and not tchar:
+ failed = False
+ while True:
+ try:
+ encoding, default_encoding = get_encoding(font, font_table, default_encoding, failed=failed)
+ if not encoding:
+ return None
+ dbytes = ebytes.decode(encoding)
+ # Code 5C is a peculiar case with Windows Codepage 932
+ if encoding == 'cp932' and '\\' in dbytes:
+ dbytes = dbytes.replace('\\', '\u00A5')
+ out.append(dbytes)
+ ebytes.clear()
+ except UnicodeDecodeError:
+ failed = True
+ else:
+ break
+ # {}
if brace:
curskip = 0
if brace == '{':
# Push state
stack.append((ucskip, ignorable, font))
- elif brace == '}':
+ elif brace == '}' and len(stack) > 0:
# Pop state
ucskip, ignorable, font = stack.pop()
- # \x (not a letter)
- elif char:
- curskip = 0
- if char == '~' and not ignorable:
- out.append('\xA0')
- elif char in '{}\\' and not ignorable:
- out.append(char)
- elif char == '-' and not ignorable:
- out.append('\u00AD')
- elif char == '_' and not ignorable:
- out.append('\u2011')
- elif char == '*':
- ignorable = True
# \command
elif word:
curskip = 0
if word in DESTINATIONS:
ignorable = True
elif word in SPECIAL_CHARS:
- out.append(SPECIAL_CHARS[word])
+ if not ignorable:
+ out.append(SPECIAL_CHARS[word])
elif word == 'uc':
ucskip = int(arg)
- elif word == ' ':
+ elif word == 'u':
c = int(arg)
if c < 0:
c += 0x10000
- out.append(chr(c))
+ if not ignorable:
+ out.append(chr(c))
curskip = ucskip
elif word == 'fonttbl':
ignorable = True
@@ -565,31 +595,24 @@
font = arg
elif word == 'ansicpg':
font_table[font] = 'cp' + arg
- elif word == 'fcharset' and font not in font_table and word + arg in CHARSET_MAPPING:
- # \ansicpg overrides \fcharset, if present.
- font_table[font] = CHARSET_MAPPING[word + arg]
+ elif word == 'fcharset' and font not in font_table and arg in CHARSET_MAPPING:
+ font_table[font] = CHARSET_MAPPING[arg]
+ elif word == 'fldrslt':
+ pass
+ # \* 'Ignore if not understood' marker
+ elif iinu:
+ ignorable = True
# \'xx
elif hex:
if curskip > 0:
curskip -= 1
elif not ignorable:
- charcode = int(hex, 16)
- failed = False
- while True:
- try:
- encoding, default_encoding = get_encoding(font, font_table, default_encoding, failed=failed)
- if not encoding:
- return None
- out.append(chr(charcode).decode(encoding))
- except UnicodeDecodeError:
- failed = True
- else:
- break
+ ebytes.append(int(hex, 16))
elif tchar:
if curskip > 0:
curskip -= 1
elif not ignorable:
- out.append(tchar)
+ ebytes += tchar.encode()
text = ''.join(out)
return text, default_encoding
=== modified file 'tests/functional/openlp_plugins/songs/test_lib.py'
--- tests/functional/openlp_plugins/songs/test_lib.py 2013-08-31 18:17:38 +0000
+++ tests/functional/openlp_plugins/songs/test_lib.py 2013-09-10 20:55:06 +0000
@@ -6,7 +6,7 @@
from mock import patch, MagicMock
-from openlp.plugins.songs.lib import VerseType, clean_string, clean_title
+from openlp.plugins.songs.lib import VerseType, clean_string, clean_title, strip_rtf
from openlp.plugins.songs.lib.songcompare import songs_probably_equal, _remove_typos, _op_length
@@ -215,6 +215,38 @@
# THEN: The maximum length should be returned.
assert result == 10, 'The length should be 10.'
+ def strip_rtf_charsets_test(self):
+ """
+ Test that the strip_rtf() method properly decodes the supported charsets.
+ """
+ test_charset_table = [
+ ('0', 'weor\\\'F0-myndum \\\'FEah\\par ', 'weorð-myndum þah\n'),
+ ('128', '\\\'83C\\\'83G\\\'83X\\\'A5\\\'83L\\\'83\\\'8A\\\'83X\\\'83g\\\'A1 '
+ '\\\\ \\\'95\\\\ \\\'8E\\} \\\'8E\\{ \\\'A1\\par ', 'イエス・キリスト。 ¥ 表 枝 施 。\n'),
+ ('129', '\\\'BF\\\'B9\\\'BC\\\'F6 \\\'B1\\\'D7\\\'B8\\\'AE\\\'BD\\\'BA\\\'B5\\\'B5\\par ', '예수 그리스도\n'),
+ ('134', '\\\'D2\\\'AE\\\'F6\\\'D5\\\'BB\\\'F9\\\'B6\\\'BD\\\'CA\\\'C7\\\'D6\\\'F7\\par ', '耶稣基督是主\n'),
+ ('161', '\\\'D7\\\'F1\\\'E9\\\'F3\\\'F4\\\'FC\\\'F2\\par ', 'Χριστός\n'),
+ ('162', 'Hazreti \\\'DDsa\\par ', 'Hazreti İsa\n'),
+ ('163', 'ph\\\'FD\\\'F5ng\\par ', 'phương\n'),
+ ('177', '\\\'E1\\\'F8\\\'E0\\\'F9\\\'E9\\\'FA\\par ', 'בראשית\n'),
+ ('178', '\\\'ED\\\'D3\\\'E6\\\'DA \\\'C7\\\'E1\\\'E3\\\'D3\\\'ED\\\'CD\\par ', 'يسوع المسيح\n'),
+ ('186', 'J\\\'EBzus Kristus yra Vie\\\'F0pats\\par ', 'Jėzus Kristus yra Viešpats\n'),
+ ('204', '\\\'D0\\\'EE\\\'F1\\\'F1\\\'E8\\\'FF\\par ', 'Россия\n'),
+ ('222', '\\\'A4\\\'C3\\\'D4\\\'CA\\\'B5\\\'EC\\par ', 'คริสต์\n'),
+ ('238', 'Z\\\'E1v\\\'ECre\\\'E8n\\\'E1 zkou\\\'9Aka\\par ', 'Závěrečná zkouška\n')
+ ]
+
+ # GIVEN: For each character set and input
+ for charset, input, exp_result in test_charset_table:
+
+ # WHEN: We call strip_rtf on the input RTF
+ result, result_enc = strip_rtf(
+ '{\\rtf1 \\ansi \\ansicpg1252 {\\fonttbl \\f0 \\fswiss \\fcharset%s Helvetica;}' \
+ '{\\colortbl ;\\red0 \\green0 \\blue0 ;}\\pard \\f0 %s}' % (charset, input))
+
+ # THEN: The stripped text matches thed expected result
+ assert result == exp_result, 'The result should be %s' % exp_result
+
class TestVerseType(TestCase):
"""