widelands-dev team mailing list archive

Thread
Date

[Merge] lp:~widelands-dev/widelands/japanese into lp:~widelands-dev/widelands/arabic

To: mp+272650@xxxxxxxxxxxxxxxxxx
From: GunChleoc <fios@xxxxxxxxxxxxxxxxxxx>
Date: Mon, 28 Sep 2015 18:55:30 -0000
Reply-to: mp+272650@xxxxxxxxxxxxxxxxxx
Sender: bounces@xxxxxxxxxxxxx

GunChleoc has proposed merging lp:~widelands-dev/widelands/japanese into lp:~widelands-dev/widelands/arabic.

Requested reviews:
  Widelands Developers (widelands-dev)
Related bugs:
  Bug #1311698 in widelands: "Non-spacing sentences could not be break automatically"
  https://bugs.launchpad.net/widelands/+bug/1311698

For more details, see:
https://code.launchpad.net/~widelands-dev/widelands/japanese/+merge/272650

Implemented line wrapping for Japanese.

Some characters block having a new line, so I group them into vector entries.

Testing can be done by removing the \n characters from win_conditions/ja.po and looking at the tooltips - rather than a crash or an endless line, we get properly wrapped lines.
-- 
Your team Widelands Developers is requested to review the proposed merge of lp:~widelands-dev/widelands/japanese into lp:~widelands-dev/widelands/arabic.

=== modified file 'src/graphic/text/bidi.cc'
--- src/graphic/text/bidi.cc	2015-09-28 06:41:58 +0000
+++ src/graphic/text/bidi.cc	2015-09-28 18:54:19 +0000
@@ -22,7 +22,6 @@
 #include <map>
 #include <string>
 
-#include <unicode/uchar.h>
 #include <unicode/unistr.h>
 #include <unicode/utypes.h>
 
@@ -32,6 +31,139 @@
 // TODO(GunChleoc): Have a look at the ICU API to see which helper functions can be gained from there.
 // TODO(GunChleoc): Arabic: Turn this into a proper class
 
+// http://www.w3.org/TR/jlreq/#characters_not_starting_a_line
+const std::set<UChar> kCannottStartLineJapanese = {
+	{0x2019}, // RIGHT SINGLE QUOTATION MARK
+	{0x201D}, // RIGHT DOUBLE QUOTATION MARK
+	{0x0029}, // RIGHT PARENTHESIS
+	{0x3015}, // RIGHT TORTOISE SHELL BRACKET
+	{0x005D}, // RIGHT SQUARE BRACKET
+	{0x007D}, // RIGHT CURLY BRACKET
+	{0x3009}, // RIGHT ANGLE BRACKET
+	{0x300B}, // RIGHT DOUBLE ANGLE BRACKET
+	{0x300D}, // RIGHT CORNER BRACKET
+	{0x300F}, // RIGHT WHITE CORNER BRACKET
+	{0x3011}, // RIGHT BLACK LENTICULAR BRACKET
+	{0x2986}, // RIGHT WHITE PARENTHESIS
+	{0x3019}, // RIGHT WHITE TORTOISE SHELL BRACKET
+	{0x3017}, // RIGHT WHITE LENTICULAR BRACKET
+	{0xFF09}, // Fullwidth Right Parenthesis
+	{0x00BB}, // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
+	{0x301F}, // LOW DOUBLE PRIME QUOTATION MARK
+	{0x2010}, // HYPHEN
+	{0x301C}, // WAVE DASH
+	{0x30A0}, // KATAKANA-HIRAGANA DOUBLE HYPHEN
+	{0x2013}, // EN DASH
+	{0x0021}, // EXCLAMATION MARK
+	{0x003F}, // QUESTION MARK
+	{0x203C}, // DOUBLE EXCLAMATION MARK
+	{0x2047}, // DOUBLE QUESTION MARK
+	{0x2048}, // QUESTION EXCLAMATION MARK
+	{0x2049}, // EXCLAMATION QUESTION MARK
+	{0x30FB}, // KATAKANA MIDDLE DOT
+	{0x003A}, // COLON
+	{0x003B}, // SEMICOLON
+	{0x3002}, // IDEOGRAPHIC FULL STOP
+	{0x002E}, // FULL STOP
+	{0x3001}, // IDEOGRAPHIC COMMA
+	{0x002C}, // COMMA
+	{0x30FD}, // KATAKANA ITERATION MARK
+	{0x30FE}, // KATAKANA VOICED ITERATION MARK
+	{0x309D}, // HIRAGANA ITERATION MARK
+	{0x309E}, // HIRAGANA VOICED ITERATION MARK
+	{0x3005}, // IDEOGRAPHIC ITERATION MARK
+	{0x303B}, // VERTICAL IDEOGRAPHIC ITERATION MARK
+	{0x30FC}, // KATAKANA-HIRAGANA PROLONGED SOUND MARK
+	{0x3041}, // HIRAGANA LETTER SMALL A
+	{0x3043}, // HIRAGANA LETTER SMALL I
+	{0x3045}, // HIRAGANA LETTER SMALL U
+	{0x3047}, // HIRAGANA LETTER SMALL E
+	{0x3049}, // HIRAGANA LETTER SMALL O
+	{0x30A1}, // KATAKANA LETTER SMALL A
+	{0x30A3}, // KATAKANA LETTER SMALL I
+	{0x30A5}, // KATAKANA LETTER SMALL U
+	{0x30A7}, // KATAKANA LETTER SMALL E
+	{0x30A9}, // KATAKANA LETTER SMALL O
+	{0x3063}, // HIRAGANA LETTER SMALL TU
+	{0x3083}, // HIRAGANA LETTER SMALL YA
+	{0x3085}, // HIRAGANA LETTER SMALL YU
+	{0x3087}, // HIRAGANA LETTER SMALL YO
+	{0x308E}, // HIRAGANA LETTER SMALL WA
+	{0x3095}, // HIRAGANA LETTER SMALL KA
+	{0x3096}, // HIRAGANA LETTER SMALL KE
+	{0x30C3}, // KATAKANA LETTER SMALL TU
+	{0x30E3}, // KATAKANA LETTER SMALL YA
+	{0x30E5}, // KATAKANA LETTER SMALL YU
+	{0x30E7}, // KATAKANA LETTER SMALL YO
+	{0x30EE}, // KATAKANA LETTER SMALL WA
+	{0x30F5}, // KATAKANA LETTER SMALL KA
+	{0x30F6}, // KATAKANA LETTER SMALL KE
+	{0x31F0}, // KATAKANA LETTER SMALL KU
+	{0x31F1}, // KATAKANA LETTER SMALL SI
+	{0x31F2}, // KATAKANA LETTER SMALL SU
+	{0x31F3}, // KATAKANA LETTER SMALL TO
+	{0x31F4}, // KATAKANA LETTER SMALL NU
+	{0x31F5}, // KATAKANA LETTER SMALL HA
+	{0x31F6}, // KATAKANA LETTER SMALL HI
+	{0x31F7}, // KATAKANA LETTER SMALL HU
+	{0x31F8}, // KATAKANA LETTER SMALL HE
+	{0x31F9}, // KATAKANA LETTER SMALL HO
+	{0x31FA}, // KATAKANA LETTER SMALL MU
+	{0x31FB}, // KATAKANA LETTER SMALL RA
+	{0x31FC}, // KATAKANA LETTER SMALL RI
+	{0x31FD}, // KATAKANA LETTER SMALL RU
+	{0x31FE}, // KATAKANA LETTER SMALL RE
+	{0x31FF}, // KATAKANA LETTER SMALL RO
+};
+
+// http://www.w3.org/TR/jlreq/#characters_not_ending_a_line
+const std::set<UChar> kCannotEndLineJapanese = {
+	{0x2018}, // LEFT SINGLE QUOTATION MARK
+	{0x201C}, // LEFT DOUBLE QUOTATION MARK
+	{0x0028}, // LEFT PARENTHESIS
+	{0x3014}, // LEFT TORTOISE SHELL BRACKET
+	{0x005B}, // LEFT SQUARE BRACKET
+	{0x007B}, // LEFT CURLY BRACKET
+	{0x3008}, // LEFT ANGLE BRACKET
+	{0x300A}, // LEFT DOUBLE ANGLE BRACKET
+	{0x300C}, // LEFT CORNER BRACKET
+	{0x300E}, // LEFT WHITE CORNER BRACKET
+	{0x3010}, // LEFT BLACK LENTICULAR BRACKET
+	{0x2985}, // LEFT WHITE PARENTHESIS
+	{0x3018}, // LEFT WHITE TORTOISE SHELL BRACKET
+	{0x3016}, // LEFT WHITE LENTICULAR BRACKET
+	{0xFF08}, // Fullwidth Left Parenthesis
+	{0x00AB}, // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
+	{0x301D}, // REVERSED DOUBLE PRIME QUOTATION MARK
+};
+
+
+// http://unicode.org/faq/blocks_ranges.html
+// http://unicode-table.com/en/blocks/
+const std::set<UBlockCode> kCJKCodeBlocks = {
+	{
+		UBlockCode::UBLOCK_CJK_COMPATIBILITY,
+		UBlockCode::UBLOCK_CJK_COMPATIBILITY_FORMS,
+		UBlockCode::UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS,
+		UBlockCode::UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT,
+		UBlockCode::UBLOCK_CJK_RADICALS_SUPPLEMENT,
+		UBlockCode::UBLOCK_CJK_STROKES,
+		UBlockCode::UBLOCK_CJK_SYMBOLS_AND_PUNCTUATION,
+		UBlockCode::UBLOCK_CJK_UNIFIED_IDEOGRAPHS,
+		UBlockCode::UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A,
+		UBlockCode::UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B,
+		UBlockCode::UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C,
+		UBlockCode::UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D,
+		UBlockCode::UBLOCK_HIRAGANA,
+		UBlockCode::UBLOCK_KATAKANA,
+	},
+};
+
+bool is_cjk_character(UChar32 c) {
+	return kCJKCodeBlocks.count(ublock_getCode(c)) == 1;
+}
+
+
 // Need to mirror () etc. for LTR languages, so we're sticking them in a map.
 const std::map<UChar, UChar> kSymmetricChars = {
 	{0x0028, 0x0029}, // ()
@@ -378,7 +510,7 @@
 }
 
 
-// True if a string does not contain Latin characters
+// True if a string contains a character from an Arabic code block
 bool has_arabic_character(const char* input) {
 	bool result = false;
 	const icu::UnicodeString parseme(input);
@@ -590,4 +722,46 @@
 	return result;
 }
 
+// True if a string contains a character from a CJK code block
+bool has_cjk_character(const char* input) {
+	bool result = false;
+	const icu::UnicodeString parseme(input);
+	for (int32_t i = 0; i < parseme.length(); ++i) {
+		if (is_cjk_character(parseme.char32At(i))) {
+			result = true;
+			break;
+		}
+	}
+	return result;
+}
+
+//  Split a string of CJK characters into units that can have line breaks between them.
+std::vector<std::string> split_cjk_word(const char* input) {
+	const icu::UnicodeString parseme(input);
+	std::vector<std::string> result;
+	for (int i = 0; i < parseme.length(); ++i) {
+		icu::UnicodeString temp;
+		UChar c = parseme.charAt(i);
+		temp += c;
+		if (i < parseme.length() - 1) {
+			UChar next = parseme.charAt(i + 1);
+			if (cannot_end_line(c) || cannot_start_line(next)) {
+				temp += next;
+				++i;
+			}
+		}
+		std::string temp2;
+		result.push_back(temp.toUTF8String(temp2));
+	}
+	return result;
+}
+
+bool cannot_start_line(const UChar& c) {
+	return kCannottStartLineJapanese.count(c) == 1;
+}
+
+bool cannot_end_line(const UChar& c) {
+	return kCannotEndLineJapanese.count(c) == 1;
+}
+
 } // namespace UI

=== modified file 'src/graphic/text/bidi.h'
--- src/graphic/text/bidi.h	2015-09-26 09:34:20 +0000
+++ src/graphic/text/bidi.h	2015-09-28 18:54:19 +0000
@@ -23,14 +23,20 @@
 #include <string>
 #include <vector>
 
+#include <unicode/uchar.h>
+
 #include "graphic/text/font_set.h"
 
 // BiDi support for RTL languages
 namespace i18n {
 	std::string make_ligatures(const char* input);
 	std::string line2bidi(const char* input);
+	std::vector<std::string> split_cjk_word(const char* input);
 	bool has_rtl_character(const char* input);
 	bool has_rtl_character(std::vector<std::string> input);
+	bool has_cjk_character(const char* input);
+	bool cannot_start_line(const UChar& c);
+	bool cannot_end_line(const UChar& c);
 
 } // namespace UI
 

=== modified file 'src/graphic/text/rt_render.cc'
--- src/graphic/text/rt_render.cc	2015-09-26 18:04:24 +0000
+++ src/graphic/text/rt_render.cc	2015-09-28 18:54:19 +0000
@@ -754,7 +754,15 @@
 			}
 			word = ts.till_any_or_end(" \t\n\r");
 			if (!word.empty()) {
-				nodes.push_back(new TextNode(font_cache_.get_font(&ns), ns, i18n::make_ligatures(word.c_str())));
+				word = i18n::make_ligatures(word.c_str());
+				if (i18n::has_cjk_character(word.c_str())) {
+					std::vector<std::string> units = i18n::split_cjk_word(word.c_str());
+					for (const std::string& unit: units) {
+						nodes.push_back(new TextNode(font_cache_.get_font(&ns), ns, unit));
+					}
+				} else {
+					nodes.push_back(new TextNode(font_cache_.get_font(&ns), ns, word));
+				}
 			}
 		}
 	}