zorba-coders team mailing list archive
-
zorba-coders team
-
Mailing list archive
-
Message #12330
[Merge] lp:~paul-lucas/zorba/bug-1024448 into lp:zorba
Paul J. Lucas has proposed merging lp:~paul-lucas/zorba/bug-1024448 into lp:zorba.
Requested reviews:
Paul J. Lucas (paul-lucas)
Related bugs:
Bug #1024448 in Zorba: "JSON parser doesn't recognize UTF-16 surrogate pairs"
https://bugs.launchpad.net/zorba/+bug/1024448
For more details, see:
https://code.launchpad.net/~paul-lucas/zorba/bug-1024448/+merge/115248
Now handling UTF-16 surrogate pairs.
--
https://code.launchpad.net/~paul-lucas/zorba/bug-1024448/+merge/115248
Your team Zorba Coders is subscribed to branch lp:zorba.
=== modified file 'src/unit_tests/test_json_parser.cpp'
--- src/unit_tests/test_json_parser.cpp 2012-07-12 17:29:55 +0000
+++ src/unit_tests/test_json_parser.cpp 2012-07-16 23:45:28 +0000
@@ -87,12 +87,16 @@
static void test_illegal_codepoint() {
static char const *const sources[] = {
- " \" \\u \" ",
- " \" \\u0 \" ",
- " \" \\u00 \" ",
- " \" \\u000 \" ",
- " \" \\uG \" ",
+ " \" \\u \" ",
+ " \" \\u0 \" ",
+ " \" \\u00 \" ",
+ " \" \\u000 \" ",
+ " \" \\uG \" ",
" \" \\u\" ",
+ " \" \\uD83D \" ",
+ " \" \\uD83D\\u0041 \" ",
+ " \" \\uD83D\\uD83E \" ",
+ " \" \\uDC4A \" ",
0
};
@@ -369,47 +373,54 @@
}
static void test_lexer_array() {
- char const source[] = "[ 1, \"2\", false, true, null ]";
+ char const source[] = "[ 1, \"2\", false, true, null, \"\\uD83D\\uDC4A\" ]";
istringstream iss( source );
lexer lex( iss );
token t;
- ASSERT_TRUE( lex.next( &t ) );
- ASSERT_TRUE( t == token::begin_array );
-
- ASSERT_TRUE( lex.next( &t ) );
- ASSERT_TRUE( t == token::number );
- ASSERT_TRUE( t.get_value() == "1" );
-
- ASSERT_TRUE( lex.next( &t ) );
- ASSERT_TRUE( t == token::value_separator );
-
- ASSERT_TRUE( lex.next( &t ) );
- ASSERT_TRUE( t == token::string );
- ASSERT_TRUE( t.get_value() == "2" );
-
- ASSERT_TRUE( lex.next( &t ) );
- ASSERT_TRUE( t == token::value_separator );
-
- ASSERT_TRUE( lex.next( &t ) );
- ASSERT_TRUE( t == token::json_false );
-
- ASSERT_TRUE( lex.next( &t ) );
- ASSERT_TRUE( t == token::value_separator );
-
- ASSERT_TRUE( lex.next( &t ) );
- ASSERT_TRUE( t == token::json_true );
-
- ASSERT_TRUE( lex.next( &t ) );
- ASSERT_TRUE( t == token::value_separator );
-
- ASSERT_TRUE( lex.next( &t ) );
- ASSERT_TRUE( t == token::json_null );
-
- ASSERT_TRUE( lex.next( &t ) );
- ASSERT_TRUE( t == token::end_array );
-
- ASSERT_TRUE( !lex.next( &t ) );
+ ASSERT_TRUE_AND_NO_EXCEPTION( lex.next( &t ) );
+ ASSERT_TRUE_AND_NO_EXCEPTION( t == token::begin_array );
+
+ ASSERT_TRUE_AND_NO_EXCEPTION( lex.next( &t ) );
+ ASSERT_TRUE_AND_NO_EXCEPTION( t == token::number );
+ ASSERT_TRUE_AND_NO_EXCEPTION( t.get_value() == "1" );
+
+ ASSERT_TRUE_AND_NO_EXCEPTION( lex.next( &t ) );
+ ASSERT_TRUE_AND_NO_EXCEPTION( t == token::value_separator );
+
+ ASSERT_TRUE_AND_NO_EXCEPTION( lex.next( &t ) );
+ ASSERT_TRUE_AND_NO_EXCEPTION( t == token::string );
+ ASSERT_TRUE_AND_NO_EXCEPTION( t.get_value() == "2" );
+
+ ASSERT_TRUE_AND_NO_EXCEPTION( lex.next( &t ) );
+ ASSERT_TRUE_AND_NO_EXCEPTION( t == token::value_separator );
+
+ ASSERT_TRUE_AND_NO_EXCEPTION( lex.next( &t ) );
+ ASSERT_TRUE_AND_NO_EXCEPTION( t == token::json_false );
+
+ ASSERT_TRUE_AND_NO_EXCEPTION( lex.next( &t ) );
+ ASSERT_TRUE_AND_NO_EXCEPTION( t == token::value_separator );
+
+ ASSERT_TRUE_AND_NO_EXCEPTION( lex.next( &t ) );
+ ASSERT_TRUE_AND_NO_EXCEPTION( t == token::json_true );
+
+ ASSERT_TRUE_AND_NO_EXCEPTION( lex.next( &t ) );
+ ASSERT_TRUE_AND_NO_EXCEPTION( t == token::value_separator );
+
+ ASSERT_TRUE_AND_NO_EXCEPTION( lex.next( &t ) );
+ ASSERT_TRUE_AND_NO_EXCEPTION( t == token::json_null );
+
+ ASSERT_TRUE_AND_NO_EXCEPTION( lex.next( &t ) );
+ ASSERT_TRUE_AND_NO_EXCEPTION( t == token::value_separator );
+
+ ASSERT_TRUE_AND_NO_EXCEPTION( lex.next( &t ) );
+ ASSERT_TRUE_AND_NO_EXCEPTION( t == token::string );
+ ASSERT_TRUE_AND_NO_EXCEPTION( t.get_value() == "\xF0\x9F\x91\x8A" );
+
+ ASSERT_TRUE_AND_NO_EXCEPTION( lex.next( &t ) );
+ ASSERT_TRUE_AND_NO_EXCEPTION( t == token::end_array );
+
+ ASSERT_TRUE_AND_NO_EXCEPTION( !lex.next( &t ) );
}
static void test_lexer_object() {
@@ -418,34 +429,34 @@
lexer lex( iss );
token t;
- ASSERT_TRUE( lex.next( &t ) );
+ ASSERT_TRUE_AND_NO_EXCEPTION( lex.next( &t ) );
ASSERT_TRUE( t == token::begin_object );
- ASSERT_TRUE( lex.next( &t ) );
+ ASSERT_TRUE_AND_NO_EXCEPTION( lex.next( &t ) );
ASSERT_TRUE( t == token::string );
- ASSERT_TRUE( lex.next( &t ) );
+ ASSERT_TRUE_AND_NO_EXCEPTION( lex.next( &t ) );
ASSERT_TRUE( t == token::name_separator );
- ASSERT_TRUE( lex.next( &t ) );
+ ASSERT_TRUE_AND_NO_EXCEPTION( lex.next( &t ) );
ASSERT_TRUE( t == token::number );
- ASSERT_TRUE( lex.next( &t ) );
+ ASSERT_TRUE_AND_NO_EXCEPTION( lex.next( &t ) );
ASSERT_TRUE( t == token::value_separator );
- ASSERT_TRUE( lex.next( &t ) );
+ ASSERT_TRUE_AND_NO_EXCEPTION( lex.next( &t ) );
ASSERT_TRUE( t == token::string );
- ASSERT_TRUE( lex.next( &t ) );
+ ASSERT_TRUE_AND_NO_EXCEPTION( lex.next( &t ) );
ASSERT_TRUE( t == token::name_separator );
- ASSERT_TRUE( lex.next( &t ) );
+ ASSERT_TRUE_AND_NO_EXCEPTION( lex.next( &t ) );
ASSERT_TRUE( t == token::string );
- ASSERT_TRUE( lex.next( &t ) );
+ ASSERT_TRUE_AND_NO_EXCEPTION( lex.next( &t ) );
ASSERT_TRUE( t == token::end_object );
- ASSERT_TRUE( !lex.next( &t ) );
+ ASSERT_TRUE_AND_NO_EXCEPTION( !lex.next( &t ) );
}
static void test_parser_array() {
=== modified file 'src/util/json_parser.cpp'
--- src/util/json_parser.cpp 2012-07-12 17:29:55 +0000
+++ src/util/json_parser.cpp 2012-07-16 23:45:28 +0000
@@ -259,20 +259,55 @@
unicode::code_point lexer::parse_codepoint() {
static char const hex_digits[] = "0123456789ABCDEF";
+ char c;
zstring cp_string( "\\u" ); // needed only for error message
-
- unicode::code_point cp = 0;
- for ( int i = 1; i <= 4; ++i ) {
- char c;
- if ( !get_char( &c ) || !ascii::is_xdigit( c ) )
+ unicode::code_point high_surrogate = 0;
+
+ while ( true ) {
+ unicode::code_point cp = 0;
+ for ( int i = 1; i <= 4; ++i ) {
+ if ( !get_char( &c ) )
+ throw illegal_codepoint( cur_loc_, cp_string );
+ cp_string += c;
+ if ( !ascii::is_xdigit( c ) )
+ throw illegal_codepoint( cur_loc_, cp_string );
+ c = ascii::to_upper( c );
+ char const *const p = std::strchr( hex_digits, c );
+ assert( p );
+ cp = (cp << 4) | (p - hex_digits);
+ }
+
+ if ( unicode::is_high_surrogate( cp ) ) {
+ if ( high_surrogate )
+ throw illegal_codepoint( cur_loc_, cp_string );
+ //
+ // It's easier to parse the \u for the low surrogate here rather than
+ // trying to manage state in parse_string().
+ //
+ if ( !get_char( &c ) )
+ throw illegal_codepoint( cur_loc_, cp_string );
+ cp_string += c;
+ if ( c != '\\' )
+ throw illegal_codepoint( cur_loc_, cp_string );
+ if ( !get_char( &c ) )
+ throw illegal_codepoint( cur_loc_, cp_string );
+ cp_string += c;
+ if ( c != 'u' )
+ throw illegal_codepoint( cur_loc_, cp_string );
+
+ high_surrogate = cp;
+ continue;
+ }
+ if ( unicode::is_low_surrogate( cp ) ) {
+ if ( !high_surrogate )
+ throw illegal_codepoint( cur_loc_, cp_string );
+ return unicode::convert_surrogate( high_surrogate, cp );
+ }
+ if ( high_surrogate )
throw illegal_codepoint( cur_loc_, cp_string );
- cp_string += c;
- c = ascii::to_upper( c );
- char const *const p = std::strchr( hex_digits, c );
- assert( p );
- cp = (cp << 4) | (p - hex_digits);
+
+ return cp;
}
- return cp;
}
token::type lexer::parse_literal( char first_c, token::value_type *value ) {
=== modified file 'src/util/unicode_util.h'
--- src/util/unicode_util.h 2012-07-12 17:29:55 +0000
+++ src/util/unicode_util.h 2012-07-16 23:45:28 +0000
@@ -93,7 +93,7 @@
////////// code-point checking ////////////////////////////////////////////////
/**
- * Test whether the given character is invalid in an IRI.
+ * Checks whether the given character is invalid in an IRI.
*
* @param c The character.
* @return Returns \c true only if the character is invalid in an IRI.
@@ -102,7 +102,7 @@
bool is_invalid_in_iri( code_point c );
/**
- * Test whether the given character is a "iprivate".
+ * Checks whether the given character is a "iprivate".
*
* @param c The character.
* @return Returns \c true only if the character is a "iprivate".
@@ -111,7 +111,7 @@
bool is_iprivate( code_point c );
/**
- * Unicode version is isspace(3).
+ * Unicode version of isspace(3).
*
* @param c The code-point to check.
* @return Returns \c true only if \a c is a whitespace character.
@@ -127,7 +127,7 @@
}
/**
- * Test whether the given character is a "ucschar".
+ * Checks whether the given character is a "ucschar".
*
* @param c The character.
* @return Returns \c true only if the character is a "ucschar".
@@ -136,6 +136,40 @@
bool is_ucschar( code_point c );
/**
+ * Checks whether the given value is a "high surrogate."
+ *
+ * @param n The value to check.
+ * @return Returns \c true only if \a n is a high surrogate.
+ */
+inline bool is_high_surrogate( unsigned long n ) {
+ return n >= 0xD800 && n <= 0xDBFF;
+}
+
+/**
+ * Checks whether the given value is a "low surrogate."
+ *
+ * @param n The value to check.
+ * @return Returns \c true only if \a n is a low surrogate.
+ */
+inline bool is_low_surrogate( unsigned long n ) {
+ return n >= 0xDC00 && n <= 0xDFFF;
+}
+
+/**
+ * Converts the given high and low surrogate values into the code-point they
+ * represent. Note that no checking is done on the parameters.
+ *
+ * @param high The high surrogate value.
+ * @param low The low surrogate value.
+ * @return Returns the represented code-point.
+ * @see is_high_surrogate()
+ * @see is_low_surrogate()
+ */
+inline code_point convert_surrogate( unsigned high, unsigned low ) {
+ return 0x10000 + (high - 0xD800) * 0x400 + (low - 0xDC00);
+}
+
+/**
* Checks whether the given code-point is valid.
*
* @param c The code-point to check.
Follow ups