← Back to team overview

zorba-coders team mailing list archive

[Merge] lp:~paul-lucas/zorba/bug-1131990 into lp:zorba

 

Paul J. Lucas has proposed merging lp:~paul-lucas/zorba/bug-1131990 into lp:zorba.

Commit message:
Fixed precedence of ^ and -[].

Requested reviews:
  Paul J. Lucas (paul-lucas)
Related bugs:
  Bug #1131988 in Zorba: "re00732 fails"
  https://bugs.launchpad.net/zorba/+bug/1131988
  Bug #1131990 in Zorba: "re00737 fails"
  https://bugs.launchpad.net/zorba/+bug/1131990

For more details, see:
https://code.launchpad.net/~paul-lucas/zorba/bug-1131990/+merge/159433

Fixed precedence of ^ and -[].
-- 
https://code.launchpad.net/~paul-lucas/zorba/bug-1131990/+merge/159433
Your team Zorba Coders is subscribed to branch lp:zorba.
=== modified file 'ChangeLog'
--- ChangeLog	2013-04-17 12:37:36 +0000
+++ ChangeLog	2013-04-17 16:32:27 +0000
@@ -51,6 +51,8 @@
   * Fixed bug #1123163 (fn:format-integer failures)
   * Fixed bug in index join rule (no index join if inner clause has positional var).
   * Fixed bug in index join rule (copy var ids after cloning index domain expr).
+  * Fixed bug #1131988 (re00732 fails)
+  * Fixed bug #1131990 (re00737 fails)
   * Fixed bug #1123162 (FOTS: formatting dates and times failures)
   * Added missing wrapper expressions around some variable references.
   * Fixed optimizer bug: elimination of positional variable sometimes caused

=== modified file 'src/util/icu_regex.cpp'
--- src/util/icu_regex.cpp	2013-04-12 05:26:54 +0000
+++ src/util/icu_regex.cpp	2013-04-17 16:32:27 +0000
@@ -118,6 +118,8 @@
   return ztd::peek_behind( s, &i ) == '?' && ztd::peek_behind( s, &i ) == '(';
 }
 
+//#define DEBUG_CONVERT_REGEX
+
 #define IS_CHAR_RANGE_BEGIN (in_char_class && is_char_range_begin( xq_re, i ))
 #define PEEK_C              ztd::peek( xq_re, i )
 
@@ -142,6 +144,7 @@
   int  in_char_class = 0;               // within [...]
   int  in_char_range = 0;               // within a-b within [...]
   int  is_first_char = 1;               // to check ^ placement
+  bool put_close_bracket = false;       // put another ] for char class
 
   bool in_backref = false;              // '\'[1-9][0-9]*
   unsigned backref_no = 0;              // 1-based
@@ -422,6 +425,13 @@
           if ( in_char_class && !in_char_range ) {
             char const next_c = PEEK_C;
             if ( next_c == '[' ) {
+              if ( put_close_bracket ) {
+                //
+                // See the comment below for the '[' case.
+                //
+                *icu_re += ']';
+                put_close_bracket = false;
+              }
               //
               // ICU uses "--" to indicate range subtraction, e.g.,
               // XQuery [A-Z-[OI]] becomes ICU [A-Z--[OI]].
@@ -449,7 +459,24 @@
           else {
             if ( in_char_class && prev_c_cooked != '-' )
               goto unescaped_char;
-            ++in_char_class;
+            if ( !in_char_class++ && PEEK_C == '^' ) {
+              //
+              // XML Schema Part 2 F.1 [16]: For any positive character group
+              // or negative character group G, and any character class
+              // expression C, G-C is a valid character class subtraction,
+              // identifying the set of all characters in C(G) that are not
+              // also in C(C).
+              //
+              // Hence, in XQuery, [^abcd-[xy]] means "all characters except
+              // abcdxy", i.e., the ^ has a higher precedence than -.
+              //
+              // However, in ICU, the reverse is true.  To make ICU behave like
+              // XQuery, we have to wrap the negative character group in [],
+              // i.e., [[^abcd]-[xy]].
+              //
+              *icu_re += '[';
+              put_close_bracket = true;
+            }
             is_first_char = 2;
           }
           break;
@@ -464,6 +491,8 @@
           else {
             if ( !in_char_class )
               goto unbalanced_char;
+            if ( put_close_bracket )
+              *icu_re += ']';
             --in_char_class;
             in_char_range = 0;
           }
@@ -549,6 +578,11 @@
     ascii::replace_all( *icu_re, "\\p{Is", 5, "\\p{In", 5 );
     ascii::replace_all( *icu_re, "\\P{Is", 5, "\\P{In", 5 );
   } // q_flag
+
+#ifdef DEBUG_CONVERT_REGEX
+  cout << "XQ : " << xq_re   << endl;
+  cout << "ICU: " << *icu_re << endl;
+#endif /* DEBUG_CONVERT_REGEX */
   return;
 
 not_single_char_esc:

=== modified file 'test/fots/CMakeLists.txt'
--- test/fots/CMakeLists.txt	2013-04-17 13:10:05 +0000
+++ test/fots/CMakeLists.txt	2013-04-17 16:32:27 +0000
@@ -175,8 +175,6 @@
 EXPECTED_FOTS_FAILURE (fn-matches.re re00288 1131985)
 EXPECTED_FOTS_FAILURE (fn-matches.re re00370 1131985)
 EXPECTED_FOTS_FAILURE (fn-matches.re re00480 1131985)
-EXPECTED_FOTS_FAILURE (fn-matches.re re00732 1131988)
-EXPECTED_FOTS_FAILURE (fn-matches.re re00737 1131990)
 EXPECTED_FOTS_FAILURE (fn-nilled fn-nilled-33 0)
 EXPECTED_FOTS_FAILURE (fn-nilled fn-nilled-35 0)
 EXPECTED_FOTS_FAILURE (fn-nilled fn-nilled-37 0)


Follow ups