zorba-coders team mailing list archive
-
zorba-coders team
-
Mailing list archive
-
Message #02552
[Merge] lp:~zorba-coders/zorba/bug-897800 into lp:zorba
Matthias Brantner has proposed merging lp:~zorba-coders/zorba/bug-897800 into lp:zorba.
Requested reviews:
Matthias Brantner (matthias-brantner)
Paul J. Lucas (paul-lucas)
Related bugs:
Bug #897800 in Zorba: "Full-text regressions"
https://bugs.launchpad.net/zorba/+bug/897800
For more details, see:
https://code.launchpad.net/~zorba-coders/zorba/bug-897800/+merge/84539
Fixed.
--
https://code.launchpad.net/~zorba-coders/zorba/bug-897800/+merge/84539
Your team Zorba Coders is subscribed to branch lp:zorba.
=== modified file 'src/runtime/full_text/icu_tokenizer.cpp'
--- src/runtime/full_text/icu_tokenizer.cpp 2011-12-01 11:02:25 +0000
+++ src/runtime/full_text/icu_tokenizer.cpp 2011-12-06 00:10:31 +0000
@@ -69,7 +69,7 @@
void send( void *payload, Tokenizer::Callback &callback ) {
if ( !empty() ) {
# if DEBUG_TOKENIZER
- cout << "TOKEN: \"" << value_ << "\"\n";
+ cout << "TOKEN: \"" << value_ << "\" (" << pos_ << ',' << sent_ << ',' << para_ << ")\n";
# endif
callback( value_.data(), value_.size(), pos_, sent_, para_, payload );
clear();
@@ -131,7 +131,7 @@
Locale const &icu_locale = get_icu_locale_for( lang );
UErrorCode status = U_ZERO_ERROR;
- word_.reset(
+ word_it_.reset(
dynamic_cast<RuleBasedBreakIterator*>(
BreakIterator::createWordInstance( icu_locale, status )
)
@@ -139,7 +139,7 @@
if ( U_FAILURE( status ) )
throw ZORBA_EXCEPTION( zerr::ZXQP0036_BREAKITERATOR_CREATION_FAILED );
- sent_.reset(
+ sent_it_.reset(
dynamic_cast<RuleBasedBreakIterator*>(
BreakIterator::createSentenceInstance( Locale::getUS(), status )
)
@@ -199,11 +199,12 @@
// This unicode::string wraps the existing buffer: no copy is made.
unicode::string const utf16_s( false, utf16_buf, utf16_len );
- word_->setText( utf16_s );
- unicode::size_type word_start = word_->first(), word_end = word_->next();
+ word_it_->setText( utf16_s );
+ unicode::size_type word_start = word_it_->first();
+ unicode::size_type word_end = word_it_->next();
- sent_->setText( utf16_s );
- unicode::size_type sent_end = sent_->first(); sent_end = sent_->next();
+ sent_it_->setText( utf16_s );
+ unicode::size_type sent_end = sent_it_->first(); sent_end = sent_it_->next();
temp_token t;
@@ -227,10 +228,11 @@
}
unique_ptr<utf8::storage_type[]> const auto_utf8_buf( utf8_buf );
- zstring_b utf8_word;
+ zstring_b utf8_word; // used only for debugging & error reporting
utf8_word.wrap_memory( utf8_buf, utf8_len );
-
- unicode::size_type const rule_status = word_->getRuleStatus();
+# if DEBUG_TOKENIZER
+ cout << "GOT: \"" << utf8_word << "\" ";
+# endif
//
// "Junk" tokens are whitespace and punctuation -- except some punctuation
@@ -238,10 +240,7 @@
//
bool is_junk = false;
-# if DEBUG_TOKENIZER
- cout << "GOT: \"" << utf8_word << "\" ";
-# endif
-
+ int32_t const rule_status = word_it_->getRuleStatus();
if ( IS_WORD_BREAK( NONE, rule_status ) ) {
//
// "NONE" tokens are what ICU calls whitespace and punctuation.
@@ -289,7 +288,7 @@
default:
in_wild = false;
}
- }
+ } // if ( wildcards )
is_junk = true;
}
@@ -350,10 +349,16 @@
t.send( payload, callback );
set_token:
+# if DEBUG_TOKENIZER
+ cout << "at set_token" << endl;
+# endif
if ( !is_junk ) {
if ( in_wild || got_backslash )
t.append( utf8_buf, utf8_len );
else {
+# if DEBUG_TOKENIZER
+ cout << "setting token" << endl;
+# endif
t.set(
utf8_buf, utf8_len, numbers().token, numbers().sent, numbers().para
);
@@ -362,9 +367,14 @@
}
next:
- word_start = word_end, word_end = word_->next();
+# if DEBUG_TOKENIZER
+ cout << "at next" << endl;
+# endif
+ word_start = word_end, word_end = word_it_->next();
if ( word_end >= sent_end && sent_end != BreakIterator::DONE ) {
- sent_end = sent_->next();
+ sent_end = sent_it_->next();
+ // The addition of the "if" fixes:
+ // https://bugs.launchpad.net/bugs/863320
if ( sent_end != BreakIterator::DONE )
++numbers().sent;
}
@@ -375,6 +385,9 @@
err::FTDY0020, ERROR_PARAMS( "", ZED( UnbalancedChar_3 ), '}' )
);
t.send( payload, callback );
+ // Incrementing "sent" here fixes:
+ // https://bugs.launchpad.net/bugs/897800
+ ++numbers().sent;
}
///////////////////////////////////////////////////////////////////////////////
=== modified file 'src/runtime/full_text/icu_tokenizer.h'
--- src/runtime/full_text/icu_tokenizer.h 2011-09-05 02:06:22 +0000
+++ src/runtime/full_text/icu_tokenizer.h 2011-12-06 00:10:31 +0000
@@ -55,8 +55,8 @@
typedef std::unique_ptr<RuleBasedBreakIterator> rbbi_ptr;
locale::iso639_1::type const lang_;
- rbbi_ptr word_;
- rbbi_ptr sent_;
+ rbbi_ptr word_it_;
+ rbbi_ptr sent_it_;
};
///////////////////////////////////////////////////////////////////////////////
=== added file 'test/rbkt/ExpQueryResults/zorba/fulltext/ft-same-sentence-false-2.xml.res'
--- test/rbkt/ExpQueryResults/zorba/fulltext/ft-same-sentence-false-2.xml.res 1970-01-01 00:00:00 +0000
+++ test/rbkt/ExpQueryResults/zorba/fulltext/ft-same-sentence-false-2.xml.res 2011-12-06 00:10:31 +0000
@@ -0,0 +1,1 @@
+false
=== modified file 'test/rbkt/Queries/CMakeLists.txt'
--- test/rbkt/Queries/CMakeLists.txt 2011-10-26 13:43:15 +0000
+++ test/rbkt/Queries/CMakeLists.txt 2011-12-06 00:10:31 +0000
@@ -294,3 +294,4 @@
EXPECTED_FAILURE(test/rbkt/zorba/reference/reference_5 868640)
+EXPECTED_FAILURE(test/rbkt/zorba/fulltext/ft-same-sentence-false-2 897800)
=== added file 'test/rbkt/Queries/zorba/fulltext/ft-same-sentence-false-2.xq'
--- test/rbkt/Queries/zorba/fulltext/ft-same-sentence-false-2.xq 1970-01-01 00:00:00 +0000
+++ test/rbkt/Queries/zorba/fulltext/ft-same-sentence-false-2.xq 2011-12-06 00:10:31 +0000
@@ -0,0 +1,2 @@
+let $x := <msg>hello. world</msg>
+return $x contains text "hello" ftand "world" same sentence
Follow ups