zorba-coders team mailing list archive
-
zorba-coders team
-
Mailing list archive
-
Message #09726
[Merge] lp:~zorba-coders/zorba/feature-ft_module into lp:zorba
Paul J. Lucas has proposed merging lp:~zorba-coders/zorba/feature-ft_module into lp:zorba.
Requested reviews:
Matthias Brantner (matthias-brantner)
Paul J. Lucas (paul-lucas)
Related bugs:
Bug #944795 in Zorba: "XQDoc doesn't handle & in URLs"
https://bugs.launchpad.net/zorba/+bug/944795
For more details, see:
https://code.launchpad.net/~zorba-coders/zorba/feature-ft_module/+merge/105913
Renamed Tokenizer::Numbers to Tokenizer::State now (just prior to the 2.5 release) to give it a better name for the forthcoming addition of the ability to tokenize using include/exclude Item lists. At that time, State will most likely be expanded to include additional state information beyond just numbers, hence the name change.
--
https://code.launchpad.net/~zorba-coders/zorba/feature-ft_module/+merge/105913
Your team Zorba Coders is subscribed to branch lp:zorba.
=== modified file 'include/zorba/tokenizer.h'
--- include/zorba/tokenizer.h 2012-05-03 12:31:51 +0000
+++ include/zorba/tokenizer.h 2012-05-16 00:57:21 +0000
@@ -48,9 +48,10 @@
/////////////////////////////////////////////////////////////////////////////
/**
- * A %Numbers contains the current token, sentence, and paragraph numbers.
+ * A %State contains inter-Tokenizer state, currently the current token,
+ * sentence, and paragraph numbers.
*/
- struct Numbers {
+ struct State {
typedef Tokenizer::size_type value_type;
value_type token; ///< Token number.
@@ -60,7 +61,7 @@
/**
* Default constructor.
*/
- Numbers();
+ State();
};
/////////////////////////////////////////////////////////////////////////////
@@ -125,7 +126,7 @@
/**
* If \c true, XML processing instructions separate tokens. For example,
- * <code>net<?PI pi?>work</code> would be 2 tokens instead of 1.
+ * <code>net<?PI pi?>work</code> would be 2 tokens instead of 1.
*/
bool processing_instructions_separate_tokens;
@@ -162,18 +163,18 @@
virtual void destroy() const = 0;
/**
- * Gets this %Tokenizer's associated Numbers.
+ * Gets this %Tokenizer's associated State.
*
- * @return Returns said Numbers.
+ * @return Returns said State.
*/
- Numbers& numbers();
+ State& state();
/**
- * Gets this %Tokenizer's associated Numbers.
+ * Gets this %Tokenizer's associated State.
*
- * @return Returns said Numbers.
+ * @return Returns said State.
*/
- Numbers const& numbers() const;
+ State const& state() const;
/**
* Tokenizes the given node.
@@ -207,9 +208,9 @@
/**
* Constructs a %Tokenizer.
*
- * @param numbers the Numbers to use.
+ * @param state the State to use.
*/
- Tokenizer( Numbers &numbers );
+ Tokenizer( State &state );
/**
* Destroys a %Tokenizer.
@@ -255,18 +256,18 @@
Callback &callback, bool tokenize_acp );
private:
- Numbers *numbers_;
+ State *state_;
};
-inline Tokenizer::Tokenizer( Numbers &numbers ) : numbers_( &numbers ) {
-}
-
-inline Tokenizer::Numbers& Tokenizer::numbers() {
- return *numbers_;
-}
-
-inline Tokenizer::Numbers const& Tokenizer::numbers() const {
- return *numbers_;
+inline Tokenizer::Tokenizer( State &state ) : state_( &state ) {
+}
+
+inline Tokenizer::State& Tokenizer::state() {
+ return *state_;
+}
+
+inline Tokenizer::State const& Tokenizer::state() const {
+ return *state_;
}
inline void Tokenizer::tokenize_node( Item const &item,
@@ -288,13 +289,13 @@
* Creates a new %Tokenizer.
*
* @param lang The language of the text that the tokenizer will tokenize.
- * @param numbers The Numbers to use. If \c null, \a t is not set.
+ * @param state The State to use. If \c null, \a t is not set.
* @param t If not \c null, set to point to a Tokenizer for \a lang.
* @return Returns \c true only if this provider can provide a tokenizer for
* \a lang.
*/
virtual bool getTokenizer( locale::iso639_1::type lang,
- Tokenizer::Numbers *numbers = 0,
+ Tokenizer::State *state = 0,
Tokenizer::ptr *t = 0 ) const = 0;
};
=== modified file 'src/runtime/full_text/apply.cpp'
--- src/runtime/full_text/apply.cpp 2012-05-03 12:31:51 +0000
+++ src/runtime/full_text/apply.cpp 2012-05-16 00:57:21 +0000
@@ -1251,11 +1251,11 @@
FTTokenSeqIterator::FTTokens synonyms;
thesaurus_callback cb( qt0.pos(), qt0.lang(), synonyms );
- Tokenizer::Numbers t_num;
+ Tokenizer::State t_state;
TokenizerProvider const *const provider = GENV_STORE.getTokenizerProvider();
ZORBA_ASSERT( provider );
Tokenizer::ptr tokenizer;
- if ( !provider->getTokenizer( qt0.lang(), &t_num, &tokenizer ) )
+ if ( !provider->getTokenizer( qt0.lang(), &t_state, &tokenizer ) )
throw XQUERY_EXCEPTION(
err::FTST0009,
ERROR_PARAMS(
=== modified file 'src/runtime/full_text/ft_module_impl.cpp'
--- src/runtime/full_text/ft_module_impl.cpp 2012-05-15 21:13:21 +0000
+++ src/runtime/full_text/ft_module_impl.cpp 2012-05-16 00:57:21 +0000
@@ -552,7 +552,7 @@
zstring base_uri;
store::Item_t item;
iso639_1::type lang;
- Tokenizer::Numbers no;
+ Tokenizer::State t_state;
store::NsBindings const ns_bindings;
TokenizerProvider const *tokenizer_provider;
store::Item_t type_name;
@@ -574,7 +574,7 @@
tokenizer_provider = GENV_STORE.getTokenizerProvider();
ZORBA_ASSERT( tokenizer_provider );
state->doc_tokens_ =
- state->doc_item_->getTokens( *tokenizer_provider, no, lang );
+ state->doc_item_->getTokens( *tokenizer_provider, t_state, lang );
while ( state->doc_tokens_->hasNext() ) {
FTToken const *token;
@@ -667,7 +667,7 @@
store::Item_t element, item, junk, name;
zstring base_uri;
iso639_1::type lang;
- Tokenizer::Numbers no;
+ Tokenizer::State t_state;
store::NsBindings const ns_bindings;
Tokenizer::ptr tokenizer;
store::Item_t type_name;
@@ -689,7 +689,7 @@
tokenizer_provider = GENV_STORE.getTokenizerProvider();
ZORBA_ASSERT( tokenizer_provider );
- if ( !tokenizer_provider->getTokenizer( lang, &no, &tokenizer ) )
+ if ( !tokenizer_provider->getTokenizer( lang, &t_state, &tokenizer ) )
throw XQUERY_EXCEPTION(
err::FTST0009 /* lang not supported */,
ERROR_PARAMS(
@@ -826,9 +826,9 @@
TokenizerProvider const *const tokenizer_provider =
GENV_STORE.getTokenizerProvider();
ZORBA_ASSERT( tokenizer_provider );
- Tokenizer::Numbers no;
+ Tokenizer::State t_state;
Tokenizer::ptr tokenizer;
- if ( !tokenizer_provider->getTokenizer( lang, &no, &tokenizer ) )
+ if ( !tokenizer_provider->getTokenizer( lang, &t_state, &tokenizer ) )
throw XQUERY_EXCEPTION(
err::FTST0009 /* lang not supported */,
ERROR_PARAMS(
=== modified file 'src/runtime/full_text/ftcontains_visitor.cpp'
--- src/runtime/full_text/ftcontains_visitor.cpp 2012-05-03 12:31:51 +0000
+++ src/runtime/full_text/ftcontains_visitor.cpp 2012-05-16 00:57:21 +0000
@@ -426,9 +426,9 @@
// actual query.
//
while ( PlanIterator::consumeNext( item, plan_iter, plan_state_ ) ) {
- Tokenizer::Numbers no;
+ Tokenizer::State t_state;
query_item_t const qi(
- item->getTokens( tokenizer_provider, no, lang, wildcards )
+ item->getTokens( tokenizer_provider, t_state, lang, wildcards )
);
if ( qi->hasNext() )
query_items.push_back( qi );
=== modified file 'src/runtime/full_text/full_text_impl.cpp'
--- src/runtime/full_text/full_text_impl.cpp 2012-05-03 12:31:51 +0000
+++ src/runtime/full_text/full_text_impl.cpp 2012-05-16 00:57:21 +0000
@@ -84,9 +84,9 @@
tokenizer_provider = GENV_STORE.getTokenizerProvider();
while ( !ftcontains && consumeNext( doc_item, search_ctx, plan_state ) ) {
- Tokenizer::Numbers no;
+ Tokenizer::State t_state;
FTTokenIterator_t doc_tokens(
- doc_item->getTokens( *tokenizer_provider, no, lang )
+ doc_item->getTokens( *tokenizer_provider, t_state, lang )
);
store::Item_t ignore_item;
if ( ftignore )
=== modified file 'src/runtime/full_text/icu_tokenizer.cpp'
--- src/runtime/full_text/icu_tokenizer.cpp 2012-05-03 12:31:51 +0000
+++ src/runtime/full_text/icu_tokenizer.cpp 2012-05-16 00:57:21 +0000
@@ -130,8 +130,8 @@
///////////////////////////////////////////////////////////////////////////////
-ICU_Tokenizer::ICU_Tokenizer( iso639_1::type lang, Numbers &no ) :
- Tokenizer( no ),
+ICU_Tokenizer::ICU_Tokenizer( iso639_1::type lang, State &state ) :
+ Tokenizer( state ),
lang_( lang )
{
Locale const &icu_locale = get_icu_locale_for( lang );
@@ -381,9 +381,9 @@
cout << " setting token" << endl;
# endif
t.set(
- utf8_buf, utf8_len, numbers().token, numbers().sent, numbers().para
+ utf8_buf, utf8_len, state().token, state().sent, state().para
);
- ++numbers().token;
+ ++state().token;
}
}
@@ -408,7 +408,7 @@
// The addition of the "if" fixes:
// https://bugs.launchpad.net/bugs/863320
if ( sent_end != BreakIterator::DONE )
- ++numbers().sent;
+ ++state().sent;
}
} // while
@@ -419,7 +419,7 @@
t.send( item, callback );
// Incrementing "sent" here fixes:
// https://bugs.launchpad.net/bugs/897800
- ++numbers().sent;
+ ++state().sent;
#if DEBUG_TOKENIZER
cout << "--------------------\n";
#endif /* DEBUG_TOKENIZER */
@@ -428,13 +428,13 @@
///////////////////////////////////////////////////////////////////////////////
bool ICU_TokenizerProvider::getTokenizer( iso639_1::type lang,
- Tokenizer::Numbers *num,
+ Tokenizer::State *state,
Tokenizer::ptr *t ) const {
for ( int32_t n = ubrk_countAvailable(), i = 0; i < n; ++i ) {
if ( char const *const icu_locale = ubrk_getAvailable( i ) )
if ( lang == find_lang( icu_locale ) ) {
- if ( num && t )
- t->reset( new ICU_Tokenizer( lang, *num ) );
+ if ( state && t )
+ t->reset( new ICU_Tokenizer( lang, *state ) );
return true;
}
}
=== modified file 'src/runtime/full_text/icu_tokenizer.h'
--- src/runtime/full_text/icu_tokenizer.h 2012-05-03 12:31:51 +0000
+++ src/runtime/full_text/icu_tokenizer.h 2012-05-16 00:57:21 +0000
@@ -40,9 +40,9 @@
* Constructs an %ICU_Tokenizer.
*
* @param lang The language of the text that the tokenizer will tokenize.
- * @param no The Numbers to use.
+ * @param state The State to use.
*/
- ICU_Tokenizer( locale::iso639_1::type lang, Numbers &no );
+ ICU_Tokenizer( locale::iso639_1::type lang, State &state );
~ICU_Tokenizer();
@@ -67,7 +67,7 @@
ICU_TokenizerProvider() { } // needed to work-around compiler bug
// inherited
- bool getTokenizer( locale::iso639_1::type, Tokenizer::Numbers* = 0,
+ bool getTokenizer( locale::iso639_1::type, Tokenizer::State* = 0,
Tokenizer::ptr* = 0 ) const;
};
=== modified file 'src/runtime/full_text/latin_tokenizer.cpp'
--- src/runtime/full_text/latin_tokenizer.cpp 2012-05-03 12:31:51 +0000
+++ src/runtime/full_text/latin_tokenizer.cpp 2012-05-16 00:57:21 +0000
@@ -242,12 +242,12 @@
///////////////////////////////////////////////////////////////////////////////
bool LatinTokenizerProvider::getTokenizer( iso639_1::type lang,
- Tokenizer::Numbers *num,
+ Tokenizer::State *state,
Tokenizer::ptr *t ) const {
switch ( lang ) {
case iso639_1::en:
- if ( num && t )
- t->reset( new LatinTokenizer( *num ) );
+ if ( state && t )
+ t->reset( new LatinTokenizer( *state ) );
return true;
default:
return false;
=== modified file 'src/runtime/full_text/latin_tokenizer.h'
--- src/runtime/full_text/latin_tokenizer.h 2012-05-03 12:31:51 +0000
+++ src/runtime/full_text/latin_tokenizer.h 2012-05-16 00:57:21 +0000
@@ -34,7 +34,7 @@
*/
class LatinTokenizer : public Tokenizer {
public:
- LatinTokenizer( Numbers &num ) : Tokenizer( num ) { }
+ LatinTokenizer( State &state ) : Tokenizer( state ) { }
// inherited
void destroy() const;
@@ -66,7 +66,7 @@
class LatinTokenizerProvider : public TokenizerProvider {
public:
// inherited
- bool getTokenizer( locale::iso639_1::type, Tokenizer::Numbers* = 0,
+ bool getTokenizer( locale::iso639_1::type, Tokenizer::State* = 0,
Tokenizer::ptr* = 0 ) const;
};
=== modified file 'src/runtime/full_text/tokenizer.cpp'
--- src/runtime/full_text/tokenizer.cpp 2012-05-03 12:31:51 +0000
+++ src/runtime/full_text/tokenizer.cpp 2012-05-16 00:57:21 +0000
@@ -59,7 +59,7 @@
void Tokenizer::item( Item const &item, bool entering ) {
if ( entering && item.isNode() &&
item.getNodeKind() == store::StoreConsts::elementNode ) {
- ++numbers().para;
+ ++state().para;
}
}
@@ -78,7 +78,7 @@
if ( find_lang_attribute( item, &lang ) ) {
TokenizerProvider const *const p = GENV_STORE.getTokenizerProvider();
ZORBA_ASSERT( p );
- if ( !p->getTokenizer( lang, numbers_, &t_ptr ) )
+ if ( !p->getTokenizer( lang, state_, &t_ptr ) )
break;
t_raw = t_ptr.get();
}
@@ -109,7 +109,7 @@
}
}
-Tokenizer::Numbers::Numbers() {
+Tokenizer::State::State() {
token = para = 0;
sent = 1;
}
=== modified file 'src/store/api/item.h'
--- src/store/api/item.h 2012-05-03 12:31:51 +0000
+++ src/store/api/item.h 2012-05-16 00:57:21 +0000
@@ -838,13 +838,13 @@
* Gets the tokens for this item.
*
* @param provider The TokenizerProvider to use.
- * @param numbers The Tokenizer::Numbers to use.
+ * @param state The Tokenizer::State to use.
* @param lang The language to use for tokenization.
* @param wildcards If \c true, allow XQuery wildcard syntax.
* @return Returns an iterator over the tokens.
*/
virtual FTTokenIterator_t
- getTokens(TokenizerProvider const &provider, Tokenizer::Numbers &numbers,
+ getTokens(TokenizerProvider const &provider, Tokenizer::State &state,
locale::iso639_1::type lang, bool wildcards = false) const;
#endif /* ZORBA_NO_FULL_TEXT */
=== modified file 'src/store/naive/atomic_items.cpp'
--- src/store/naive/atomic_items.cpp 2012-05-15 21:12:27 +0000
+++ src/store/naive/atomic_items.cpp 2012-05-16 00:57:21 +0000
@@ -1651,7 +1651,7 @@
#ifndef ZORBA_NO_FULL_TEXT
FTTokenIterator_t StringItem::getTokens(
TokenizerProvider const &provider,
- Tokenizer::Numbers &numbers,
+ Tokenizer::State &state,
iso639_1::type lang,
bool wildcards ) const
{
@@ -1660,7 +1660,7 @@
AtomicItemTokenizerCallback callback( *tokens );
Tokenizer::ptr tokenizer;
- if ( provider.getTokenizer( lang, &numbers, &tokenizer ) )
+ if ( provider.getTokenizer( lang, &state, &tokenizer ) )
tokenizer->tokenize_string(
theValue.data(), theValue.size(), lang, wildcards, callback
);
=== modified file 'src/store/naive/atomic_items.h'
--- src/store/naive/atomic_items.h 2012-05-08 01:09:52 +0000
+++ src/store/naive/atomic_items.h 2012-05-16 00:57:21 +0000
@@ -852,7 +852,7 @@
#ifndef ZORBA_NO_FULL_TEXT
FTTokenIterator_t getTokens(
TokenizerProvider const&,
- Tokenizer::Numbers&,
+ Tokenizer::State&,
locale::iso639_1::type,
bool = false ) const;
#endif /* ZORBA_NO_FULL_TEXT */
=== modified file 'src/store/naive/item.cpp'
--- src/store/naive/item.cpp 2012-05-03 12:31:51 +0000
+++ src/store/naive/item.cpp 2012-05-16 00:57:21 +0000
@@ -354,7 +354,7 @@
#ifndef ZORBA_NO_FULL_TEXT
FTTokenIterator_t
-Item::getTokens( TokenizerProvider const&, Tokenizer::Numbers&,
+Item::getTokens( TokenizerProvider const&, Tokenizer::State&,
locale::iso639_1::type, bool ) const
{
throw ZORBA_EXCEPTION(
=== modified file 'src/store/naive/node_items.cpp'
--- src/store/naive/node_items.cpp 2012-05-08 23:31:37 +0000
+++ src/store/naive/node_items.cpp 2012-05-16 00:57:21 +0000
@@ -4822,7 +4822,7 @@
FTTokenIterator_t
AttributeNode::getTokens( TokenizerProvider const &provider,
- Tokenizer::Numbers &numbers, iso639_1::type lang,
+ Tokenizer::State &state, iso639_1::type lang,
bool ) const
{
FTTokenStore &token_store = getTree()->getTokenStore();
@@ -4838,7 +4838,7 @@
zorba::Item const api_attr( this );
Tokenizer::ptr tokenizer;
- if ( provider.getTokenizer( lang, &numbers, &tokenizer ) ) {
+ if ( provider.getTokenizer( lang, &state, &tokenizer ) ) {
tokenizer->tokenize_node( api_attr, lang, callback );
token_store.putAttr( this, att_tokens );
}
@@ -4907,7 +4907,7 @@
FTTokenIterator_t
XmlNode::getTokens( TokenizerProvider const &provider,
- Tokenizer::Numbers &numbers, iso639_1::type lang,
+ Tokenizer::State &state, iso639_1::type lang,
bool ) const
{
FTTokenStore &token_store = getTree()->getTokenStore();
@@ -4918,7 +4918,7 @@
zorba::Item const api_root( getRoot() );
XmlNodeTokenizerCallback callback( token_store );
Tokenizer::ptr tokenizer;
- if ( provider.getTokenizer( lang, &numbers, &tokenizer ) )
+ if ( provider.getTokenizer( lang, &state, &tokenizer ) )
tokenizer->tokenize_node( api_root, lang, callback );
}
=== modified file 'src/store/naive/node_items.h'
--- src/store/naive/node_items.h 2012-05-03 12:31:51 +0000
+++ src/store/naive/node_items.h 2012-05-16 00:57:21 +0000
@@ -555,7 +555,7 @@
#ifndef ZORBA_NO_FULL_TEXT
FTTokenIterator_t getTokens(
TokenizerProvider const&,
- Tokenizer::Numbers&,
+ Tokenizer::State&,
locale::iso639_1::type,
bool = false ) const;
#endif /* ZORBA_NO_FULL_TEXT */
@@ -1233,7 +1233,7 @@
isPrecedingSibling(const store::Item_t&) const { return false; }
#ifndef ZORBA_NO_FULL_TEXT
- FTTokenIterator_t getTokens( TokenizerProvider const&, Tokenizer::Numbers&,
+ FTTokenIterator_t getTokens( TokenizerProvider const&, Tokenizer::State&,
locale::iso639_1::type,
bool wildcards = false ) const;
#endif /* ZORBA_NO_FULL_TEXT */
=== modified file 'src/unit_tests/tokenizer.cpp'
--- src/unit_tests/tokenizer.cpp 2012-05-03 12:31:51 +0000
+++ src/unit_tests/tokenizer.cpp 2012-05-16 00:57:21 +0000
@@ -60,7 +60,7 @@
class TestTokenizer : public Tokenizer {
public:
- TestTokenizer( Numbers &num ) : Tokenizer( num ) { }
+ TestTokenizer( State &state ) : Tokenizer( state ) { }
~TestTokenizer();
// inherited
@@ -125,7 +125,7 @@
item.getNodeName( qname );
if ( ::binary_search( block_elements, end, qname.getLocalName().c_str(),
less<char const*>() ) ) {
- ++numbers().para;
+ ++state().para;
}
}
}
@@ -291,7 +291,7 @@
// no break;
case '!':
case '?':
- ++numbers().sent;
+ ++state().sent;
}
} // for
@@ -324,19 +324,19 @@
Callback &callback, Item const *item ) {
if ( !token.empty() ) {
#if PRINT_TOKENS
- cout << "t=" << setw(2) << numbers().token
- << ", s=" << setw(2) << numbers().sent
- << ", p=" << setw(2) << numbers().para
+ cout << "t=" << setw(2) << state().token
+ << ", s=" << setw(2) << state().sent
+ << ", p=" << setw(2) << state().para
<< ": \"" << token << "\"\n";
#endif /* PRINT_TOKENS */
- check_token( token.c_str(), numbers().token );
+ check_token( token.c_str(), state().token );
callback.token(
token.data(), token.size(), lang,
- numbers().token, numbers().sent, numbers().para, item
+ state().token, state().sent, state().para, item
);
- ++numbers().token;
+ ++state().token;
return true;
}
return false;
@@ -347,15 +347,15 @@
class TestTokenizerProvider : public TokenizerProvider {
public:
// inherited
- bool getTokenizer( iso639_1::type, Tokenizer::Numbers* = 0,
+ bool getTokenizer( iso639_1::type, Tokenizer::State* = 0,
Tokenizer::ptr* = 0 ) const;
};
bool TestTokenizerProvider::getTokenizer( iso639_1::type lang,
- Tokenizer::Numbers *num,
+ Tokenizer::State *state,
Tokenizer::ptr *t ) const {
- if ( num && t )
- t->reset( new TestTokenizer( *num ) );
+ if ( state && t )
+ t->reset( new TestTokenizer( *state ) );
return true;
}
Follow ups