zorba-coders team mailing list archive
-
zorba-coders team
-
Mailing list archive
-
Message #03128
[Merge] lp:~zorba-coders/zorba/tokenize into lp:zorba
Matthias Brantner has proposed merging lp:~zorba-coders/zorba/tokenize into lp:zorba.
Requested reviews:
Matthias Brantner (matthias-brantner)
Paul J. Lucas (paul-lucas)
William Candillon (wcandillon)
Related bugs:
Bug #898074 in Zorba: "fn:tokenize() doesn't stream"
https://bugs.launchpad.net/zorba/+bug/898074
For more details, see:
https://code.launchpad.net/~zorba-coders/zorba/tokenize/+merge/86835
implementation of string:split function that doesn't accept regular expressions but allows for streamable processing of the input (resolves bug #898074)
--
https://code.launchpad.net/~zorba-coders/zorba/tokenize/+merge/86835
Your team Zorba Coders is subscribed to branch lp:zorba.
=== modified file 'ChangeLog'
--- ChangeLog 2011-12-23 19:38:53 +0000
+++ ChangeLog 2011-12-23 21:56:35 +0000
@@ -12,6 +12,8 @@
set multiple times via the c++ api).
* Fixed bug #905050 (setting and getting the context item type via the c++ api)
* Added createDayTimeDuration, createYearMonthDuration, createDocumentNode, createCommentNode, createPiNode to api's ItemFactory.
+ * Added split function to the string module that allows for streamable tokenization but doesn't have regular expression
+ support.
* zerr is not predeclared anymore to be http://www.zorba-xquery.com/errors
version 2.1
=== modified file 'modules/com/zorba-xquery/www/modules/CMakeLists.txt'
--- modules/com/zorba-xquery/www/modules/CMakeLists.txt 2011-12-21 14:40:33 +0000
+++ modules/com/zorba-xquery/www/modules/CMakeLists.txt 2011-12-23 21:56:35 +0000
@@ -58,7 +58,7 @@
URI "http://www.zorba-xquery.com/modules/reflection")
DECLARE_ZORBA_MODULE(FILE schema.xq VERSION 2.0
URI "http://www.zorba-xquery.com/modules/schema")
-DECLARE_ZORBA_MODULE(FILE string.xq VERSION 2.0
+DECLARE_ZORBA_MODULE(FILE string.xq VERSION 2.1
URI "http://www.zorba-xquery.com/modules/string")
DECLARE_ZORBA_MODULE(FILE xml.xq VERSION 2.0
URI "http://www.zorba-xquery.com/modules/xml")
=== modified file 'modules/com/zorba-xquery/www/modules/string.xq'
--- modules/com/zorba-xquery/www/modules/string.xq 2011-08-03 15:12:40 +0000
+++ modules/com/zorba-xquery/www/modules/string.xq 2011-12-23 21:56:35 +0000
@@ -25,7 +25,7 @@
:)
module namespace string = "http://www.zorba-xquery.com/modules/string";
declare namespace ver = "http://www.zorba-xquery.com/options/versioning";
-declare option ver:module-version "2.0";
+declare option ver:module-version "2.1";
(:~
: This function materializes a streamable string.
@@ -63,3 +63,23 @@
:
:)
declare function string:is-streamable($s as xs:string) as xs:boolean external;
+
+(:~
+ : Returns a sequence of strings constructed by splitting the input wherever the given
+ : separator is found.
+ :
+ : The function is different from fn:tokenize. It doesn't allow
+ : the separator to be a regular expression. This restriction allows for more
+ : performant implementation. Specifically, the function processes
+ : streamable strings as input in a streamable way which is particularly useful
+ : to tokenize huge strings (e.g. if returned by the file module's read-text
+ : function).
+ :
+ : @param $s the input string to split
+ : @param $separator the separator used for splitting the input string $s
+ :
+ : @return a sequence of strings constructed by splitting the input
+ :)
+declare function string:split(
+ $s as xs:string,
+ $separator as xs:string) as xs:string* external;
=== modified file 'src/functions/pregenerated/func_strings.cpp'
--- src/functions/pregenerated/func_strings.cpp 2011-12-21 14:40:33 +0000
+++ src/functions/pregenerated/func_strings.cpp 2011-12-23 21:56:35 +0000
@@ -320,6 +320,16 @@
return new StringIsStreamableIterator(sctx, loc, argv);
}
+PlanIter_t fn_zorba_string_split::codegen(
+ CompilerCB*,
+ static_context* sctx,
+ const QueryLoc& loc,
+ std::vector<PlanIter_t>& argv,
+ AnnotationHolder& ann) const
+{
+ return new StringSplitIterator(sctx, loc, argv);
+}
+
void populate_context_strings(static_context* sctx)
{
{
@@ -890,6 +900,19 @@
}
+
+ {
+
+
+ DECL_WITH_KIND(sctx, fn_zorba_string_split,
+ (createQName("http://www.zorba-xquery.com/modules/string","","split"),
+ GENV_TYPESYSTEM.STRING_TYPE_ONE,
+ GENV_TYPESYSTEM.STRING_TYPE_ONE,
+ GENV_TYPESYSTEM.STRING_TYPE_STAR),
+ FunctionConsts::FN_ZORBA_STRING_SPLIT_2);
+
+ }
+
}
=== modified file 'src/functions/pregenerated/func_strings.h'
--- src/functions/pregenerated/func_strings.h 2011-12-22 14:14:53 +0000
+++ src/functions/pregenerated/func_strings.h 2011-12-23 21:56:35 +0000
@@ -481,6 +481,21 @@
};
+//fn-zorba-string:split
+class fn_zorba_string_split : public function
+{
+public:
+ fn_zorba_string_split(const signature& sig, FunctionConsts::FunctionKind kind)
+ :
+ function(sig, kind)
+ {
+
+ }
+
+ CODEGEN_DECL();
+};
+
+
} //namespace zorba
=== modified file 'src/functions/pregenerated/function_enum.h'
--- src/functions/pregenerated/function_enum.h 2011-12-21 14:40:33 +0000
+++ src/functions/pregenerated/function_enum.h 2011-12-23 21:56:35 +0000
@@ -371,6 +371,7 @@
FN_ANALYZE_STRING_3,
FN_ZORBA_STRING_MATERIALIZE_1,
FN_ZORBA_STRING_IS_STREAMABLE_1,
+ FN_ZORBA_STRING_SPLIT_2,
FN_ZORBA_XQDOC_XQDOC_1,
FN_ZORBA_XQDOC_XQDOC_CONTENT_1,
=== modified file 'src/runtime/spec/strings/strings.xml'
--- src/runtime/spec/strings/strings.xml 2011-12-21 14:40:33 +0000
+++ src/runtime/spec/strings/strings.xml 2011-12-23 21:56:35 +0000
@@ -729,4 +729,35 @@
</zorba:iterator>
+<!--
+/*******************************************************************************
+ * string:tokenize
+********************************************************************************/
+-->
+<zorba:iterator name="StringSplitIterator">
+
+ <zorba:description author="Matthias Brantner">
+ string:split
+ </zorba:description>
+
+ <zorba:function>
+ <zorba:signature localname="split" prefix="fn-zorba-string">
+ <zorba:param>xs:string</zorba:param>
+ <zorba:param>xs:string</zorba:param>
+ <zorba:output>xs:string*</zorba:output>
+ </zorba:signature>
+ </zorba:function>
+
+ <zorba:state>
+ <zorba:member type="zstring" name="theSeparator"
+ brief="separator for the tokenization"/>
+ <zorba:member type="std::istream*" name="theIStream"
+ brief="the remaining string (if the input is streamable)"/>
+ <zorba:member type="zstring" name="theInput"
+ brief="the string to tokenize (if the input is not streamable)"/>
+ <zorba:member type="size_t" name="theNextStartPos" defaultValue="0"/>
+ </zorba:state>
+
+</zorba:iterator>
+
</zorba:iterators>
=== modified file 'src/runtime/strings/pregenerated/strings.cpp'
--- src/runtime/strings/pregenerated/strings.cpp 2011-12-21 14:40:33 +0000
+++ src/runtime/strings/pregenerated/strings.cpp 2011-12-23 21:56:35 +0000
@@ -830,6 +830,48 @@
// </StringIsStreamableIterator>
+// <StringSplitIterator>
+const char* StringSplitIterator::class_name_str = "StringSplitIterator";
+StringSplitIterator::class_factory<StringSplitIterator>
+StringSplitIterator::g_class_factory;
+
+const serialization::ClassVersion
+StringSplitIterator::class_versions[] ={{ 1, 0x000905, false}};
+
+const int StringSplitIterator::class_versions_count =
+sizeof(StringSplitIterator::class_versions)/sizeof(struct serialization::ClassVersion);
+
+void StringSplitIterator::accept(PlanIterVisitor& v) const {
+ v.beginVisit(*this);
+
+ std::vector<PlanIter_t>::const_iterator lIter = theChildren.begin();
+ std::vector<PlanIter_t>::const_iterator lEnd = theChildren.end();
+ for ( ; lIter != lEnd; ++lIter ){
+ (*lIter)->accept(v);
+ }
+
+ v.endVisit(*this);
+}
+
+StringSplitIterator::~StringSplitIterator() {}
+
+StringSplitIteratorState::StringSplitIteratorState() {}
+
+StringSplitIteratorState::~StringSplitIteratorState() {}
+
+
+void StringSplitIteratorState::init(PlanState& planState) {
+ PlanIteratorState::init(planState);
+ theNextStartPos = 0;
+}
+
+void StringSplitIteratorState::reset(PlanState& planState) {
+ PlanIteratorState::reset(planState);
+ theNextStartPos = 0;
+}
+// </StringSplitIterator>
+
+
}
=== modified file 'src/runtime/strings/pregenerated/strings.h'
--- src/runtime/strings/pregenerated/strings.h 2011-12-21 14:40:33 +0000
+++ src/runtime/strings/pregenerated/strings.h 2011-12-23 21:56:35 +0000
@@ -1075,6 +1075,58 @@
};
+/**
+ *
+ * string:split
+ *
+ * Author: Matthias Brantner
+ */
+class StringSplitIteratorState : public PlanIteratorState
+{
+public:
+ zstring theSeparator; //separator for the tokenization
+ std::istream* theIStream; //the remaining string (if the input is streamable)
+ zstring theInput; //the string to tokenize (if the input is not streamable)
+ size_t theNextStartPos; //
+
+ StringSplitIteratorState();
+
+ ~StringSplitIteratorState();
+
+ void init(PlanState&);
+ void reset(PlanState&);
+};
+
+class StringSplitIterator : public NaryBaseIterator<StringSplitIterator, StringSplitIteratorState>
+{
+public:
+ SERIALIZABLE_CLASS(StringSplitIterator);
+
+ SERIALIZABLE_CLASS_CONSTRUCTOR2T(StringSplitIterator,
+ NaryBaseIterator<StringSplitIterator, StringSplitIteratorState>);
+
+ void serialize( ::zorba::serialization::Archiver& ar)
+ {
+ serialize_baseclass(ar,
+ (NaryBaseIterator<StringSplitIterator, StringSplitIteratorState>*)this);
+ }
+
+ StringSplitIterator(
+ static_context* sctx,
+ const QueryLoc& loc,
+ std::vector<PlanIter_t>& children)
+ :
+ NaryBaseIterator<StringSplitIterator, StringSplitIteratorState>(sctx, loc, children)
+ {}
+
+ virtual ~StringSplitIterator();
+
+ void accept(PlanIterVisitor& v) const;
+
+ bool nextImpl(store::Item_t& result, PlanState& aPlanState) const;
+};
+
+
}
#endif
/*
=== modified file 'src/runtime/strings/strings_impl.cpp'
--- src/runtime/strings/strings_impl.cpp 2011-12-23 06:41:43 +0000
+++ src/runtime/strings/strings_impl.cpp 2011-12-23 21:56:35 +0000
@@ -140,6 +140,7 @@
p = ec;
if ( utf8::read( *state->theStream, ec ) == utf8::npos )
+ {
if ( state->theStream->good() ) {
//
// If read() failed but the stream state is good, it means that an
@@ -165,6 +166,7 @@
zerr::ZOSE0003_STREAM_READ_FAILURE, ERROR_LOC( loc )
);
}
+ }
state->theResult.clear();
state->theResult.push_back( utf8::next_char( p ) );
@@ -2284,5 +2286,133 @@
STACK_END(state);
}
+/**
+ *______________________________________________________________________
+ *
+ * http://www.zorba-xquery.com/modules/string
+ * string:split
+ */
+bool StringSplitIterator::nextImpl(
+ store::Item_t& result,
+ PlanState& planState) const
+{
+ store::Item_t item;
+ size_t lNewPos = 0;
+ zstring lToken;
+ zstring lPartialMatch;
+
+ StringSplitIteratorState* state;
+ DEFAULT_STACK_INIT(StringSplitIteratorState, state, planState);
+
+ // init phase, get input string and tokens
+ consumeNext(item, theChildren[0].getp(), planState);
+
+ if (item->isStreamable())
+ {
+ state->theIStream = &item->getStream();
+ }
+ else
+ {
+ state->theIStream = 0;
+ item->getStringValue2(state->theInput);
+ }
+
+ consumeNext(item, theChildren[1].getp(), planState);
+
+ item->getStringValue2(state->theSeparator);
+
+ // working phase, do the tokenization
+ if (state->theIStream)
+ {
+ while ( !state->theIStream->eof() )
+ {
+ utf8::encoded_char_type ec;
+ memset( ec, '\0' , sizeof(ec) );
+ utf8::storage_type *p;
+ p = ec;
+
+ if ( utf8::read( *state->theIStream, ec ) != utf8::npos )
+ {
+ if (state->theSeparator.compare(lNewPos, 1, ec) == 0)
+ {
+ if (++lNewPos == state->theSeparator.length())
+ {
+ GENV_ITEMFACTORY->createString(result, lToken);
+ STACK_PUSH(true, state);
+ }
+ else
+ {
+ lPartialMatch.append(ec);
+ }
+ }
+ else
+ {
+ lToken.append(lPartialMatch);
+ lToken.append(ec);
+ }
+ }
+ else
+ {
+ if (state->theIStream->good())
+ {
+ char buf[ 6 /* bytes at most */ * 5 /* chars per byte */ ], *b = buf;
+ bool first = true;
+ for ( ; *p; ++p ) {
+ if ( first )
+ first = false;
+ else
+ *b++ = ',';
+ ::strcpy( b, "0x" ); b += 2;
+ ::sprintf( b, "%0hhX", *p ); b += 2;
+ }
+ throw XQUERY_EXCEPTION(
+ zerr::ZXQD0006_INVALID_UTF8_BYTE_SEQUENCE,
+ ERROR_PARAMS( buf ),
+ ERROR_LOC( loc )
+ );
+ }
+ if (!lToken.empty())
+ {
+ GENV_ITEMFACTORY->createString(result, lToken);
+ STACK_PUSH(true, state);
+ }
+ break;
+ }
+ }
+ }
+ else
+ {
+ while (true)
+ {
+ if (state->theNextStartPos == zstring::npos)
+ {
+ break;
+ }
+
+ lNewPos =
+ state->theInput.find(state->theSeparator, state->theNextStartPos);
+ if (lNewPos != zstring::npos)
+ {
+ zstring lSubStr = state->theInput.substr(
+ state->theNextStartPos,
+ lNewPos - state->theNextStartPos);
+ GENV_ITEMFACTORY->createString(result, lSubStr);
+ state->theNextStartPos =
+ lNewPos==state->theInput.length() - state->theSeparator.length()
+ ? zstring::npos
+ : lNewPos + state->theSeparator.length();
+ }
+ else
+ {
+ zstring lSubStr = state->theInput.substr(state->theNextStartPos);
+ GENV_ITEMFACTORY->createString(result, lSubStr);
+ state->theNextStartPos = zstring::npos;
+ }
+ STACK_PUSH(true, state);
+ }
+ }
+
+ STACK_END(state);
+}
} // namespace zorba
/* vim:set et sw=2 ts=2: */
=== modified file 'src/runtime/visitors/pregenerated/planiter_visitor.h'
--- src/runtime/visitors/pregenerated/planiter_visitor.h 2011-12-21 14:40:33 +0000
+++ src/runtime/visitors/pregenerated/planiter_visitor.h 2011-12-23 21:56:35 +0000
@@ -582,6 +582,8 @@
class StringIsStreamableIterator;
+ class StringSplitIterator;
+
class XQDocIterator;
class XQDocContentIterator;
@@ -1423,6 +1425,9 @@
virtual void beginVisit ( const StringIsStreamableIterator& ) = 0;
virtual void endVisit ( const StringIsStreamableIterator& ) = 0;
+ virtual void beginVisit ( const StringSplitIterator& ) = 0;
+ virtual void endVisit ( const StringSplitIterator& ) = 0;
+
virtual void beginVisit ( const XQDocIterator& ) = 0;
virtual void endVisit ( const XQDocIterator& ) = 0;
=== modified file 'src/runtime/visitors/pregenerated/printer_visitor.cpp'
--- src/runtime/visitors/pregenerated/printer_visitor.cpp 2011-12-21 14:40:33 +0000
+++ src/runtime/visitors/pregenerated/printer_visitor.cpp 2011-12-23 21:56:35 +0000
@@ -3961,6 +3961,20 @@
// </StringIsStreamableIterator>
+// <StringSplitIterator>
+void PrinterVisitor::beginVisit ( const StringSplitIterator& a) {
+ thePrinter.startBeginVisit("StringSplitIterator", ++theId);
+ printCommons( &a, theId );
+ thePrinter.endBeginVisit( theId );
+}
+
+void PrinterVisitor::endVisit ( const StringSplitIterator& ) {
+ thePrinter.startEndVisit();
+ thePrinter.endEndVisit();
+}
+// </StringSplitIterator>
+
+
// <XQDocIterator>
void PrinterVisitor::beginVisit ( const XQDocIterator& a) {
thePrinter.startBeginVisit("XQDocIterator", ++theId);
=== modified file 'src/runtime/visitors/pregenerated/printer_visitor.h'
--- src/runtime/visitors/pregenerated/printer_visitor.h 2011-12-21 14:40:33 +0000
+++ src/runtime/visitors/pregenerated/printer_visitor.h 2011-12-23 21:56:35 +0000
@@ -876,6 +876,9 @@
void beginVisit( const StringIsStreamableIterator& );
void endVisit ( const StringIsStreamableIterator& );
+ void beginVisit( const StringSplitIterator& );
+ void endVisit ( const StringSplitIterator& );
+
void beginVisit( const XQDocIterator& );
void endVisit ( const XQDocIterator& );
=== added file 'test/rbkt/ExpQueryResults/zorba/string/tokenize01.xml.res'
--- test/rbkt/ExpQueryResults/zorba/string/tokenize01.xml.res 1970-01-01 00:00:00 +0000
+++ test/rbkt/ExpQueryResults/zorba/string/tokenize01.xml.res 2011-12-23 21:56:35 +0000
@@ -0,0 +1,1 @@
+a d a d
=== added file 'test/rbkt/ExpQueryResults/zorba/string/tokenize02.xml.res'
--- test/rbkt/ExpQueryResults/zorba/string/tokenize02.xml.res 1970-01-01 00:00:00 +0000
+++ test/rbkt/ExpQueryResults/zorba/string/tokenize02.xml.res 2011-12-23 21:56:35 +0000
@@ -0,0 +1,1 @@
+a a
=== added file 'test/rbkt/ExpQueryResults/zorba/string/tokenize03.xml.res'
--- test/rbkt/ExpQueryResults/zorba/string/tokenize03.xml.res 1970-01-01 00:00:00 +0000
+++ test/rbkt/ExpQueryResults/zorba/string/tokenize03.xml.res 2011-12-23 21:56:35 +0000
@@ -0,0 +1,1 @@
+ d d
=== added file 'test/rbkt/ExpQueryResults/zorba/string/tokenize04.xml.res'
--- test/rbkt/ExpQueryResults/zorba/string/tokenize04.xml.res 1970-01-01 00:00:00 +0000
+++ test/rbkt/ExpQueryResults/zorba/string/tokenize04.xml.res 2011-12-23 21:56:35 +0000
@@ -0,0 +1,1 @@
+abcd abcd
=== added file 'test/rbkt/Queries/zorba/string/token01.txt'
--- test/rbkt/Queries/zorba/string/token01.txt 1970-01-01 00:00:00 +0000
+++ test/rbkt/Queries/zorba/string/token01.txt 2011-12-23 21:56:35 +0000
@@ -0,0 +1,1 @@
+abcd
\ No newline at end of file
=== added file 'test/rbkt/Queries/zorba/string/token02.txt'
--- test/rbkt/Queries/zorba/string/token02.txt 1970-01-01 00:00:00 +0000
+++ test/rbkt/Queries/zorba/string/token02.txt 2011-12-23 21:56:35 +0000
@@ -0,0 +1,1 @@
+abc
\ No newline at end of file
=== added file 'test/rbkt/Queries/zorba/string/token03.txt'
--- test/rbkt/Queries/zorba/string/token03.txt 1970-01-01 00:00:00 +0000
+++ test/rbkt/Queries/zorba/string/token03.txt 2011-12-23 21:56:35 +0000
@@ -0,0 +1,1 @@
+bcd
\ No newline at end of file
=== added file 'test/rbkt/Queries/zorba/string/token04.txt'
--- test/rbkt/Queries/zorba/string/token04.txt 1970-01-01 00:00:00 +0000
+++ test/rbkt/Queries/zorba/string/token04.txt 2011-12-23 21:56:35 +0000
@@ -0,0 +1,1 @@
+abcd
\ No newline at end of file
=== added file 'test/rbkt/Queries/zorba/string/tokenize01.xq'
--- test/rbkt/Queries/zorba/string/tokenize01.xq 1970-01-01 00:00:00 +0000
+++ test/rbkt/Queries/zorba/string/tokenize01.xq 2011-12-23 21:56:35 +0000
@@ -0,0 +1,5 @@
+import module namespace f = "http://expath.org/ns/file";
+import module namespace s = "http://www.zorba-xquery.com/modules/string";
+
+s:split(f:read-text(fn:resolve-uri("token01.txt")), "bc"),
+s:split(s:materialize(f:read-text(fn:resolve-uri("token01.txt"))), "bc")
=== added file 'test/rbkt/Queries/zorba/string/tokenize02.xq'
--- test/rbkt/Queries/zorba/string/tokenize02.xq 1970-01-01 00:00:00 +0000
+++ test/rbkt/Queries/zorba/string/tokenize02.xq 2011-12-23 21:56:35 +0000
@@ -0,0 +1,5 @@
+import module namespace f = "http://expath.org/ns/file";
+import module namespace s = "http://www.zorba-xquery.com/modules/string";
+
+s:split(f:read-text(fn:resolve-uri("token02.txt")), "bc"),
+s:split(s:materialize(f:read-text(fn:resolve-uri("token02.txt"))), "bc")
=== added file 'test/rbkt/Queries/zorba/string/tokenize03.xq'
--- test/rbkt/Queries/zorba/string/tokenize03.xq 1970-01-01 00:00:00 +0000
+++ test/rbkt/Queries/zorba/string/tokenize03.xq 2011-12-23 21:56:35 +0000
@@ -0,0 +1,5 @@
+import module namespace f = "http://expath.org/ns/file";
+import module namespace s = "http://www.zorba-xquery.com/modules/string";
+
+s:split(f:read-text(fn:resolve-uri("token03.txt")), "bc"),
+s:split(s:materialize(f:read-text(fn:resolve-uri("token03.txt"))), "bc")
=== added file 'test/rbkt/Queries/zorba/string/tokenize04.xq'
--- test/rbkt/Queries/zorba/string/tokenize04.xq 1970-01-01 00:00:00 +0000
+++ test/rbkt/Queries/zorba/string/tokenize04.xq 2011-12-23 21:56:35 +0000
@@ -0,0 +1,5 @@
+import module namespace f = "http://expath.org/ns/file";
+import module namespace s = "http://www.zorba-xquery.com/modules/string";
+
+s:split(f:read-text(fn:resolve-uri("token04.txt")), "f"),
+s:split(s:materialize(f:read-text(fn:resolve-uri("token04.txt"))), "f")
Follow ups