zorba-coders team mailing list archive
-
zorba-coders team
-
Mailing list archive
-
Message #02758
[Merge] lp:~paul-lucas/zorba/bug-898075 into lp:zorba
Paul J. Lucas has proposed merging lp:~paul-lucas/zorba/bug-898075 into lp:zorba.
Requested reviews:
Paul J. Lucas (paul-lucas)
Related bugs:
Bug #898075 in Zorba: "fn:string-to-codepoints() doesn't stream"
https://bugs.launchpad.net/zorba/+bug/898075
For more details, see:
https://code.launchpad.net/~paul-lucas/zorba/bug-898075/+merge/85410
Applied William's patch; patched William's patch to handle UTF-8 properly.
--
https://code.launchpad.net/~paul-lucas/zorba/bug-898075/+merge/85410
Your team Zorba Coders is subscribed to branch lp:zorba.
=== modified file 'include/zorba/pregenerated/diagnostic_list.h'
--- include/zorba/pregenerated/diagnostic_list.h 2011-11-15 08:23:20 +0000
+++ include/zorba/pregenerated/diagnostic_list.h 2011-12-12 23:19:26 +0000
@@ -458,6 +458,8 @@
extern ZORBA_DLL_PUBLIC ZorbaErrorCode ZXQD0005_INVALID_KEY_FOR_MAP;
+extern ZORBA_DLL_PUBLIC ZorbaErrorCode ZXQD0006_INVALID_UTF8_BYTE_SEQUENCE;
+
extern ZORBA_DLL_PUBLIC ZorbaErrorCode ZAPI0002_XQUERY_COMPILATION_FAILED;
extern ZORBA_DLL_PUBLIC ZorbaErrorCode ZAPI0003_XQUERY_NOT_COMPILED;
=== modified file 'modules/com/zorba-xquery/www/modules/pregenerated/errors.xq'
--- modules/com/zorba-xquery/www/modules/pregenerated/errors.xq 2011-11-15 08:23:20 +0000
+++ modules/com/zorba-xquery/www/modules/pregenerated/errors.xq 2011-12-12 23:19:26 +0000
@@ -217,6 +217,10 @@
(:~
:)
+declare variable $zerr:ZXQD0006 as xs:QName := fn:QName($zerr:NS, "zerr:ZXQD0006");
+
+(:~
+:)
declare variable $zerr:ZAPI0002 as xs:QName := fn:QName($zerr:NS, "zerr:ZAPI0002");
(:~
=== modified file 'src/diagnostics/diagnostic_en.xml'
--- src/diagnostics/diagnostic_en.xml 2011-12-07 20:46:23 +0000
+++ src/diagnostics/diagnostic_en.xml 2011-12-12 23:19:26 +0000
@@ -1722,6 +1722,10 @@
<value>key with type $1 not subtype or castable to target type $2 of map ($3)</value>
</diagnostic>
+ <diagnostic code="ZXQD0006" name="INVALID_UTF8_BYTE_SEQUENCE">
+ <value>"$1": invalid UTF-8 byte sequence</value>
+ </diagnostic>
+
<!--////////// Zorba API Errors ////////////////////////////////////////-->
<diagnostic code="ZAPI0002" name="XQUERY_COMPILATION_FAILED">
=== modified file 'src/diagnostics/pregenerated/diagnostic_list.cpp'
--- src/diagnostics/pregenerated/diagnostic_list.cpp 2011-11-15 08:23:20 +0000
+++ src/diagnostics/pregenerated/diagnostic_list.cpp 2011-12-12 23:19:26 +0000
@@ -666,6 +666,9 @@
ZorbaErrorCode ZXQD0005_INVALID_KEY_FOR_MAP( "ZXQD0005" );
+ZorbaErrorCode ZXQD0006_INVALID_UTF8_BYTE_SEQUENCE( "ZXQD0006" );
+
+
ZorbaErrorCode ZAPI0002_XQUERY_COMPILATION_FAILED( "ZAPI0002" );
=== modified file 'src/diagnostics/pregenerated/dict_en.cpp'
--- src/diagnostics/pregenerated/dict_en.cpp 2011-12-01 16:19:52 +0000
+++ src/diagnostics/pregenerated/dict_en.cpp 2011-12-12 23:19:26 +0000
@@ -365,6 +365,7 @@
{ "ZXQD0003", "inconsistent options to the parse-xml-fragment() function: $1" },
{ "ZXQD0004", "invalid parameter: $1" },
{ "ZXQD0005", "key with type $1 not subtype or castable to target type $2 of map ($3)" },
+ { "ZXQD0006", "\"$1\": invalid UTF-8 byte sequence" },
{ "ZXQP0000", "no error" },
{ "ZXQP0001", "dynamic runtime error${: 1}" },
{ "ZXQP0002", "\"$1\": assertion failed" },
=== modified file 'src/runtime/spec/strings/strings.xml'
--- src/runtime/spec/strings/strings.xml 2011-12-01 11:02:25 +0000
+++ src/runtime/spec/strings/strings.xml 2011-12-12 23:19:26 +0000
@@ -57,7 +57,8 @@
<zorba:member type="xs_unsignedInt" name="theIterator"
brief="the current iterator"/>
<zorba:member type="checked_vector<xs_unsignedInt>" name="theResult"
- brief="the resulting vector"/>
+ brief="the resulting vector"/>
+ <zorba:member type="std::istream*" name="theStream" />
</zorba:state>
</zorba:iterator>
=== modified file 'src/runtime/strings/pregenerated/strings.h'
--- src/runtime/strings/pregenerated/strings.h 2011-12-01 11:02:25 +0000
+++ src/runtime/strings/pregenerated/strings.h 2011-12-12 23:19:26 +0000
@@ -82,6 +82,7 @@
public:
xs_unsignedInt theIterator; //the current iterator
checked_vector<xs_unsignedInt> theResult; //the resulting vector
+ std::istream* theStream; //
StringToCodepointsIteratorState();
=== modified file 'src/runtime/strings/strings_impl.cpp'
--- src/runtime/strings/strings_impl.cpp 2011-12-01 16:19:52 +0000
+++ src/runtime/strings/strings_impl.cpp 2011-12-12 23:19:26 +0000
@@ -120,22 +120,76 @@
if (consumeNext(item, theChildren [0].getp(), planState ))
{
- item->getStringValue2(inputStr);
-
- if (!inputStr.empty())
- {
- utf8::to_codepoints(inputStr, &state->theResult);
-
- while (state->theIterator < state->theResult.size())
- {
- GENV_ITEMFACTORY->createInteger(
- result,
- Integer(state->theResult[state->theIterator])
- );
-
- STACK_PUSH(true, state );
- state->theIterator = state->theIterator + 1;
- }
+ if(!item->isStreamable())
+ {
+ item->getStringValue2(inputStr);
+ }
+ else
+ {
+ state->theStream = &item->getStream();
+ }
+ }
+
+ if ( state->theStream )
+ {
+ while ( !state->theStream->eof() )
+ {
+ utf8::encoded_char_type ec;
+ ::bzero( ec, sizeof( ec ) );
+ utf8::storage_type *p;
+ p = ec;
+
+ if ( utf8::read( *state->theStream, ec ) == utf8::npos )
+ if ( state->theStream->good() ) {
+ //
+ // If read() failed but the stream state is good, it means that an
+ // invalid byte was encountered.
+ //
+ char buf[ 6 /* bytes at most */ * 5 /* chars per byte */ ], *b = buf;
+ bool first = true;
+ for ( ; *p; ++p ) {
+ if ( first )
+ first = false;
+ else
+ *b++ = ',';
+ ::strcpy( b, "0x" ); b += 2;
+ ::sprintf( b, "%0hhX", *p ); b += 2;
+ }
+ throw XQUERY_EXCEPTION(
+ zerr::ZXQD0006_INVALID_UTF8_BYTE_SEQUENCE,
+ ERROR_PARAMS( buf ),
+ ERROR_LOC( loc )
+ );
+ } else {
+ throw XQUERY_EXCEPTION(
+ zerr::ZOSE0003_STREAM_READ_FAILURE, ERROR_LOC( loc )
+ );
+ }
+ state->theResult.clear();
+ state->theResult.push_back( utf8::next_char( p ) );
+
+ GENV_ITEMFACTORY->createInteger(
+ result,
+ Integer(state->theResult[0])
+ );
+
+ STACK_PUSH(true, state );
+ state->theIterator = state->theIterator + 1;
+ }
+ }
+ else if (!inputStr.empty())
+ {
+ utf8::to_codepoints(inputStr, &state->theResult);
+
+ while (state->theIterator < state->theResult.size())
+ {
+ GENV_ITEMFACTORY->createInteger(
+ result,
+ Integer(state->theResult[state->theIterator])
+ );
+
+ STACK_PUSH(true, state );
+ state->theIterator = state->theIterator + 1;
}
}
STACK_END (state);
@@ -146,6 +200,7 @@
{
PlanIteratorState::init(planState);
theIterator = 0;
+ theStream = 0;
theResult.clear();
}
=== modified file 'src/util/utf8_util.cpp'
--- src/util/utf8_util.cpp 2011-07-17 00:10:56 +0000
+++ src/util/utf8_util.cpp 2011-12-12 23:19:26 +0000
@@ -22,6 +22,7 @@
#include "cxx_util.h"
#include "utf8_util.h"
+using namespace std;
#ifndef ZORBA_NO_UNICODE
U_NAMESPACE_USE
#endif /* ZORBA_NO_UNICODE */
@@ -152,6 +153,22 @@
return len;
}
+size_type read( istream &i, storage_type **ps ) {
+ char c = i.get();
+ if ( !i.good() || !is_start_byte( c ) )
+ return npos;
+ storage_type *&p = *ps;
+ *p++ = c;
+ size_type const len = char_length( c );
+ for ( size_type n = 1; n < len; ++n ) {
+ c = i.get();
+ if ( !i.good() || !is_continuation_byte( c ) )
+ return npos;
+ *p++ = c;
+ }
+ return len;
+}
+
#ifndef ZORBA_NO_UNICODE
bool to_string( unicode::char_type const *in, unicode::size_type in_len,
=== modified file 'src/util/utf8_util_base.h'
--- src/util/utf8_util_base.h 2011-12-01 16:19:52 +0000
+++ src/util/utf8_util_base.h 2011-12-12 23:19:26 +0000
@@ -18,6 +18,7 @@
#define ZORBA_UTF8_UTIL_BASE_H
#include <cstddef>
+#include <iostream>
#include <iterator>
#include <stdexcept>
@@ -164,6 +165,32 @@
template<class OctetIterator>
unicode::code_point prev_char( OctetIterator &i );
+/**
+ * Reads bytes from an istream until an entire UTF-8 character has been read.
+ *
+ * @param i The istream to read from.
+ * @param ps A pointer to a pointer to what will be the first byte of a UTF-8
+ * byte sequence. The pointer is advanced to one byte past the newly read
+ * character.
+ * @return Returns the number of bytes comprising the UTF-8 character (which
+ * equals the number of bytes read) or \c npos if either EOF was reached or the
+ * bytes read are an invalid UTF-8 byte sequence.
+ */
+size_type read( std::istream &i, storage_type **ps );
+
+/**
+ * Reads bytes from an istream until an entire UTF-8 character has been read.
+ *
+ * @param i The istream to read from.
+ * @param p A pointer to what will be the first byte of a UTF-8 byte sequence.
+ * @return Returns the number of bytes comprising the UTF-8 character (which
+ * equals the number of bytes read) or \c npos if either EOF was reached or the
+ * bytes read are an invalid UTF-8 byte sequence.
+ */
+inline size_type read( std::istream &i, storage_type *p ) {
+ return read( i, &p );
+}
+
////////// Character access ///////////////////////////////////////////////////
/**
Follow ups
-
[Merge] lp:~paul-lucas/zorba/bug-898075 into lp:zorba
From: noreply, 2011-12-15
-
[Merge] lp:~paul-lucas/zorba/bug-898075 into lp:zorba
From: Zorba Build Bot, 2011-12-15
-
[Merge] lp:~paul-lucas/zorba/bug-898075 into lp:zorba
From: Zorba Build Bot, 2011-12-15
-
[Merge] lp:~paul-lucas/zorba/bug-898075 into lp:zorba
From: Matthias Brantner, 2011-12-15
-
Re: [Merge] lp:~paul-lucas/zorba/bug-898075 into lp:zorba
From: William Candillon, 2011-12-15
-
[Merge] lp:~paul-lucas/zorba/bug-898075 into lp:zorba
From: Zorba Build Bot, 2011-12-12
-
Re: [Merge] lp:~paul-lucas/zorba/bug-898075 into lp:zorba
From: Zorba Build Bot, 2011-12-12
-
[Merge] lp:~paul-lucas/zorba/bug-898075 into lp:zorba
From: Zorba Build Bot, 2011-12-12
-
[Merge] lp:~paul-lucas/zorba/bug-898075 into lp:zorba
From: Paul J. Lucas, 2011-12-12
-
Re: [Merge] lp:~paul-lucas/zorba/bug-898075 into lp:zorba
From: Paul J. Lucas, 2011-12-12