zorba-coders team mailing list archive
-
zorba-coders team
-
Mailing list archive
-
Message #21114
[Merge] lp:~zorba-coders/zorba/bug-1169908 into lp:zorba
Paul J. Lucas has proposed merging lp:~zorba-coders/zorba/bug-1169908 into lp:zorba.
Commit message:
char_length() and utf8::read() now throw an exception upon an invalid UTF-8.
Requested reviews:
Paul J. Lucas (paul-lucas)
Related bugs:
Bug #1169908 in Zorba: "Zorba hangs with invalid utf-8 input"
https://bugs.launchpad.net/zorba/+bug/1169908
For more details, see:
https://code.launchpad.net/~zorba-coders/zorba/bug-1169908/+merge/160692
char_length() and utf8::read() now throw an exception upon an invalid UTF-8.
--
https://code.launchpad.net/~zorba-coders/zorba/bug-1169908/+merge/160692
Your team Zorba Coders is subscribed to branch lp:zorba.
=== modified file 'src/runtime/strings/strings_impl.cpp'
--- src/runtime/strings/strings_impl.cpp 2013-04-12 04:34:41 +0000
+++ src/runtime/strings/strings_impl.cpp 2013-04-24 15:47:29 +0000
@@ -16,6 +16,7 @@
#include "stdafx.h"
#include <iostream>
+#include <sstream>
#include "common/common.h"
@@ -39,11 +40,13 @@
#include "zorbautils/string_util.h"
+#include "util/ascii_util.h"
+#include "util/oseparator.h"
#include "util/regex.h"
-#include "util/utf8_util.h"
-#include "util/utf8_string.h"
#include "util/string_util.h"
#include "util/uri_util.h"
+#include "util/utf8_string.h"
+#include "util/utf8_util.h"
#include "util/xml_util.h"
@@ -137,49 +140,32 @@
{
utf8::encoded_char_type ec;
memset( ec, 0, sizeof( ec ) );
- utf8::storage_type *p;
- p = ec;
- if ( utf8::read( *state->theStream, ec ) == utf8::npos )
- {
- if ( state->theStream->eof() )
+ try {
+ if ( !utf8::read( *state->theStream, ec ) ) {
+ if ( !state->theStream->eof() && !state->theStream->good() )
+ throw XQUERY_EXCEPTION(
+ zerr::ZOSE0003_STREAM_READ_FAILURE, ERROR_LOC( loc )
+ );
break;
- if ( state->theStream->good() ) {
- //
- // If read() failed but the stream state is good, it means that an
- // invalid byte was encountered.
- //
- char buf[ 6 /* bytes at most */ * 5 /* chars per byte */ ], *b = buf;
- bool first = true;
- for ( ; *p; ++p ) {
- if ( first )
- first = false;
- else
- *b++ = ',';
- ::strcpy( b, "0x" ); b += 2;
- ::sprintf( b, "%0hhX", *p ); b += 2;
- }
- throw XQUERY_EXCEPTION(
- zerr::ZXQD0006_INVALID_UTF8_BYTE_SEQUENCE,
- ERROR_PARAMS( buf ),
- ERROR_LOC( loc )
- );
- } else {
- throw XQUERY_EXCEPTION(
- zerr::ZOSE0003_STREAM_READ_FAILURE, ERROR_LOC( loc )
- );
}
}
- state->theResult.clear();
- state->theResult.push_back( utf8::next_char( p ) );
-
+ catch ( utf8::invalid_byte const& ) {
+ ostringstream oss;
+ oseparator comma( ',' );
+ for ( utf8::storage_type const *c = ec; *c; ++c )
+ oss << comma << ascii::printable_char( *c );
+ throw XQUERY_EXCEPTION(
+ zerr::ZXQD0006_INVALID_UTF8_BYTE_SEQUENCE,
+ ERROR_PARAMS( oss.str() ),
+ ERROR_LOC( loc )
+ );
+ }
GENV_ITEMFACTORY->createInteger(
- result,
- Integer(state->theResult[0])
+ result, xs_integer( utf8::decode( ec ) )
);
-
- STACK_PUSH(true, state );
- state->theIterator = state->theIterator + 1;
+ STACK_PUSH( true, state );
+ ++(state->theIterator);
}
}
else if (!inputStr.empty())
@@ -190,7 +176,7 @@
{
GENV_ITEMFACTORY->createInteger(
result,
- Integer(state->theResult[state->theIterator])
+ xs_integer(state->theResult[state->theIterator])
);
STACK_PUSH(true, state );
@@ -263,7 +249,7 @@
res = (res < 0 ? -1 : (res > 0 ? 1 : 0));
- GENV_ITEMFACTORY->createInteger(result, Integer(res));
+ GENV_ITEMFACTORY->createInteger(result, xs_integer(res));
STACK_PUSH(true, state);
}
@@ -758,13 +744,11 @@
if (consumeNext(item, theChildren [0].getp(), planState))
{
item->getStringValue2(strval);
-
- STACK_PUSH(GENV_ITEMFACTORY->createInteger(result, Integer(utf8::length(strval))),
- state);
+ STACK_PUSH(GENV_ITEMFACTORY->createInteger(result, xs_integer(utf8::length(strval))), state);
}
else
{
- STACK_PUSH(GENV_ITEMFACTORY->createInteger(result, Integer::zero()),
+ STACK_PUSH(GENV_ITEMFACTORY->createInteger(result, xs_integer::zero()),
state);
}
STACK_END(state);
@@ -2350,6 +2334,7 @@
store::Item_t& result,
PlanState& planState) const
{
+ bool read;
store::Item_t item;
size_t lNewPos = 0;
zstring lToken;
@@ -2381,11 +2366,24 @@
while ( !state->theIStream->eof() )
{
utf8::encoded_char_type ec;
- memset( ec, '\0' , sizeof(ec) );
- utf8::storage_type *p;
- p = ec;
-
- if ( utf8::read( *state->theIStream, ec ) != utf8::npos )
+ memset( ec, 0 , sizeof(ec) );
+
+ try {
+ read = !!utf8::read( *state->theIStream, ec );
+ }
+ catch ( utf8::invalid_byte const& ) {
+ ostringstream oss;
+ oseparator comma( ',' );
+ for ( utf8::storage_type const *c = ec; *c; ++c )
+ oss << comma << ascii::printable_char( *c );
+ throw XQUERY_EXCEPTION(
+ zerr::ZXQD0006_INVALID_UTF8_BYTE_SEQUENCE,
+ ERROR_PARAMS( oss.str() ),
+ ERROR_LOC( loc )
+ );
+ }
+
+ if ( read )
{
if (state->theSeparator.compare(lNewPos, 1, ec) == 0)
{
@@ -2407,24 +2405,10 @@
}
else
{
- if (state->theIStream->good())
- {
- char buf[ 6 /* bytes at most */ * 5 /* chars per byte */ ], *b = buf;
- bool first = true;
- for ( ; *p; ++p ) {
- if ( first )
- first = false;
- else
- *b++ = ',';
- ::strcpy( b, "0x" ); b += 2;
- ::sprintf( b, "%0hhX", *p ); b += 2;
- }
+ if ( !state->theIStream->eof() && !state->theIStream->good() )
throw XQUERY_EXCEPTION(
- zerr::ZXQD0006_INVALID_UTF8_BYTE_SEQUENCE,
- ERROR_PARAMS( buf ),
- ERROR_LOC( loc )
+ zerr::ZOSE0003_STREAM_READ_FAILURE, ERROR_LOC( loc )
);
- }
if (!lToken.empty())
{
GENV_ITEMFACTORY->createString(result, lToken);
@@ -2432,7 +2416,7 @@
}
break;
}
- }
+ } // while
}
else
{
=== modified file 'src/util/unicode_util.h'
--- src/util/unicode_util.h 2013-04-01 03:33:29 +0000
+++ src/util/unicode_util.h 2013-04-24 15:47:29 +0000
@@ -130,6 +130,16 @@
////////// constants //////////////////////////////////////////////////////////
+/**
+ * Byte Order Mark (BOM).
+ */
+code_point const BOM = 0xFEFF;
+
+/**
+ * An invalid code-point.
+ */
+code_point const invalid = static_cast<code_point>( -1 );
+
//
// Various '1' digits.
//
=== modified file 'src/util/utf8_util.cpp'
--- src/util/utf8_util.cpp 2013-03-12 03:43:11 +0000
+++ src/util/utf8_util.cpp 2013-04-24 15:47:29 +0000
@@ -15,6 +15,7 @@
*/
#include "stdafx.h"
+// standard
#include <algorithm>
#include <cstring>
@@ -22,6 +23,7 @@
#include <unicode/ustring.h>
#endif /* ZORBA_NO_ICU */
+// local
#include "cxx_util.h"
#include "utf8_util.h"
@@ -40,6 +42,24 @@
namespace zorba {
namespace utf8 {
+///////////////////////////////////////////////////////////////////////////////
+
+invalid_byte::invalid_byte( char byte ) :
+ invalid_argument( make_what( byte ) ),
+ byte_( byte )
+{
+}
+
+invalid_byte::~invalid_byte() throw() {
+ // out-of-line since it's virtual
+}
+
+string invalid_byte::make_what( storage_type byte ) {
+ return BUILD_STRING( '\'', byte, "': invalid UTF-8 byte" );
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
size_type byte_pos( storage_type const *s, size_type char_pos ) {
if ( char_pos == npos )
return npos;
@@ -52,12 +72,12 @@
return p - s;
}
-size_type byte_pos( storage_type const *s, size_type s_size,
+size_type byte_pos( storage_type const *s, size_type s_len,
size_type char_pos ) {
if ( char_pos == npos )
return npos;
storage_type const *p = s;
- storage_type const *const end = s + s_size;
+ storage_type const *const end = s + s_len;
for ( ; char_pos > 0; --char_pos ) {
if ( p >= end )
return npos;
@@ -86,7 +106,9 @@
/* E */ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
/* F */ 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0,0
};
- return length_table[ static_cast<unsigned char>( start ) ];
+ if ( size_type const c_len = length_table[ static_cast<unsigned>( start ) ] )
+ return c_len;
+ throw invalid_byte( start );
}
size_type char_pos( storage_type const *s, storage_type const *p ) {
@@ -98,8 +120,10 @@
return pos;
}
-size_type encode( unicode::code_point c, storage_type **ps ) {
- unsigned const n = c & 0xFFFFFFFF;
+size_type encode( unicode::code_point cp, storage_type **ps ) {
+ if ( !unicode::is_valid( cp ) )
+ return 0;
+ unsigned const n = cp & 0xFFFFFFFF;
storage_type *&p = *ps, *const p0 = p;
if ( n < 0x80 ) {
// 0xxxxxxx
@@ -138,15 +162,6 @@
return p - p0;
}
-size_type length( storage_type const *s ) {
- size_type len = 0;
- while ( *s ) {
- s += char_length( *s );
- ++len;
- }
- return len;
-}
-
storage_type* itou( unsigned long long n, storage_type *buf,
unicode::code_point zero ) {
storage_type *s = buf;
@@ -176,29 +191,40 @@
return buf;
}
+size_type length( storage_type const *s ) {
+ size_type total_len = 0;
+ while ( *s ) {
+ s += char_length( *s );
+ ++total_len;
+ }
+ return total_len;
+}
+
size_type length( storage_type const *begin, storage_type const *end ) {
- size_type len = 0;
+ size_type total_len = 0;
while ( begin < end && *begin ) {
begin += char_length( *begin );
- ++len;
+ ++total_len;
}
- return len;
+ return total_len;
}
size_type read( istream &i, storage_type **ps ) {
char c = i.get();
- if ( !i.good() || !is_start_byte( c ) )
- return npos;
+ if ( !i.good() )
+ return 0;
storage_type *&p = *ps;
*p++ = c;
- size_type const len = char_length( c );
- for ( size_type n = 1; n < len; ++n ) {
+ size_type const c_len = char_length( c );
+ for ( size_type got = 1; got < c_len; ++got ) {
c = i.get();
- if ( !i.good() || !is_continuation_byte( c ) )
- return npos;
+ if ( !i.good() )
+ return 0;
*p++ = c;
+ if ( !is_continuation_byte( c ) )
+ throw invalid_byte( c );
}
- return len;
+ return c_len;
}
#ifndef ZORBA_NO_ICU
@@ -268,35 +294,34 @@
#endif /* ZORBA_NO_ICU */
storage_type const* validate( storage_type const *s ) {
- while ( *s ) {
- size_type c_len = char_length( *s );
- if ( !c_len )
- return s;
- while ( --c_len ) {
- if ( !is_continuation_byte( *++s ) )
- return s;
- }
- ++s;
- }
- return nullptr;
-}
-
-storage_type const* validate( storage_type const *s, size_type s_size ) {
- while ( s_size ) {
- size_type c_len = char_length( *s );
- if ( !c_len )
- return s;
- while ( --c_len ) {
- if ( !--s_size )
- return s;
- if ( !is_continuation_byte( *++s ) )
- return s;
- }
- ++s;
- --s_size;
- }
- return nullptr;
-}
+ try {
+ for ( ; *s; ++s ) {
+ for ( size_type c_len = char_length( *s ); --c_len; )
+ if ( !is_continuation_byte( *++s ) )
+ return s;
+ }
+ return nullptr;
+ }
+ catch ( invalid_byte const& ) {
+ return s;
+ }
+}
+
+storage_type const* validate( storage_type const *s, size_type s_len ) {
+ try {
+ for ( ; s_len; ++s, --s_len ) {
+ for ( size_type c_len = char_length( *s ); --c_len; )
+ if ( !--s_len || !is_continuation_byte( *++s ) )
+ return s;
+ }
+ return nullptr;
+ }
+ catch ( invalid_byte const& ) {
+ return s;
+ }
+}
+
+///////////////////////////////////////////////////////////////////////////////
} // namespace utf8
} // namespace zorba
=== modified file 'src/util/utf8_util.h'
--- src/util/utf8_util.h 2013-03-14 18:06:31 +0000
+++ src/util/utf8_util.h 2013-04-24 15:47:29 +0000
@@ -298,7 +298,7 @@
* @tparam ContainerType The type of STL container to put the codepoint values.
* @param s The string to get the codepoints for.
* @param c A pointer to the container to put the codepoint values. The
- * containers contents are overwritten.
+ * container's contents are appended to.
*/
template<class StringType,class ContainerType> inline
void to_codepoints( StringType const &s, ContainerType *c ) {
@@ -314,11 +314,14 @@
* ("Number, Decimal Digit") category.
*/
typedef storage_type itou_buf_type[
- (sizeof( encoded_char_type ) - 1 /* subtract null */) * 20 + 1 /* add null */
+ sizeof( encoded_char_type )
+ * 20 // maximum number of digits in a 64-bit unsigned long
+ + 1 // null
];
/**
- * Converts an <code>unsigned long long</code> to a UTF-8 encoded string.
+ * Converts an <code>unsigned long long</code> to a null-terminated UTF-8
+ * encoded string.
*
* @param n The <code>unsigned long long</code> to convert.
* @param buf The buffer for the result. The caller must ensure it's of
@@ -778,7 +781,7 @@
u_type u( *s );
u_size_type const u_size( u.size() );
if ( u_size < width )
- u.insert( static_cast<size_type>( 0 ), width - u_size, cp );
+ u.insert( static_cast<u_size_type>( 0 ), width - u_size, cp );
return *s;
}
=== modified file 'src/util/utf8_util.tcc'
--- src/util/utf8_util.tcc 2013-02-07 17:24:36 +0000
+++ src/util/utf8_util.tcc 2013-04-24 15:47:29 +0000
@@ -73,23 +73,26 @@
template<class OctetIterator>
unicode::code_point next_char( OctetIterator &i ) {
- unicode::code_point c = *i & 0xFFu; // prevents sign-extension
- if ( c < 0x80 ) // special-case ASCII
+ unicode::code_point cp = *i & 0xFFu; // prevents sign-extension
+ if ( cp < 0x80 ) // special-case ASCII
++i;
else {
- size_type const len = char_length( c );
+ size_type const len = char_length( cp );
unsigned m = (0x7F >> len) & 0x1F; // mask
- c = unicode::code_point( 0 );
+ cp = unicode::code_point( 0 );
switch ( len ) {
- case 6: c |= ((*i & m ) << 30); ++i; m = 0x3F;
- case 5: c |= ((*i & m ) << 24); ++i; m = 0x3F;
- case 4: c |= ((*i & m ) << 18); ++i; m = 0x3F;
- case 3: c |= ((*i & m ) << 12); ++i; m = 0x3F;
- case 2: c |= ((*i & m ) << 6); ++i;
- c |= (*i & 0x3F) ; ++i;
+ case 6: cp |= ((*i & m ) << 30); ++i; m = 0x3F;
+ case 5: cp |= ((*i & m ) << 24); ++i; m = 0x3F;
+ case 4: cp |= ((*i & m ) << 18); ++i; m = 0x3F;
+ case 3: cp |= ((*i & m ) << 12); ++i; m = 0x3F;
+ case 2: cp |= ((*i & m ) << 6); ++i;
+ cp |= (*i & 0x3F) ; ++i;
+ break;
+ default:
+ cp = unicode::invalid;
}
}
- return c;
+ return cp;
}
template<class OctetIterator>
=== modified file 'src/util/utf8_util_base.h'
--- src/util/utf8_util_base.h 2013-04-05 01:53:19 +0000
+++ src/util/utf8_util_base.h 2013-04-24 15:47:29 +0000
@@ -17,11 +17,14 @@
#ifndef ZORBA_UTF8_UTIL_BASE_H
#define ZORBA_UTF8_UTIL_BASE_H
+// standard
#include <cstddef>
#include <iostream>
#include <iterator>
#include <stdexcept>
+// local
+#include "string_util.h"
#include "unicode_util.h"
namespace zorba {
@@ -36,6 +39,8 @@
/**
* A type that can hold all the bytes of the largest encoded UTF-8 character.
+ * Note that this is NOT a C string: it is NOT null-terminated (since the first
+ * byte of a UTF-8 byte sequence encodes the number of bytes in the sequence).
*/
typedef storage_type encoded_char_type[6];
@@ -47,11 +52,35 @@
////////// Constants //////////////////////////////////////////////////////////
/**
+ * Byte Order Mark (BOM).
+ */
+storage_type const BOM[] = "\xEF\xBB\xBF";
+
+/**
* The special value used to denote either (a) the maximum possible number as
* input or (b) "not found" as a result.
*/
size_type const npos = static_cast<size_type>( -1 );
+////////// Exceptions /////////////////////////////////////////////////////////
+
+/**
+ * An %invalid_byte is-an invalid_argument for reporting invalid UTF-8 bytes.
+ */
+class invalid_byte : public std::invalid_argument {
+public:
+ invalid_byte( storage_type byte );
+ ~invalid_byte() throw();
+
+ storage_type byte() const throw() {
+ return byte_;
+ }
+
+private:
+ static std::string make_what( storage_type byte );
+ storage_type byte_;
+};
+
////////// Byte/Char position conversion //////////////////////////////////////
/**
@@ -59,7 +88,8 @@
*
* @param s A null-terminated UTF-8 encoded C string.
* @param char_pos The character position.
- * @return Returns the corresponding byte position.
+ * @return Returns the corresponding byte position or \c npos if \a s contains
+ * an invalid UTF-8 byte.
*/
size_type byte_pos( storage_type const *s, size_type char_pos );
@@ -67,12 +97,12 @@
* Converts a character position into a byte position.
*
* @param s A UTF-8 encoded C string.
- * @param s_size The size of \a s in bytes.
+ * @param s_len The length of \a s in bytes.
* @param char_pos The character position.
- * @return Returns the corresponding byte position or \c npos if the result >=
- * \a s_size.
+ * @return Returns the corresponding byte position or \c npos if the result
+ * >= \a s_len or \a s contains an invalid UTF-8 byte.
*/
-size_type byte_pos( storage_type const *s, size_type s_size,
+size_type byte_pos( storage_type const *s, size_type s_len,
size_type char_pos );
/**
@@ -80,7 +110,8 @@
*
* @param s A UTF-8 encoded C string.
* @param p A pointer to somewhere within \a s.
- * @return Returns said offset.
+ * @return Returns said offset or \c npos if \a s contains an invalid UTF-8
+ * byte.
*/
size_type char_pos( storage_type const *s, storage_type const *p );
@@ -89,7 +120,8 @@
*
* @param s A UTF-8 encoded C string.
* @param byte_pos The byte position.
- * @return Returns the corresponding character position.
+ * @return Returns said position or \c npos if \a s contains an invalid UTF-8
+ * byte.
*/
inline size_type char_pos( storage_type const *s, size_type byte_pos ) {
return byte_pos != npos ? char_pos( s, s + byte_pos ) : npos;
@@ -100,36 +132,42 @@
/**
* Encodes a Unicode character into a UTF-8 byte sequence.
*
- * @param c The Unicode code-point to encode.
+ * @param cp The Unicode code-point to encode.
* @param ps A pointer to a pointer to what will be the first byte of a UTF-8
* byte sequence. The pointer is advanced to one byte past the newly encoded
* character.
- * @return Returns the number of bytes required to encode the character.
+ * @return Returns the number of bytes required to encode \a cp or 0 if \a cp
+ * is invalid.
*/
-size_type encode( unicode::code_point c, storage_type **ps );
+size_type encode( unicode::code_point cp, storage_type **ps );
/**
* Encodes a Unicode character into a UTF-8 byte sequence.
*
- * @param c The Unicode code-point to encode.
+ * @param cp The Unicode code-point to encode.
* @param p A pointer to what will be the first byte of a UTF-8 byte sequence.
- * @return Returns the number of bytes required to encode the character.
+ * @return Returns the number of bytes required to encode \a cp or 0 if \a cp
+ * is invalid.
*/
-inline size_type encode( unicode::code_point c, storage_type *p ) {
- return encode( c, &p );
+inline size_type encode( unicode::code_point cp, storage_type *p ) {
+ return encode( cp, &p );
}
/**
* Encodes a Unicode character into a UTF-8 byte sequence and appends it to the
* given string.
*
- * @param c The Unicode code-point to encode.
+ * @param cp The Unicode code-point to encode.
* @param out The string to append to.
+ * @return Returns the number of bytes required to encode \a cp or 0 if \a cp
+ * is invalid.
*/
template<class StringType> inline
-void encode( unicode::code_point c, StringType *out ) {
+size_type encode( unicode::code_point cp, StringType *out ) {
encoded_char_type ec;
- out->append( ec, encode( c, ec ) );
+ size_type const len = encode( cp, ec );
+ out->append( ec, len );
+ return len;
}
/**
@@ -140,7 +178,8 @@
* @param i An iterator pointing to the first byte of a UTF-8 byte sequence
* comprising a Unicode character. The iterator is advanced by the number of
* bytes comprising the UTF-8 byte sequence.
- * @return Returns the Unicode code-point of the next character.
+ * @return Returns the Unicode code-point of the next character or
+ * <code>unicode::invalid</code>.
*/
template<class OctetIterator>
unicode::code_point next_char( OctetIterator &i );
@@ -150,7 +189,8 @@
*
* @param p A pointer to the first byte of a UTF-8 byte sequence comprising a
* Unicode character.
- * @return Returns the Unicode code-point of the next character.
+ * @return Returns the Unicode code-point of the next character or
+ * <code>unicode::invalid</code>.
*/
inline unicode::code_point decode( storage_type const *p ) {
return next_char( p );
@@ -164,7 +204,8 @@
* @param i An iterator pointing to somewhere within a UTF-8 string. It is
* repositioned to the first byte of the UTF-8 byte sequence comprising e
* previous character.
- * @return Returns the Unicode code-point of previous character.
+ * @return Returns the Unicode code-point of previous character or
+ * <code>unicode::invalid</code>.
*/
template<class OctetIterator>
unicode::code_point prev_char( OctetIterator &i );
@@ -174,11 +215,12 @@
*
* @param i The istream to read from.
* @param ps A pointer to a pointer to what will be the first byte of a UTF-8
- * byte sequence. The pointer is advanced to one byte past the newly read
- * character.
+ * byte sequence. The pointer is advanced to one byte beyond all the bytes
+ * comprising the newly read UTF-8 character. All bytes read from the stream
+ * (valid or not) are written to the buffer.
* @return Returns the number of bytes comprising the UTF-8 character (which
- * equals the number of bytes read) or \c npos if either EOF was reached or the
- * bytes read are an invalid UTF-8 byte sequence.
+ * equals the number of bytes read) or 0 if EOF was reached.
+ * @throws invalid_byte if an invalid UTF-8 byte is encountered.
*/
size_type read( std::istream &i, storage_type **ps );
@@ -187,9 +229,10 @@
*
* @param i The istream to read from.
* @param p A pointer to what will be the first byte of a UTF-8 byte sequence.
+ * All bytes read from the stream (valid or not) are written to the buffer.
* @return Returns the number of bytes comprising the UTF-8 character (which
- * equals the number of bytes read) or \c npos if either EOF was reached or the
- * bytes read are an invalid UTF-8 byte sequence.
+ * equals the number of bytes read) or 0 if EOF was reached.
+ * @throws invalid_byte if an invalid UTF-8 byte is encountered.
*/
inline size_type read( std::istream &i, storage_type *p ) {
return read( i, &p );
@@ -203,6 +246,7 @@
* @param s A pointer to the first byte of a UTF-8 string.
* @param char_pos The index of the desired character (not byte).
* @return Returns said character.
+ * @throws invalid_byte if an invalid UTF-8 byte is encountered.
*/
inline unicode::code_point char_at( storage_type const *s,
size_type char_pos ) {
@@ -214,14 +258,15 @@
* Gets the Unicode character at the given position.
*
* @param s A pointer to the first byte of a UTF-8 string.
- * @param s_size The size of \a s in bytes.
+ * @param s_len The length of \a s in bytes.
* @param char_pos The index of the desired character (not byte).
* @return Returns said character.
- * @throws std::out_of_range if \a char_pos >= \a s_size.
+ * @throws invalid_byte if an invalid UTF-8 byte is encountered.
+ * @throws std::out_of_range if \a char_pos >= \a s_len.
*/
-inline unicode::code_point char_at( storage_type const *s, size_type s_size,
+inline unicode::code_point char_at( storage_type const *s, size_type s_len,
size_type char_pos ) {
- size_type const b = byte_pos( s, s_size, char_pos );
+ size_type const b = byte_pos( s, s_len, char_pos );
if ( b == npos )
throw std::out_of_range( "char_at" );
storage_type const *s2 = s + b;
@@ -235,8 +280,8 @@
*
* @param start The start byte of a UTF-8 byte sequence comprising a Unicode
* character.
- * @return Returns a number in the range [1,6] if \a start is valid or 0 if
- * \a start is invalid.
+ * @return Returns a number in the range [1,6].
+ * @throws invalid_byte if an invalid UTF-8 byte is encountered.
*/
size_type char_length( storage_type start );
@@ -246,6 +291,7 @@
*
* @param s A pointer to the first byte of a NULL-terminated UTF-8 string.
* @return Returns said number of characters.
+ * @throws invalid_byte if an invalid UTF-8 byte is encountered.
*/
size_type length( storage_type const *s );
@@ -257,6 +303,7 @@
* @param end A pointer to one past the last byte of the same UTF-8 byte
* sequence.
* @return Returns said number of characters.
+ * @throws invalid_byte if an invalid UTF-8 byte is encountered.
*/
size_type length( storage_type const *begin, storage_type const *end );
@@ -265,10 +312,14 @@
*
* @tparam StringType The string type.
* @param s The string.
- * @return Returns said number of characters.
+ * @return Returns said number of characters or 0 if any byte is invalid.
*/
-template<class StringType>
-inline size_type length( StringType const &s ) {
+template<class StringType> inline
+typename std::enable_if<
+ ztd::has_c_str<StringType,char const* (StringType::*)() const>::value,
+ size_type
+>::type
+length( StringType const &s ) {
return length( s.c_str() );
}
@@ -328,11 +379,11 @@
* Checks an entire UTF-8 string for validity.
*
* @param s The UTF-8 string to validate.
- * @param s_size The number of bytes (not characters) to check.
+ * @param s_len The number of bytes (not characters) to check.
* @return Returns \c nullptr if the string is valid or a pointer to the first
* invalid byte if invalid.
*/
-storage_type const* validate( storage_type const *s, size_type s_size );
+storage_type const* validate( storage_type const *s, size_type s_len );
////////// iterator ///////////////////////////////////////////////////////////
Follow ups
-
[Merge] lp:~zorba-coders/zorba/bug-1169908 into lp:zorba
From: noreply, 2013-04-25
-
[Merge] lp:~zorba-coders/zorba/bug-1169908 into lp:zorba
From: Zorba Build Bot, 2013-04-25
-
[Merge] lp:~zorba-coders/zorba/bug-1169908 into lp:zorba
From: Zorba Build Bot, 2013-04-25
-
[Merge] lp:~zorba-coders/zorba/bug-1169908 into lp:zorba
From: Matthias Brantner, 2013-04-25
-
Re: [Merge] lp:~zorba-coders/zorba/bug-1169908 into lp:zorba
From: Matthias Brantner, 2013-04-25
-
[Merge] lp:~zorba-coders/zorba/bug-1169908 into lp:zorba
From: Zorba Build Bot, 2013-04-25
-
Re: [Merge] lp:~zorba-coders/zorba/bug-1169908 into lp:zorba
From: Zorba Build Bot, 2013-04-25
-
[Merge] lp:~zorba-coders/zorba/bug-1169908 into lp:zorba
From: Zorba Build Bot, 2013-04-25
-
[Merge] lp:~zorba-coders/zorba/bug-1169908 into lp:zorba
From: Zorba Build Bot, 2013-04-25
-
[Merge] lp:~zorba-coders/zorba/bug-1169908 into lp:zorba
From: Paul J. Lucas, 2013-04-25
-
[Merge] lp:~zorba-coders/zorba/bug-1169908 into lp:zorba
From: Zorba Build Bot, 2013-04-24
-
Re: [Merge] lp:~zorba-coders/zorba/bug-1169908 into lp:zorba
From: Zorba Build Bot, 2013-04-24
-
[Merge] lp:~zorba-coders/zorba/bug-1169908 into lp:zorba
From: Zorba Build Bot, 2013-04-24
-
[Merge] lp:~zorba-coders/zorba/bug-1169908 into lp:zorba
From: Paul J. Lucas, 2013-04-24
-
[Merge] lp:~zorba-coders/zorba/bug-1169908 into lp:zorba
From: Zorba Build Bot, 2013-04-24
-
Re: [Merge] lp:~zorba-coders/zorba/bug-1169908 into lp:zorba
From: Zorba Build Bot, 2013-04-24
-
[Merge] lp:~zorba-coders/zorba/bug-1169908 into lp:zorba
From: Paul J. Lucas, 2013-04-24
-
[Merge] lp:~zorba-coders/zorba/bug-1169908 into lp:zorba
From: Zorba Build Bot, 2013-04-24
-
Re: [Merge] lp:~zorba-coders/zorba/bug-1169908 into lp:zorba
From: Zorba Build Bot, 2013-04-24
-
[Merge] lp:~zorba-coders/zorba/bug-1169908 into lp:zorba
From: Zorba Build Bot, 2013-04-24
-
[Merge] lp:~zorba-coders/zorba/bug-1169908 into lp:zorba
From: Paul J. Lucas, 2013-04-24
-
Re: [Merge] lp:~zorba-coders/zorba/bug-1169908 into lp:zorba
From: Paul J. Lucas, 2013-04-24