← Back to team overview

zorba-coders team mailing list archive

[Merge] lp:~zorba-coders/zorba/bug-1169908 into lp:zorba

 

Paul J. Lucas has proposed merging lp:~zorba-coders/zorba/bug-1169908 into lp:zorba.

Commit message:
char_length() and utf8::read() now throw an exception upon an invalid UTF-8.

Requested reviews:
  Paul J. Lucas (paul-lucas)
Related bugs:
  Bug #1169908 in Zorba: "Zorba hangs with invalid utf-8 input"
  https://bugs.launchpad.net/zorba/+bug/1169908

For more details, see:
https://code.launchpad.net/~zorba-coders/zorba/bug-1169908/+merge/160692

char_length() and utf8::read() now throw an exception upon an invalid UTF-8.
-- 
https://code.launchpad.net/~zorba-coders/zorba/bug-1169908/+merge/160692
Your team Zorba Coders is subscribed to branch lp:zorba.
=== modified file 'src/runtime/strings/strings_impl.cpp'
--- src/runtime/strings/strings_impl.cpp	2013-04-12 04:34:41 +0000
+++ src/runtime/strings/strings_impl.cpp	2013-04-24 15:47:29 +0000
@@ -16,6 +16,7 @@
 #include "stdafx.h"
 
 #include <iostream>
+#include <sstream>
 
 #include "common/common.h"
 
@@ -39,11 +40,13 @@
 
 #include "zorbautils/string_util.h"
 
+#include "util/ascii_util.h"
+#include "util/oseparator.h"
 #include "util/regex.h"
-#include "util/utf8_util.h"
-#include "util/utf8_string.h"
 #include "util/string_util.h"
 #include "util/uri_util.h"
+#include "util/utf8_string.h"
+#include "util/utf8_util.h"
 #include "util/xml_util.h"
 
 
@@ -137,49 +140,32 @@
     {
       utf8::encoded_char_type ec;
       memset( ec, 0, sizeof( ec ) );
-      utf8::storage_type *p;
-      p = ec;
 
-      if ( utf8::read( *state->theStream, ec ) == utf8::npos )
-      {
-        if ( state->theStream->eof() )
+      try {
+        if ( !utf8::read( *state->theStream, ec ) ) {
+          if ( !state->theStream->eof() && !state->theStream->good() )
+            throw XQUERY_EXCEPTION(
+              zerr::ZOSE0003_STREAM_READ_FAILURE, ERROR_LOC( loc )
+            );
           break;
-        if ( state->theStream->good() ) {
-          //
-          // If read() failed but the stream state is good, it means that an
-          // invalid byte was encountered.
-          //
-          char buf[ 6 /* bytes at most */ * 5 /* chars per byte */ ], *b = buf;
-          bool first = true;
-          for ( ; *p; ++p ) {
-            if ( first )
-              first = false;
-            else
-              *b++ = ',';
-            ::strcpy( b, "0x" );          b += 2;
-            ::sprintf( b, "%0hhX", *p );  b += 2;
-          }
-          throw XQUERY_EXCEPTION(
-            zerr::ZXQD0006_INVALID_UTF8_BYTE_SEQUENCE,
-            ERROR_PARAMS( buf ),
-            ERROR_LOC( loc )
-          );
-        } else {
-          throw XQUERY_EXCEPTION(
-            zerr::ZOSE0003_STREAM_READ_FAILURE, ERROR_LOC( loc )
-          );
         }
       }
-      state->theResult.clear();
-      state->theResult.push_back( utf8::next_char( p ) );
-
+      catch ( utf8::invalid_byte const& ) {
+        ostringstream oss;
+        oseparator comma( ',' );
+        for ( utf8::storage_type const *c = ec; *c; ++c )
+          oss << comma << ascii::printable_char( *c );
+        throw XQUERY_EXCEPTION(
+          zerr::ZXQD0006_INVALID_UTF8_BYTE_SEQUENCE,
+          ERROR_PARAMS( oss.str() ),
+          ERROR_LOC( loc )
+        );
+      }
       GENV_ITEMFACTORY->createInteger(
-        result,
-        Integer(state->theResult[0])
+        result, xs_integer( utf8::decode( ec ) )
       );
-
-      STACK_PUSH(true, state );
-      state->theIterator = state->theIterator + 1;
+      STACK_PUSH( true, state );
+      ++(state->theIterator);
     }
   }
   else if (!inputStr.empty())
@@ -190,7 +176,7 @@
     {
       GENV_ITEMFACTORY->createInteger(
         result,
-        Integer(state->theResult[state->theIterator])
+        xs_integer(state->theResult[state->theIterator])
       );
 
       STACK_PUSH(true, state );
@@ -263,7 +249,7 @@
 
       res = (res < 0 ? -1 : (res > 0 ? 1 : 0));
 
-      GENV_ITEMFACTORY->createInteger(result, Integer(res));
+      GENV_ITEMFACTORY->createInteger(result, xs_integer(res));
 
       STACK_PUSH(true, state);
     }
@@ -758,13 +744,11 @@
   if (consumeNext(item, theChildren [0].getp(), planState))
   {
     item->getStringValue2(strval);
-
-    STACK_PUSH(GENV_ITEMFACTORY->createInteger(result, Integer(utf8::length(strval))),
-               state);
+    STACK_PUSH(GENV_ITEMFACTORY->createInteger(result, xs_integer(utf8::length(strval))), state);
   }
   else
   {
-    STACK_PUSH(GENV_ITEMFACTORY->createInteger(result, Integer::zero()),
+    STACK_PUSH(GENV_ITEMFACTORY->createInteger(result, xs_integer::zero()),
                state);
   }
   STACK_END(state);
@@ -2350,6 +2334,7 @@
     store::Item_t& result,
     PlanState& planState) const
 {
+  bool read;
   store::Item_t item;
   size_t lNewPos = 0;
   zstring lToken;
@@ -2381,11 +2366,24 @@
     while ( !state->theIStream->eof() )
     {
       utf8::encoded_char_type ec;
-      memset( ec, '\0' , sizeof(ec) );
-      utf8::storage_type *p;
-      p = ec;
-
-      if ( utf8::read( *state->theIStream, ec ) != utf8::npos )
+      memset( ec, 0 , sizeof(ec) );
+
+      try {
+        read = !!utf8::read( *state->theIStream, ec );
+      }
+      catch ( utf8::invalid_byte const& ) {
+        ostringstream oss;
+        oseparator comma( ',' );
+        for ( utf8::storage_type const *c = ec; *c; ++c )
+          oss << comma << ascii::printable_char( *c );
+        throw XQUERY_EXCEPTION(
+          zerr::ZXQD0006_INVALID_UTF8_BYTE_SEQUENCE,
+          ERROR_PARAMS( oss.str() ),
+          ERROR_LOC( loc )
+        );
+      }
+
+      if ( read )
       {
         if (state->theSeparator.compare(lNewPos, 1, ec) == 0)
         {
@@ -2407,24 +2405,10 @@
       }
       else
       {
-        if (state->theIStream->good())
-        {
-          char buf[ 6 /* bytes at most */ * 5 /* chars per byte */ ], *b = buf;
-          bool first = true;
-          for ( ; *p; ++p ) {
-            if ( first )
-              first = false;
-            else
-              *b++ = ',';
-            ::strcpy( b, "0x" );          b += 2;
-            ::sprintf( b, "%0hhX", *p );  b += 2;
-          }
+        if ( !state->theIStream->eof() && !state->theIStream->good() )
           throw XQUERY_EXCEPTION(
-            zerr::ZXQD0006_INVALID_UTF8_BYTE_SEQUENCE,
-            ERROR_PARAMS( buf ),
-            ERROR_LOC( loc )
+            zerr::ZOSE0003_STREAM_READ_FAILURE, ERROR_LOC( loc )
           );
-        }
         if (!lToken.empty())
         {
           GENV_ITEMFACTORY->createString(result, lToken);
@@ -2432,7 +2416,7 @@
         }
         break;
       }
-    }
+    } // while
   }
   else
   {

=== modified file 'src/util/unicode_util.h'
--- src/util/unicode_util.h	2013-04-01 03:33:29 +0000
+++ src/util/unicode_util.h	2013-04-24 15:47:29 +0000
@@ -130,6 +130,16 @@
 
 ////////// constants //////////////////////////////////////////////////////////
 
+/**
+ * Byte Order Mark (BOM).
+ */
+code_point const BOM = 0xFEFF;
+
+/**
+ * An invalid code-point.
+ */
+code_point const invalid = static_cast<code_point>( -1 );
+
 //
 // Various '1' digits.
 //

=== modified file 'src/util/utf8_util.cpp'
--- src/util/utf8_util.cpp	2013-03-12 03:43:11 +0000
+++ src/util/utf8_util.cpp	2013-04-24 15:47:29 +0000
@@ -15,6 +15,7 @@
  */
 #include "stdafx.h"
 
+// standard
 #include <algorithm>
 #include <cstring>
 
@@ -22,6 +23,7 @@
 #include <unicode/ustring.h>
 #endif /* ZORBA_NO_ICU */
 
+// local
 #include "cxx_util.h"
 #include "utf8_util.h"
 
@@ -40,6 +42,24 @@
 namespace zorba {
 namespace utf8 {
 
+///////////////////////////////////////////////////////////////////////////////
+
+invalid_byte::invalid_byte( char byte ) :
+  invalid_argument( make_what( byte ) ),
+  byte_( byte )
+{
+}
+
+invalid_byte::~invalid_byte() throw() {
+  // out-of-line since it's virtual
+}
+
+string invalid_byte::make_what( storage_type byte ) {
+  return BUILD_STRING( '\'', byte, "': invalid UTF-8 byte" );
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
 size_type byte_pos( storage_type const *s, size_type char_pos ) {
   if ( char_pos == npos )
     return npos;
@@ -52,12 +72,12 @@
   return p - s;
 }
 
-size_type byte_pos( storage_type const *s, size_type s_size,
+size_type byte_pos( storage_type const *s, size_type s_len,
                     size_type char_pos ) {
   if ( char_pos == npos )
     return npos;
   storage_type const *p = s;
-  storage_type const *const end = s + s_size;
+  storage_type const *const end = s + s_len;
   for ( ; char_pos > 0; --char_pos ) {
     if ( p >= end )
       return npos;
@@ -86,7 +106,9 @@
     /* E */ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
     /* F */ 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0,0
   };
-  return length_table[ static_cast<unsigned char>( start ) ];
+  if ( size_type const c_len = length_table[ static_cast<unsigned>( start ) ] )
+    return c_len;
+  throw invalid_byte( start );
 }
 
 size_type char_pos( storage_type const *s, storage_type const *p ) {
@@ -98,8 +120,10 @@
   return pos;
 }
 
-size_type encode( unicode::code_point c, storage_type **ps ) {
-  unsigned const n = c & 0xFFFFFFFF;
+size_type encode( unicode::code_point cp, storage_type **ps ) {
+  if ( !unicode::is_valid( cp ) )
+    return 0;
+  unsigned const n = cp & 0xFFFFFFFF;
   storage_type *&p = *ps, *const p0 = p;
   if ( n < 0x80 ) {
     // 0xxxxxxx
@@ -138,15 +162,6 @@
   return p - p0;
 }
 
-size_type length( storage_type const *s ) {
-  size_type len = 0;
-  while ( *s ) {
-    s += char_length( *s );
-    ++len;
-  }
-  return len;
-}
-
 storage_type* itou( unsigned long long n, storage_type *buf,
                     unicode::code_point zero ) {
   storage_type *s = buf;
@@ -176,29 +191,40 @@
   return buf;
 }
 
+size_type length( storage_type const *s ) {
+  size_type total_len = 0;
+  while ( *s ) {
+    s += char_length( *s );
+    ++total_len;
+  }
+  return total_len;
+}
+
 size_type length( storage_type const *begin, storage_type const *end ) {
-  size_type len = 0;
+  size_type total_len = 0;
   while ( begin < end && *begin ) {
     begin += char_length( *begin );
-    ++len;
+    ++total_len;
   }
-  return len;
+  return total_len;
 }
 
 size_type read( istream &i, storage_type **ps ) {
   char c = i.get();
-  if ( !i.good() || !is_start_byte( c ) )
-    return npos;
+  if ( !i.good() )
+    return 0;
   storage_type *&p = *ps;
   *p++ = c;
-  size_type const len = char_length( c );
-  for ( size_type n = 1; n < len; ++n ) {
+  size_type const c_len = char_length( c );
+  for ( size_type got = 1; got < c_len; ++got ) {
     c = i.get();
-    if ( !i.good() || !is_continuation_byte( c ) )
-      return npos;
+    if ( !i.good() )
+      return 0;
     *p++ = c;
+    if ( !is_continuation_byte( c ) )
+      throw invalid_byte( c );
   }
-  return len;
+  return c_len;
 }
 
 #ifndef ZORBA_NO_ICU
@@ -268,35 +294,34 @@
 #endif /* ZORBA_NO_ICU */
 
 storage_type const* validate( storage_type const *s ) {
-  while ( *s ) {
-    size_type c_len = char_length( *s );
-    if ( !c_len )
-      return s;
-    while ( --c_len ) {
-      if ( !is_continuation_byte( *++s ) )
-        return s;
-    }
-    ++s;
-  }
-  return nullptr;
-}
-
-storage_type const* validate( storage_type const *s, size_type s_size ) {
-  while ( s_size ) {
-    size_type c_len = char_length( *s );
-    if ( !c_len )
-      return s;
-    while ( --c_len ) {
-      if ( !--s_size )
-        return s;
-      if ( !is_continuation_byte( *++s ) )
-        return s;
-    }
-    ++s;
-    --s_size;
-  }
-  return nullptr;
-}
+  try {
+    for ( ; *s; ++s ) {
+      for ( size_type c_len = char_length( *s ); --c_len; )
+        if ( !is_continuation_byte( *++s ) )
+          return s;
+    }
+    return nullptr;
+  }
+  catch ( invalid_byte const& ) {
+    return s;
+  }
+}
+
+storage_type const* validate( storage_type const *s, size_type s_len ) {
+  try {
+    for ( ; s_len; ++s, --s_len ) {
+      for ( size_type c_len = char_length( *s ); --c_len; )
+        if ( !--s_len || !is_continuation_byte( *++s ) )
+          return s;
+    }
+    return nullptr;
+  }
+  catch ( invalid_byte const& ) {
+    return s;
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////
 
 } // namespace utf8
 } // namespace zorba

=== modified file 'src/util/utf8_util.h'
--- src/util/utf8_util.h	2013-03-14 18:06:31 +0000
+++ src/util/utf8_util.h	2013-04-24 15:47:29 +0000
@@ -298,7 +298,7 @@
  * @tparam ContainerType The type of STL container to put the codepoint values.
  * @param s The string to get the codepoints for.
  * @param c A pointer to the container to put the codepoint values.  The
- * containers contents are overwritten.
+ * container's contents are appended to.
  */
 template<class StringType,class ContainerType> inline
 void to_codepoints( StringType const &s, ContainerType *c ) {
@@ -314,11 +314,14 @@
  * ("Number, Decimal Digit") category.
  */
 typedef storage_type itou_buf_type[
-  (sizeof( encoded_char_type ) - 1 /* subtract null */) * 20 + 1 /* add null */
+  sizeof( encoded_char_type )
+  * 20  // maximum number of digits in a 64-bit unsigned long
+  + 1   // null
 ];
 
 /**
- * Converts an <code>unsigned long long</code> to a UTF-8 encoded string.
+ * Converts an <code>unsigned long long</code> to a null-terminated UTF-8
+ * encoded string.
  *
  * @param n The <code>unsigned long long</code> to convert.
  * @param buf The buffer for the result.  The caller must ensure it's of
@@ -778,7 +781,7 @@
   u_type u( *s );
   u_size_type const u_size( u.size() );
   if ( u_size < width )
-    u.insert( static_cast<size_type>( 0 ), width - u_size, cp );
+    u.insert( static_cast<u_size_type>( 0 ), width - u_size, cp );
   return *s;
 }
 

=== modified file 'src/util/utf8_util.tcc'
--- src/util/utf8_util.tcc	2013-02-07 17:24:36 +0000
+++ src/util/utf8_util.tcc	2013-04-24 15:47:29 +0000
@@ -73,23 +73,26 @@
 
 template<class OctetIterator>
 unicode::code_point next_char( OctetIterator &i ) {
-  unicode::code_point c = *i & 0xFFu;   // prevents sign-extension
-  if ( c < 0x80 )                       // special-case ASCII
+  unicode::code_point cp = *i & 0xFFu;  // prevents sign-extension
+  if ( cp < 0x80 )                      // special-case ASCII
     ++i;
   else {
-    size_type const len = char_length( c );
+    size_type const len = char_length( cp );
     unsigned m = (0x7F >> len) & 0x1F;  // mask
-    c = unicode::code_point( 0 );
+    cp = unicode::code_point( 0 );
     switch ( len ) {
-      case 6: c |= ((*i & m   ) << 30); ++i; m = 0x3F;
-      case 5: c |= ((*i & m   ) << 24); ++i; m = 0x3F;
-      case 4: c |= ((*i & m   ) << 18); ++i; m = 0x3F;
-      case 3: c |= ((*i & m   ) << 12); ++i; m = 0x3F;
-      case 2: c |= ((*i & m   ) <<  6); ++i;
-              c |=  (*i & 0x3F)       ; ++i;
+      case 6: cp |= ((*i & m   ) << 30); ++i; m = 0x3F;
+      case 5: cp |= ((*i & m   ) << 24); ++i; m = 0x3F;
+      case 4: cp |= ((*i & m   ) << 18); ++i; m = 0x3F;
+      case 3: cp |= ((*i & m   ) << 12); ++i; m = 0x3F;
+      case 2: cp |= ((*i & m   ) <<  6); ++i;
+              cp |=  (*i & 0x3F)       ; ++i;
+              break;
+      default:
+        cp = unicode::invalid;
     }
   }
-  return c;
+  return cp;
 }
 
 template<class OctetIterator>

=== modified file 'src/util/utf8_util_base.h'
--- src/util/utf8_util_base.h	2013-04-05 01:53:19 +0000
+++ src/util/utf8_util_base.h	2013-04-24 15:47:29 +0000
@@ -17,11 +17,14 @@
 #ifndef ZORBA_UTF8_UTIL_BASE_H
 #define ZORBA_UTF8_UTIL_BASE_H
 
+// standard
 #include <cstddef>
 #include <iostream>
 #include <iterator>
 #include <stdexcept>
 
+// local
+#include "string_util.h"
 #include "unicode_util.h"
 
 namespace zorba {
@@ -36,6 +39,8 @@
 
 /**
  * A type that can hold all the bytes of the largest encoded UTF-8 character.
+ * Note that this is NOT a C string: it is NOT null-terminated (since the first
+ * byte of a UTF-8 byte sequence encodes the number of bytes in the sequence).
  */
 typedef storage_type encoded_char_type[6];
 
@@ -47,11 +52,35 @@
 ////////// Constants //////////////////////////////////////////////////////////
 
 /**
+ * Byte Order Mark (BOM).
+ */
+storage_type const BOM[] = "\xEF\xBB\xBF";
+
+/**
  * The special value used to denote either (a) the maximum possible number as
  * input or (b) "not found" as a result.
  */
 size_type const npos = static_cast<size_type>( -1 );
 
+////////// Exceptions /////////////////////////////////////////////////////////
+
+/**
+ * An %invalid_byte is-an invalid_argument for reporting invalid UTF-8 bytes.
+ */
+class invalid_byte : public std::invalid_argument {
+public:
+  invalid_byte( storage_type byte );
+  ~invalid_byte() throw();
+
+  storage_type byte() const throw() {
+    return byte_;
+  }
+
+private:
+  static std::string make_what( storage_type byte );
+  storage_type byte_;
+};
+
 ////////// Byte/Char position conversion //////////////////////////////////////
 
 /**
@@ -59,7 +88,8 @@
  *
  * @param s A null-terminated UTF-8 encoded C string.
  * @param char_pos The character position.
- * @return Returns the corresponding byte position.
+ * @return Returns the corresponding byte position or \c npos if \a s contains
+ * an invalid UTF-8 byte.
  */
 size_type byte_pos( storage_type const *s, size_type char_pos );
 
@@ -67,12 +97,12 @@
  * Converts a character position into a byte position.
  *
  * @param s A UTF-8 encoded C string.
- * @param s_size The size of \a s in bytes.
+ * @param s_len The length of \a s in bytes.
  * @param char_pos The character position.
- * @return Returns the corresponding byte position or \c npos if the result >=
- * \a s_size.
+ * @return Returns the corresponding byte position or \c npos if the result
+ * &gt;= \a s_len or \a s contains an invalid UTF-8 byte.
  */
-size_type byte_pos( storage_type const *s, size_type s_size,
+size_type byte_pos( storage_type const *s, size_type s_len,
                     size_type char_pos );
 
 /**
@@ -80,7 +110,8 @@
  *
  * @param s A UTF-8 encoded C string.
  * @param p A pointer to somewhere within \a s.
- * @return Returns said offset.
+ * @return Returns said offset or \c npos if \a s contains an invalid UTF-8
+ * byte.
  */
 size_type char_pos( storage_type const *s, storage_type const *p );
 
@@ -89,7 +120,8 @@
  *
  * @param s A UTF-8 encoded C string.
  * @param byte_pos The byte position.
- * @return Returns the corresponding character position.
+ * @return Returns said position or \c npos if \a s contains an invalid UTF-8
+ * byte.
  */
 inline size_type char_pos( storage_type const *s, size_type byte_pos ) {
   return byte_pos != npos ? char_pos( s, s + byte_pos ) : npos;
@@ -100,36 +132,42 @@
 /**
  * Encodes a Unicode character into a UTF-8 byte sequence.
  *
- * @param c The Unicode code-point to encode.
+ * @param cp The Unicode code-point to encode.
  * @param ps A pointer to a pointer to what will be the first byte of a UTF-8
  * byte sequence.  The pointer is advanced to one byte past the newly encoded
  * character.
- * @return Returns the number of bytes required to encode the character.
+ * @return Returns the number of bytes required to encode \a cp or 0 if \a cp
+ * is invalid.
  */
-size_type encode( unicode::code_point c, storage_type **ps );
+size_type encode( unicode::code_point cp, storage_type **ps );
 
 /**
  * Encodes a Unicode character into a UTF-8 byte sequence.
  *
- * @param c The Unicode code-point to encode.
+ * @param cp The Unicode code-point to encode.
  * @param p A pointer to what will be the first byte of a UTF-8 byte sequence.
- * @return Returns the number of bytes required to encode the character.
+ * @return Returns the number of bytes required to encode \a cp or 0 if \a cp
+ * is invalid.
  */
-inline size_type encode( unicode::code_point c, storage_type *p ) {
-  return encode( c, &p );
+inline size_type encode( unicode::code_point cp, storage_type *p ) {
+  return encode( cp, &p );
 }
 
 /**
  * Encodes a Unicode character into a UTF-8 byte sequence and appends it to the
  * given string.
  *
- * @param c The Unicode code-point to encode.
+ * @param cp The Unicode code-point to encode.
  * @param out The string to append to.
+ * @return Returns the number of bytes required to encode \a cp or 0 if \a cp
+ * is invalid.
  */
 template<class StringType> inline
-void encode( unicode::code_point c, StringType *out ) {
+size_type encode( unicode::code_point cp, StringType *out ) {
   encoded_char_type ec;
-  out->append( ec, encode( c, ec ) );
+  size_type const len = encode( cp, ec );
+  out->append( ec, len );
+  return len;
 }
 
 /**
@@ -140,7 +178,8 @@
  * @param i An iterator pointing to the first byte of a UTF-8 byte sequence
  * comprising a Unicode character.  The iterator is advanced by the number of
  * bytes comprising the UTF-8 byte sequence.
- * @return Returns the Unicode code-point of the next character.
+ * @return Returns the Unicode code-point of the next character or
+ * <code>unicode::invalid</code>.
  */
 template<class OctetIterator>
 unicode::code_point next_char( OctetIterator &i );
@@ -150,7 +189,8 @@
  *
  * @param p A pointer to the first byte of a UTF-8 byte sequence comprising a
  * Unicode character.
- * @return Returns the Unicode code-point of the next character.
+ * @return Returns the Unicode code-point of the next character or
+ * <code>unicode::invalid</code>.
  */
 inline unicode::code_point decode( storage_type const *p ) {
   return next_char( p );
@@ -164,7 +204,8 @@
  * @param i An iterator pointing to somewhere within a UTF-8 string.  It is
  * repositioned to the first byte of the UTF-8 byte sequence comprising e
  * previous character.
- * @return Returns the Unicode code-point of previous character.
+ * @return Returns the Unicode code-point of previous character or
+ * <code>unicode::invalid</code>.
  */
 template<class OctetIterator>
 unicode::code_point prev_char( OctetIterator &i );
@@ -174,11 +215,12 @@
  *
  * @param i The istream to read from.
  * @param ps A pointer to a pointer to what will be the first byte of a UTF-8
- * byte sequence.  The pointer is advanced to one byte past the newly read
- * character.
+ * byte sequence.  The pointer is advanced to one byte beyond all the bytes
+ * comprising the newly read UTF-8 character.  All bytes read from the stream
+ * (valid or not) are written to the buffer.
  * @return Returns the number of bytes comprising the UTF-8 character (which
- * equals the number of bytes read) or \c npos if either EOF was reached or the
- * bytes read are an invalid UTF-8 byte sequence.
+ * equals the number of bytes read) or 0 if EOF was reached.
+ * @throws invalid_byte if an invalid UTF-8 byte is encountered.
  */
 size_type read( std::istream &i, storage_type **ps );
 
@@ -187,9 +229,10 @@
  *
  * @param i The istream to read from.
  * @param p A pointer to what will be the first byte of a UTF-8 byte sequence.
+ * All bytes read from the stream (valid or not) are written to the buffer.
  * @return Returns the number of bytes comprising the UTF-8 character (which
- * equals the number of bytes read) or \c npos if either EOF was reached or the
- * bytes read are an invalid UTF-8 byte sequence.
+ * equals the number of bytes read) or 0 if EOF was reached.
+ * @throws invalid_byte if an invalid UTF-8 byte is encountered.
  */
 inline size_type read( std::istream &i, storage_type *p ) {
   return read( i, &p );
@@ -203,6 +246,7 @@
  * @param s A pointer to the first byte of a UTF-8 string.
  * @param char_pos The index of the desired character (not byte).
  * @return Returns said character.
+ * @throws invalid_byte if an invalid UTF-8 byte is encountered.
  */
 inline unicode::code_point char_at( storage_type const *s,
                                     size_type char_pos ) {
@@ -214,14 +258,15 @@
  * Gets the Unicode character at the given position.
  *
  * @param s A pointer to the first byte of a UTF-8 string.
- * @param s_size The size of \a s in bytes.
+ * @param s_len The length of \a s in bytes.
  * @param char_pos The index of the desired character (not byte).
  * @return Returns said character.
- * @throws std::out_of_range if \a char_pos >= \a s_size.
+ * @throws invalid_byte if an invalid UTF-8 byte is encountered.
+ * @throws std::out_of_range if \a char_pos >= \a s_len.
  */
-inline unicode::code_point char_at( storage_type const *s, size_type s_size,
+inline unicode::code_point char_at( storage_type const *s, size_type s_len,
                                     size_type char_pos ) {
-  size_type const b = byte_pos( s, s_size, char_pos );
+  size_type const b = byte_pos( s, s_len, char_pos );
   if ( b == npos )
     throw std::out_of_range( "char_at" );
   storage_type const *s2 = s + b;
@@ -235,8 +280,8 @@
  *
  * @param start The start byte of a UTF-8 byte sequence comprising a Unicode
  * character.
- * @return Returns a number in the range [1,6] if \a start is valid or 0 if
- * \a start is invalid.
+ * @return Returns a number in the range [1,6].
+ * @throws invalid_byte if an invalid UTF-8 byte is encountered.
  */
 size_type char_length( storage_type start );
 
@@ -246,6 +291,7 @@
  *
  * @param s A pointer to the first byte of a NULL-terminated UTF-8 string.
  * @return Returns said number of characters.
+ * @throws invalid_byte if an invalid UTF-8 byte is encountered.
  */
 size_type length( storage_type const *s );
 
@@ -257,6 +303,7 @@
  * @param end A pointer to one past the last byte of the same UTF-8 byte
  * sequence.
  * @return Returns said number of characters.
+ * @throws invalid_byte if an invalid UTF-8 byte is encountered.
  */
 size_type length( storage_type const *begin, storage_type const *end );
 
@@ -265,10 +312,14 @@
  *
  * @tparam StringType The string type.
  * @param s The string.
- * @return Returns said number of characters.
+ * @return Returns said number of characters or 0 if any byte is invalid.
  */
-template<class StringType>
-inline size_type length( StringType const &s ) {
+template<class StringType> inline
+typename std::enable_if<
+  ztd::has_c_str<StringType,char const* (StringType::*)() const>::value,
+  size_type
+>::type
+length( StringType const &s ) {
   return length( s.c_str() );
 }
 
@@ -328,11 +379,11 @@
  * Checks an entire UTF-8 string for validity.
  *
  * @param s The UTF-8 string to validate.
- * @param s_size The number of bytes (not characters) to check.
+ * @param s_len The number of bytes (not characters) to check.
  * @return Returns \c nullptr if the string is valid or a pointer to the first
  * invalid byte if invalid.
  */
-storage_type const* validate( storage_type const *s, size_type s_size );
+storage_type const* validate( storage_type const *s, size_type s_len );
 
 ////////// iterator ///////////////////////////////////////////////////////////
 


Follow ups