← Back to team overview

zorba-coders team mailing list archive

[Merge] lp:~paul-lucas/zorba/feature-utf8_streambuf into lp:zorba

 

Paul J. Lucas has proposed merging lp:~paul-lucas/zorba/feature-utf8_streambuf into lp:zorba.

Commit message:
Streambuf for validating UTF-8 on-the-fly.

Requested reviews:
  Paul J. Lucas (paul-lucas)

For more details, see:
https://code.launchpad.net/~paul-lucas/zorba/feature-utf8_streambuf/+merge/142440

Streambuf for validating UTF-8 on-the-fly.
-- 
https://code.launchpad.net/~paul-lucas/zorba/feature-utf8_streambuf/+merge/142440
Your team Zorba Coders is subscribed to branch lp:zorba.
=== modified file 'src/unit_tests/CMakeLists.txt'
--- src/unit_tests/CMakeLists.txt	2013-01-05 00:57:49 +0000
+++ src/unit_tests/CMakeLists.txt	2013-01-09 05:11:31 +0000
@@ -23,6 +23,8 @@
   test_uri.cpp
   test_uuid.cpp
   unit_tests.cpp
+  test_uri.cpp
+  test_utf8_streambuf.cpp
 )
 
 IF (NOT ZORBA_NO_FULL_TEXT)

=== added file 'src/unit_tests/test_utf8_streambuf.cpp'
--- src/unit_tests/test_utf8_streambuf.cpp	1970-01-01 00:00:00 +0000
+++ src/unit_tests/test_utf8_streambuf.cpp	2013-01-09 05:11:31 +0000
@@ -0,0 +1,166 @@
+/*
+ * Copyright 2006-2008 The FLWOR Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "stdafx.h"
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#include <zorba/zorba_exception.h>
+
+#include "util/utf8_streambuf.h"
+
+using namespace std;
+using namespace zorba;
+
+#define SMILEY_FACE           "\xF0\x9F\x98\x8A"
+#define COPYRIGHT_UTF8        "\xC2\xA9"
+#define ONE_THIRD_UTF8        "\xE2\x85\x93"
+
+#define BAD_COPYRIGHT_1_UTF8  "\x42\xA9"
+#define BAD_COPYRIGHT_2_UTF8  "\xC2\x79"
+
+static char const *const tests_good[] = {
+  "Hello, world!",
+  "Copyright " COPYRIGHT_UTF8 " 2012",
+  ONE_THIRD_UTF8 " cup sugar",
+  "Smiley " SMILEY_FACE,
+  "Smiley 2 " SMILEY_FACE SMILEY_FACE,
+  SMILEY_FACE " Smiley",
+  SMILEY_FACE SMILEY_FACE " 2 Smiley",
+  0
+};
+
+static char const *const tests_bad[] = {
+  "Copyright " BAD_COPYRIGHT_1_UTF8 " 2012",
+  "Copyright " BAD_COPYRIGHT_2_UTF8 " 2012",
+  0
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+static int failures;
+
+static bool assert_true( int no, char const *expr, int line, bool result ) {
+  if ( !result ) {
+    cout << '#' << no << " FAILED, line " << line << ": " << expr << endl;
+    ++failures;
+  }
+  return result;
+}
+
+static void print_exception( int no, char const *expr, int line,
+                             std::exception const &e ) {
+  assert_true( no, expr, line, false );
+  cout << "+ exception: " << e.what() << endl;
+}
+
+#define ASSERT_TRUE( NO, EXPR ) assert_true( NO, #EXPR, __LINE__, !!(EXPR) )
+
+#define ASSERT_TRUE_AND_NO_EXCEPTION( NO, EXPR ) \
+  try { ASSERT_TRUE( NO, EXPR ); } \
+  catch ( exception const &e ) { print_exception( NO, #EXPR, __LINE__, e ); } \
+  catch ( ... ) { assert_true( NO, #EXPR, __LINE__, false ); }
+
+#define ASSERT_EXCEPTION( NO, EXPR ) \
+  try { EXPR; assert_true( NO, #EXPR, __LINE__, false ); } \
+  catch ( ZorbaException const &e ) { } \
+  catch ( ... ) { assert_true( NO, #EXPR, __LINE__, false ); }
+
+///////////////////////////////////////////////////////////////////////////////
+
+static bool test_getline( char const *test ) {
+  istringstream iss( test );
+  utf8::streambuf utf_buf( iss.rdbuf() );
+  iss.ios::rdbuf( &utf_buf );
+  iss.exceptions( ios::badbit );
+
+  char buf[ 1024 ];
+  iss.getline( buf, sizeof buf );
+  if ( iss.gcount() ) {
+    string const s( buf, iss.gcount() );
+    return s == test;
+  }
+  return false;
+}
+
+static bool test_read( char const *test ) {
+  istringstream iss( test );
+  utf8::streambuf utf_buf( iss.rdbuf() );
+  iss.ios::rdbuf( &utf_buf );
+  iss.exceptions( ios::badbit );
+
+  char buf[ 1024 ];
+  iss.read( buf, sizeof buf );
+  if ( iss.gcount() ) {
+    string const s( buf, iss.gcount() );
+    return s == test;
+  }
+  return false;
+}
+
+static bool test_insertion( char const *test ) {
+  ostringstream oss;
+  utf8::streambuf utf_buf( oss.rdbuf(), true );
+  oss.ios::rdbuf( &utf_buf );
+  oss.exceptions( ios::badbit );
+
+  oss << test << flush;
+  string const s( oss.str() );
+  return s == test;
+}
+
+static bool test_put( char const *test ) {
+  ostringstream oss;
+  utf8::streambuf utf_buf( oss.rdbuf(), true );
+  oss.ios::rdbuf( &utf_buf );
+  oss.exceptions( ios::badbit );
+
+  for ( char const *c = test; *c; ++c )
+    oss.put( *c );
+
+  string const s( oss.str() );
+  return s == test;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace zorba {
+namespace UnitTests {
+
+int test_utf8_streambuf( int, char*[] ) {
+  int test_no = 0;
+  for ( char const *const *s = tests_good; *s; ++s, ++test_no ) {
+    ASSERT_TRUE_AND_NO_EXCEPTION( test_no, test_getline( *s ) );
+    ASSERT_TRUE_AND_NO_EXCEPTION( test_no, test_read( *s ) );
+    ASSERT_TRUE_AND_NO_EXCEPTION( test_no, test_insertion( *s ) );
+    ASSERT_TRUE_AND_NO_EXCEPTION( test_no, test_put( *s ) );
+  }
+  for ( char const *const *s = tests_bad; *s; ++s, ++test_no ) {
+    ASSERT_EXCEPTION( test_no, test_getline( *s ) );
+    ASSERT_EXCEPTION( test_no, test_read( *s ) );
+    ASSERT_EXCEPTION( test_no, test_insertion( *s ) );
+    ASSERT_EXCEPTION( test_no, test_put( *s ) );
+  }
+  cout << failures << " test(s) failed\n";
+  return failures ? 1 : 0;
+}
+
+} // namespace UnitTests
+} // namespace zorba
+/* vim:set et sw=2 ts=2: */

=== modified file 'src/unit_tests/unit_test_list.h'
--- src/unit_tests/unit_test_list.h	2012-09-19 21:16:15 +0000
+++ src/unit_tests/unit_test_list.h	2013-01-09 05:11:31 +0000
@@ -56,6 +56,7 @@
   int test_unique_ptr( int, char*[] );
 #endif /* ZORBA_HAVE_UNIQUE_PTR */
 
+  int test_utf8_streambuf( int, char*[] );
   int test_uuid( int, char*[] );
 
 #ifndef ZORBA_HAVE_UNORDERED_MAP

=== modified file 'src/unit_tests/unit_tests.cpp'
--- src/unit_tests/unit_tests.cpp	2012-11-12 21:17:32 +0000
+++ src/unit_tests/unit_tests.cpp	2013-01-09 05:11:31 +0000
@@ -61,6 +61,7 @@
   libunittests["unique_ptr"] = test_unique_ptr;
 #endif /* ZORBA_HAVE_UNIQUE_PTR */
 
+  libunittests["utf8_streambuf"] = test_utf8_streambuf;
   libunittests["uuid"] = test_uuid;
 
 #ifndef ZORBA_HAVE_UNORDERED_MAP

=== modified file 'src/util/CMakeLists.txt'
--- src/util/CMakeLists.txt	2012-12-12 01:04:54 +0000
+++ src/util/CMakeLists.txt	2013-01-09 05:11:31 +0000
@@ -29,6 +29,7 @@
   unicode_categories.cpp
   uri_util.cpp
   utf8_util.cpp
+  utf8_streambuf.cpp
   xml_util.cpp
   fx/fxcharheap.cpp
   string/empty_rep_base.cpp

=== modified file 'src/util/icu_streambuf.h'
--- src/util/icu_streambuf.h	2012-12-27 18:50:25 +0000
+++ src/util/icu_streambuf.h	2013-01-09 05:11:31 +0000
@@ -46,12 +46,12 @@
  *    try {
  *      os.ios::rdbuf( &xbuf );
  *      // ...
+ *      os.ios::rdbuf( xbuf.original() );
  *    }
  *    catch ( ... ) {
  *      os.ios::rdbuf( xbuf.original() );
  *      throw;
  *    }
- *    os.ios::rdbuf( xbuf.original() );
  *  }
  * \endcode
  *

=== added file 'src/util/utf8_streambuf.cpp'
--- src/util/utf8_streambuf.cpp	1970-01-01 00:00:00 +0000
+++ src/util/utf8_streambuf.cpp	2013-01-09 05:11:31 +0000
@@ -0,0 +1,259 @@
+/*
+ * Copyright 2006-2008 The FLWOR Foundation.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "stdafx.h"
+
+//#define ZORBA_DEBUG_UTF8_STREAMBUF
+#ifdef ZORBA_DEBUG_UTF8_STREAMBUF
+# include <stdio.h>
+#endif
+
+#include <iomanip>
+#include <stdexcept>
+
+#include <zorba/config.h>
+#include <zorba/diagnostic_list.h>
+
+#include "diagnostics/diagnostic.h"
+#include "diagnostics/zorba_exception.h"
+#include "util/cxx_util.h"
+#include "util/oseparator.h"
+#include "util/string_util.h"
+#include "util/utf8_util.h"
+
+#include "utf8_streambuf.h"
+
+using namespace std;
+
+namespace zorba {
+namespace utf8 {
+
+///////////////////////////////////////////////////////////////////////////////
+
+inline void streambuf::buf_type::clear() {
+  char_len_ = 0;
+}
+
+void streambuf::buf_type::throw_invalid_utf8( storage_type *buf,
+                                              size_type len ) {
+  ostringstream oss;
+  oss << hex << setfill('0') << setw(2) << uppercase;
+  oseparator comma( ',' );
+
+  for ( size_type i = 0; i < len; ++i )
+    oss << comma << "0x" << (static_cast<unsigned>( buf[i] ) & 0xFF);
+
+  clear();
+  throw ZORBA_EXCEPTION(
+    zerr::ZXQD0006_INVALID_UTF8_BYTE_SEQUENCE,
+    ERROR_PARAMS( oss.str() )
+  );
+}
+
+void streambuf::buf_type::validate( storage_type c, bool bump ) {
+  size_type char_len_copy = char_len_, cur_len_copy = cur_len_;
+
+  if ( !char_len_copy ) {
+    //
+    // This means we're (hopefully) at the first byte of a UTF-8 byte sequence
+    // comprising a character.
+    //
+    if ( !(char_len_copy = char_length( c )) )
+      throw_invalid_utf8( &c, 1 );
+    cur_len_copy = 0;
+  }
+
+  storage_type *const cur_byte_ptr = utf8_char_ + cur_len_copy;
+  storage_type const old_byte = *cur_byte_ptr;
+  *cur_byte_ptr = c;
+
+  if ( cur_len_copy++ && !is_continuation_byte( c ) )
+    throw_invalid_utf8( utf8_char_, cur_len_copy );
+
+  if ( bump ) {
+    char_len_ = (cur_len_copy == char_len_copy ? 0 : char_len_copy);
+    cur_len_ = cur_len_copy;
+  } else {
+    *cur_byte_ptr = old_byte;
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+inline void streambuf::clear() {
+  gbuf_.clear();
+  pbuf_.clear();
+}
+
+streambuf::streambuf( std::streambuf *orig, bool validate_put ) :
+  internal::proxy_streambuf( orig ),
+  validate_put_( validate_put )
+{
+  if ( !orig )
+    throw invalid_argument( "null streambuf" );
+  clear();
+}
+
+void streambuf::imbue( std::locale const &loc ) {
+  original()->pubimbue( loc );
+}
+
+void streambuf::resync() {
+  int_type c = original()->sgetc();
+  while ( !traits_type::eq_int_type( c, traits_type::eof() ) ) {
+    if ( is_start_byte( traits_type::to_char_type( c ) ) )
+      break;
+    c = original()->sbumpc();
+  }
+}
+
+streambuf::pos_type streambuf::seekoff( off_type o, ios_base::seekdir d,
+                                        ios_base::openmode m ) {
+  clear();
+  return original()->pubseekoff( o, d, m );
+}
+
+streambuf::pos_type streambuf::seekpos( pos_type p, ios_base::openmode m ) {
+  clear();
+  return original()->pubseekpos( p, m );
+}
+
+std::streambuf* streambuf::setbuf( char_type *p, streamsize s ) {
+  original()->pubsetbuf( p, s );
+  return this;
+}
+
+streamsize streambuf::showmanyc() {
+  return original()->in_avail();
+}
+
+int streambuf::sync() {
+  return original()->pubsync();
+}
+
+streambuf::int_type streambuf::overflow( int_type c ) {
+#ifdef ZORBA_DEBUG_UTF8_STREAMBUF
+  printf( "overflow()\n" );
+#endif
+  if ( traits_type::eq_int_type( c, traits_type::eof() ) )
+    return traits_type::eof();
+  if ( validate_put_ )
+    pbuf_.validate( traits_type::to_char_type( c ), true );
+  original()->sputc( c );
+  return c;
+}
+
+streambuf::int_type streambuf::pbackfail( int_type c ) {
+  if ( !traits_type::eq_int_type( c, traits_type::eof() ) &&
+       gbuf_.cur_len_ &&
+       original()->sputbackc( traits_type::to_char_type( c ) ) ) {
+    --gbuf_.cur_len_;
+    return c;
+  }
+  return traits_type::eof();
+}
+
+streambuf::int_type streambuf::uflow() {
+#ifdef ZORBA_DEBUG_UTF8_STREAMBUF
+  printf( "uflow()\n" );
+#endif
+  int_type const c = original()->sbumpc();
+  if ( traits_type::eq_int_type( c, traits_type::eof() ) )
+    return traits_type::eof();
+  gbuf_.validate( traits_type::to_char_type( c ) );
+  return c;
+}
+
+streambuf::int_type streambuf::underflow() {
+#ifdef ZORBA_DEBUG_UTF8_STREAMBUF
+  printf( "underflow()\n" );
+#endif
+  int_type const c = original()->sgetc();
+  if ( traits_type::eq_int_type( c, traits_type::eof() ) )
+    return traits_type::eof();
+  gbuf_.validate( traits_type::to_char_type( c ), false );
+  return c;
+}
+
+streamsize streambuf::xsgetn( char_type *to, streamsize size ) {
+#ifdef ZORBA_DEBUG_UTF8_STREAMBUF
+  printf( "xsgetn()\n" );
+#endif
+  streamsize return_size = 0;
+
+  if ( gbuf_.char_len_ ) {
+    streamsize const want = gbuf_.char_len_ - gbuf_.cur_len_;
+    streamsize const get = min( want, size );
+    streamsize const got = original()->sgetn( to, get );
+    for ( streamsize i = 0; i < got; ++i )
+      gbuf_.validate( to[i] );
+    to += got;
+    size -= got, return_size += got;
+  }
+
+  while ( size > 0 ) {
+    if ( streamsize const got = original()->sgetn( to, size ) ) {
+      for ( streamsize i = 0; i < got; ++i )
+        gbuf_.validate( to[i] );
+      to += got;
+      size -= got, return_size += got;
+    } else
+      break;
+  }
+  return return_size;
+}
+
+streamsize streambuf::xsputn( char_type const *from, streamsize size ) {
+#ifdef ZORBA_DEBUG_UTF8_STREAMBUF
+  printf( "xsputn()\n" );
+#endif
+  if ( validate_put_ )
+    for ( streamsize i = 0; i < size; ++i )
+      pbuf_.validate( from[i] );
+  return original()->sputn( from, size );
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Both new & delete are done inside Zorba rather than in the header to
+// guarantee that they're cross-DLL-boundary safe on Windows.
+
+std::streambuf* alloc_streambuf( std::streambuf *orig ) {
+  return new utf8::streambuf( orig );
+}
+
+int get_streambuf_index() {
+  //
+  // This function is out-of-line because it has a static constant within it.
+  // It has a static constant within it to guarantee (1) initialization before
+  // use and (2) initialization happens exactly once.
+  //
+  // See: "Standard C++ IOStreams and Locales: Advanced Programmer's Guide and
+  // Reference," Angelika Langer and Klaus Kreft, Addison-Wesley, 2000, section
+  // 3.3.1.1: "Initializing and Maintaining the iword/pword Index."
+  //
+  // See: "The C++ Programming Language," Bjarne Stroustrup, Addison-Wesley,
+  // 2000, section 10.4.8: "Local Static Store."
+  //
+  static int const index = ios_base::xalloc();
+  return index;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace utf8
+} // namespace zorba
+/* vim:set et sw=2 ts=2: */

=== added file 'src/util/utf8_streambuf.h'
--- src/util/utf8_streambuf.h	1970-01-01 00:00:00 +0000
+++ src/util/utf8_streambuf.h	2013-01-09 05:11:31 +0000
@@ -0,0 +1,322 @@
+/*
+ * Copyright 2006-2008 The FLWOR Foundation.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ZORBA_UTF8_STREAMBUF_H
+#define ZORBA_UTF8_STREAMBUF_H
+
+#include <zorba/internal/streambuf.h>
+
+#include "util/utf8_util.h"
+
+namespace zorba {
+namespace utf8 {
+
+///////////////////////////////////////////////////////////////////////////////
+
+/**
+ * A %utf8::streambuf is-a std::streambuf for validating UTF-8 on-the-fly.
+ * To use it, replace a stream's streambuf:
+ * \code
+ *  istream is;
+ *  // ...
+ *  utf8::streambuf xbuf( is.rdbuf() );
+ *  is.ios::rdbuf( &xbuf );
+ * \endcode
+ * Note that the %utf8::streambuf must exist for as long as it's being used by
+ * the stream.  If you are replacing the streambuf for a stream you did not
+ * create, you should set it back to the original streambuf:
+ * \code
+ *  void f( ostream &os ) {
+ *    utf8::streambuf xbuf( os.rdbuf() );
+ *    try {
+ *      os.ios::rdbuf( &xbuf );
+ *      // ...
+ *      os.ios::rdbuf( xbuf.original() );
+ *    }
+ *    catch ( ... ) {
+ *      os.ios::rdbuf( xbuf.original() );
+ *      throw;
+ *    }
+ *  }
+ * \endcode
+ *
+ * If an invalid UTF-8 byte sequence is read, then the stream's \c badbit is
+ * set.  Hence using a %utf8::streambuf requires rigorous error-checking.
+ *
+ * However, if exceptions are enabled for the stream, then
+ * \c ZXQD0006_INVALID_UTF8_BYTE_SEQUENCE is thrown.  (When enabling exceptions
+ * for a stream you didn't create, you should set the exception mask back to
+ * the original mask.)
+ * \code
+ *  istream is;
+ *  std::ios::iostate const orig_exceptions = is.exceptions();
+ *  try {
+ *    is.exceptions( orig_exceptions | ios::badbit );
+ *    // ...
+ *    is.exceptions( orig_exceptions );
+ *  }
+ *  catch ( ... ) {
+ *    is.exceptions( orig_exceptions );
+ *    throw;
+ *  }
+ * \endcode
+ *
+ * While %utf8::streambuf does support seeking, the positions must always be on
+ * the first byte of a UTF-8 character.
+ */
+class streambuf : public internal::proxy_streambuf {
+public:
+  /**
+   * Constructs a %streambuf.
+   *
+   * @param orig The original streambuf to read/write from/to.
+   * @param validate_put If \c true, characters written are validated;
+   * if \c false, characters are written without validation, i.e., it's assumed
+   * that you're writing valid UTF-8.
+   * @throws std::invalid_argument if \a orig is \c null.
+   */
+  streambuf( std::streambuf *orig, bool validate_put = false );
+
+  /**
+   * If an invalid UTF-8 byte sequence was read, resynchronizes by skipping
+   * bytes until a new UTF-8 start byte is encountered.
+   */
+  void resync();
+
+protected:
+  void imbue( std::locale const& );
+  pos_type seekoff( off_type, std::ios_base::seekdir, std::ios_base::openmode );
+  pos_type seekpos( pos_type, std::ios_base::openmode );
+  std::streambuf* setbuf( char_type*, std::streamsize );
+  std::streamsize showmanyc();
+  int sync();
+  int_type overflow( int_type );
+  int_type pbackfail( int_type );
+  int_type uflow();
+  int_type underflow();
+  std::streamsize xsgetn( char_type*, std::streamsize );
+  std::streamsize xsputn( char_type const*, std::streamsize );
+
+private:
+  struct buf_type {
+    encoded_char_type utf8_char_;
+    size_type char_len_;
+    size_type cur_len_;
+
+    void clear();
+    void throw_invalid_utf8( storage_type *buf, size_type len );
+    void validate( storage_type, bool bump = true );
+  };
+
+  buf_type gbuf_, pbuf_;
+  bool const validate_put_;
+
+  void clear();
+
+  // forbid
+  streambuf( streambuf const& );
+  streambuf& operator=( streambuf const& );
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+std::streambuf* alloc_streambuf( std::streambuf *orig );
+
+int get_streambuf_index();
+
+///////////////////////////////////////////////////////////////////////////////
+
+/**
+ * Attaches a utf8::streambuf to a stream.  Unlike using a
+ * utf8::streambuf directly, this function will create the streambuf,
+ * attach it to the stream, and manage it for the lifetime of the stream
+ * automatically.
+ *
+ * @param ios The stream to attach the utf8::streambuf to.  If the stream
+ * already has a utf8::streambuf attached to it, this function does
+ * nothing.
+ */
+template<typename charT,typename Traits> inline
+void attach( std::basic_ios<charT,Traits> &ios ) {
+  int const index = get_streambuf_index();
+  void *&pword = ios.pword( index );
+  if ( !pword ) {
+    std::streambuf *const buf = alloc_streambuf( ios.rdbuf() );
+    ios.rdbuf( buf );
+    pword = buf;
+    ios.register_callback( internal::stream_callback, index );
+  }
+}
+
+/**
+ * Detaches a previously attached utf8::streambuf from a stream.  The streambuf
+ * is destroyed and the stream's original streambuf is restored.
+ *
+ * @param ios The stream to detach the utf8::streambuf from.  If the stream
+ * doesn't have a utf8::streambuf attached to it, this function does nothing.
+ */
+template<typename charT,typename Traits> inline
+void detach( std::basic_ios<charT,Traits> &ios ) {
+  int const index = get_streambuf_index();
+  if ( streambuf *const buf = static_cast<streambuf*>( ios.pword( index ) ) ) {
+    ios.pword( index ) = 0;
+    ios.rdbuf( buf->original() );
+    internal::dealloc_streambuf( buf );
+  }
+}
+
+/**
+ * Checks whether the given stream has a utf8::streambuf attached.
+ *
+ * @param ios The stream to check.
+ * @return \c true only if a utf8::streambuf is attached.
+ */
+template<typename charT,typename Traits> inline
+bool is_attached( std::basic_ios<charT,Traits> &ios ) {
+  return !!ios.pword( get_streambuf_index() );
+}
+
+/**
+ * A %utf8::auto_attach is a class that attaches a utf8::streambuf to a stream
+ * and automatically detaches it when the %auto_attach object is destroyed.
+ * \code
+ *  void f( ostream &os ) {
+ *    utf8::auto_attach<ostream> const raii( os, "ISO-8859-1" );
+ *    // ...
+ *  }
+ * \endcode
+ * A %utf8::auto_attach is useful for streams not created by you.
+ *
+ * @see http://en.wikipedia.org/wiki/Resource_Acquisition_Is_Initialization
+ */
+template<class StreamType>
+class auto_attach {
+public:
+  /**
+   * Constructs an %auto_attach object calling attach() on the given stream.
+   *
+   * @param stream The stream to attach the utf8::streambuf to.  If the stream
+   * already has a utf8::streambuf attached to it, this contructor does
+   * nothing.
+   */
+  auto_attach( StreamType &stream ) : stream_( stream ) {
+    attach( stream );
+  }
+
+  /**
+   * Destroys this %auto_attach object calling detach() on the previously
+   * attached stream.
+   */
+  ~auto_attach() {
+    detach( stream_ );
+  }
+
+private:
+  StreamType &stream_;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+/**
+ * A %utf8::stream is used to wrap a C++ standard I/O stream with a
+ * utf8::streambuf so that encoding/decoding and the management of the
+ * streambuf happens automatically.
+ *
+ * A %utf8::stream is useful for streams created by you.
+ *
+ * @tparam StreamType The I/O stream class type to wrap. It must be a concrete
+ * stream class.
+ */
+template<class StreamType>
+class stream : public StreamType {
+public:
+  /**
+   * Constructs a %utf8::stream.
+   */
+  stream() :
+#ifdef WIN32
+# pragma warning( push )
+# pragma warning( disable : 4355 )
+#endif /* WIN32 */
+    utf8_buf_( this->rdbuf() )
+#ifdef WIN32
+# pragma warning( pop )
+#endif /* WIN32 */
+  {
+    init();
+  }
+
+  /**
+   * Constructs a %stream.
+   *
+   * @tparam StreamArgType The type of the first argument of \a StreamType's
+   * constructor.
+   * @param stream_arg The argument to pass as the first argument to
+   * \a StreamType's constructor.
+   */
+  template<typename StreamArgType>
+  stream( StreamArgType stream_arg ) :
+    StreamType( stream_arg ),
+#ifdef WIN32
+# pragma warning( push )
+# pragma warning( disable : 4355 )
+#endif /* WIN32 */
+    utf8_buf_( this->rdbuf() )
+#ifdef WIN32
+# pragma warning( pop )
+#endif /* WIN32 */
+  {
+    init();
+  }
+
+  /**
+   * Constructs a %utf8::stream.
+   *
+   * @tparam StreamArgType The type of the first argument of \a StreamType's
+   * constructor.
+   * @param stream_arg The argument to pass as the first argument to
+   * \a StreamType's constructor.
+   * @param mode The open-mode to pass to \a StreamType's constructor.
+   */
+  template<typename StreamArgType>
+  stream( StreamArgType stream_arg, std::ios_base::openmode mode ) :
+    StreamType( stream_arg, mode ),
+#ifdef WIN32
+# pragma warning( push )
+# pragma warning( disable : 4355 )
+#endif /* WIN32 */
+    utf8_buf_( this->rdbuf() )
+#ifdef WIN32
+# pragma warning( pop )
+#endif /* WIN32 */
+  {
+    init();
+  }
+
+private:
+  streambuf utf8_buf_;
+
+  void init() {
+    this->std::ios::rdbuf( &utf8_buf_ );
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace utf8
+} // namespace zorba
+#endif  /* ZORBA_UTF8_STREAMBUF_H */
+/* vim:set et sw=2 ts=2: */

=== modified file 'test/unit/CMakeLists.txt'
--- test/unit/CMakeLists.txt	2012-09-19 21:16:15 +0000
+++ test/unit/CMakeLists.txt	2013-01-09 05:11:31 +0000
@@ -153,5 +153,6 @@
   ZORBA_ADD_TEST("test/libunit/thesaurus" LibUnitTest thesaurus)
   ZORBA_ADD_TEST("test/libunit/tokenizer" LibUnitTest tokenizer)
 ENDIF (NOT ZORBA_NO_FULL_TEXT)
+ZORBA_ADD_TEST("test/libunit/utf8_streambuf" LibUnitTest utf8_streambuf)
 
 # vim:set et sw=2 ts=2:


Follow ups