← Back to team overview

maria-developers team mailing list archive

MDEV-11343 LOAD DATA INFILE fails to load data with an escape character followed by a multi-byte character

 

Hello Sergei,

Please review a patch for MDEV-11343.


Thanks!
commit 3b2701d33630857908d236686ad3df18b5ffddc6
Author: Alexander Barkov <bar@xxxxxxxxxxx>
Date:   Fri Nov 25 17:16:30 2016 +0400

    MDEV-11343 LOAD DATA INFILE fails to load data with an escape character followed by a multi-byte character
    
    Partially backporting MDEV-9874 from 10.2 to 10.0
    
    READ_INFO::read_field() raised the ER_INVALID_CHARACTER_STRING error
    when reading an escape character followed by a multi-byte character.
    
    Raising wellformedness errors in READ_INFO::read_field() was wrong,
    because the main goal of READ_INFO::read_field() is to *unescape* the
    data which was presumably escaped using mysql_real_escape_string(),
    using the same character set with the one specified in
    "LOAD DATA INFILE ... CHARACTER SET ..." (or assumed by default).
    
    During LOAD DATA, multi-byte characters are not always scanned as a single
    entity! In case of escaped data, parts of a multi-byte character can be
    scanned on different loop iterations. So the old code erroneously tested
    welformedness in the middle of a multi-byte character.
    
    Moreover, the data after unescaping can go into a BLOB field, not a text field.
    Wellformedness tests are meaningless in this case.
    
    Ater this patch, wellformedness is only checked later, during
    Field::store(str,length,cs) time. The loop that scans bytes only
    makes sure to revert the changes made by mysql_real_escape_string().
    
    Note, in some cases users can supply data which did not really go through
    mysql_real_escape_string() and was escaped by some other means,
    or was not escaped at all. The file reported in this MDEV contains
    the string "\ä", which is an example of such improperly escaped data, as
    - either there should be two backslashes:   "\\ä"
    - or there should be no backslashes at all: "ä"
    mysql_real_escape_string() could not generate "\ä".

diff --git a/include/m_ctype.h b/include/m_ctype.h
index 5994816..8d9838f 100644
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@@ -180,6 +180,9 @@ extern MY_UNI_CTYPE my_uni_ctype[256];
 /* A helper macros for "need at least n bytes" */
 #define MY_CS_TOOSMALLN(n)    (-100-(n))
 
+#define MY_CS_IS_TOOSMALL(rc) ((rc) >= MY_CS_TOOSMALL6 && (rc) <= MY_CS_TOOSMALL)
+
+
 #define MY_SEQ_INTTAIL	1
 #define MY_SEQ_SPACES	2
 
diff --git a/mysql-test/r/ctype_utf8mb4.result b/mysql-test/r/ctype_utf8mb4.result
index 38814bc..50382b5 100644
--- a/mysql-test/r/ctype_utf8mb4.result
+++ b/mysql-test/r/ctype_utf8mb4.result
@@ -3356,5 +3356,28 @@ DFFFFFDFFFFF9CFFFF9DFFFF9EFFFF
 # End of 5.6 tests
 #
 #
+# Start of 10.0 tests
+#
+#
+# MDEV-11343 LOAD DATA INFILE fails to load data with an escape character followed by a multi-byte character
+#
+CREATE TABLE t1 (a TEXT CHARACTER SET utf8mb4);
+LOAD DATA INFILE '../../std_data/loaddata/mdev-11343.txt' INTO TABLE t1 CHARACTER SET utf8mb4;
+SELECT HEX(a) FROM t1;
+HEX(a)
+C3A4
+C3A478
+78C3A4
+78C3A478
+EA99A0
+EA99A078
+78EA99A0
+78EA99A078
+F09F988E
+F09F988E78
+78F09F988E
+78F09F988E78
+DROP TABLE t1;
+#
 # End of tests
 #
diff --git a/mysql-test/r/loaddata.result b/mysql-test/r/loaddata.result
index 2f2a357..1246230 100644
--- a/mysql-test/r/loaddata.result
+++ b/mysql-test/r/loaddata.result
@@ -552,7 +552,8 @@ CREATE DATABASE d2 CHARSET utf8;
 USE d2;
 CREATE TABLE t1 (val TEXT);
 LOAD DATA INFILE '../../std_data/bug20683959loaddata.txt' INTO TABLE t1;
-ERROR HY000: Invalid utf8 character string: 'Ã"RT @niouzechun: \9058\221A'
+Warnings:
+Warning	1366	Incorrect string value: '\xF5\x80\x81\xAE\xE7\xB9...' for column 'val' at row 1
 DROP TABLE d1.t1, d2.t1;
 DROP DATABASE d1;
 DROP DATABASE d2;
diff --git a/mysql-test/std_data/loaddata/mdev-11343.txt b/mysql-test/std_data/loaddata/mdev-11343.txt
new file mode 100644
index 0000000..dded121
--- /dev/null
+++ b/mysql-test/std_data/loaddata/mdev-11343.txt
@@ -0,0 +1,12 @@
+\ä
+\äx
+x\ä
+x\äx
+\ê™ 
+\ê™ x
+x\ê™ 
+x\ê™ x
+\😎
+\😎x
+x\😎
+x\😎x
diff --git a/mysql-test/t/ctype_utf8mb4.test b/mysql-test/t/ctype_utf8mb4.test
index 6b876cc..cf1c103 100644
--- a/mysql-test/t/ctype_utf8mb4.test
+++ b/mysql-test/t/ctype_utf8mb4.test
@@ -1864,6 +1864,17 @@ set @@collation_connection=utf8mb4_bin;
 --echo # End of 5.6 tests
 --echo #
 
+--echo #
+--echo # Start of 10.0 tests
+--echo #
+
+--echo #
+--echo # MDEV-11343 LOAD DATA INFILE fails to load data with an escape character followed by a multi-byte character
+--echo #
+CREATE TABLE t1 (a TEXT CHARACTER SET utf8mb4);
+LOAD DATA INFILE '../../std_data/loaddata/mdev-11343.txt' INTO TABLE t1 CHARACTER SET utf8mb4;
+SELECT HEX(a) FROM t1;
+DROP TABLE t1;
 
 --echo #
 --echo # End of tests
diff --git a/mysql-test/t/loaddata.test b/mysql-test/t/loaddata.test
index 7d0f385..9f2aafc 100644
--- a/mysql-test/t/loaddata.test
+++ b/mysql-test/t/loaddata.test
@@ -675,7 +675,6 @@ SELECT HEX(val) FROM t1;
 CREATE DATABASE d2 CHARSET utf8;
 USE d2;
 CREATE TABLE t1 (val TEXT);
---error ER_INVALID_CHARACTER_STRING
 LOAD DATA INFILE '../../std_data/bug20683959loaddata.txt' INTO TABLE t1;
 
 DROP TABLE d1.t1, d2.t1;
diff --git a/sql/sql_load.cc b/sql/sql_load.cc
index af4b251..6a473ce 100644
--- a/sql/sql_load.cc
+++ b/sql/sql_load.cc
@@ -79,6 +79,81 @@ class READ_INFO {
   NET *io_net;
   int level; /* for load xml */
 
+
+#if MYSQL_VERSION_ID >= 100200
+#error This 10.0 and 10.1 specific fix should be removed in 10.2.
+#error Fix read_mbtail() to use my_charlen() instead of my_charlen_tmp()
+#else
+  int my_charlen_tmp(CHARSET_INFO *cs, const char *str, const char *end)
+  {
+    my_wc_t wc;
+    return cs->cset->mb_wc(cs, &wc, (const uchar *) str, (const uchar *) end);
+  }
+#endif
+
+  /**
+    Read a tail of a multi-byte character.
+    The first byte of the character is assumed to be already
+    read from the file and appended to "str".
+
+    @returns  true  - if EOF happened unexpectedly
+    @returns  false - no EOF happened: found a good multi-byte character,
+                                       or a bad byte sequence
+
+    Note:
+    The return value depends only on EOF:
+    - read_mbtail() returns "false" is a good character was read, but also
+    - read_mbtail() returns "false" if an incomplete byte sequence was found
+      and no EOF happened.
+
+    For example, suppose we have an ujis file with bytes 0x8FA10A, where:
+    - 0x8FA1 is an incomplete prefix of a 3-byte character
+      (it should be [8F][A1-FE][A1-FE] to make a full 3-byte character)
+    - 0x0A is a line demiliter
+    This file has some broken data, the trailing [A1-FE] is missing.
+
+    In this example it works as follows:
+    - 0x8F is read from the file and put into "data" before the call
+      for read_mbtail()
+    - 0xA1 is read from the file and put into "data" by read_mbtail()
+    - 0x0A is kept in the read queue, so the next read iteration after
+      the current read_mbtail() call will normally find it and recognize as
+      a line delimiter
+    - the current call for read_mbtail() returns "false",
+      because no EOF happened
+  */
+  bool read_mbtail(String *str)
+  {
+    int chlen;
+    if ((chlen= my_charlen_tmp(read_charset, str->end() - 1, str->end())) == 1)
+      return false; // Single byte character found
+    for (uint32 length0= str->length() - 1 ; MY_CS_IS_TOOSMALL(chlen); )
+    {
+      int chr= GET;
+      if (chr == my_b_EOF)
+      {
+        DBUG_PRINT("info", ("read_mbtail: chlen=%d; unexpected EOF", chlen));
+        return true; // EOF
+      }
+      str->append(chr);
+      chlen= my_charlen_tmp(read_charset, str->ptr() + length0, str->end());
+      if (chlen == MY_CS_ILSEQ)
+      {
+        /**
+          It has been an incomplete (but a valid) sequence so far,
+          but the last byte turned it into a bad byte sequence.
+          Unget the very last byte.
+        */
+        str->length(str->length() - 1);
+        PUSH(chr);
+        DBUG_PRINT("info", ("read_mbtail: ILSEQ"));
+        return false; // Bad byte sequence
+      }
+    }
+    DBUG_PRINT("info", ("read_mbtail: chlen=%d", chlen));
+    return false; // Good multi-byte character
+  }
+
 public:
   bool error,line_cuted,found_null,enclosed;
   uchar	*row_start,			/* Found row starts here */
@@ -1510,7 +1585,8 @@ int READ_INFO::read_field()
 
   for (;;)
   {
-    while ( to < end_of_buff)
+    // Make sure we have enough space for the longest multi-byte character.
+    while ( to + read_charset->mbmaxlen < end_of_buff)
     {
       chr = GET;
       if (chr == my_b_EOF)
@@ -1598,52 +1674,27 @@ int READ_INFO::read_field()
 	}
       }
 #ifdef USE_MB
-        uint ml= my_mbcharlen(read_charset, chr);
-        if (ml == 0)
-        {
-          *to= '\0';
-          my_error(ER_INVALID_CHARACTER_STRING, MYF(0),
-                   read_charset->csname, buffer);
-          error= true;
-          return 1;
-        }
-
-        if (ml > 1 &&
-            to + ml <= end_of_buff)
-        {
-          uchar* p= to;
-          *to++ = chr;
-
-          for (uint i= 1; i < ml; i++)
-          {
-            chr= GET;
-            if (chr == my_b_EOF)
-            {
-              /*
-                Need to back up the bytes already ready from illformed
-                multi-byte char 
-              */
-              to-= i;
-              goto found_eof;
-            }
-            *to++ = chr;
-          }
-          if (my_ismbchar(read_charset,
-                        (const char *)p,
-                        (const char *)to))
-            continue;
-          for (uint i= 0; i < ml; i++)
-            PUSH(*--to);
-          chr= GET;
-        }
-        else if (ml > 1)
-        {
-          // Buffer is too small, exit while loop, and reallocate.
-          PUSH(chr);
-          break;
-        }
 #endif
       *to++ = (uchar) chr;
+#if MYSQL_VERSION_ID >= 100200
+#error This 10.0 and 10.1 specific fix should be removed in 10.2
+#else
+      if (my_mbcharlen(read_charset, (uchar) chr) > 1)
+      {
+        /*
+          A known MBHEAD found. Try to scan the full multi-byte character.
+          Otherwise, a possible following second byte 0x5C would be
+          mis-interpreted as an escape on the next iteration.
+          (Important for big5, gbk, sjis, cp932).
+        */
+        String tmp((char *) to - 1, read_charset->mbmaxlen, read_charset);
+        tmp.length(1);
+        bool eof= read_mbtail(&tmp);
+        to+= tmp.length() - 1;
+        if (eof)
+          goto found_eof;
+      }
+#endif
     }
     /*
     ** We come here if buffer is too small. Enlarge it and continue
diff --git a/sql/sql_string.h b/sql/sql_string.h
index c287f05..557d14a 100644
--- a/sql/sql_string.h
+++ b/sql/sql_string.h
@@ -136,6 +136,7 @@ class String
   inline bool is_empty() const { return (str_length == 0); }
   inline void mark_as_const() { Alloced_length= 0;}
   inline const char *ptr() const { return Ptr; }
+  inline const char *end() const { return Ptr + str_length; }
   inline char *c_ptr()
   {
     DBUG_ASSERT(!alloced || !Ptr || !Alloced_length || 

Follow ups