maria-developers team mailing list archive

Thread
Date

Please review MDEV-9823 LOAD DATA INFILE silently truncates incomplete byte sequences

To: Sergei Golubchik <serg@xxxxxxxxxxx>, maria-developers <maria-developers@xxxxxxxxxxxxxxxxxxx>
From: Alexander Barkov <bar@xxxxxxxxxxx>
Date: Mon, 4 Apr 2016 14:05:39 +0400
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Thunderbird/38.6.0

Hi Sergei,

Please review a patch for MDEV-9823.

This is a prerequisite for the current sprint task:
MDEV-6353 my_ismbchar() and my_mbcharlen() refactoring


Thanks.

commit 9b6bcea8894474bb0d660a11ac21c1f64d12099f
Author: Alexander Barkov <bar@xxxxxxxxxxx>
Date:   Mon Apr 4 14:03:42 2016 +0400

    MDEV-9823 LOAD DATA INFILE silently truncates incomplete byte sequences

diff --git a/mysql-test/r/ctype_eucjpms.result b/mysql-test/r/ctype_eucjpms.result
index f9cb4f1..8d4d8f6 100644
--- a/mysql-test/r/ctype_eucjpms.result
+++ b/mysql-test/r/ctype_eucjpms.result
@@ -33913,3 +33913,24 @@ DROP TABLE t1;
 #
 # End of 10.1 tests
 #
+#
+# End of 10.2 tests
+#
+#
+# MDEV-9842 LOAD DATA INFILE does not work well with a TEXT column when using sjis
+#
+CREATE TABLE t1 (a TEXT CHARACTER SET eucjpms);
+LOAD DATA INFILE '../../std_data/loaddata/mdev9823.ujis.txt' INTO TABLE t1 CHARACTER SET eucjpms IGNORE 4 LINES;
+SELECT HEX(a) FROM t1;
+HEX(a)
+3F
+78787831
+3F3F
+78787832
+8FA1A1
+78787833
+3F3F
+DROP TABLE t1;
+#
+# End of 10.2 tests
+#
diff --git a/mysql-test/r/ctype_ujis.result b/mysql-test/r/ctype_ujis.result
index 61541ec..5eb9a3e 100644
--- a/mysql-test/r/ctype_ujis.result
+++ b/mysql-test/r/ctype_ujis.result
@@ -26218,3 +26218,24 @@ DROP TABLE t1;
 #
 # End of 10.1 tests
 #
+#
+# End of 10.2 tests
+#
+#
+# MDEV-9842 LOAD DATA INFILE does not work well with a TEXT column when using sjis
+#
+CREATE TABLE t1 (a TEXT CHARACTER SET ujis);
+LOAD DATA INFILE '../../std_data/loaddata/mdev9823.ujis.txt' INTO TABLE t1 CHARACTER SET ujis IGNORE 4 LINES;
+SELECT HEX(a) FROM t1;
+HEX(a)
+3F
+78787831
+3F3F
+78787832
+8FA1A1
+78787833
+3F3F
+DROP TABLE t1;
+#
+# End of 10.2 tests
+#
diff --git a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result
index f52e08a..af85841 100644
--- a/mysql-test/r/ctype_utf8.result
+++ b/mysql-test/r/ctype_utf8.result
@@ -10426,5 +10426,27 @@ b
 c
 DROP TABLE t1;
 #
+# MDEV-9842 LOAD DATA INFILE does not work well with a TEXT column when using sjis
+#
+CREATE TABLE t1 (a TEXT CHARACTER SET utf8);
+LOAD DATA INFILE '../../std_data/loaddata/mdev9823.utf8mb4.txt' INTO TABLE t1 CHARACTER SET utf8 IGNORE 4 LINES;
+Warnings:
+Warning	1366	Incorrect string value: '\xD0' for column 'a' at row 1
+Warning	1366	Incorrect string value: '\xE1\x80' for column 'a' at row 3
+Warning	1366	Incorrect string value: '\xF0\x9F\x98' for column 'a' at row 5
+Warning	1366	Incorrect string value: '\xF0\x9F\x98\x8E' for column 'a' at row 7
+Warning	1366	Incorrect string value: '\xF0\x9F\x98' for column 'a' at row 8
+SELECT HEX(a) FROM t1;
+HEX(a)
+3F
+78787831
+3F3F
+78787832
+3F3F3F
+78787833
+3F3F3F3F
+3F3F3F
+DROP TABLE t1;
+#
 # End of 10.2 tests
 #
diff --git a/mysql-test/r/ctype_utf8mb4.result b/mysql-test/r/ctype_utf8mb4.result
index 10d77ae..558aba9 100644
--- a/mysql-test/r/ctype_utf8mb4.result
+++ b/mysql-test/r/ctype_utf8mb4.result
@@ -3398,3 +3398,30 @@ DROP FUNCTION f1;
 #
 # End of 10.1 tests
 #
+#
+# End of 10.2 tests
+#
+#
+# MDEV-9842 LOAD DATA INFILE does not work well with a TEXT column when using sjis
+#
+CREATE TABLE t1 (a TEXT CHARACTER SET utf8mb4);
+LOAD DATA INFILE '../../std_data/loaddata/mdev9823.utf8mb4.txt' INTO TABLE t1 CHARACTER SET utf8mb4 IGNORE 4 LINES;
+Warnings:
+Warning	1366	Incorrect string value: '\xD0' for column 'a' at row 1
+Warning	1366	Incorrect string value: '\xE1\x80' for column 'a' at row 3
+Warning	1366	Incorrect string value: '\xF0\x9F\x98' for column 'a' at row 5
+Warning	1366	Incorrect string value: '\xF0\x9F\x98' for column 'a' at row 8
+SELECT HEX(a) FROM t1;
+HEX(a)
+3F
+78787831
+3F3F
+78787832
+3F3F3F
+78787833
+F09F988E
+3F3F3F
+DROP TABLE t1;
+#
+# End of 10.2 tests
+#
diff --git a/mysql-test/std_data/loaddata/mdev9823.ujis.txt b/mysql-test/std_data/loaddata/mdev9823.ujis.txt
new file mode 100644
index 0000000..5468c99
--- /dev/null
+++ b/mysql-test/std_data/loaddata/mdev9823.ujis.txt
@@ -0,0 +1,11 @@
+# This file has incomplete UJIS sequences {8F}, {8FA1},
+# has a valid UJIS sequence {8FA1A1},
+# and has no NL at the end:
+# {8F} \n xxx1 {8FA1} \n xxx2 {8FA1A1} \n xxx3 \n {8FA1} EOF
+
+xxx1
+¡
+xxx2
+¡¡
+xxx3
+¡
\ No newline at end of file
diff --git a/mysql-test/std_data/loaddata/mdev9823.utf8mb4.txt b/mysql-test/std_data/loaddata/mdev9823.utf8mb4.txt
new file mode 100644
index 0000000..8773956
--- /dev/null
+++ b/mysql-test/std_data/loaddata/mdev9823.utf8mb4.txt
@@ -0,0 +1,12 @@
+# This file has incomplete utf8mb4 sequences {D0}, {E180}, {F09F98},
+# has a valid utf8mb4 sequence {F09F988E}
+# and has no NL at the end:
+# {D0} \n xxx1 {E180} xxx2 \n {F09F98} \n xxx3 {F09F988E} {F09F98} EOF
+Ð
+xxx1
+á€
+xxx2
+ðŸ˜
+xxx3
+ðŸ˜Ž
+ðŸ˜
\ No newline at end of file
diff --git a/mysql-test/t/ctype_eucjpms.test b/mysql-test/t/ctype_eucjpms.test
index d533e38..b5bd92d 100644
--- a/mysql-test/t/ctype_eucjpms.test
+++ b/mysql-test/t/ctype_eucjpms.test
@@ -566,3 +566,19 @@ DROP TABLE t1;
 --echo #
 --echo # End of 10.1 tests
 --echo #
+
+--echo #
+--echo # End of 10.2 tests
+--echo #
+
+--echo #
+--echo # MDEV-9842 LOAD DATA INFILE does not work well with a TEXT column when using sjis
+--echo #
+CREATE TABLE t1 (a TEXT CHARACTER SET eucjpms);
+LOAD DATA INFILE '../../std_data/loaddata/mdev9823.ujis.txt' INTO TABLE t1 CHARACTER SET eucjpms IGNORE 4 LINES;
+SELECT HEX(a) FROM t1;
+DROP TABLE t1;
+
+--echo #
+--echo # End of 10.2 tests
+--echo #
diff --git a/mysql-test/t/ctype_ujis.test b/mysql-test/t/ctype_ujis.test
index 3f44458..db85585 100644
--- a/mysql-test/t/ctype_ujis.test
+++ b/mysql-test/t/ctype_ujis.test
@@ -1396,3 +1396,20 @@ SELECT HEX(a) FROM t1 ORDER BY a;DROP TABLE t1;
 --echo #
 --echo # End of 10.1 tests
 --echo #
+
+
+--echo #
+--echo # End of 10.2 tests
+--echo #
+
+--echo #
+--echo # MDEV-9842 LOAD DATA INFILE does not work well with a TEXT column when using sjis
+--echo #
+CREATE TABLE t1 (a TEXT CHARACTER SET ujis);
+LOAD DATA INFILE '../../std_data/loaddata/mdev9823.ujis.txt' INTO TABLE t1 CHARACTER SET ujis IGNORE 4 LINES;
+SELECT HEX(a) FROM t1;
+DROP TABLE t1;
+
+--echo #
+--echo # End of 10.2 tests
+--echo #
diff --git a/mysql-test/t/ctype_utf8.test b/mysql-test/t/ctype_utf8.test
index f3a9e63..edf66f8 100644
--- a/mysql-test/t/ctype_utf8.test
+++ b/mysql-test/t/ctype_utf8.test
@@ -1967,5 +1967,13 @@ SELECT c1 FROM t1;
 DROP TABLE t1;
 
 --echo #
+--echo # MDEV-9842 LOAD DATA INFILE does not work well with a TEXT column when using sjis
+--echo #
+CREATE TABLE t1 (a TEXT CHARACTER SET utf8);
+LOAD DATA INFILE '../../std_data/loaddata/mdev9823.utf8mb4.txt' INTO TABLE t1 CHARACTER SET utf8 IGNORE 4 LINES;
+SELECT HEX(a) FROM t1;
+DROP TABLE t1;
+
+--echo #
 --echo # End of 10.2 tests
 --echo #
diff --git a/mysql-test/t/ctype_utf8mb4.test b/mysql-test/t/ctype_utf8mb4.test
index 2fe9b5e..74e39a8 100644
--- a/mysql-test/t/ctype_utf8mb4.test
+++ b/mysql-test/t/ctype_utf8mb4.test
@@ -1919,3 +1919,20 @@ DROP FUNCTION f1;
 --echo #
 --echo # End of 10.1 tests
 --echo #
+
+
+--echo #
+--echo # End of 10.2 tests
+--echo #
+
+--echo #
+--echo # MDEV-9842 LOAD DATA INFILE does not work well with a TEXT column when using sjis
+--echo #
+CREATE TABLE t1 (a TEXT CHARACTER SET utf8mb4);
+LOAD DATA INFILE '../../std_data/loaddata/mdev9823.utf8mb4.txt' INTO TABLE t1 CHARACTER SET utf8mb4 IGNORE 4 LINES;
+SELECT HEX(a) FROM t1;
+DROP TABLE t1;
+
+--echo #
+--echo # End of 10.2 tests
+--echo #
diff --git a/sql/sql_load.cc b/sql/sql_load.cc
index e2d579b..a2cc7e0 100644
--- a/sql/sql_load.cc
+++ b/sql/sql_load.cc
@@ -119,6 +119,64 @@ class READ_INFO {
     *to= chr;
     return false;
   }
+
+  /**
+    Read a tail of a multi-byte character.
+    The first byte is assumed to be already read from the file
+    and appended to "data".
+
+    @returns  true  - if EOF happened unexpectedly
+    @returns  false - no EOF happened: found a good multi-byte character,
+                                       or a bad byte sequence
+
+    Note:
+    read_mbtail() returns "false" if an incomplete byte sequence was found.
+    For example, suppose we have an ujis file with bytes 0x8FA10A, where:
+    - 0x8FA1 is an incomplete prefix of a 3-byte character
+      (it should be [8F][A1-FE][A1-FE] to make a full 3-byte character)
+    - 0x0A is a line demiliter
+    This file has some broken data, the trailing 0xA1 got lost for some reasons.
+
+    In this example it will work as follows:
+    - 0x8F is read from the file and put into "data" before the call
+      for read_mbtail()
+    - 0xA1 is read from the file and put into "data" by read_mbtail()
+    - 0x0A is kept in the read queue, so the next read iteration after
+      the current read_mbtail() call will normally find it and recognize as
+      a line delimiter
+    - the current call for read_mbtail() returns "false",
+      because no EOF happened
+  */
+  bool read_mbtail(String *data)
+  {
+    DBUG_ENTER("READ_INFO::read_mbtail");
+    int chlen;
+    if ((chlen= my_charlen(read_charset, data->end() - 1,
+                                         data->end())) != 1)
+    {
+      for (uint32 length0= data->length() - 1 ; MY_CS_IS_TOOSMALL(chlen); )
+      {
+        int chr= GET;
+        if (chr == my_b_EOF)
+          DBUG_RETURN(true);
+        data->append(chr);
+        chlen= my_charlen(read_charset, data->ptr() + length0, data->end());
+        if (chlen == MY_CS_ILSEQ)
+        {
+          /**
+            It has been an incomplete (but a valid) sequence so far,
+            but the last byte turned it into a bad byte sequence.
+            Unget the very last byte.
+          */
+          data->length(data->length() - 1);
+          PUSH(chr);
+          break;
+        }
+      }
+    }
+    DBUG_RETURN(false);
+  }
+
 public:
   bool error,line_cuted,found_null,enclosed;
   uchar	*row_start,			/* Found row starts here */
@@ -1589,38 +1647,9 @@ int READ_INFO::read_field()
 	  return 0;
 	}
       }
-#ifdef USE_MB
-      if (my_mbcharlen(read_charset, chr) > 1)
-      {
-        uint32 length0= data.length();
-        int ml= my_mbcharlen(read_charset, chr);
-        data.append(chr);
-
-        for (int i= 1; i < ml; i++)
-        {
-          chr= GET;
-          if (chr == my_b_EOF)
-          {
-            /*
-             Need to back up the bytes already ready from illformed
-             multi-byte char 
-            */
-            data.length(length0);
-            goto found_eof;
-          }
-          data.append(chr);
-        }
-        if (my_ismbchar(read_charset,
-                        (const char *) data.ptr() + length0,
-                        (const char *) data.end()))
-          continue;
-        for (int i= 0; i < ml; i++)
-          PUSH(data.end()[-1 - i]);
-        data.length(length0);
-        chr= GET;
-      }
-#endif
       data.append(chr);
+      if (use_mb(read_charset) && read_mbtail(&data))
+        goto found_eof;
     }
     /*
     ** We come here if buffer is too small. Enlarge it and continue
diff --git a/strings/ctype-eucjpms.c b/strings/ctype-eucjpms.c
index 52494b7..469d3a5 100644
--- a/strings/ctype-eucjpms.c
+++ b/strings/ctype-eucjpms.c
@@ -199,6 +199,7 @@ static const uchar sort_order_eucjpms[]=
 #define IS_MB2_KATA(x,y)      (iseucjpms_ss2(x) && iskata(y))
 #define IS_MB2_CHAR(x,y)      (IS_MB2_KATA(x,y) || IS_MB2_JIS(x,y))
 #define IS_MB3_CHAR(x,y,z)    (iseucjpms_ss3(x) && IS_MB2_JIS(y,z))
+#define IS_MB_PREFIX2(x,y)    (iseucjpms_ss3(x) && iseucjpms(y))
 #define DEFINE_ASIAN_ROUTINES
 #include "ctype-mb.ic"
 
diff --git a/strings/ctype-mb.ic b/strings/ctype-mb.ic
index 6fc4d6e..2df9c9d 100644
--- a/strings/ctype-mb.ic
+++ b/strings/ctype-mb.ic
@@ -75,7 +75,13 @@ MY_FUNCTION_NAME(charlen)(CHARSET_INFO *cs __attribute__((unused)),
 
 #ifdef IS_MB3_CHAR
   if (b + 3 > e)
+  {
+#ifdef IS_MB_PREFIX2
+    if (!IS_MB_PREFIX2(b[0], b[1]))
+      return MY_CS_ILSEQ;
+#endif
     return MY_CS_TOOSMALLN(3);
+  }
   if (IS_MB3_CHAR(b[0], b[1], b[2]))
     return 3; /* Three-byte character */
 #endif
diff --git a/strings/ctype-ujis.c b/strings/ctype-ujis.c
index 67e6890..b24fdb3 100644
--- a/strings/ctype-ujis.c
+++ b/strings/ctype-ujis.c
@@ -198,6 +198,7 @@ static const uchar sort_order_ujis[]=
 #define IS_MB2_KATA(x,y)      (isujis_ss2(x)    && iskata(y))
 #define IS_MB2_CHAR(x, y)     (IS_MB2_KATA(x,y) || IS_MB2_JIS(x,y))
 #define IS_MB3_CHAR(x, y, z)  (isujis_ss3(x)    && IS_MB2_JIS(y,z))
+#define IS_MB_PREFIX2(x,y)    (isujis_ss3(x)    && isujis(y))
 #define DEFINE_ASIAN_ROUTINES
 #include "ctype-mb.ic"

Follow ups

Re: Please review MDEV-9823 LOAD DATA INFILE silently truncates incomplete byte sequences
From: Sergei Golubchik, 2016-04-05