maria-developers team mailing list archive

Thread
Date

Please review MDEV-9811 and MDEV-9824 (dependencies for MDEV-6353)

To: Sergei Golubchik <serg@xxxxxxxxxxx>, maria-developers <maria-developers@xxxxxxxxxxxxxxxxxxx>
From: Alexander Barkov <bar@xxxxxxxxxxx>
Date: Tue, 29 Mar 2016 15:44:48 +0400
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Thunderbird/38.6.0

Hi Sergei,

Please review a patch fixing MDEV-9811 and MDEV-9824.

They are prerequisites for:

MDEV-6353 my_ismbchar() and my_mbcharlen() refactoring

Thanks.

commit afe22406280634cb97ce4c0943db24033fb7d3bc
Author: Alexander Barkov <bar@xxxxxxxxxxx>
Date:   Tue Mar 29 15:39:15 2016 +0400

    MDEV-9811 LOAD DATA INFILE does not work well with gbk in some cases
    MDEV-9824 LOAD DATA does not work with multi-byte strings in LINES TERMINATED BY when IGNORE is specified

diff --git a/include/m_ctype.h b/include/m_ctype.h
index 615ee6a..5eb71b6 100644
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@@ -180,6 +180,10 @@ extern MY_UNI_CTYPE my_uni_ctype[256];
 /* A helper macros for "need at least n bytes" */
 #define MY_CS_TOOSMALLN(n)    (-100-(n))
 
+#define MY_CS_MBMAXLEN  6     /* Maximum supported mbmaxlen */
+#define MY_CS_IS_TOOSMALL(rc) ((rc) >= MY_CS_TOOSMALL6 && (rc) <= MY_CS_TOOSMALL)
+
+
 #define MY_SEQ_INTTAIL	1
 #define MY_SEQ_SPACES	2
 
diff --git a/mysql-test/r/ctype_gbk.result b/mysql-test/r/ctype_gbk.result
index b577454..e454347 100644
--- a/mysql-test/r/ctype_gbk.result
+++ b/mysql-test/r/ctype_gbk.result
@@ -5926,3 +5926,24 @@ Warning	1300	Invalid gb2312 character string: '\xA3A'
 #
 # End of 10.1 tests
 #
+#
+# Start of 10.2 tests
+#
+#
+# MDEV-9811 LOAD DATA INFILE does not work well with gbk in some cases
+#
+CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET gbk);
+LOAD DATA INFILE '../../std_data/loaddata/mdev8711.txt' INTO TABLE t1 CHARACTER SET gbk LINES TERMINATED BY '@';
+SELECT HEX(a) FROM t1;
+HEX(a)
+B04061B041
+B042
+DELETE FROM t1;
+LOAD DATA INFILE '../../std_data/loaddata/mdev8711.txt' INTO TABLE t1 CHARACTER SET gbk LINES TERMINATED BY '@' IGNORE 1 LINES;
+SELECT HEX(a) FROM t1;
+HEX(a)
+B042
+DROP TABLE t1;
+#
+# End of 10.2 tests
+#
diff --git a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result
index 816fe65..f52e08a 100644
--- a/mysql-test/r/ctype_utf8.result
+++ b/mysql-test/r/ctype_utf8.result
@@ -10401,3 +10401,30 @@ SET @@SQL_MODE=default;
 #
 # End of 10.1 tests
 #
+#
+# Start of 10.2 tests
+#
+#
+# MDEV-9824 LOAD DATA does not work with multi-byte strings in LINES TERMINATED BY when IGNORE is specified
+#
+CREATE TABLE t1 (c1 VARCHAR(10) CHARACTER SET utf8);
+LOAD DATA INFILE '../../std_data/loaddata/mdev9824.txt' INTO TABLE t1 CHARACTER SET utf8 LINES TERMINATED BY 'Ñ‘Ñ‘';
+Warnings:
+Warning	1638	Non-ASCII separator arguments are not fully supported
+SELECT c1 FROM t1;
+c1
+a
+b
+c
+DELETE FROM t1;
+LOAD DATA INFILE '../../std_data/loaddata/mdev9824.txt' INTO TABLE t1 CHARACTER SET utf8 LINES TERMINATED BY 'Ñ‘Ñ‘' IGNORE 1 LINES;
+Warnings:
+Warning	1638	Non-ASCII separator arguments are not fully supported
+SELECT c1 FROM t1;
+c1
+b
+c
+DROP TABLE t1;
+#
+# End of 10.2 tests
+#
diff --git a/mysql-test/std_data/loaddata/mdev8711.txt b/mysql-test/std_data/loaddata/mdev8711.txt
new file mode 100644
index 0000000..49296a7
--- /dev/null
+++ b/mysql-test/std_data/loaddata/mdev8711.txt
@@ -0,0 +1 @@
+°@a°A@°B@
\ No newline at end of file
diff --git a/mysql-test/std_data/loaddata/mdev9824.txt b/mysql-test/std_data/loaddata/mdev9824.txt
new file mode 100644
index 0000000..7050e08
--- /dev/null
+++ b/mysql-test/std_data/loaddata/mdev9824.txt
@@ -0,0 +1 @@
+aÑ‘Ñ‘bÑ‘Ñ‘cÑ‘Ñ‘
\ No newline at end of file
diff --git a/mysql-test/t/ctype_gbk.test b/mysql-test/t/ctype_gbk.test
index 07e73cd..ae66dbb 100644
--- a/mysql-test/t/ctype_gbk.test
+++ b/mysql-test/t/ctype_gbk.test
@@ -435,3 +435,22 @@ SELECT HEX(CONVERT(CAST(0xA341 AS CHAR CHARACTER SET gb2312) USING utf8));
 --echo #
 --echo # End of 10.1 tests
 --echo #
+
+--echo #
+--echo # Start of 10.2 tests
+--echo #
+
+--echo #
+--echo # MDEV-9811 LOAD DATA INFILE does not work well with gbk in some cases
+--echo #
+CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET gbk);
+LOAD DATA INFILE '../../std_data/loaddata/mdev8711.txt' INTO TABLE t1 CHARACTER SET gbk LINES TERMINATED BY '@';
+SELECT HEX(a) FROM t1;
+DELETE FROM t1;
+LOAD DATA INFILE '../../std_data/loaddata/mdev8711.txt' INTO TABLE t1 CHARACTER SET gbk LINES TERMINATED BY '@' IGNORE 1 LINES;
+SELECT HEX(a) FROM t1;
+DROP TABLE t1;
+
+--echo #
+--echo # End of 10.2 tests
+--echo #
diff --git a/mysql-test/t/ctype_utf8.test b/mysql-test/t/ctype_utf8.test
index 85ffed9..f3a9e63 100644
--- a/mysql-test/t/ctype_utf8.test
+++ b/mysql-test/t/ctype_utf8.test
@@ -1950,3 +1950,22 @@ SET @@SQL_MODE=default;
 --echo #
 --echo # End of 10.1 tests
 --echo #
+
+--echo #
+--echo # Start of 10.2 tests
+--echo #
+
+--echo #
+--echo # MDEV-9824 LOAD DATA does not work with multi-byte strings in LINES TERMINATED BY when IGNORE is specified
+--echo #
+CREATE TABLE t1 (c1 VARCHAR(10) CHARACTER SET utf8);
+LOAD DATA INFILE '../../std_data/loaddata/mdev9824.txt' INTO TABLE t1 CHARACTER SET utf8 LINES TERMINATED BY 'Ñ‘Ñ‘';
+SELECT c1 FROM t1;
+DELETE FROM t1;
+LOAD DATA INFILE '../../std_data/loaddata/mdev9824.txt' INTO TABLE t1 CHARACTER SET utf8 LINES TERMINATED BY 'Ñ‘Ñ‘' IGNORE 1 LINES;
+SELECT c1 FROM t1;
+DROP TABLE t1;
+
+--echo #
+--echo # End of 10.2 tests
+--echo #
diff --git a/sql/sql_load.cc b/sql/sql_load.cc
index d43eb88..f6104f1 100644
--- a/sql/sql_load.cc
+++ b/sql/sql_load.cc
@@ -1707,32 +1707,93 @@ int READ_INFO::next_line()
   for (;;)
   {
     int chr = GET;
-#ifdef USE_MB
-   if (my_mbcharlen(read_charset, chr) > 1)
-   {
-       for (uint i=1;
-            chr != my_b_EOF && i<my_mbcharlen(read_charset, chr);
-            i++)
-	   chr = GET;
-       if (chr == escape_char)
-	   continue;
-   }
-#endif
-   if (chr == my_b_EOF)
-   {
-      eof=1;
+    if (chr == my_b_EOF)
+    {
+      eof= true;
       return 1;
     }
+    if (use_mb(read_charset))
+    {
+      char buf[MY_CS_MBMAXLEN];
+      buf[0]= chr;
+      for (uint i= 1; ; buf[i++]= chr)
+      {
+        DBUG_ASSERT(i < sizeof(buf));
+        int chlen= my_charlen(read_charset, buf, buf + i);
+        if (chlen == 1)
+        {
+          /*
+            A single byte character was found,
+            proceed to check escape_char and line_term_char.
+          */
+          DBUG_ASSERT(i == 1);
+          goto check_single_byte;
+        }
+        if (MY_CS_IS_TOOSMALL(chlen))
+        {
+          // buf[] is a prefix of a multi-byte character
+          chr= GET;
+          if (chr == my_b_EOF)
+          {
+            eof= true;
+            return 1;
+          }
+          continue; // Collect more bytes to buf[].
+        }
+        /*
+          Either a complete multi-byte sequence,
+          or a broken byte sequence was found.
+          Check if the sequence is a prefix of the "LINES TERMINATED BY"
+          string.
+        */
+        if ((uchar) buf[0] == line_term_char && i <= line_term_length &&
+            !memcmp(buf, line_term_ptr, i))
+        {
+          if (line_term_length == i)
+          {
+            /*
+              We found a "LINES TERMINATED BY" string that consists
+              of a single multi-byte character.
+            */
+            return 0;
+          }
+          /*
+            Our sequence is a prefix of "LINES TERMINATED BY".
+            Now check the suffix. Length of the suffix of line_term_ptr
+            that still needs to be checked is (line_term_length - i).
+            Note, READ_INFO::terminator() assumes that the leftmost byte of the
+            argument is already scanned from the file and is checked
+            (e.g. against line_term_char). So we need to pass one extra byte.
+          */
+          if (terminator(line_term_ptr + i - 1, line_term_length - i + 1))
+            return 0;
+        }
+        /*
+          Here we have a good multi-byte character or a broken byte sequence,
+          and the sequence is not equal to "LINES TERMINATED BY".
+          No needs to check for escape_char, because:
+          - multi-byte escape characters in "FIELDS ESCAPED BY" are not
+            supported and are rejected at parse time.
+          - broken single-byte sequences are not recognized as escapes,
+            they are considered to be part of the data and are converted to
+            question marks.
+        */
+        goto fin;
+      }
+      DBUG_ASSERT(0); // Should not get to here
+    }
+check_single_byte:
     if (chr == escape_char)
     {
-      line_cuted=1;
+      line_cuted= true;
       if (GET == my_b_EOF)
-	return 1;
+        return 1;
       continue;
     }
     if (chr == line_term_char && terminator(line_term_ptr,line_term_length))
       return 0;
-    line_cuted=1;
+fin:
+    line_cuted= true;
   }
 }

Follow ups

Re: Please review MDEV-9811 and MDEV-9824 (dependencies for MDEV-6353)
From: Sergei Golubchik, 2016-03-31