maria-developers team mailing list archive
-
maria-developers team
-
Mailing list archive
-
Message #09447
Please review MDEV-9811 and MDEV-9824 (dependencies for MDEV-6353)
Hi Sergei,
Please review a patch fixing MDEV-9811 and MDEV-9824.
They are prerequisites for:
MDEV-6353 my_ismbchar() and my_mbcharlen() refactoring
Thanks.
commit afe22406280634cb97ce4c0943db24033fb7d3bc
Author: Alexander Barkov <bar@xxxxxxxxxxx>
Date: Tue Mar 29 15:39:15 2016 +0400
MDEV-9811 LOAD DATA INFILE does not work well with gbk in some cases
MDEV-9824 LOAD DATA does not work with multi-byte strings in LINES TERMINATED BY when IGNORE is specified
diff --git a/include/m_ctype.h b/include/m_ctype.h
index 615ee6a..5eb71b6 100644
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@@ -180,6 +180,10 @@ extern MY_UNI_CTYPE my_uni_ctype[256];
/* A helper macros for "need at least n bytes" */
#define MY_CS_TOOSMALLN(n) (-100-(n))
+#define MY_CS_MBMAXLEN 6 /* Maximum supported mbmaxlen */
+#define MY_CS_IS_TOOSMALL(rc) ((rc) >= MY_CS_TOOSMALL6 && (rc) <= MY_CS_TOOSMALL)
+
+
#define MY_SEQ_INTTAIL 1
#define MY_SEQ_SPACES 2
diff --git a/mysql-test/r/ctype_gbk.result b/mysql-test/r/ctype_gbk.result
index b577454..e454347 100644
--- a/mysql-test/r/ctype_gbk.result
+++ b/mysql-test/r/ctype_gbk.result
@@ -5926,3 +5926,24 @@ Warning 1300 Invalid gb2312 character string: '\xA3A'
#
# End of 10.1 tests
#
+#
+# Start of 10.2 tests
+#
+#
+# MDEV-9811 LOAD DATA INFILE does not work well with gbk in some cases
+#
+CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET gbk);
+LOAD DATA INFILE '../../std_data/loaddata/mdev8711.txt' INTO TABLE t1 CHARACTER SET gbk LINES TERMINATED BY '@';
+SELECT HEX(a) FROM t1;
+HEX(a)
+B04061B041
+B042
+DELETE FROM t1;
+LOAD DATA INFILE '../../std_data/loaddata/mdev8711.txt' INTO TABLE t1 CHARACTER SET gbk LINES TERMINATED BY '@' IGNORE 1 LINES;
+SELECT HEX(a) FROM t1;
+HEX(a)
+B042
+DROP TABLE t1;
+#
+# End of 10.2 tests
+#
diff --git a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result
index 816fe65..f52e08a 100644
--- a/mysql-test/r/ctype_utf8.result
+++ b/mysql-test/r/ctype_utf8.result
@@ -10401,3 +10401,30 @@ SET @@SQL_MODE=default;
#
# End of 10.1 tests
#
+#
+# Start of 10.2 tests
+#
+#
+# MDEV-9824 LOAD DATA does not work with multi-byte strings in LINES TERMINATED BY when IGNORE is specified
+#
+CREATE TABLE t1 (c1 VARCHAR(10) CHARACTER SET utf8);
+LOAD DATA INFILE '../../std_data/loaddata/mdev9824.txt' INTO TABLE t1 CHARACTER SET utf8 LINES TERMINATED BY 'ÑÑ';
+Warnings:
+Warning 1638 Non-ASCII separator arguments are not fully supported
+SELECT c1 FROM t1;
+c1
+a
+b
+c
+DELETE FROM t1;
+LOAD DATA INFILE '../../std_data/loaddata/mdev9824.txt' INTO TABLE t1 CHARACTER SET utf8 LINES TERMINATED BY 'ÑÑ' IGNORE 1 LINES;
+Warnings:
+Warning 1638 Non-ASCII separator arguments are not fully supported
+SELECT c1 FROM t1;
+c1
+b
+c
+DROP TABLE t1;
+#
+# End of 10.2 tests
+#
diff --git a/mysql-test/std_data/loaddata/mdev8711.txt b/mysql-test/std_data/loaddata/mdev8711.txt
new file mode 100644
index 0000000..49296a7
--- /dev/null
+++ b/mysql-test/std_data/loaddata/mdev8711.txt
@@ -0,0 +1 @@
+°@a°A@°B@
\ No newline at end of file
diff --git a/mysql-test/std_data/loaddata/mdev9824.txt b/mysql-test/std_data/loaddata/mdev9824.txt
new file mode 100644
index 0000000..7050e08
--- /dev/null
+++ b/mysql-test/std_data/loaddata/mdev9824.txt
@@ -0,0 +1 @@
+aÑÑbÑÑcÑÑ
\ No newline at end of file
diff --git a/mysql-test/t/ctype_gbk.test b/mysql-test/t/ctype_gbk.test
index 07e73cd..ae66dbb 100644
--- a/mysql-test/t/ctype_gbk.test
+++ b/mysql-test/t/ctype_gbk.test
@@ -435,3 +435,22 @@ SELECT HEX(CONVERT(CAST(0xA341 AS CHAR CHARACTER SET gb2312) USING utf8));
--echo #
--echo # End of 10.1 tests
--echo #
+
+--echo #
+--echo # Start of 10.2 tests
+--echo #
+
+--echo #
+--echo # MDEV-9811 LOAD DATA INFILE does not work well with gbk in some cases
+--echo #
+CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET gbk);
+LOAD DATA INFILE '../../std_data/loaddata/mdev8711.txt' INTO TABLE t1 CHARACTER SET gbk LINES TERMINATED BY '@';
+SELECT HEX(a) FROM t1;
+DELETE FROM t1;
+LOAD DATA INFILE '../../std_data/loaddata/mdev8711.txt' INTO TABLE t1 CHARACTER SET gbk LINES TERMINATED BY '@' IGNORE 1 LINES;
+SELECT HEX(a) FROM t1;
+DROP TABLE t1;
+
+--echo #
+--echo # End of 10.2 tests
+--echo #
diff --git a/mysql-test/t/ctype_utf8.test b/mysql-test/t/ctype_utf8.test
index 85ffed9..f3a9e63 100644
--- a/mysql-test/t/ctype_utf8.test
+++ b/mysql-test/t/ctype_utf8.test
@@ -1950,3 +1950,22 @@ SET @@SQL_MODE=default;
--echo #
--echo # End of 10.1 tests
--echo #
+
+--echo #
+--echo # Start of 10.2 tests
+--echo #
+
+--echo #
+--echo # MDEV-9824 LOAD DATA does not work with multi-byte strings in LINES TERMINATED BY when IGNORE is specified
+--echo #
+CREATE TABLE t1 (c1 VARCHAR(10) CHARACTER SET utf8);
+LOAD DATA INFILE '../../std_data/loaddata/mdev9824.txt' INTO TABLE t1 CHARACTER SET utf8 LINES TERMINATED BY 'ÑÑ';
+SELECT c1 FROM t1;
+DELETE FROM t1;
+LOAD DATA INFILE '../../std_data/loaddata/mdev9824.txt' INTO TABLE t1 CHARACTER SET utf8 LINES TERMINATED BY 'ÑÑ' IGNORE 1 LINES;
+SELECT c1 FROM t1;
+DROP TABLE t1;
+
+--echo #
+--echo # End of 10.2 tests
+--echo #
diff --git a/sql/sql_load.cc b/sql/sql_load.cc
index d43eb88..f6104f1 100644
--- a/sql/sql_load.cc
+++ b/sql/sql_load.cc
@@ -1707,32 +1707,93 @@ int READ_INFO::next_line()
for (;;)
{
int chr = GET;
-#ifdef USE_MB
- if (my_mbcharlen(read_charset, chr) > 1)
- {
- for (uint i=1;
- chr != my_b_EOF && i<my_mbcharlen(read_charset, chr);
- i++)
- chr = GET;
- if (chr == escape_char)
- continue;
- }
-#endif
- if (chr == my_b_EOF)
- {
- eof=1;
+ if (chr == my_b_EOF)
+ {
+ eof= true;
return 1;
}
+ if (use_mb(read_charset))
+ {
+ char buf[MY_CS_MBMAXLEN];
+ buf[0]= chr;
+ for (uint i= 1; ; buf[i++]= chr)
+ {
+ DBUG_ASSERT(i < sizeof(buf));
+ int chlen= my_charlen(read_charset, buf, buf + i);
+ if (chlen == 1)
+ {
+ /*
+ A single byte character was found,
+ proceed to check escape_char and line_term_char.
+ */
+ DBUG_ASSERT(i == 1);
+ goto check_single_byte;
+ }
+ if (MY_CS_IS_TOOSMALL(chlen))
+ {
+ // buf[] is a prefix of a multi-byte character
+ chr= GET;
+ if (chr == my_b_EOF)
+ {
+ eof= true;
+ return 1;
+ }
+ continue; // Collect more bytes to buf[].
+ }
+ /*
+ Either a complete multi-byte sequence,
+ or a broken byte sequence was found.
+ Check if the sequence is a prefix of the "LINES TERMINATED BY"
+ string.
+ */
+ if ((uchar) buf[0] == line_term_char && i <= line_term_length &&
+ !memcmp(buf, line_term_ptr, i))
+ {
+ if (line_term_length == i)
+ {
+ /*
+ We found a "LINES TERMINATED BY" string that consists
+ of a single multi-byte character.
+ */
+ return 0;
+ }
+ /*
+ Our sequence is a prefix of "LINES TERMINATED BY".
+ Now check the suffix. Length of the suffix of line_term_ptr
+ that still needs to be checked is (line_term_length - i).
+ Note, READ_INFO::terminator() assumes that the leftmost byte of the
+ argument is already scanned from the file and is checked
+ (e.g. against line_term_char). So we need to pass one extra byte.
+ */
+ if (terminator(line_term_ptr + i - 1, line_term_length - i + 1))
+ return 0;
+ }
+ /*
+ Here we have a good multi-byte character or a broken byte sequence,
+ and the sequence is not equal to "LINES TERMINATED BY".
+ No needs to check for escape_char, because:
+ - multi-byte escape characters in "FIELDS ESCAPED BY" are not
+ supported and are rejected at parse time.
+ - broken single-byte sequences are not recognized as escapes,
+ they are considered to be part of the data and are converted to
+ question marks.
+ */
+ goto fin;
+ }
+ DBUG_ASSERT(0); // Should not get to here
+ }
+check_single_byte:
if (chr == escape_char)
{
- line_cuted=1;
+ line_cuted= true;
if (GET == my_b_EOF)
- return 1;
+ return 1;
continue;
}
if (chr == line_term_char && terminator(line_term_ptr,line_term_length))
return 0;
- line_cuted=1;
+fin:
+ line_cuted= true;
}
}
Follow ups