maria-developers team mailing list archive
-
maria-developers team
-
Mailing list archive
-
Message #09459
Re: Please review MDEV-9811 and MDEV-9824 (dependencies for MDEV-6353)
Hi, Alexander!
Looks ok to push. A couple of sugestions below.
On Mar 29, Alexander Barkov wrote:
> commit afe22406280634cb97ce4c0943db24033fb7d3bc
> Author: Alexander Barkov <bar@xxxxxxxxxxx>
> Date: Tue Mar 29 15:39:15 2016 +0400
>
> MDEV-9811 LOAD DATA INFILE does not work well with gbk in some cases
> MDEV-9824 LOAD DATA does not work with multi-byte strings in LINES TERMINATED BY when IGNORE is specified
>
> diff --git a/include/m_ctype.h b/include/m_ctype.h
> index 615ee6a..5eb71b6 100644
> --- a/include/m_ctype.h
> +++ b/include/m_ctype.h
> @@ -180,6 +180,10 @@ extern MY_UNI_CTYPE my_uni_ctype[256];
> /* A helper macros for "need at least n bytes" */
> #define MY_CS_TOOSMALLN(n) (-100-(n))
>
> +#define MY_CS_MBMAXLEN 6 /* Maximum supported mbmaxlen */
I'd add in init_available_charsets():
DBUG_ASSERT(cs[0]->mbmaxlen <= MY_CS_MBMAXLEN);
> +#define MY_CS_IS_TOOSMALL(rc) ((rc) >= MY_CS_TOOSMALL6 && (rc) <= MY_CS_TOOSMALL)
> +
> +
> #define MY_SEQ_INTTAIL 1
> #define MY_SEQ_SPACES 2
>
> diff --git a/sql/sql_load.cc b/sql/sql_load.cc
> index d43eb88..f6104f1 100644
> --- a/sql/sql_load.cc
> +++ b/sql/sql_load.cc
> @@ -1707,32 +1707,93 @@ int READ_INFO::next_line()
> for (;;)
> {
> int chr = GET;
> -#ifdef USE_MB
> - if (my_mbcharlen(read_charset, chr) > 1)
> - {
> - for (uint i=1;
> - chr != my_b_EOF && i<my_mbcharlen(read_charset, chr);
> - i++)
> - chr = GET;
> - if (chr == escape_char)
> - continue;
> - }
> -#endif
> if (chr == my_b_EOF)
> {
> eof= true;
> return 1;
> }
> + if (use_mb(read_charset))
> + {
> + char buf[MY_CS_MBMAXLEN];
> + buf[0]= chr;
> + for (uint i= 1; ; buf[i++]= chr)
> + {
> + DBUG_ASSERT(i < sizeof(buf));
> + int chlen= my_charlen(read_charset, buf, buf + i);
> + if (chlen == 1)
I would've moved it to the first if():
if (use_mb(read_charset) && my_charlen(read_charset, &chr, 1) != 1)
> + {
> + /*
> + A single byte character was found,
> + proceed to check escape_char and line_term_char.
> + */
> + DBUG_ASSERT(i == 1);
> + goto check_single_byte;
> + }
> + if (MY_CS_IS_TOOSMALL(chlen))
> + {
> + // buf[] is a prefix of a multi-byte character
> + chr= GET;
> + if (chr == my_b_EOF)
> + {
> + eof= true;
> + return 1;
> + }
> + continue; // Collect more bytes to buf[].
> + }
> + /*
> + Either a complete multi-byte sequence,
> + or a broken byte sequence was found.
> + Check if the sequence is a prefix of the "LINES TERMINATED BY"
> + string.
> + */
> + if ((uchar) buf[0] == line_term_char && i <= line_term_length &&
> + !memcmp(buf, line_term_ptr, i))
> + {
> + if (line_term_length == i)
> + {
> + /*
> + We found a "LINES TERMINATED BY" string that consists
> + of a single multi-byte character.
> + */
> + return 0;
> + }
> + /*
> + Our sequence is a prefix of "LINES TERMINATED BY".
> + Now check the suffix. Length of the suffix of line_term_ptr
> + that still needs to be checked is (line_term_length - i).
> + Note, READ_INFO::terminator() assumes that the leftmost byte of the
> + argument is already scanned from the file and is checked
> + (e.g. against line_term_char). So we need to pass one extra byte.
> + */
> + if (terminator(line_term_ptr + i - 1, line_term_length - i + 1))
> + return 0;
> + }
> + /*
> + Here we have a good multi-byte character or a broken byte sequence,
> + and the sequence is not equal to "LINES TERMINATED BY".
> + No needs to check for escape_char, because:
> + - multi-byte escape characters in "FIELDS ESCAPED BY" are not
> + supported and are rejected at parse time.
> + - broken single-byte sequences are not recognized as escapes,
> + they are considered to be part of the data and are converted to
> + question marks.
> + */
> + goto fin;
> + }
> + DBUG_ASSERT(0); // Should not get to here
> + }
> +check_single_byte:
> if (chr == escape_char)
> {
> line_cuted= true;
> if (GET == my_b_EOF)
> return 1;
> continue;
> }
> if (chr == line_term_char && terminator(line_term_ptr,line_term_length))
> return 0;
> +fin:
> line_cuted= true;
> }
> }
Regards,
Sergei
Chief Architect MariaDB
and security@xxxxxxxxxxx
References