← Back to team overview

maria-developers team mailing list archive

Re: MDEV-7649 wrong result when comparing utf8 column with an invalid literal

 

Hi, Bar!

On Apr 07, Alexander Barkov wrote:
> Hi Sergei,
> 
> Please review a patch for MDEV-7649 for 5.5.
> It fixes the case #1 to return empty set.
> 
> (by treating bad bytes as greater than any valid character).
> 
> Thanks.

The fix is good.
A couple of comments about tests, see below.

> diff --git a/mysql-test/include/ctype_utf8_ilseq.inc b/mysql-test/include/ctype_utf8_ilseq.inc
> new file mode 100644
> index 0000000..700fcbf
> --- /dev/null
> +++ b/mysql-test/include/ctype_utf8_ilseq.inc
> @@ -0,0 +1,112 @@
> +#
> +# Compare a field to an utf8 string literal with illegal byte sequences
> +#
> +
> +--echo #
> +--echo # Start of ctype_utf8_ilseq.inc
> +--echo #
> +
> +--eval CREATE TABLE t1 ENGINE=$ENGINE AS SELECT REPEAT(' ', 60) AS ch LIMIT 0;
> +ALTER TABLE t1
> +  ADD id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
> +  ADD KEY(ch);
> +SHOW CREATE TABLE t1;
> +
> +INSERT INTO t1 (ch) VALUES ('admin');
> +SELECT ch FROM t1 WHERE ch='admin𝌆';
> +DELETE FROM t1;
> +INSERT INTO t1 (ch) VALUES ('a'), ('a?'), ('a??'), ('a???'), ('a????');
> +INSERT INTO t1 (ch) VALUES ('ab'),('a?b'),('a??b'),('a???b'),('a????b');
> +INSERT INTO t1 (ch) VALUES ('az'),('a?z'),('a??z'),('a???z'),('a????z');
> +INSERT INTO t1 (ch) VALUES ('z');
> +# LATIN SMALL LETTER A + LATIN CAPITAL LETTER E WITH GRAVE
> +INSERT INTO t1 (ch) VALUES (_utf8 0x61D080);
> +# LATIN SMALL LETTER A + ARMENIAN SMALL LETTER REH
> +INSERT INTO t1 (ch) VALUES (_utf8 0x61D680);
> +
> +SELECT ch FROM t1 IGNORE KEY (ch) WHERE ch='a𝌆' ORDER BY ch;
> +SELECT ch FROM t1 IGNORE KEY (ch) WHERE ch='a𝌆b' ORDER BY ch;
> +SELECT ch FROM t1 WHERE ch='a𝌆' ORDER BY ch;
> +SELECT ch FROM t1 WHERE ch='a𝌆b' ORDER BY ch;

would be good to have an explain or force key. otherwise we
don't know whether the key was used or not
(this applies everywhere)

> +
> +SELECT ch FROM t1 IGNORE KEY (ch) WHERE ch<'a𝌆' ORDER BY ch;
> +SELECT ch FROM t1 IGNORE KEY (ch) WHERE ch<'a𝌆b' ORDER BY ch;
> +SELECT ch FROM t1 WHERE ch<'a𝌆' ORDER BY ch;
> +SELECT ch FROM t1 WHERE ch<'a𝌆b' ORDER BY ch;
> +
> +SELECT ch FROM t1 IGNORE KEY (ch) WHERE ch>'a𝌆' ORDER BY ch;
> +SELECT ch FROM t1 IGNORE KEY (ch) WHERE ch>'a𝌆b' ORDER BY ch;
> +SELECT ch FROM t1 WHERE ch>'a𝌆' ORDER BY ch;
> +SELECT ch FROM t1 WHERE ch>'a𝌆b' ORDER BY ch;
> +
> +ALTER TABLE t1 DROP KEY ch;
> +
> +--echo # 0xD18F would be a good 2-byte character, 0xD1 is an incomplete sequence
> +SET @query=CONCAT('SELECT ch FROM t1 WHERE ch=''a', 0xD1,'''');
> +PREPARE stmt FROM @query;
> +EXECUTE stmt;
> +SET @query=CONCAT('SELECT ch FROM t1 WHERE ch=''a', 0xD1,'b''');
> +PREPARE stmt FROM @query;
> +EXECUTE stmt;
> +
> +#
> +# Non-equality comparison currently work differently depending on collation:
> +#
> +# - utf8_general_ci falls back to memcmp() on bad byte
> +# - utf8_unicode_ci treats bad bytes greater than any valid character
> +#
> +#  For example, these two characters:
> +#    _utf8 0xD080 (U+00C8 LATIN CAPITAL LETTER E WITH GRAVE)
> +#    _utf8 0xD680 (U+0580 ARMENIAN SMALL LETTER REH)
> +#
> +#  will give different results (depending on collation) when compared 
> +#  to an incomplete byte sequence 0xD1 (mb2head not followed by mb2tail).
> +#
> +# For utf8_general_ci the result depends on the valid side:
> +# - 0xD080 is smaller than 0xD1, because 0xD0 < 0xD1
> +# - 0xD680 is greater than 0xD1, because 0xD6 > 0xD1
> +#
> +# For utf8_unicode_ci the result does not depend on the valid side:
> +# - 0xD080 is smaller than 0xD1, because 0xD1 is greater than any valid character
> +# - 0xD680 is smaller than 0xD1, because 0xD1 is greater than any valid character
> +#
> +# utf8_general_ci should be eventually fixed to treat bad bytes greater
> +# than any valid character, similar to utf8_unicode_ci.
> +#
> +
> +SET @query=CONCAT('SELECT ch FROM t1 WHERE ch<''a', 0xD1,'''');
> +PREPARE stmt FROM @query;
> +EXECUTE stmt;
> +SET @query=CONCAT('SELECT ch FROM t1 WHERE ch>''a', 0xD1,'''');
> +PREPARE stmt FROM @query;
> +EXECUTE stmt;
> +
> +--echo # 0xEA9A96 would be a good 3-byte character, 0xEA9A is an incomplete sequence
> +SET @query=CONCAT('SELECT ch FROM t1 WHERE ch=''a', 0xEA9A,'''');
> +PREPARE stmt FROM @query;
> +EXECUTE stmt;
> +SET @query=CONCAT('SELECT ch FROM t1 WHERE ch=''a', 0xEA9A,'b''');
> +PREPARE stmt FROM @query;
> +EXECUTE stmt;
> +
> +--echo # 0x8F is a bad byte sequence (an mb2tail without mb2head)
> +SET @query=CONCAT('SELECT ch FROM t1 WHERE ch=''a', 0x8F,'''');
> +PREPARE stmt FROM @query;
> +EXECUTE stmt;
> +SET @query=CONCAT('SELECT ch FROM t1 WHERE ch=''a', 0x8F,'b''');
> +PREPARE stmt FROM @query;
> +EXECUTE stmt;
> +
> +--echo # 0x8F8F is a bad byte sequence (an mb2tail without mb2head, two times)
> +SET @query=CONCAT('SELECT ch FROM t1 WHERE ch=''a', 0x8F8F,'''');
> +PREPARE stmt FROM @query;
> +EXECUTE stmt;
> +SET @query=CONCAT('SELECT ch FROM t1 WHERE ch=''a', 0x8F8F,'b''');
> +PREPARE stmt FROM @query;
> +EXECUTE stmt;
> +
> +DROP TABLE t1;
> +
> +--echo #
> +--echo # End of ctype_utf8_ilseq.inc
> +--echo #
> diff --git a/mysql-test/r/ctype_uca.result b/mysql-test/r/ctype_uca.result
> index 1d5dd4e..974c097 100644
> --- a/mysql-test/r/ctype_uca.result
> +++ b/mysql-test/r/ctype_uca.result
> @@ -3216,5 +3216,604 @@ a	c1
>  10	=> DŽ
>  drop table t1;
>  #
> +# MDEV-7649 wrong result when comparing utf8 column with an invalid literal
> +#
> +SET NAMES utf8 COLLATE utf8_unicode_ci;
> +#
> +# Start of ctype_utf8_ilseq.inc
> +#
> +CREATE TABLE t1 ENGINE=InnoDB AS SELECT REPEAT(' ', 60) AS ch LIMIT 0;;
> +Warnings:
> +Warning	1286	Unknown storage engine 'InnoDB'
> +Warning	1266	Using storage engine MyISAM for table 't1'

Eh?

> +ALTER TABLE t1
> +ADD id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
> +ADD KEY(ch);
> +SHOW CREATE TABLE t1;
> +Table	Create Table
> +t1	CREATE TABLE `t1` (
> +  `ch` varchar(60) CHARACTER SET utf8 COLLATE utf8_unicode_ci NOT NULL DEFAULT '',
> +  `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
> +  PRIMARY KEY (`id`),
> +  KEY `ch` (`ch`)
> +) ENGINE=MyISAM DEFAULT CHARSET=latin1
> +INSERT INTO t1 (ch) VALUES ('admin');
...
> diff --git a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result
> index 9a6dadb..d979592 100644
> --- a/mysql-test/r/ctype_utf8.result
> +++ b/mysql-test/r/ctype_utf8.result
> @@ -5134,5 +5134,603 @@ a	1024
>  Warnings:
>  Warning	1260	Row 2 was cut by GROUP_CONCAT()
>  #
> +# MDEV-7649 wrong result when comparing utf8 column with an invalid literal
> +#
> +SET NAMES utf8 COLLATE utf8_general_ci;
> +#
> +# Start of ctype_utf8_ilseq.inc
> +#
> +CREATE TABLE t1 ENGINE=InnoDB AS SELECT REPEAT(' ', 60) AS ch LIMIT 0;;
> +ALTER TABLE t1
> +ADD id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
> +ADD KEY(ch);
> +SHOW CREATE TABLE t1;
> +Table	Create Table
> +t1	CREATE TABLE `t1` (
> +  `ch` varchar(60) CHARACTER SET utf8 NOT NULL DEFAULT '',
> +  `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
> +  PRIMARY KEY (`id`),
> +  KEY `ch` (`ch`)
> +) ENGINE=InnoDB DEFAULT CHARSET=latin1
> +INSERT INTO t1 (ch) VALUES ('admin');
> +SELECT ch FROM t1 WHERE ch='admin𝌆';
> +ch
> +Warnings:
> +Warning	1366	Incorrect string value: '\xF0\x9D\x8C\x86' for column 'ch' at row 1
> +DELETE FROM t1;
> +INSERT INTO t1 (ch) VALUES ('a'), ('a?'), ('a??'), ('a???'), ('a????');
> +INSERT INTO t1 (ch) VALUES ('ab'),('a?b'),('a??b'),('a???b'),('a????b');
> +INSERT INTO t1 (ch) VALUES ('az'),('a?z'),('a??z'),('a???z'),('a????z');
> +INSERT INTO t1 (ch) VALUES ('z');
> +INSERT INTO t1 (ch) VALUES (_utf8 0x61D080);
> +INSERT INTO t1 (ch) VALUES (_utf8 0x61D680);
> +SELECT ch FROM t1 IGNORE KEY (ch) WHERE ch='a𝌆' ORDER BY ch;
> +ch
> +SELECT ch FROM t1 IGNORE KEY (ch) WHERE ch='a𝌆b' ORDER BY ch;
> +ch
> +SELECT ch FROM t1 WHERE ch='a𝌆' ORDER BY ch;
> +ch
> +Warnings:
> +Warning	1366	Incorrect string value: '\xF0\x9D\x8C\x86' for column 'ch' at row 1
> +SELECT ch FROM t1 WHERE ch='a𝌆b' ORDER BY ch;
> +ch
> +Warnings:
> +Warning	1366	Incorrect string value: '\xF0\x9D\x8C\x86b' for column 'ch' at row 1
> +SELECT ch FROM t1 IGNORE KEY (ch) WHERE ch<'a𝌆' ORDER BY ch;
> +ch
> +a
> +a?
> +a??
> +a???
> +a????
> +a????b
> +a????z
> +a???b
> +a???z
> +a??b
> +a??z
> +a?b
> +a?z
> +ab
> +az
> +aЀ
> +aր

hm. seems that according to this test the bad character is larger than others
even in utf8_general_ci case. Why?

> +SELECT ch FROM t1 IGNORE KEY (ch) WHERE ch<'a𝌆b' ORDER BY ch;
> +ch
> +a
> +a?
> +a??
> +a???
> +a????
> +a????b
> +a????z
> +a???b
> +a???z
> +a??b
> +a??z
> +a?b
> +a?z
> +ab
> +az
> +aЀ
> +aր

Regards,
Sergei


References