maria-developers team mailing list archive
-
maria-developers team
-
Mailing list archive
-
Message #09105
MDEV-8844 Unreadable control characters printed as is in warnings
Hi Sergei,
Please review a patch for MDEV-8844.
Thanks!
diff --git a/include/m_ctype.h b/include/m_ctype.h
index a552226..b059258 100644
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@@ -582,6 +582,7 @@ struct charset_info_st
extern MYSQL_PLUGIN_IMPORT struct charset_info_st my_charset_bin;
extern MYSQL_PLUGIN_IMPORT struct charset_info_st my_charset_latin1;
extern MYSQL_PLUGIN_IMPORT struct charset_info_st my_charset_filename;
+extern MYSQL_PLUGIN_IMPORT struct charset_info_st my_charset_errmsg;
extern MYSQL_PLUGIN_IMPORT struct charset_info_st my_charset_utf8_general_ci;
extern struct charset_info_st my_charset_big5_bin;
diff --git a/mysql-test/r/ctype_latin1.result b/mysql-test/r/ctype_latin1.result
index 4847592..fce1a07 100644
--- a/mysql-test/r/ctype_latin1.result
+++ b/mysql-test/r/ctype_latin1.result
@@ -8181,5 +8181,44 @@ Warnings:
Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t1`.`c` AS `c`,`test`.`t1`.`d` AS `d` from `test`.`t1` where ((coalesce(`test`.`t1`.`c`,0) = '3 ') and (coalesce(`test`.`t1`.`d`,0) = '3 '))
DROP TABLE t1;
#
+# MDEV-8844 Unreadable control characters printed as is in warnings
+#
+SET NAMES latin1;
+SELECT CAST(_latin1 0x610062 AS INT);
+CAST(_latin1 0x610062 AS INT)
+0
+Warnings:
+Warning 1292 Truncated incorrect INTEGER value: 'a\0000b'
+SELECT CAST(_latin1 0x610162 AS INT);
+CAST(_latin1 0x610162 AS INT)
+0
+Warnings:
+Warning 1292 Truncated incorrect INTEGER value: 'a\0001b'
+SELECT CAST(_latin1 0x611F62 AS INT);
+CAST(_latin1 0x611F62 AS INT)
+0
+Warnings:
+Warning 1292 Truncated incorrect INTEGER value: 'a\001Fb'
+SELECT CAST(_latin1 0x617F62 AS INT);
+CAST(_latin1 0x617F62 AS INT)
+0
+Warnings:
+Warning 1292 Truncated incorrect INTEGER value: 'a\007Fb'
+SELECT CAST(_latin1 0x612062 AS INT);
+CAST(_latin1 0x612062 AS INT)
+0
+Warnings:
+Warning 1292 Truncated incorrect INTEGER value: 'a b'
+SELECT CAST(_latin1 0x617E62 AS INT);
+CAST(_latin1 0x617E62 AS INT)
+0
+Warnings:
+Warning 1292 Truncated incorrect INTEGER value: 'a~b'
+SELECT CAST(_latin1 0x61FF62 AS INT);
+CAST(_latin1 0x61FF62 AS INT)
+0
+Warnings:
+Warning 1292 Truncated incorrect INTEGER value: 'aÿb'
+#
# End of 10.1 tests
#
diff --git a/mysql-test/r/ctype_ucs.result b/mysql-test/r/ctype_ucs.result
index 5617431..82e4784 100644
--- a/mysql-test/r/ctype_ucs.result
+++ b/mysql-test/r/ctype_ucs.result
@@ -5649,5 +5649,38 @@ CAST(CONVERT('1IJ3' USING ucs2) AS SIGNED)
Warnings:
Warning 1292 Truncated incorrect INTEGER value: '1IJ3'
#
+# MDEV-8844 Unreadable control characters printed as is in warnings
+#
+SELECT CAST(_ucs2 0x006100000062 AS INT);
+CAST(_ucs2 0x006100000062 AS INT)
+0
+Warnings:
+Warning 1292 Truncated incorrect INTEGER value: 'a\0000b'
+SELECT CAST(_ucs2 0x006100010062 AS INT);
+CAST(_ucs2 0x006100010062 AS INT)
+0
+Warnings:
+Warning 1292 Truncated incorrect INTEGER value: 'a\0001b'
+SELECT CAST(_ucs2 0x0061D8000062 AS INT);
+CAST(_ucs2 0x0061D8000062 AS INT)
+0
+Warnings:
+Warning 1292 Truncated incorrect INTEGER value: 'a\D800b'
+SELECT CAST(_ucs2 0x0061DFFF0062 AS INT);
+CAST(_ucs2 0x0061DFFF0062 AS INT)
+0
+Warnings:
+Warning 1292 Truncated incorrect INTEGER value: 'a\DFFFb'
+SELECT CAST(_ucs2 0x0061D7000062 AS INT);
+CAST(_ucs2 0x0061D7000062 AS INT)
+0
+Warnings:
+Warning 1292 Truncated incorrect INTEGER value: 'aíb'
+SELECT CAST(_ucs2 0x0061E0030062 AS INT);
+CAST(_ucs2 0x0061E0030062 AS INT)
+0
+Warnings:
+Warning 1292 Truncated incorrect INTEGER value: 'aîb'
+#
# End of 10.1 tests
#
diff --git a/mysql-test/r/ctype_utf16.result b/mysql-test/r/ctype_utf16.result
index 3bd3725..303fa48 100644
--- a/mysql-test/r/ctype_utf16.result
+++ b/mysql-test/r/ctype_utf16.result
@@ -2199,5 +2199,14 @@ CAST(CONVERT('1IJ3' USING utf16) AS SIGNED)
Warnings:
Warning 1292 Truncated incorrect INTEGER value: '1IJ3'
#
+# MDEV-8844 Unreadable control characters printed as is in warnings
+#
+SET NAMES utf8;
+SELECT CAST(_utf16 0x0061D83DDE0E0062 AS INT);
+CAST(_utf16 0x0061D83DDE0E0062 AS INT)
+0
+Warnings:
+Warning 1292 Truncated incorrect INTEGER value: 'a?b'
+#
# End of 10.1 tests
#
diff --git a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result
index 66db7df..ab340e0 100644
--- a/mysql-test/r/ctype_utf8.result
+++ b/mysql-test/r/ctype_utf8.result
@@ -10213,5 +10213,69 @@ Warnings:
Note 1003 select `test`.`t1`.`c` AS `c` from `test`.`t1` where (`test`.`t1`.`c` = 'A')
DROP TABLE t1;
#
+# MDEV-8844 Unreadable control characters printed as is in warnings
+#
+SET NAMES utf8;
+SELECT CAST(_utf8 0x610062 AS INT);
+CAST(_utf8 0x610062 AS INT)
+0
+Warnings:
+Warning 1292 Truncated incorrect INTEGER value: 'a\0000b'
+SELECT CAST(_utf8 0x610162 AS INT);
+CAST(_utf8 0x610162 AS INT)
+0
+Warnings:
+Warning 1292 Truncated incorrect INTEGER value: 'a\0001b'
+SELECT CAST(_utf8 0x611F62 AS INT);
+CAST(_utf8 0x611F62 AS INT)
+0
+Warnings:
+Warning 1292 Truncated incorrect INTEGER value: 'a\001Fb'
+SELECT CAST(_utf8 0x617F62 AS INT);
+CAST(_utf8 0x617F62 AS INT)
+0
+Warnings:
+Warning 1292 Truncated incorrect INTEGER value: 'a\007Fb'
+SELECT CAST(_utf8 0x61C28062 AS INT);
+CAST(_utf8 0x61C28062 AS INT)
+0
+Warnings:
+Warning 1292 Truncated incorrect INTEGER value: 'a\0080b'
+SELECT CAST(_utf8 0x61C29F62 AS INT);
+CAST(_utf8 0x61C29F62 AS INT)
+0
+Warnings:
+Warning 1292 Truncated incorrect INTEGER value: 'a\009Fb'
+SELECT CAST(_utf8 0x612062 AS INT);
+CAST(_utf8 0x612062 AS INT)
+0
+Warnings:
+Warning 1292 Truncated incorrect INTEGER value: 'a b'
+SELECT CAST(_utf8 0x617E62 AS INT);
+CAST(_utf8 0x617E62 AS INT)
+0
+Warnings:
+Warning 1292 Truncated incorrect INTEGER value: 'a~b'
+SELECT CAST(_utf8 0x61C2BF62 AS INT);
+CAST(_utf8 0x61C2BF62 AS INT)
+0
+Warnings:
+Warning 1292 Truncated incorrect INTEGER value: 'a¿b'
+SELECT CAST(_utf8 'ëëë' AS INT);
+CAST(_utf8 'ëëë' AS INT)
+0
+Warnings:
+Warning 1292 Truncated incorrect INTEGER value: 'ëëë'
+SELECT CAST(_utf8 'ÅÅÅ' AS INT);
+CAST(_utf8 'ÅÅÅ' AS INT)
+0
+Warnings:
+Warning 1292 Truncated incorrect INTEGER value: 'ÅÅÅ'
+SELECT CAST(_utf8 'ÑÑÑ' AS INT);
+CAST(_utf8 'ÑÑÑ' AS INT)
+0
+Warnings:
+Warning 1292 Truncated incorrect INTEGER value: 'ÑÑÑ'
+#
# End of 10.1 tests
#
diff --git a/mysql-test/t/ctype_latin1.test b/mysql-test/t/ctype_latin1.test
index a30c7ae..7478ca6 100644
--- a/mysql-test/t/ctype_latin1.test
+++ b/mysql-test/t/ctype_latin1.test
@@ -374,5 +374,19 @@ SELECT * FROM t1 WHERE COALESCE(c,0)='3 ' AND COALESCE(d,0)=COALESCE(c,0);
DROP TABLE t1;
--echo #
+--echo # MDEV-8844 Unreadable control characters printed as is in warnings
+--echo #
+SET NAMES latin1;
+# control
+SELECT CAST(_latin1 0x610062 AS INT);
+SELECT CAST(_latin1 0x610162 AS INT);
+SELECT CAST(_latin1 0x611F62 AS INT);
+SELECT CAST(_latin1 0x617F62 AS INT);
+# normal characters
+SELECT CAST(_latin1 0x612062 AS INT);
+SELECT CAST(_latin1 0x617E62 AS INT);
+SELECT CAST(_latin1 0x61FF62 AS INT);
+
+--echo #
--echo # End of 10.1 tests
--echo #
diff --git a/mysql-test/t/ctype_ucs.test b/mysql-test/t/ctype_ucs.test
index 2f48062..d6341fb 100644
--- a/mysql-test/t/ctype_ucs.test
+++ b/mysql-test/t/ctype_ucs.test
@@ -955,5 +955,18 @@ SET NAMES utf8;
SELECT CAST(CONVERT('1IJ3' USING ucs2) AS SIGNED);
--echo #
+--echo # MDEV-8844 Unreadable control characters printed as is in warnings
+--echo #
+# control
+SELECT CAST(_ucs2 0x006100000062 AS INT);
+SELECT CAST(_ucs2 0x006100010062 AS INT);
+# surrogate halfs
+SELECT CAST(_ucs2 0x0061D8000062 AS INT);
+SELECT CAST(_ucs2 0x0061DFFF0062 AS INT);
+# normal characters
+SELECT CAST(_ucs2 0x0061D7000062 AS INT);
+SELECT CAST(_ucs2 0x0061E0030062 AS INT);
+
+--echo #
--echo # End of 10.1 tests
--echo #
diff --git a/mysql-test/t/ctype_utf16.test b/mysql-test/t/ctype_utf16.test
index bb7eb8c..9e15961 100644
--- a/mysql-test/t/ctype_utf16.test
+++ b/mysql-test/t/ctype_utf16.test
@@ -893,5 +893,14 @@ SELECT CAST(CONVERT('1IJ3' USING utf16) AS SIGNED);
--echo #
+--echo # MDEV-8844 Unreadable control characters printed as is in warnings
+--echo #
+SET NAMES utf8;
+# Make sure surrogate halfs (when a part of a full utf16 character)
+# are not escaped and the entire utf16 character consisting of two
+# surrogate pairs is replaced to a single question mark.
+SELECT CAST(_utf16 0x0061D83DDE0E0062 AS INT);
+
+--echo #
--echo # End of 10.1 tests
--echo #
diff --git a/mysql-test/t/ctype_utf8.test b/mysql-test/t/ctype_utf8.test
index 639f6d4..ac7bc95 100644
--- a/mysql-test/t/ctype_utf8.test
+++ b/mysql-test/t/ctype_utf8.test
@@ -1843,6 +1843,26 @@ EXPLAIN EXTENDED
SELECT * FROM t1 WHERE c>=_utf8'a' COLLATE utf8_general_ci AND c='A';
DROP TABLE t1;
+--echo #
+--echo # MDEV-8844 Unreadable control characters printed as is in warnings
+--echo #
+SET NAMES utf8;
+# control, part1
+SELECT CAST(_utf8 0x610062 AS INT);
+SELECT CAST(_utf8 0x610162 AS INT);
+SELECT CAST(_utf8 0x611F62 AS INT);
+# control, part2: U+0080..U+009F
+SELECT CAST(_utf8 0x617F62 AS INT);
+SELECT CAST(_utf8 0x61C28062 AS INT);
+SELECT CAST(_utf8 0x61C29F62 AS INT);
+# normal characters
+SELECT CAST(_utf8 0x612062 AS INT);
+SELECT CAST(_utf8 0x617E62 AS INT);
+SELECT CAST(_utf8 0x61C2BF62 AS INT);
+SELECT CAST(_utf8 'ëëë' AS INT);
+SELECT CAST(_utf8 'ÅÅÅ' AS INT);
+SELECT CAST(_utf8 'ÑÑÑ' AS INT);
+
--echo #
--echo # End of 10.1 tests
diff --git a/sql/sql_error.cc b/sql/sql_error.cc
index b72d642..1ed3547 100644
--- a/sql/sql_error.cc
+++ b/sql/sql_error.cc
@@ -931,7 +931,7 @@ char *err_conv(char *buff, uint to_length, const char *from,
else
{
uint errors;
- res= copy_and_convert(to, to_length, system_charset_info,
+ res= copy_and_convert(to, to_length, &my_charset_errmsg,
from, from_length, from_cs, &errors);
to[res]= 0;
}
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index 3c2c812..b1a7427 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -7953,3 +7953,163 @@ struct charset_info_st my_charset_utf8mb4_bin=
};
#endif /* HAVE_CHARSET_utf8mb4 */
+
+
+/**
+ A special version of utf8 for error handling.
+
+ In error messages let's use the SQL standard Unicode escape sequence
+ notation to display non-printable characters, which is:
+ 1. \hhhh (for BMP)
+ 2. \+hhhhhh (for non-BMP)
+
+ Note, non-BMP characters are replaced to QUESTION MARK.
+ Perhaps we need to change the DIAGNOSTICS relate code to use utf8mb4.
+
+ As non-BMP characters are not replaced to escape sequences for now,
+ we need only 5 bytes to display a non-printable character, e.g. "\007F".
+*/
+#define MY_CS_ERROR_MB_MAXLEN 5
+
+
+/**
+ Detect if a Unicode code point is printable.
+*/
+static inline my_bool
+my_is_printable(my_wc_t wc)
+{
+ /*
+ Blocks:
+ U+0000 .. U+001F control
+ U+0020 .. U+007E printable
+ U+007F .. U+009F control
+ U+00A0 .. U+00FF printable
+ U+0100 .. U+10FFFF As of Unicode-6.1.0, this range does not have any
+ characters of the "Cc" (Other, control) category.
+ Should be mostly safe to print.
+ Except for the surrogate halfs,
+ which are encoding components, not real characters.
+ */
+ if (wc >= 0x20 && wc <= 0x7E) /* Quickly detect ASCII printable */
+ return TRUE;
+ if (wc <= 0x9F) /* The rest of U+0000..U+009F are control characters */
+ {
+ /* NL, CR, TAB are Ok */
+ return (wc == '\r' || wc == '\n' || wc == '\t');
+ }
+ /*
+ Surrogate halfs (when alone) print badly in gnome-terminal:
+ SELECT _ucs2 0xD800;
+ Let's escape them as well.
+ */
+ if (wc >= 0xD800 && wc <= 0xDFFF)
+ return FALSE;
+ return TRUE;
+}
+
+
+/**
+ Non-printable code points are printed as \hhhh.
+ Printable code points are encoded as utf8.
+*/
+static int
+my_wc_mb_errmsg(CHARSET_INFO *cs, my_wc_t wc, uchar *r, uchar *e)
+{
+ if (my_is_printable(wc))
+ return my_uni_utf8(cs, wc, r, e);
+
+ if (r + MY_CS_ERROR_MB_MAXLEN > e)
+ return MY_CS_TOOSMALLN(MY_CS_ERROR_MB_MAXLEN);
+ DBUG_ASSERT(wc < 0x10000);
+ *r++= '\\';
+ *r++= _dig_vec_upper[(wc >> 12) & 0x0F];
+ *r++= _dig_vec_upper[(wc >> 8) & 0x0F];
+ *r++= _dig_vec_upper[(wc >> 4) & 0x0F];
+ *r++= _dig_vec_upper[wc & 0x0F];
+ return 5;
+}
+
+
+/**
+ A version of utf8 handler for error messages.
+ It only has a special wc_mb() implementation,
+ and is equal to my_charset_utf8_handler otherwise.
+*/
+static MY_CHARSET_HANDLER my_charset_errmsg_handler=
+{
+ NULL, /* init */
+ my_ismbchar_utf8,
+ my_mbcharlen_utf8,
+ my_numchars_mb,
+ my_charpos_mb,
+ my_well_formed_len_utf8,
+ my_lengthsp_8bit,
+ my_numcells_mb,
+ my_utf8_uni,
+ my_wc_mb_errmsg, /* A special wb_wc() implementation */
+ my_mb_ctype_mb,
+ my_caseup_str_utf8,
+ my_casedn_str_utf8,
+ my_caseup_utf8,
+ my_casedn_utf8,
+ my_snprintf_8bit,
+ my_long10_to_str_8bit,
+ my_longlong10_to_str_8bit,
+ my_fill_8bit,
+ my_strntol_8bit,
+ my_strntoul_8bit,
+ my_strntoll_8bit,
+ my_strntoull_8bit,
+ my_strntod_8bit,
+ my_strtoll10_8bit,
+ my_strntoull10rnd_8bit,
+ my_scan_8bit,
+ my_charlen_utf8,
+ my_well_formed_char_length_utf8,
+ my_copy_fix_mb,
+ my_wc_mb_errmsg,
+};
+
+
+/**
+ A special version of utf8:
+ - uses my_charset_errmsg_handler
+ - defines mbmaxlen as MY_CS_ERROR_MB_MAXLEN
+ - has the MY_CS_NONASCII to avoid optimization in the conversion routines,
+ which would go around wc_mb().
+ - has unique charset and collation names, for easier debugging purposes.
+ Otherwise, equal to my_charset_utf8.
+ Note, as we don't expose it to the SQL level, it's ok to have the same ID=33.
+*/
+struct charset_info_st my_charset_errmsg=
+{
+ 33,0,0, /* number */
+ MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|
+ MY_CS_NONASCII, /* state */
+ "errmsg", /* cs name */
+ "errmsg", /* name */
+ "", /* comment */
+ NULL, /* tailoring */
+ ctype_utf8, /* ctype */
+ to_lower_utf8, /* to_lower */
+ to_upper_utf8, /* to_upper */
+ to_upper_utf8, /* sort_order */
+ NULL, /* uca */
+ NULL, /* tab_to_uni */
+ NULL, /* tab_from_uni */
+ &my_unicase_default, /* caseinfo */
+ NULL, /* state_map */
+ NULL, /* ident_map */
+ 1, /* strxfrm_multiply */
+ 1, /* caseup_multiply */
+ 1, /* casedn_multiply */
+ 1, /* mbminlen */
+ MY_CS_ERROR_MB_MAXLEN, /* mbmaxlen */
+ 0, /* min_sort_char */
+ 0xFFFF, /* max_sort_char */
+ ' ', /* pad char */
+ 0, /* escape_with_backslash_is_dangerous */
+ 1, /* levels_for_order */
+ &my_charset_errmsg_handler,
+ &my_collation_utf8_general_ci_handler
+};
Follow ups