← Back to team overview

maria-developers team mailing list archive

MDEV-8844 Unreadable control characters printed as is in warnings

 

Hi Sergei,

Please review a patch for MDEV-8844.

Thanks!

diff --git a/include/m_ctype.h b/include/m_ctype.h
index a552226..b059258 100644
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@@ -582,6 +582,7 @@ struct charset_info_st
 extern MYSQL_PLUGIN_IMPORT struct charset_info_st my_charset_bin;
 extern MYSQL_PLUGIN_IMPORT struct charset_info_st my_charset_latin1;
 extern MYSQL_PLUGIN_IMPORT struct charset_info_st my_charset_filename;
+extern MYSQL_PLUGIN_IMPORT struct charset_info_st my_charset_errmsg;
 extern MYSQL_PLUGIN_IMPORT struct charset_info_st my_charset_utf8_general_ci;
 
 extern struct charset_info_st my_charset_big5_bin;
diff --git a/mysql-test/r/ctype_latin1.result b/mysql-test/r/ctype_latin1.result
index 4847592..fce1a07 100644
--- a/mysql-test/r/ctype_latin1.result
+++ b/mysql-test/r/ctype_latin1.result
@@ -8181,5 +8181,44 @@ Warnings:
 Note	1003	select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t1`.`c` AS `c`,`test`.`t1`.`d` AS `d` from `test`.`t1` where ((coalesce(`test`.`t1`.`c`,0) = '3 ') and (coalesce(`test`.`t1`.`d`,0) = '3 '))
 DROP TABLE t1;
 #
+# MDEV-8844 Unreadable control characters printed as is in warnings
+#
+SET NAMES latin1;
+SELECT CAST(_latin1 0x610062 AS INT);
+CAST(_latin1 0x610062 AS INT)
+0
+Warnings:
+Warning	1292	Truncated incorrect INTEGER value: 'a\0000b'
+SELECT CAST(_latin1 0x610162 AS INT);
+CAST(_latin1 0x610162 AS INT)
+0
+Warnings:
+Warning	1292	Truncated incorrect INTEGER value: 'a\0001b'
+SELECT CAST(_latin1 0x611F62 AS INT);
+CAST(_latin1 0x611F62 AS INT)
+0
+Warnings:
+Warning	1292	Truncated incorrect INTEGER value: 'a\001Fb'
+SELECT CAST(_latin1 0x617F62 AS INT);
+CAST(_latin1 0x617F62 AS INT)
+0
+Warnings:
+Warning	1292	Truncated incorrect INTEGER value: 'a\007Fb'
+SELECT CAST(_latin1 0x612062 AS INT);
+CAST(_latin1 0x612062 AS INT)
+0
+Warnings:
+Warning	1292	Truncated incorrect INTEGER value: 'a b'
+SELECT CAST(_latin1 0x617E62 AS INT);
+CAST(_latin1 0x617E62 AS INT)
+0
+Warnings:
+Warning	1292	Truncated incorrect INTEGER value: 'a~b'
+SELECT CAST(_latin1 0x61FF62 AS INT);
+CAST(_latin1 0x61FF62 AS INT)
+0
+Warnings:
+Warning	1292	Truncated incorrect INTEGER value: 'aÿb'
+#
 # End of 10.1 tests
 #
diff --git a/mysql-test/r/ctype_ucs.result b/mysql-test/r/ctype_ucs.result
index 5617431..82e4784 100644
--- a/mysql-test/r/ctype_ucs.result
+++ b/mysql-test/r/ctype_ucs.result
@@ -5649,5 +5649,38 @@ CAST(CONVERT('1IJ3' USING ucs2) AS SIGNED)
 Warnings:
 Warning	1292	Truncated incorrect INTEGER value: '1IJ3'
 #
+# MDEV-8844 Unreadable control characters printed as is in warnings
+#
+SELECT CAST(_ucs2 0x006100000062 AS INT);
+CAST(_ucs2 0x006100000062 AS INT)
+0
+Warnings:
+Warning	1292	Truncated incorrect INTEGER value: 'a\0000b'
+SELECT CAST(_ucs2 0x006100010062 AS INT);
+CAST(_ucs2 0x006100010062 AS INT)
+0
+Warnings:
+Warning	1292	Truncated incorrect INTEGER value: 'a\0001b'
+SELECT CAST(_ucs2 0x0061D8000062 AS INT);
+CAST(_ucs2 0x0061D8000062 AS INT)
+0
+Warnings:
+Warning	1292	Truncated incorrect INTEGER value: 'a\D800b'
+SELECT CAST(_ucs2 0x0061DFFF0062 AS INT);
+CAST(_ucs2 0x0061DFFF0062 AS INT)
+0
+Warnings:
+Warning	1292	Truncated incorrect INTEGER value: 'a\DFFFb'
+SELECT CAST(_ucs2 0x0061D7000062 AS INT);
+CAST(_ucs2 0x0061D7000062 AS INT)
+0
+Warnings:
+Warning	1292	Truncated incorrect INTEGER value: 'a휀b'
+SELECT CAST(_ucs2 0x0061E0030062 AS INT);
+CAST(_ucs2 0x0061E0030062 AS INT)
+0
+Warnings:
+Warning	1292	Truncated incorrect INTEGER value: 'ab'
+#
 # End of 10.1 tests
 #
diff --git a/mysql-test/r/ctype_utf16.result b/mysql-test/r/ctype_utf16.result
index 3bd3725..303fa48 100644
--- a/mysql-test/r/ctype_utf16.result
+++ b/mysql-test/r/ctype_utf16.result
@@ -2199,5 +2199,14 @@ CAST(CONVERT('1IJ3' USING utf16) AS SIGNED)
 Warnings:
 Warning	1292	Truncated incorrect INTEGER value: '1IJ3'
 #
+# MDEV-8844 Unreadable control characters printed as is in warnings
+#
+SET NAMES utf8;
+SELECT CAST(_utf16 0x0061D83DDE0E0062 AS INT);
+CAST(_utf16 0x0061D83DDE0E0062 AS INT)
+0
+Warnings:
+Warning	1292	Truncated incorrect INTEGER value: 'a?b'
+#
 # End of 10.1 tests
 #
diff --git a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result
index 66db7df..ab340e0 100644
--- a/mysql-test/r/ctype_utf8.result
+++ b/mysql-test/r/ctype_utf8.result
@@ -10213,5 +10213,69 @@ Warnings:
 Note	1003	select `test`.`t1`.`c` AS `c` from `test`.`t1` where (`test`.`t1`.`c` = 'A')
 DROP TABLE t1;
 #
+# MDEV-8844 Unreadable control characters printed as is in warnings
+#
+SET NAMES utf8;
+SELECT CAST(_utf8 0x610062 AS INT);
+CAST(_utf8 0x610062 AS INT)
+0
+Warnings:
+Warning	1292	Truncated incorrect INTEGER value: 'a\0000b'
+SELECT CAST(_utf8 0x610162 AS INT);
+CAST(_utf8 0x610162 AS INT)
+0
+Warnings:
+Warning	1292	Truncated incorrect INTEGER value: 'a\0001b'
+SELECT CAST(_utf8 0x611F62 AS INT);
+CAST(_utf8 0x611F62 AS INT)
+0
+Warnings:
+Warning	1292	Truncated incorrect INTEGER value: 'a\001Fb'
+SELECT CAST(_utf8 0x617F62 AS INT);
+CAST(_utf8 0x617F62 AS INT)
+0
+Warnings:
+Warning	1292	Truncated incorrect INTEGER value: 'a\007Fb'
+SELECT CAST(_utf8 0x61C28062 AS INT);
+CAST(_utf8 0x61C28062 AS INT)
+0
+Warnings:
+Warning	1292	Truncated incorrect INTEGER value: 'a\0080b'
+SELECT CAST(_utf8 0x61C29F62 AS INT);
+CAST(_utf8 0x61C29F62 AS INT)
+0
+Warnings:
+Warning	1292	Truncated incorrect INTEGER value: 'a\009Fb'
+SELECT CAST(_utf8 0x612062 AS INT);
+CAST(_utf8 0x612062 AS INT)
+0
+Warnings:
+Warning	1292	Truncated incorrect INTEGER value: 'a b'
+SELECT CAST(_utf8 0x617E62 AS INT);
+CAST(_utf8 0x617E62 AS INT)
+0
+Warnings:
+Warning	1292	Truncated incorrect INTEGER value: 'a~b'
+SELECT CAST(_utf8 0x61C2BF62 AS INT);
+CAST(_utf8 0x61C2BF62 AS INT)
+0
+Warnings:
+Warning	1292	Truncated incorrect INTEGER value: 'a¿b'
+SELECT CAST(_utf8 'ëëë' AS INT);
+CAST(_utf8 'ëëë' AS INT)
+0
+Warnings:
+Warning	1292	Truncated incorrect INTEGER value: 'ëëë'
+SELECT CAST(_utf8 'Å“Å“Å“' AS INT);
+CAST(_utf8 'Å“Å“Å“' AS INT)
+0
+Warnings:
+Warning	1292	Truncated incorrect INTEGER value: 'Å“Å“Å“'
+SELECT CAST(_utf8 'яяя' AS INT);
+CAST(_utf8 'яяя' AS INT)
+0
+Warnings:
+Warning	1292	Truncated incorrect INTEGER value: 'яяя'
+#
 # End of 10.1 tests
 #
diff --git a/mysql-test/t/ctype_latin1.test b/mysql-test/t/ctype_latin1.test
index a30c7ae..7478ca6 100644
--- a/mysql-test/t/ctype_latin1.test
+++ b/mysql-test/t/ctype_latin1.test
@@ -374,5 +374,19 @@ SELECT * FROM t1 WHERE COALESCE(c,0)='3 ' AND COALESCE(d,0)=COALESCE(c,0);
 DROP TABLE t1;
 
 --echo #
+--echo # MDEV-8844 Unreadable control characters printed as is in warnings
+--echo #
+SET NAMES latin1;
+# control
+SELECT CAST(_latin1 0x610062 AS INT);
+SELECT CAST(_latin1 0x610162 AS INT);
+SELECT CAST(_latin1 0x611F62 AS INT);
+SELECT CAST(_latin1 0x617F62 AS INT);
+# normal characters
+SELECT CAST(_latin1 0x612062 AS INT);
+SELECT CAST(_latin1 0x617E62 AS INT);
+SELECT CAST(_latin1 0x61FF62 AS INT);
+
+--echo #
 --echo # End of 10.1 tests
 --echo #
diff --git a/mysql-test/t/ctype_ucs.test b/mysql-test/t/ctype_ucs.test
index 2f48062..d6341fb 100644
--- a/mysql-test/t/ctype_ucs.test
+++ b/mysql-test/t/ctype_ucs.test
@@ -955,5 +955,18 @@ SET NAMES utf8;
 SELECT CAST(CONVERT('1IJ3' USING ucs2) AS SIGNED);
 
 --echo #
+--echo # MDEV-8844 Unreadable control characters printed as is in warnings
+--echo #
+# control
+SELECT CAST(_ucs2 0x006100000062 AS INT);
+SELECT CAST(_ucs2 0x006100010062 AS INT);
+# surrogate halfs
+SELECT CAST(_ucs2 0x0061D8000062 AS INT);
+SELECT CAST(_ucs2 0x0061DFFF0062 AS INT);
+# normal characters
+SELECT CAST(_ucs2 0x0061D7000062 AS INT);
+SELECT CAST(_ucs2 0x0061E0030062 AS INT);
+
+--echo #
 --echo # End of 10.1 tests
 --echo #
diff --git a/mysql-test/t/ctype_utf16.test b/mysql-test/t/ctype_utf16.test
index bb7eb8c..9e15961 100644
--- a/mysql-test/t/ctype_utf16.test
+++ b/mysql-test/t/ctype_utf16.test
@@ -893,5 +893,14 @@ SELECT CAST(CONVERT('1IJ3' USING utf16) AS SIGNED);
 
 
 --echo #
+--echo # MDEV-8844 Unreadable control characters printed as is in warnings
+--echo #
+SET NAMES utf8;
+# Make sure surrogate halfs (when a part of a full utf16 character)
+# are not escaped and the entire utf16 character consisting of two
+# surrogate pairs is replaced to a single question mark.
+SELECT CAST(_utf16 0x0061D83DDE0E0062 AS INT);
+
+--echo #
 --echo # End of 10.1 tests
 --echo #
diff --git a/mysql-test/t/ctype_utf8.test b/mysql-test/t/ctype_utf8.test
index 639f6d4..ac7bc95 100644
--- a/mysql-test/t/ctype_utf8.test
+++ b/mysql-test/t/ctype_utf8.test
@@ -1843,6 +1843,26 @@ EXPLAIN EXTENDED
 SELECT * FROM t1 WHERE c>=_utf8'a' COLLATE utf8_general_ci AND c='A';
 DROP TABLE t1;
 
+--echo #
+--echo # MDEV-8844 Unreadable control characters printed as is in warnings
+--echo #
+SET NAMES utf8;
+# control, part1
+SELECT CAST(_utf8 0x610062 AS INT);
+SELECT CAST(_utf8 0x610162 AS INT);
+SELECT CAST(_utf8 0x611F62 AS INT);
+# control, part2: U+0080..U+009F
+SELECT CAST(_utf8 0x617F62 AS INT);
+SELECT CAST(_utf8 0x61C28062 AS INT);
+SELECT CAST(_utf8 0x61C29F62 AS INT);
+# normal characters
+SELECT CAST(_utf8 0x612062 AS INT);
+SELECT CAST(_utf8 0x617E62 AS INT);
+SELECT CAST(_utf8 0x61C2BF62 AS INT);
+SELECT CAST(_utf8 'ëëë' AS INT);
+SELECT CAST(_utf8 'Å“Å“Å“' AS INT);
+SELECT CAST(_utf8 'яяя' AS INT);
+
 
 --echo #
 --echo # End of 10.1 tests
diff --git a/sql/sql_error.cc b/sql/sql_error.cc
index b72d642..1ed3547 100644
--- a/sql/sql_error.cc
+++ b/sql/sql_error.cc
@@ -931,7 +931,7 @@ char *err_conv(char *buff, uint to_length, const char *from,
   else
   {
     uint errors;
-    res= copy_and_convert(to, to_length, system_charset_info,
+    res= copy_and_convert(to, to_length, &my_charset_errmsg,
                           from, from_length, from_cs, &errors);
     to[res]= 0;
   }
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index 3c2c812..b1a7427 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -7953,3 +7953,163 @@ struct charset_info_st my_charset_utf8mb4_bin=
 };
 
 #endif /* HAVE_CHARSET_utf8mb4 */
+
+
+/**
+  A special version of utf8 for error handling.
+
+  In error messages let's use the SQL standard Unicode escape sequence
+  notation to display non-printable characters, which is:
+  1. \hhhh        (for BMP)
+  2. \+hhhhhh     (for non-BMP)
+
+  Note, non-BMP characters are replaced to QUESTION MARK.
+  Perhaps we need to change the DIAGNOSTICS relate code to use utf8mb4.
+
+  As non-BMP characters are not replaced to escape sequences for now,
+  we need only 5 bytes to display a non-printable character, e.g. "\007F".
+*/
+#define MY_CS_ERROR_MB_MAXLEN 5
+
+
+/**
+  Detect if a Unicode code point is printable.
+*/
+static inline my_bool
+my_is_printable(my_wc_t wc)
+{
+  /*
+    Blocks:
+      U+0000 .. U+001F     control
+      U+0020 .. U+007E     printable
+      U+007F .. U+009F     control
+      U+00A0 .. U+00FF     printable
+      U+0100 .. U+10FFFF   As of Unicode-6.1.0, this range does not have any
+                           characters of the "Cc" (Other, control) category.
+                           Should be mostly safe to print.
+                           Except for the surrogate halfs,
+                           which are encoding components, not real characters.
+  */
+  if (wc >= 0x20 && wc <= 0x7E) /* Quickly detect ASCII printable */
+    return TRUE;
+  if (wc <= 0x9F)    /* The rest of U+0000..U+009F are control characters */
+  {
+    /* NL, CR, TAB are Ok */
+    return (wc == '\r' || wc == '\n' || wc == '\t');
+  }
+  /*
+    Surrogate halfs (when alone) print badly in gnome-terminal:
+      SELECT _ucs2 0xD800;
+    Let's escape them as well.
+  */
+  if (wc >= 0xD800 && wc <= 0xDFFF)
+    return FALSE;
+  return TRUE;
+}
+
+
+/**
+  Non-printable code points are printed as \hhhh.
+  Printable code points are encoded as utf8.
+*/
+static int
+my_wc_mb_errmsg(CHARSET_INFO *cs, my_wc_t wc, uchar *r, uchar *e)
+{
+  if (my_is_printable(wc))
+    return my_uni_utf8(cs, wc, r, e);
+
+  if (r + MY_CS_ERROR_MB_MAXLEN > e)
+    return MY_CS_TOOSMALLN(MY_CS_ERROR_MB_MAXLEN);
+  DBUG_ASSERT(wc < 0x10000);
+  *r++= '\\';
+  *r++= _dig_vec_upper[(wc >> 12) & 0x0F];
+  *r++= _dig_vec_upper[(wc >> 8) & 0x0F];
+  *r++= _dig_vec_upper[(wc >> 4) & 0x0F];
+  *r++= _dig_vec_upper[wc & 0x0F];
+  return 5;
+}
+
+
+/**
+  A version of utf8 handler for error messages.
+  It only has a special wc_mb() implementation,
+  and is equal to my_charset_utf8_handler otherwise.
+*/
+static MY_CHARSET_HANDLER my_charset_errmsg_handler=
+{
+  NULL,               /* init */
+  my_ismbchar_utf8,
+  my_mbcharlen_utf8,
+  my_numchars_mb,
+  my_charpos_mb,
+  my_well_formed_len_utf8,
+  my_lengthsp_8bit,
+  my_numcells_mb,
+  my_utf8_uni,
+  my_wc_mb_errmsg,         /* A special wb_wc() implementation */
+  my_mb_ctype_mb,
+  my_caseup_str_utf8,
+  my_casedn_str_utf8,
+  my_caseup_utf8,
+  my_casedn_utf8,
+  my_snprintf_8bit,
+  my_long10_to_str_8bit,
+  my_longlong10_to_str_8bit,
+  my_fill_8bit,
+  my_strntol_8bit,
+  my_strntoul_8bit,
+  my_strntoll_8bit,
+  my_strntoull_8bit,
+  my_strntod_8bit,
+  my_strtoll10_8bit,
+  my_strntoull10rnd_8bit,
+  my_scan_8bit,
+  my_charlen_utf8,
+  my_well_formed_char_length_utf8,
+  my_copy_fix_mb,
+  my_wc_mb_errmsg,
+};
+
+
+/**
+  A special version of utf8:
+  - uses my_charset_errmsg_handler
+  - defines mbmaxlen as MY_CS_ERROR_MB_MAXLEN
+  - has the MY_CS_NONASCII to avoid optimization in the conversion routines,
+    which would go around wc_mb().
+  - has unique charset and collation names, for easier debugging purposes.
+  Otherwise, equal to my_charset_utf8.
+  Note, as we don't expose it to the SQL level, it's ok to have the same ID=33.
+*/
+struct charset_info_st my_charset_errmsg=
+{
+  33,0,0,                /* number       */
+  MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|
+                 MY_CS_NONASCII,  /* state  */
+  "errmsg",              /* cs name      */
+  "errmsg",              /* name         */
+  "",                    /* comment      */
+  NULL,                  /* tailoring    */
+  ctype_utf8,            /* ctype        */
+  to_lower_utf8,         /* to_lower     */
+  to_upper_utf8,         /* to_upper     */
+  to_upper_utf8,         /* sort_order   */
+  NULL,                  /* uca          */
+  NULL,                  /* tab_to_uni   */
+  NULL,                  /* tab_from_uni */
+  &my_unicase_default,   /* caseinfo     */
+  NULL,                  /* state_map    */
+  NULL,                  /* ident_map    */
+  1,                     /* strxfrm_multiply */
+  1,                     /* caseup_multiply  */
+  1,                     /* casedn_multiply  */
+  1,                     /* mbminlen     */
+  MY_CS_ERROR_MB_MAXLEN, /* mbmaxlen     */
+  0,                     /* min_sort_char */
+  0xFFFF,                /* max_sort_char */
+  ' ',                   /* pad char      */
+  0,                     /* escape_with_backslash_is_dangerous */
+  1,                     /* levels_for_order   */
+  &my_charset_errmsg_handler,
+  &my_collation_utf8_general_ci_handler
+};

Follow ups