← Back to team overview

maria-developers team mailing list archive

Re: MDEV-7231 Field ROUTINE_DEFINITION in INFORMATION_SCHEMA.`ROUTINES` contains broken procedure body when used shielding quotes inside.

 

  Hi Sergei,

On 02/18/2016 07:02 PM, Alexander Barkov wrote:
Hi Sergei,


On 02/18/2016 06:45 PM, Sergei Golubchik wrote:
Hi, Alexander!

On Feb 18, Alexander Barkov wrote:
Hi Sergei,

Please review a patch for MDEV-7231.

Sorry, I cannot understand what parts of this patch are fixing MDEV-7231
and what is fixing charset issue.

Could you please split this patch in two - one for MDEV-7231 and the
other for charsets?

Here's the first part, fixing only the problem reported in MDEV-7231.

I did not remove the test parts covering "SELECT _binary'...'"
and "SET NAMES binary" and just re-recorded the relevant parts
of ctype_utf8.result with misleading results.

The parts of ctype_utf8.result not related to binary are the same with
the full version.

ctype_utf8mb4.result is also the same with the full version.

Thanks.


Hmm. I'm not fixing any charset issues.
Which parts do you think are not related?

The problem was that the ROUTINE_DEFINITION:

- Did not escape special characters and quotes in text and binary
literals. Now it does.

- Also, in now converts some binary strings to X'hhhh' notation,
   when conversion to utf8 would produce confusingly looking
   constants. It's another side of the same problem.


Regards,
Sergei
Chief Architect MariaDB
and security@xxxxxxxxxxx

diff --git a/include/m_ctype.h b/include/m_ctype.h
index a552226..ee49e94 100644
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@@ -533,6 +533,7 @@ struct my_charset_handler_st
 
 extern MY_CHARSET_HANDLER my_charset_8bit_handler;
 extern MY_CHARSET_HANDLER my_charset_ucs2_handler;
+extern MY_CHARSET_HANDLER my_charset_utf8_handler;
 
 
 /*
@@ -889,6 +890,18 @@ uint32 my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
                   const char *from, uint32 from_length,
                   CHARSET_INFO *from_cs, uint *errors);
 
+/**
+  An extended version of my_convert(), to pass non-default mb_wc() and wc_mb().
+  For example, String::copy_printable() which is used in
+  Protocol::store_warning() uses this to escape control
+  and non-convertable characters.
+*/
+uint32 my_convert_using_func(char *to, uint32 to_length, CHARSET_INFO *to_cs,
+                             my_charset_conv_wc_mb mb_wc,
+                             const char *from, uint32 from_length,
+                             CHARSET_INFO *from_cs,
+                             my_charset_conv_mb_wc wc_mb,
+                             uint *errors);
 /*
   Convert a string between two character sets.
   Bad byte sequences as well as characters that cannot be
diff --git a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result
index 90bc6b5..3420413 100644
--- a/mysql-test/r/ctype_utf8.result
+++ b/mysql-test/r/ctype_utf8.result
@@ -10259,5 +10259,169 @@ Warnings:
 Note	1003	select `test`.`t1`.`c` AS `c` from `test`.`t1` where (`test`.`t1`.`c` = 'A')
 DROP TABLE t1;
 #
+# MDEV-7231 Field ROUTINE_DEFINITION in INFORMATION_SCHEMA.`ROUTINES` contains broken procedure body when used shielding quotes inside.
+#
+CREATE PROCEDURE p1()
+BEGIN
+SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2);
+SELECT '''', """", '\'', "\"";
+SELECT '<tab>	<tab>\t<tab>';
+SELECT '<nl>
+<nl>\n<nl>';
+SELECT 'test';
+SELECT 'tëst';
+SELECT 'test\0';
+SELECT 'tëst\0';
+SELECT _binary'test';
+SELECT _binary'tëst';
+SELECT _binary'test\0';
+SELECT _binary'tëst\0';
+SELECT N'''', N"""", N'\'', N"\"";
+SELECT N'<tab>	<tab>\t<tab>';
+SELECT N'<nl>
+<nl>\n<nl>';
+SELECT N'test';
+SELECT N'tëst';
+SELECT N'test\0';
+SELECT N'tëst\0';
+END$$
+SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES
+WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='p1';
+ROUTINE_DEFINITION
+BEGIN
+SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2);
+SELECT '''', """", '''', """";
+SELECT '<tab>\t<tab>\t<tab>';
+SELECT '<nl>\n<nl>\n<nl>';
+SELECT 'test';
+SELECT 'tëst';
+SELECT 'test\0';
+SELECT 'tëst\0';
+SELECT 'test';
+SELECT 'tëst';
+SELECT 'test\0';
+SELECT 'tëst\0';
+SELECT N'''', N"""", N'''', N"""";
+SELECT N'<tab>\t<tab>\t<tab>';
+SELECT N'<nl>\n<nl>\n<nl>';
+SELECT N'test';
+SELECT N'tëst';
+SELECT N'test\0';
+SELECT N'tëst\0';
+END
+SELECT body_utf8 FROM mysql.proc WHERE name='p1';
+body_utf8
+BEGIN
+SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2);
+SELECT '''', """", '''', """";
+SELECT '<tab>\t<tab>\t<tab>';
+SELECT '<nl>\n<nl>\n<nl>';
+SELECT 'test';
+SELECT 'tëst';
+SELECT 'test\0';
+SELECT 'tëst\0';
+SELECT 'test';
+SELECT 'tëst';
+SELECT 'test\0';
+SELECT 'tëst\0';
+SELECT N'''', N"""", N'''', N"""";
+SELECT N'<tab>\t<tab>\t<tab>';
+SELECT N'<nl>\n<nl>\n<nl>';
+SELECT N'test';
+SELECT N'tëst';
+SELECT N'test\0';
+SELECT N'tëst\0';
+END
+DROP PROCEDURE p1;
+SET NAMES binary;
+CREATE FUNCTION f1() RETURNS TEXT RETURN CONCAT('i','й');
+SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES
+WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='f1';
+ROUTINE_DEFINITION
+RETURN CONCAT('i','й')
+SELECT body_utf8 FROM mysql.proc WHERE name='f1';
+body_utf8
+RETURN CONCAT('i','й')
+DROP FUNCTION f1;
+SET NAMES utf8;
+SET @@SQL_MODE='NO_BACKSLASH_ESCAPES';
+CREATE PROCEDURE p1()
+BEGIN
+SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2);
+SELECT '''', """";
+SELECT '<tab>	<tab>\t<tab>';
+SELECT '<nl>
+<nl>\n<nl>';
+SELECT 'test';
+SELECT 'tëst';
+SELECT 'test\0';
+SELECT 'tëst\0';
+SELECT _binary'test';
+SELECT _binary'tëst';
+SELECT _binary'test\0';
+SELECT _binary'tëst\0';
+SELECT N'''', N"""";
+SELECT N'<tab>	<tab>\t<tab>';
+SELECT N'<nl>
+<nl>\n<nl>';
+SELECT N'test';
+SELECT N'tëst';
+SELECT N'test\0';
+SELECT N'tëst\0';
+END$$
+SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES
+WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='p1';
+ROUTINE_DEFINITION
+BEGIN
+SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2);
+SELECT '''', """";
+SELECT '<tab>	<tab>\t<tab>';
+SELECT '<nl>
+<nl>\n<nl>';
+SELECT 'test';
+SELECT 'tëst';
+SELECT 'test\0';
+SELECT 'tëst\0';
+SELECT 'test';
+SELECT 'tëst';
+SELECT 'test\0';
+SELECT 'tëst\0';
+SELECT N'''', N"""";
+SELECT N'<tab>	<tab>\t<tab>';
+SELECT N'<nl>
+<nl>\n<nl>';
+SELECT N'test';
+SELECT N'tëst';
+SELECT N'test\0';
+SELECT N'tëst\0';
+END
+SELECT body_utf8 FROM mysql.proc WHERE name='p1';
+body_utf8
+BEGIN
+SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2);
+SELECT '''', """";
+SELECT '<tab>	<tab>\t<tab>';
+SELECT '<nl>
+<nl>\n<nl>';
+SELECT 'test';
+SELECT 'tëst';
+SELECT 'test\0';
+SELECT 'tëst\0';
+SELECT 'test';
+SELECT 'tëst';
+SELECT 'test\0';
+SELECT 'tëst\0';
+SELECT N'''', N"""";
+SELECT N'<tab>	<tab>\t<tab>';
+SELECT N'<nl>
+<nl>\n<nl>';
+SELECT N'test';
+SELECT N'tëst';
+SELECT N'test\0';
+SELECT N'tëst\0';
+END
+DROP PROCEDURE p1;
+SET @@SQL_MODE=default;
+#
 # End of 10.1 tests
 #
diff --git a/mysql-test/r/ctype_utf8mb4.result b/mysql-test/r/ctype_utf8mb4.result
index ac53a7e..f969369 100644
--- a/mysql-test/r/ctype_utf8mb4.result
+++ b/mysql-test/r/ctype_utf8mb4.result
@@ -3382,5 +3382,19 @@ SET NAMES utf8mb4;
 SELECT * FROM `test😁😁test`;
 ERROR HY000: Invalid utf8mb4 character string: 'test\xF0\x9F\x98\x81\xF0\x9F\x98\x81test'
 #
+# MDEV-7231 Field ROUTINE_DEFINITION in INFORMATION_SCHEMA.`ROUTINES` contains broken procedure body when used shielding quotes inside.
+#
+SET NAMES utf8mb4;
+CREATE FUNCTION f1() RETURNS TEXT CHARACTER SET utf8mb4
+RETURN CONCAT('😎','x😎','😎y','x😎y');
+SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES
+WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='f1';
+ROUTINE_DEFINITION
+RETURN CONCAT('?','x?','?y','x?y')
+SELECT body_utf8 FROM mysql.proc WHERE name='f1';
+body_utf8
+RETURN CONCAT('?','x?','?y','x?y')
+DROP FUNCTION f1;
+#
 # End of 10.1 tests
 #
diff --git a/mysql-test/t/ctype_utf8.test b/mysql-test/t/ctype_utf8.test
index 014194d..879629e0 100644
--- a/mysql-test/t/ctype_utf8.test
+++ b/mysql-test/t/ctype_utf8.test
@@ -1872,5 +1872,80 @@ DROP TABLE t1;
 
 
 --echo #
+--echo # MDEV-7231 Field ROUTINE_DEFINITION in INFORMATION_SCHEMA.`ROUTINES` contains broken procedure body when used shielding quotes inside.
+--echo #
+DELIMITER $$;
+CREATE PROCEDURE p1()
+BEGIN
+SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2);
+SELECT '''', """", '\'', "\"";
+SELECT '<tab>	<tab>\t<tab>';
+SELECT '<nl>
+<nl>\n<nl>';
+SELECT 'test';
+SELECT 'tëst';
+SELECT 'test\0';
+SELECT 'tëst\0';
+SELECT _binary'test';
+SELECT _binary'tëst';
+SELECT _binary'test\0';
+SELECT _binary'tëst\0';
+SELECT N'''', N"""", N'\'', N"\"";
+SELECT N'<tab>	<tab>\t<tab>';
+SELECT N'<nl>
+<nl>\n<nl>';
+SELECT N'test';
+SELECT N'tëst';
+SELECT N'test\0';
+SELECT N'tëst\0';
+END$$
+DELIMITER ;$$
+SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES
+WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='p1';
+SELECT body_utf8 FROM mysql.proc WHERE name='p1';
+DROP PROCEDURE p1;
+
+SET NAMES binary;
+CREATE FUNCTION f1() RETURNS TEXT RETURN CONCAT('i','й');
+SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES
+WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='f1';
+SELECT body_utf8 FROM mysql.proc WHERE name='f1';
+DROP FUNCTION f1;
+SET NAMES utf8;
+
+SET @@SQL_MODE='NO_BACKSLASH_ESCAPES';
+DELIMITER $$;
+CREATE PROCEDURE p1()
+BEGIN
+SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2);
+SELECT '''', """";
+SELECT '<tab>	<tab>\t<tab>';
+SELECT '<nl>
+<nl>\n<nl>';
+SELECT 'test';
+SELECT 'tëst';
+SELECT 'test\0';
+SELECT 'tëst\0';
+SELECT _binary'test';
+SELECT _binary'tëst';
+SELECT _binary'test\0';
+SELECT _binary'tëst\0';
+SELECT N'''', N"""";
+SELECT N'<tab>	<tab>\t<tab>';
+SELECT N'<nl>
+<nl>\n<nl>';
+SELECT N'test';
+SELECT N'tëst';
+SELECT N'test\0';
+SELECT N'tëst\0';
+END$$
+DELIMITER ;$$
+SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES
+WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='p1';
+SELECT body_utf8 FROM mysql.proc WHERE name='p1';
+DROP PROCEDURE p1;
+SET @@SQL_MODE=default;
+
+--echo #
 --echo # End of 10.1 tests
 --echo #
diff --git a/mysql-test/t/ctype_utf8mb4.test b/mysql-test/t/ctype_utf8mb4.test
index 3f2b600..2fe9b5e 100644
--- a/mysql-test/t/ctype_utf8mb4.test
+++ b/mysql-test/t/ctype_utf8mb4.test
@@ -1905,5 +1905,17 @@ SET NAMES utf8mb4;
 SELECT * FROM `test😁😁test`;
 
 --echo #
+--echo # MDEV-7231 Field ROUTINE_DEFINITION in INFORMATION_SCHEMA.`ROUTINES` contains broken procedure body when used shielding quotes inside.
+--echo #
+# Non-BMP characters should be replaced to '?' in ROUTINE_DEFINITION/body_utf8
+SET NAMES utf8mb4;
+CREATE FUNCTION f1() RETURNS TEXT CHARACTER SET utf8mb4
+RETURN CONCAT('😎','x😎','😎y','x😎y');
+SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES
+WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='f1';
+SELECT body_utf8 FROM mysql.proc WHERE name='f1';
+DROP FUNCTION f1;
+
+--echo #
 --echo # End of 10.1 tests
 --echo #
diff --git a/sql/sql_lex.cc b/sql/sql_lex.cc
index 898e3ae..231f67e 100644
--- a/sql/sql_lex.cc
+++ b/sql/sql_lex.cc
@@ -324,9 +324,7 @@ void Lex_input_stream::body_utf8_start(THD *thd, const char *begin_ptr)
   DBUG_ASSERT(begin_ptr);
   DBUG_ASSERT(m_cpp_buf <= begin_ptr && begin_ptr <= m_cpp_buf + m_buf_length);
 
-  uint body_utf8_length=
-    (m_buf_length / thd->variables.character_set_client->mbminlen) *
-    my_charset_utf8_bin.mbmaxlen;
+  uint body_utf8_length= get_body_utf8_maximum_length(thd);
 
   m_body_utf8= (char *) thd->alloc(body_utf8_length + 1);
   m_body_utf8_ptr= m_body_utf8;
@@ -335,6 +333,22 @@ void Lex_input_stream::body_utf8_start(THD *thd, const char *begin_ptr)
   m_cpp_utf8_processed_ptr= begin_ptr;
 }
 
+
+uint Lex_input_stream::get_body_utf8_maximum_length(THD *thd)
+{
+  /*
+    String literals can grow during escaping:
+    1a. Character string '<TAB>' can grow to '\t', 3 bytes to 4 bytes growth.
+    1b. Character string '1000 times <TAB>' grows from
+        1002 to 2002 bytes (including quotes), which gives a little bit
+        less than 2 times growth.
+    "2" should be a reasonable multiplier that safely covers escaping needs.
+  */
+  return (m_buf_length / thd->variables.character_set_client->mbminlen) *
+          my_charset_utf8_bin.mbmaxlen * 2/*for escaping*/;
+}
+
+
 /**
   @brief The operation appends unprocessed part of pre-processed buffer till
   the given pointer (ptr) and sets m_cpp_utf8_processed_ptr to end_ptr.
@@ -434,6 +448,196 @@ void Lex_input_stream::body_utf8_append_literal(THD *thd,
   m_cpp_utf8_processed_ptr= end_ptr;
 }
 
+
+
+
+extern "C" {
+
+/**
+  Escape a character. Consequently puts "escape" and "wc" characters into
+  the destination utf8 string.
+  @param cs     - the character set (utf8)
+  @param escape - the escape character (backslash, single quote, double quote)
+  @param wc     - the character to be escaped
+  @param str    - the destination string
+  @param end    - the end of the destination string
+  @returns      - a code according to the wc_mb() convension.
+*/
+int my_wc_mb_utf8_with_escape(CHARSET_INFO *cs, my_wc_t escape, my_wc_t wc,
+                              uchar *str, uchar *end)
+{
+  DBUG_ASSERT(escape > 0);
+  if (str + 1 >= end)
+    return MY_CS_TOOSMALL2;  // Not enough space, need at least two bytes.
+  *str= escape;
+  int cnvres= my_charset_utf8_handler.wc_mb(cs, wc, str + 1, end);
+  if (cnvres > 0)
+    return cnvres + 1;       // The character was normally put
+  if (cnvres == MY_CS_ILUNI)
+    return MY_CS_ILUNI;      // Could not encode "wc" (e.g. non-BMP character)
+  DBUG_ASSERT(cnvres <= MY_CS_TOOSMALL);
+  return cnvres - 1;         // Not enough space
+}
+
+
+/**
+  Optionally escape a character.
+  If "escape" is non-zero, then both "escape" and "wc" are put to
+  the destination string. Otherwise, only "wc" is put.
+  @param cs     - the character set (utf8)
+  @param wc     - the character to be optionally escaped
+  @param escape - the escape character, or 0
+  @param ewc    - the escaped replacement of "wc" (e.g. 't' for '\t')
+  @param str    - the destination string
+  @param end    - the end of the destination string
+  @returns      - a code according to the wc_mb() conversion.
+*/
+int my_wc_mb_utf8_opt_escape(CHARSET_INFO *cs,
+                             my_wc_t wc, my_wc_t escape, my_wc_t ewc,
+                             uchar *str, uchar *end)
+{
+  return escape ? my_wc_mb_utf8_with_escape(cs, escape, ewc, str, end) :
+                  my_charset_utf8_handler.wc_mb(cs, wc, str, end);
+}
+
+/**
+  Encode a character with optional backlash escaping and quote escaping.
+  Quote marks are escaped using another quote mark.
+  Additionally, if "escape" is non-zero, then special characters are
+  also escaped using "escape".
+  Otherwise (if "escape" is zero, e.g. in case of MODE_NO_BACKSLASH_ESCAPES),
+  then special characters are not escaped and handled as normal characters.
+
+  @param cs        - the character set (utf8)
+  @param wc        - the character to be encoded
+  @param str       - the destination string
+  @param end       - the end of the destination string
+  @param quotation - the string delimiter (e.g. ' or ")
+  @param escape    - the escape character (backslash, or 0)
+  @returns         - a code according to the wc_mb() convension.
+*/
+int my_wc_mb_utf8_escape(CHARSET_INFO *cs, my_wc_t wc, uchar *str, uchar *end,
+                         my_wc_t quotation, my_wc_t escape)
+{
+  DBUG_ASSERT(escape == 0 || escape == '\\');
+  switch (wc) {
+  case 0:      return my_wc_mb_utf8_opt_escape(cs, wc, escape, '0', str, end);
+  case '\t':   return my_wc_mb_utf8_opt_escape(cs, wc, escape, 't', str, end);
+  case '\r':   return my_wc_mb_utf8_opt_escape(cs, wc, escape, 'r', str, end);
+  case '\n':   return my_wc_mb_utf8_opt_escape(cs, wc, escape, 'n', str, end);
+  case '\032': return my_wc_mb_utf8_opt_escape(cs, wc, escape, 'Z', str, end);
+  case '\'':
+  case '\"':
+    if (wc == quotation)
+      return my_wc_mb_utf8_with_escape(cs, wc, wc, str, end);
+  }
+  return my_charset_utf8_handler.wc_mb(cs, wc, str, end); // No escaping needed
+}
+
+
+/** wc_mb() compatible routines for all sql_mode and delimiter combinations */
+int my_wc_mb_utf8_escape_single_quote_and_backslash(CHARSET_INFO *cs,
+                                                    my_wc_t wc,
+                                                    uchar *str, uchar *end)
+{
+  return my_wc_mb_utf8_escape(cs, wc, str, end, '\'', '\\');
+}
+
+
+int my_wc_mb_utf8_escape_double_quote_and_backslash(CHARSET_INFO *cs,
+                                                    my_wc_t wc,
+                                                    uchar *str, uchar *end)
+{
+  return my_wc_mb_utf8_escape(cs, wc, str, end, '"', '\\');
+}
+
+
+int my_wc_mb_utf8_escape_single_quote(CHARSET_INFO *cs, my_wc_t wc,
+                                      uchar *str, uchar *end)
+{
+  return my_wc_mb_utf8_escape(cs, wc, str, end, '\'', 0);
+}
+
+
+int my_wc_mb_utf8_escape_double_quote(CHARSET_INFO *cs, my_wc_t wc,
+                                      uchar *str, uchar *end)
+{
+  return my_wc_mb_utf8_escape(cs, wc, str, end, '"', 0);
+}
+
+}; // End of extern "C"
+
+
+/**
+  Get an escaping function, depending on the current sql_mode and the
+  string separator.
+*/
+my_charset_conv_wc_mb
+Lex_input_stream::get_escape_func(THD *thd, my_wc_t quotation) const
+{
+  return thd->backslash_escapes() ?
+         (quotation == '"' ? my_wc_mb_utf8_escape_double_quote_and_backslash:
+                             my_wc_mb_utf8_escape_single_quote_and_backslash) :
+         (quotation == '"' ? my_wc_mb_utf8_escape_double_quote:
+                             my_wc_mb_utf8_escape_single_quote);
+}
+
+
+/**
+  Append a text literal to the end of m_body_utf8.
+  The string is escaped according to the current sql_mode and the
+  string delimiter (e.g. ' or ").
+
+  @param thd       - current THD
+  @param txt       - the string to be appended to m_body_utf8.
+                     Note, the string must be already unescaped.
+  @param cs        - the character set of the string
+  @param end_ptr   - m_cpp_utf8_processed_ptr will be set to this value
+                     (see body_utf8_append_literal for details)
+  @param quotation - the string delimiter (single or double quotation)
+*/
+void Lex_input_stream::body_utf8_append_escape(THD *thd,
+                                               const LEX_STRING *txt,
+                                               CHARSET_INFO *cs,
+                                               const char *end_ptr,
+                                               my_wc_t quotation)
+{
+  DBUG_ASSERT(quotation == '\'' || quotation == '"');
+  if (!m_cpp_utf8_processed_ptr)
+    return;
+  /**
+    In case of "SET NAMES binary; SELECT 'aaa';" or "SELECT _binary'xxx';",
+    we reinterpret the string as utf8, as calling wc_mb() for "binary" is
+    not reliable.
+  */
+  if (cs == &my_charset_bin)
+    cs= &my_charset_utf8_general_ci;
+  uint errors;
+  /**
+    We previously alloced m_body_utf8 to be able to store the query with all
+    strings properly escaped. See get_body_utf8_maximum_length().
+    So here we have guaranteedly enough space to append any string literal
+    with escaping. Passing txt->length*2 as "available space" should be good
+    enough.
+    For better safety purposes we could calculate get_body_utf8_maximum_length()
+    every time we append a string, but this would affect performance negatively,
+    so let's check that we don't get beyond the allocated buffer in
+    debug build only.
+  */
+  DBUG_ASSERT(m_body_utf8 + get_body_utf8_maximum_length(thd) >=
+              m_body_utf8_ptr + txt->length * 2);
+  uint32 cnv_length= my_convert_using_func(m_body_utf8_ptr, txt->length * 2,
+                                           &my_charset_utf8_general_ci,
+                                           get_escape_func(thd, quotation),
+                                           txt->str, txt->length,
+                                           cs, cs->cset->mb_wc,
+                                           &errors);
+  m_body_utf8_ptr+= cnv_length;
+  *m_body_utf8_ptr= 0;
+  m_cpp_utf8_processed_ptr= end_ptr;
+}
+
+
 void Lex_input_stream::add_digest_token(uint token, LEX_YYSTYPE yylval)
 {
   if (m_digest != NULL)
@@ -797,14 +1001,14 @@ Lex_input_stream::unescape(CHARSET_INFO *cs, char *to,
   Fix sometimes to do only one scan of the string
 */
 
-bool Lex_input_stream::get_text(LEX_STRING *dst, int pre_skip, int post_skip)
+bool Lex_input_stream::get_text(LEX_STRING *dst, uint sep,
+                                int pre_skip, int post_skip)
 {
-  reg1 uchar c,sep;
+  reg1 uchar c;
   uint found_escape=0;
   CHARSET_INFO *cs= m_thd->charset();
 
   tok_bitmap= 0;
-  sep= yyGetLast();                        // String should end with this
   while (! eof())
   {
     c= yyGet();
@@ -1169,6 +1373,8 @@ static int lex_one_token(YYSTYPE *yylval, THD *thd)
       return((int) c);
 
     case MY_LEX_IDENT_OR_NCHAR:
+    {
+      uint sep;
       if (lip->yyPeek() != '\'')
       {
 	state= MY_LEX_IDENT;
@@ -1176,14 +1382,20 @@ static int lex_one_token(YYSTYPE *yylval, THD *thd)
       }
       /* Found N'string' */
       lip->yySkip();                         // Skip '
-      if (lip->get_text(&yylval->lex_str, 2, 1))
+      if (lip->get_text(&yylval->lex_str, (sep= lip->yyGetLast()), 2, 1))
       {
 	state= MY_LEX_CHAR;             // Read char by char
 	break;
       }
+
+      lip->body_utf8_append(lip->m_cpp_text_start);
+      lip->body_utf8_append_escape(thd, &yylval->lex_str,
+                                   national_charset_info,
+                                   lip->m_cpp_text_end, sep);
+
       lex->text_string_is_7bit= (lip->tok_bitmap & 0x80) ? 0 : 1;
       return(NCHAR_STRING);
-
+    }
     case MY_LEX_IDENT_OR_HEX:
       if (lip->yyPeek() == '\'')
       {					// Found x'hex-number'
@@ -1541,23 +1753,23 @@ static int lex_one_token(YYSTYPE *yylval, THD *thd)
       }
       /* " used for strings */
     case MY_LEX_STRING:			// Incomplete text string
-      if (lip->get_text(&yylval->lex_str, 1, 1))
+    {
+      uint sep;
+      if (lip->get_text(&yylval->lex_str, (sep= lip->yyGetLast()), 1, 1))
       {
 	state= MY_LEX_CHAR;		// Read char by char
 	break;
       }
-
+      CHARSET_INFO *strcs= lip->m_underscore_cs ? lip->m_underscore_cs : cs;
       lip->body_utf8_append(lip->m_cpp_text_start);
 
-      lip->body_utf8_append_literal(thd, &yylval->lex_str,
-        lip->m_underscore_cs ? lip->m_underscore_cs : cs,
-        lip->m_cpp_text_end);
-
+      lip->body_utf8_append_escape(thd, &yylval->lex_str, strcs,
+                                   lip->m_cpp_text_end, sep);
       lip->m_underscore_cs= NULL;
 
       lex->text_string_is_7bit= (lip->tok_bitmap & 0x80) ? 0 : 1;
       return(TEXT_STRING);
-
+    }
     case MY_LEX_COMMENT:			//  Comment
       lex->select_lex.options|= OPTION_FOUND_COMMENT;
       while ((c = lip->yyGet()) != '\n' && c) ;
diff --git a/sql/sql_lex.h b/sql/sql_lex.h
index 644d1a1..af5c12d 100644
--- a/sql/sql_lex.h
+++ b/sql/sql_lex.h
@@ -1807,6 +1807,7 @@ class Lex_input_stream
 {
   size_t unescape(CHARSET_INFO *cs, char *to,
                   const char *str, const char *end, int sep);
+  my_charset_conv_wc_mb get_escape_func(THD *thd, my_wc_t quotation) const;
 public:
   Lex_input_stream()
   {
@@ -2077,6 +2078,12 @@ class Lex_input_stream
     return (uint) (m_body_utf8_ptr - m_body_utf8);
   }
 
+  /**
+    Get the maximum length of the utf8-body buffer.
+    The utf8 body can grow because of the character set conversion and escaping.
+  */
+  uint get_body_utf8_maximum_length(THD *thd);
+
   void body_utf8_start(THD *thd, const char *begin_ptr);
   void body_utf8_append(const char *ptr);
   void body_utf8_append(const char *ptr, const char *end_ptr);
@@ -2084,7 +2091,11 @@ class Lex_input_stream
                                 const LEX_STRING *txt,
                                 CHARSET_INFO *txt_cs,
                                 const char *end_ptr);
-
+  void body_utf8_append_escape(THD *thd,
+                               const LEX_STRING *txt,
+                               CHARSET_INFO *txt_cs,
+                               const char *end_ptr,
+                               my_wc_t quotation);
   /** Current thread. */
   THD *m_thd;
 
@@ -2105,7 +2116,7 @@ class Lex_input_stream
   /** LALR(2) resolution, value of the look ahead token.*/
   LEX_YYSTYPE lookahead_yylval;
 
-  bool get_text(LEX_STRING *to, int pre_skip, int post_skip);
+  bool get_text(LEX_STRING *to, uint sep, int pre_skip, int post_skip);
 
   void add_digest_token(uint token, LEX_YYSTYPE yylval);
 
diff --git a/strings/ctype.c b/strings/ctype.c
index f871a21..620c7e1 100644
--- a/strings/ctype.c
+++ b/strings/ctype.c
@@ -1030,19 +1030,18 @@ my_charset_is_ascii_compatible(CHARSET_INFO *cs)
   @return Number of bytes copied to 'to' string
 */
 
-static uint32
-my_convert_internal(char *to, uint32 to_length,
-                    CHARSET_INFO *to_cs,
-                    const char *from, uint32 from_length,
-                    CHARSET_INFO *from_cs, uint *errors)
+uint32
+my_convert_using_func(char *to, uint32 to_length,
+                      CHARSET_INFO *to_cs, my_charset_conv_wc_mb wc_mb,
+                      const char *from, uint32 from_length,
+                      CHARSET_INFO *from_cs, my_charset_conv_mb_wc mb_wc,
+                      uint *errors)
 {
   int         cnvres;
   my_wc_t     wc;
   const uchar *from_end= (const uchar*) from + from_length;
   char *to_start= to;
   uchar *to_end= (uchar*) to + to_length;
-  my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
-  my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
   uint error_count= 0;
 
   while (1)
@@ -1119,8 +1118,11 @@ my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
     immediately switch to slow mb_wc->wc_mb method.
   */
   if ((to_cs->state | from_cs->state) & MY_CS_NONASCII)
-    return my_convert_internal(to, to_length, to_cs,
-                               from, from_length, from_cs, errors);
+    return my_convert_using_func(to, to_length,
+                                 to_cs, to_cs->cset->wc_mb,
+                                 from, from_length,
+                                 from_cs, from_cs->cset->mb_wc,
+                                 errors);
 
   length= length2= MY_MIN(to_length, from_length);
 
@@ -1152,9 +1154,11 @@ my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
       uint32 copied_length= length2 - length;
       to_length-= copied_length;
       from_length-= copied_length;
-      return copied_length + my_convert_internal(to, to_length, to_cs,
-                                                 from, from_length, from_cs,
-                                                 errors);
+      return copied_length + my_convert_using_func(to, to_length, to_cs,
+                                                   to_cs->cset->wc_mb,
+                                                   from, from_length, from_cs,
+                                                   from_cs->cset->mb_wc,
+                                                   errors);
     }
   }
 

Follow ups

References