maria-developers team mailing list archive
-
maria-developers team
-
Mailing list archive
-
Message #09276
MDEV-7231 Field ROUTINE_DEFINITION in INFORMATION_SCHEMA.`ROUTINES` contains broken procedure body when used shielding quotes inside.
Hi Sergei,
Please review a patch for MDEV-7231.
Thanks.
diff --git a/include/m_ctype.h b/include/m_ctype.h
index a552226..ee49e94 100644
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@@ -533,6 +533,7 @@ struct my_charset_handler_st
extern MY_CHARSET_HANDLER my_charset_8bit_handler;
extern MY_CHARSET_HANDLER my_charset_ucs2_handler;
+extern MY_CHARSET_HANDLER my_charset_utf8_handler;
/*
@@ -889,6 +890,18 @@ uint32 my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
const char *from, uint32 from_length,
CHARSET_INFO *from_cs, uint *errors);
+/**
+ An extended version of my_convert(), to pass non-default mb_wc() and wc_mb().
+ For example, String::copy_printable() which is used in
+ Protocol::store_warning() uses this to escape control
+ and non-convertable characters.
+*/
+uint32 my_convert_using_func(char *to, uint32 to_length, CHARSET_INFO *to_cs,
+ my_charset_conv_wc_mb mb_wc,
+ const char *from, uint32 from_length,
+ CHARSET_INFO *from_cs,
+ my_charset_conv_mb_wc wc_mb,
+ uint *errors);
/*
Convert a string between two character sets.
Bad byte sequences as well as characters that cannot be
diff --git a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result
index 90bc6b5..bf9e444 100644
--- a/mysql-test/r/ctype_utf8.result
+++ b/mysql-test/r/ctype_utf8.result
@@ -10259,5 +10259,169 @@ Warnings:
Note 1003 select `test`.`t1`.`c` AS `c` from `test`.`t1` where (`test`.`t1`.`c` = 'A')
DROP TABLE t1;
#
+# MDEV-7231 Field ROUTINE_DEFINITION in INFORMATION_SCHEMA.`ROUTINES` contains broken procedure body when used shielding quotes inside.
+#
+CREATE PROCEDURE p1()
+BEGIN
+SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2);
+SELECT '''', """", '\'', "\"";
+SELECT '<tab> <tab>\t<tab>';
+SELECT '<nl>
+<nl>\n<nl>';
+SELECT 'test';
+SELECT 'tëst';
+SELECT 'test\0';
+SELECT 'tëst\0';
+SELECT _binary'test';
+SELECT _binary'tëst';
+SELECT _binary'test\0';
+SELECT _binary'tëst\0';
+SELECT N'''', N"""", N'\'', N"\"";
+SELECT N'<tab> <tab>\t<tab>';
+SELECT N'<nl>
+<nl>\n<nl>';
+SELECT N'test';
+SELECT N'tëst';
+SELECT N'test\0';
+SELECT N'tëst\0';
+END$$
+SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES
+WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='p1';
+ROUTINE_DEFINITION
+BEGIN
+SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2);
+SELECT '''', """", '''', """";
+SELECT '<tab>\t<tab>\t<tab>';
+SELECT '<nl>\n<nl>\n<nl>';
+SELECT 'test';
+SELECT 'tëst';
+SELECT 'test\0';
+SELECT 'tëst\0';
+SELECT 'test';
+SELECT X'74C3AB7374';
+SELECT 'test\0';
+SELECT X'74C3AB737400';
+SELECT N'''', N"""", N'''', N"""";
+SELECT N'<tab>\t<tab>\t<tab>';
+SELECT N'<nl>\n<nl>\n<nl>';
+SELECT N'test';
+SELECT N'tëst';
+SELECT N'test\0';
+SELECT N'tëst\0';
+END
+SELECT body_utf8 FROM mysql.proc WHERE name='p1';
+body_utf8
+BEGIN
+SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2);
+SELECT '''', """", '''', """";
+SELECT '<tab>\t<tab>\t<tab>';
+SELECT '<nl>\n<nl>\n<nl>';
+SELECT 'test';
+SELECT 'tëst';
+SELECT 'test\0';
+SELECT 'tëst\0';
+SELECT 'test';
+SELECT X'74C3AB7374';
+SELECT 'test\0';
+SELECT X'74C3AB737400';
+SELECT N'''', N"""", N'''', N"""";
+SELECT N'<tab>\t<tab>\t<tab>';
+SELECT N'<nl>\n<nl>\n<nl>';
+SELECT N'test';
+SELECT N'tëst';
+SELECT N'test\0';
+SELECT N'tëst\0';
+END
+DROP PROCEDURE p1;
+SET NAMES binary;
+CREATE FUNCTION f1() RETURNS TEXT RETURN CONCAT('i','й');
+SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES
+WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='f1';
+ROUTINE_DEFINITION
+RETURN CONCAT('i',X'D0B9')
+SELECT body_utf8 FROM mysql.proc WHERE name='f1';
+body_utf8
+RETURN CONCAT('i',X'D0B9')
+DROP FUNCTION f1;
+SET NAMES utf8;
+SET @@SQL_MODE='NO_BACKSLASH_ESCAPES';
+CREATE PROCEDURE p1()
+BEGIN
+SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2);
+SELECT '''', """";
+SELECT '<tab> <tab>\t<tab>';
+SELECT '<nl>
+<nl>\n<nl>';
+SELECT 'test';
+SELECT 'tëst';
+SELECT 'test\0';
+SELECT 'tëst\0';
+SELECT _binary'test';
+SELECT _binary'tëst';
+SELECT _binary'test\0';
+SELECT _binary'tëst\0';
+SELECT N'''', N"""";
+SELECT N'<tab> <tab>\t<tab>';
+SELECT N'<nl>
+<nl>\n<nl>';
+SELECT N'test';
+SELECT N'tëst';
+SELECT N'test\0';
+SELECT N'tëst\0';
+END$$
+SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES
+WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='p1';
+ROUTINE_DEFINITION
+BEGIN
+SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2);
+SELECT '''', """";
+SELECT '<tab> <tab>\t<tab>';
+SELECT '<nl>
+<nl>\n<nl>';
+SELECT 'test';
+SELECT 'tëst';
+SELECT 'test\0';
+SELECT 'tëst\0';
+SELECT 'test';
+SELECT X'74C3AB7374';
+SELECT 'test\0';
+SELECT X'74C3AB73745C30';
+SELECT N'''', N"""";
+SELECT N'<tab> <tab>\t<tab>';
+SELECT N'<nl>
+<nl>\n<nl>';
+SELECT N'test';
+SELECT N'tëst';
+SELECT N'test\0';
+SELECT N'tëst\0';
+END
+SELECT body_utf8 FROM mysql.proc WHERE name='p1';
+body_utf8
+BEGIN
+SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2);
+SELECT '''', """";
+SELECT '<tab> <tab>\t<tab>';
+SELECT '<nl>
+<nl>\n<nl>';
+SELECT 'test';
+SELECT 'tëst';
+SELECT 'test\0';
+SELECT 'tëst\0';
+SELECT 'test';
+SELECT X'74C3AB7374';
+SELECT 'test\0';
+SELECT X'74C3AB73745C30';
+SELECT N'''', N"""";
+SELECT N'<tab> <tab>\t<tab>';
+SELECT N'<nl>
+<nl>\n<nl>';
+SELECT N'test';
+SELECT N'tëst';
+SELECT N'test\0';
+SELECT N'tëst\0';
+END
+DROP PROCEDURE p1;
+SET @@SQL_MODE=default;
+#
# End of 10.1 tests
#
diff --git a/mysql-test/r/ctype_utf8mb4.result b/mysql-test/r/ctype_utf8mb4.result
index ac53a7e..f969369 100644
--- a/mysql-test/r/ctype_utf8mb4.result
+++ b/mysql-test/r/ctype_utf8mb4.result
@@ -3382,5 +3382,19 @@ SET NAMES utf8mb4;
SELECT * FROM `testððtest`;
ERROR HY000: Invalid utf8mb4 character string: 'test\xF0\x9F\x98\x81\xF0\x9F\x98\x81test'
#
+# MDEV-7231 Field ROUTINE_DEFINITION in INFORMATION_SCHEMA.`ROUTINES` contains broken procedure body when used shielding quotes inside.
+#
+SET NAMES utf8mb4;
+CREATE FUNCTION f1() RETURNS TEXT CHARACTER SET utf8mb4
+RETURN CONCAT('ð','xð','ðy','xðy');
+SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES
+WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='f1';
+ROUTINE_DEFINITION
+RETURN CONCAT('?','x?','?y','x?y')
+SELECT body_utf8 FROM mysql.proc WHERE name='f1';
+body_utf8
+RETURN CONCAT('?','x?','?y','x?y')
+DROP FUNCTION f1;
+#
# End of 10.1 tests
#
diff --git a/mysql-test/t/ctype_utf8.test b/mysql-test/t/ctype_utf8.test
index 014194d..879629e0 100644
--- a/mysql-test/t/ctype_utf8.test
+++ b/mysql-test/t/ctype_utf8.test
@@ -1872,5 +1872,80 @@ DROP TABLE t1;
--echo #
+--echo # MDEV-7231 Field ROUTINE_DEFINITION in INFORMATION_SCHEMA.`ROUTINES` contains broken procedure body when used shielding quotes inside.
+--echo #
+DELIMITER $$;
+CREATE PROCEDURE p1()
+BEGIN
+SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2);
+SELECT '''', """", '\'', "\"";
+SELECT '<tab> <tab>\t<tab>';
+SELECT '<nl>
+<nl>\n<nl>';
+SELECT 'test';
+SELECT 'tëst';
+SELECT 'test\0';
+SELECT 'tëst\0';
+SELECT _binary'test';
+SELECT _binary'tëst';
+SELECT _binary'test\0';
+SELECT _binary'tëst\0';
+SELECT N'''', N"""", N'\'', N"\"";
+SELECT N'<tab> <tab>\t<tab>';
+SELECT N'<nl>
+<nl>\n<nl>';
+SELECT N'test';
+SELECT N'tëst';
+SELECT N'test\0';
+SELECT N'tëst\0';
+END$$
+DELIMITER ;$$
+SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES
+WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='p1';
+SELECT body_utf8 FROM mysql.proc WHERE name='p1';
+DROP PROCEDURE p1;
+
+SET NAMES binary;
+CREATE FUNCTION f1() RETURNS TEXT RETURN CONCAT('i','й');
+SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES
+WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='f1';
+SELECT body_utf8 FROM mysql.proc WHERE name='f1';
+DROP FUNCTION f1;
+SET NAMES utf8;
+
+SET @@SQL_MODE='NO_BACKSLASH_ESCAPES';
+DELIMITER $$;
+CREATE PROCEDURE p1()
+BEGIN
+SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2);
+SELECT '''', """";
+SELECT '<tab> <tab>\t<tab>';
+SELECT '<nl>
+<nl>\n<nl>';
+SELECT 'test';
+SELECT 'tëst';
+SELECT 'test\0';
+SELECT 'tëst\0';
+SELECT _binary'test';
+SELECT _binary'tëst';
+SELECT _binary'test\0';
+SELECT _binary'tëst\0';
+SELECT N'''', N"""";
+SELECT N'<tab> <tab>\t<tab>';
+SELECT N'<nl>
+<nl>\n<nl>';
+SELECT N'test';
+SELECT N'tëst';
+SELECT N'test\0';
+SELECT N'tëst\0';
+END$$
+DELIMITER ;$$
+SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES
+WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='p1';
+SELECT body_utf8 FROM mysql.proc WHERE name='p1';
+DROP PROCEDURE p1;
+SET @@SQL_MODE=default;
+
+--echo #
--echo # End of 10.1 tests
--echo #
diff --git a/mysql-test/t/ctype_utf8mb4.test b/mysql-test/t/ctype_utf8mb4.test
index 3f2b600..2fe9b5e 100644
--- a/mysql-test/t/ctype_utf8mb4.test
+++ b/mysql-test/t/ctype_utf8mb4.test
@@ -1905,5 +1905,17 @@ SET NAMES utf8mb4;
SELECT * FROM `testððtest`;
--echo #
+--echo # MDEV-7231 Field ROUTINE_DEFINITION in INFORMATION_SCHEMA.`ROUTINES` contains broken procedure body when used shielding quotes inside.
+--echo #
+# Non-BMP characters should be replaced to '?' in ROUTINE_DEFINITION/body_utf8
+SET NAMES utf8mb4;
+CREATE FUNCTION f1() RETURNS TEXT CHARACTER SET utf8mb4
+RETURN CONCAT('ð','xð','ðy','xðy');
+SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES
+WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='f1';
+SELECT body_utf8 FROM mysql.proc WHERE name='f1';
+DROP FUNCTION f1;
+
+--echo #
--echo # End of 10.1 tests
--echo #
diff --git a/sql/sql_lex.cc b/sql/sql_lex.cc
index 898e3ae..75df23a 100644
--- a/sql/sql_lex.cc
+++ b/sql/sql_lex.cc
@@ -324,9 +324,7 @@ void Lex_input_stream::body_utf8_start(THD *thd, const char *begin_ptr)
DBUG_ASSERT(begin_ptr);
DBUG_ASSERT(m_cpp_buf <= begin_ptr && begin_ptr <= m_cpp_buf + m_buf_length);
- uint body_utf8_length=
- (m_buf_length / thd->variables.character_set_client->mbminlen) *
- my_charset_utf8_bin.mbmaxlen;
+ uint body_utf8_length= get_body_utf8_maximum_length(thd);
m_body_utf8= (char *) thd->alloc(body_utf8_length + 1);
m_body_utf8_ptr= m_body_utf8;
@@ -335,6 +333,25 @@ void Lex_input_stream::body_utf8_start(THD *thd, const char *begin_ptr)
m_cpp_utf8_processed_ptr= begin_ptr;
}
+
+uint Lex_input_stream::get_body_utf8_maximum_length(THD *thd)
+{
+ /*
+ String literals can grow during escaping:
+ 1a. Binary string '<FF>' grows to X'FF', 3 bytes to 5 bytes growth.
+ 1b. Binary string '1000 times <FF>' grows to X'<1000 times "FF">', which
+ gives 1002 to 2003 growth, a little bit less than 2 times.
+ 2a. Character string '<TAB>' can grow to '\t', 3 bytes to 4 bytes growth.
+ 2b. Character string '1000 times <TAB>' grows from
+ 1002 to 2002 bytes (including quotes), which gives a little bit
+ less than 2 times growth.
+ "2" should be a reasonable multiplier that safely covers escaping needs.
+ */
+ return (m_buf_length / thd->variables.character_set_client->mbminlen) *
+ my_charset_utf8_bin.mbmaxlen * 2/*for escaping*/;
+}
+
+
/**
@brief The operation appends unprocessed part of pre-processed buffer till
the given pointer (ptr) and sets m_cpp_utf8_processed_ptr to end_ptr.
@@ -434,6 +451,220 @@ void Lex_input_stream::body_utf8_append_literal(THD *thd,
m_cpp_utf8_processed_ptr= end_ptr;
}
+
+
+
+extern "C" {
+
+/**
+ Escape a character. Consequently puts "escape" and "wc" characters into
+ the destination utf8 string.
+ @param cs - the character set (utf8)
+ @param escape - the escape character (backslash, single quote, double quote)
+ @param wc - the character to be escaped
+ @param str - the destination string
+ @param end - the end of the destination string
+ @returns - a code according to the wc_mb() convension.
+*/
+int my_wc_mb_utf8_with_escape(CHARSET_INFO *cs, my_wc_t escape, my_wc_t wc,
+ uchar *str, uchar *end)
+{
+ DBUG_ASSERT(escape > 0);
+ if (str + 1 >= end)
+ return MY_CS_TOOSMALL2; // Not enough space, need at least two bytes.
+ *str= escape;
+ int cnvres= my_charset_utf8_handler.wc_mb(cs, wc, str + 1, end);
+ if (cnvres > 0)
+ return cnvres + 1; // The character was normally put
+ if (cnvres == MY_CS_ILUNI)
+ return MY_CS_ILUNI; // Could not encode "wc" (e.g. non-BMP character)
+ DBUG_ASSERT(cnvres <= MY_CS_TOOSMALL);
+ return cnvres - 1; // Not enough space
+}
+
+
+/**
+ Optionally escape a character.
+ If "escape" is non-zero, then both "escape" and "wc" are put to
+ the destination string. Otherwise, only "wc" is put.
+ @param cs - the character set (utf8)
+ @param wc - the character to be optionally escaped
+ @param escape - the escape character, or 0
+ @param ewc - the escaped replacement of "wc" (e.g. 't' for '\t')
+ @param str - the destination string
+ @param end - the end of the destination string
+ @returns - a code according to the wc_mb() conversion.
+*/
+int my_wc_mb_utf8_opt_escape(CHARSET_INFO *cs,
+ my_wc_t wc, my_wc_t escape, my_wc_t ewc,
+ uchar *str, uchar *end)
+{
+ return escape ? my_wc_mb_utf8_with_escape(cs, escape, ewc, str, end) :
+ my_charset_utf8_handler.wc_mb(cs, wc, str, end);
+}
+
+/**
+ Encode a character with optional backlash escaping and quote escaping.
+ Quote marks are escaped using another quote mark.
+ Additionally, if "escape" is non-zero, then special characters are
+ also escaped using "escape".
+ Otherwise (if "escape" is zero, e.g. in case of MODE_NO_BACKSLASH_ESCAPES),
+ then special characters are not escaped and handled as normal characters.
+
+ @param cs - the character set (utf8)
+ @param wc - the character to be encoded
+ @param str - the destination string
+ @param end - the end of the destination string
+ @param quotation - the string delimiter (e.g. ' or ")
+ @param escape - the escape character (backslash, or 0)
+ @returns - a code according to the wc_mb() convension.
+*/
+int my_wc_mb_utf8_escape(CHARSET_INFO *cs, my_wc_t wc, uchar *str, uchar *end,
+ my_wc_t quotation, my_wc_t escape)
+{
+ DBUG_ASSERT(escape == 0 || escape == '\\');
+ switch (wc) {
+ case 0: return my_wc_mb_utf8_opt_escape(cs, wc, escape, '0', str, end);
+ case '\t': return my_wc_mb_utf8_opt_escape(cs, wc, escape, 't', str, end);
+ case '\r': return my_wc_mb_utf8_opt_escape(cs, wc, escape, 'r', str, end);
+ case '\n': return my_wc_mb_utf8_opt_escape(cs, wc, escape, 'n', str, end);
+ case '\032': return my_wc_mb_utf8_opt_escape(cs, wc, escape, 'Z', str, end);
+ case '\'':
+ case '\"':
+ if (wc == quotation)
+ return my_wc_mb_utf8_with_escape(cs, wc, wc, str, end);
+ }
+ return my_charset_utf8_handler.wc_mb(cs, wc, str, end); // No escaping needed
+}
+
+
+/** wc_mb() compatible routines for all sql_mode and delimiter combinations */
+int my_wc_mb_utf8_escape_single_quote_and_backslash(CHARSET_INFO *cs,
+ my_wc_t wc,
+ uchar *str, uchar *end)
+{
+ return my_wc_mb_utf8_escape(cs, wc, str, end, '\'', '\\');
+}
+
+
+int my_wc_mb_utf8_escape_double_quote_and_backslash(CHARSET_INFO *cs,
+ my_wc_t wc,
+ uchar *str, uchar *end)
+{
+ return my_wc_mb_utf8_escape(cs, wc, str, end, '"', '\\');
+}
+
+
+int my_wc_mb_utf8_escape_single_quote(CHARSET_INFO *cs, my_wc_t wc,
+ uchar *str, uchar *end)
+{
+ return my_wc_mb_utf8_escape(cs, wc, str, end, '\'', 0);
+}
+
+
+int my_wc_mb_utf8_escape_double_quote(CHARSET_INFO *cs, my_wc_t wc,
+ uchar *str, uchar *end)
+{
+ return my_wc_mb_utf8_escape(cs, wc, str, end, '"', 0);
+}
+
+}; // End of extern "C"
+
+
+/**
+ Get an escaping function, depending on the current sql_mode and the
+ string separator.
+*/
+my_charset_conv_wc_mb
+Lex_input_stream::get_escape_func(THD *thd, my_wc_t quotation) const
+{
+ return thd->backslash_escapes() ?
+ (quotation == '"' ? my_wc_mb_utf8_escape_double_quote_and_backslash:
+ my_wc_mb_utf8_escape_single_quote_and_backslash) :
+ (quotation == '"' ? my_wc_mb_utf8_escape_double_quote:
+ my_wc_mb_utf8_escape_single_quote);
+}
+
+
+/**
+ Append a text literal to the end of m_body_utf8.
+ The string is escaped according to the current sql_mode and the
+ string delimiter (e.g. ' or ").
+
+ @param thd - current THD
+ @param txt - the string to be appended to m_body_utf8.
+ Note, the string must be already unescaped.
+ @param cs - the character set of the string
+ @param end_ptr - m_cpp_utf8_processed_ptr will be set to this value
+ (see body_utf8_append_literal for details)
+ @param quotation - the string delimiter (single or double quotation)
+*/
+void Lex_input_stream::body_utf8_append_escape(THD *thd,
+ const LEX_STRING *txt,
+ CHARSET_INFO *cs,
+ const char *end_ptr,
+ my_wc_t quotation)
+{
+ DBUG_ASSERT(quotation == '\'' || quotation == '"');
+ if (!m_cpp_utf8_processed_ptr)
+ return;
+ /**
+ In case of "SET NAMES binary; SELECT 'aaa';" or "SELECT _binary'xxx';",
+ we reinterpret the string as utf8, as calling wc_mb() for "binary" is
+ not reliable.
+ */
+ if (cs == &my_charset_bin)
+ cs= &my_charset_utf8_general_ci;
+ uint errors;
+ /**
+ We previously alloced m_body_utf8 to be able to store the query with all
+ strings properly escaped. See get_body_utf8_maximum_length().
+ So here we have guaranteedly enough space to append any string literal
+ with escaping. Passing txt->length*2 as "available space" should be good
+ enough.
+ For better safety purposes we could calculate get_body_utf8_maximum_length()
+ every time we append a string, but this would affect performance negatively,
+ so let's check that we don't get beyond the allocated buffer in
+ debug build only.
+ */
+ DBUG_ASSERT(m_body_utf8 + get_body_utf8_maximum_length(thd) >=
+ m_body_utf8_ptr + txt->length * 2);
+ uint32 cnv_length= my_convert_using_func(m_body_utf8_ptr, txt->length * 2,
+ &my_charset_utf8_general_ci,
+ get_escape_func(thd, quotation),
+ txt->str, txt->length,
+ cs, cs->cset->mb_wc,
+ &errors);
+ m_body_utf8_ptr+= cnv_length;
+ *m_body_utf8_ptr= 0;
+ m_cpp_utf8_processed_ptr= end_ptr;
+}
+
+
+/**
+ Append a text literal to the end of m_body_utf8 using the X'hhhh' notation.
+
+ @param txt - the string to be appended to m_body_utf8.
+ @param end_ptr - m_cpp_utf8_processed_ptr will be set to this value
+ (see body_utf8_append_literal for details)
+*/
+void Lex_input_stream::body_utf8_append_hex_literal(const LEX_STRING *txt,
+ const char *end_ptr)
+{
+ if (!m_cpp_utf8_processed_ptr)
+ return;
+ // See the comment in body_utf8_append_escape()
+ DBUG_ASSERT(m_body_utf8 + get_body_utf8_maximum_length(m_thd) >=
+ m_body_utf8_ptr + txt->length * 2 + 3);
+ *m_body_utf8_ptr++= 'X';
+ *m_body_utf8_ptr++= '\'';
+ m_body_utf8_ptr= octet2hex(m_body_utf8_ptr, txt->str, txt->length);
+ *m_body_utf8_ptr++= '\'';
+ *m_body_utf8_ptr= '\0';
+ m_cpp_utf8_processed_ptr= end_ptr;
+}
+
+
void Lex_input_stream::add_digest_token(uint token, LEX_YYSTYPE yylval)
{
if (m_digest != NULL)
@@ -783,32 +1014,22 @@ my_unescape(CHARSET_INFO *cs, char *to, const char *str, const char *end,
}
-size_t
-Lex_input_stream::unescape(CHARSET_INFO *cs, char *to,
- const char *str, const char *end,
- int sep)
-{
- return my_unescape(cs, to, str, end, sep, m_thd->backslash_escapes());
-}
-
-
/*
Return an unescaped text literal without quotes
Fix sometimes to do only one scan of the string
*/
-bool Lex_input_stream::get_text(LEX_STRING *dst, int pre_skip, int post_skip)
+bool Lex_input_stream::get_text(Lip_string_st *dst, uint sep,
+ int pre_skip, int post_skip)
{
- reg1 uchar c,sep;
- uint found_escape=0;
+ reg1 uchar c;
CHARSET_INFO *cs= m_thd->charset();
+ uint flags= 0;
- tok_bitmap= 0;
- sep= yyGetLast(); // String should end with this
while (! eof())
{
- c= yyGet();
- tok_bitmap|= c;
+ if ((c= yyGet()) & 0x80)
+ flags|= Lip_string_st::string_flag_8bit;
#ifdef USE_MB
{
int l;
@@ -817,6 +1038,7 @@ bool Lex_input_stream::get_text(LEX_STRING *dst, int pre_skip, int post_skip)
get_ptr() -1,
get_end_of_query()))) {
skip_binary(l-1);
+ flags|= Lip_string_st::string_flag_mb;
continue;
}
}
@@ -824,7 +1046,7 @@ bool Lex_input_stream::get_text(LEX_STRING *dst, int pre_skip, int post_skip)
if (c == '\\' &&
!(m_thd->variables.sql_mode & MODE_NO_BACKSLASH_ESCAPES))
{ // Escaped character
- found_escape=1;
+ flags|= Lip_string_st::string_flag_backslash_escape;
if (eof())
return true;
yySkip();
@@ -833,41 +1055,18 @@ bool Lex_input_stream::get_text(LEX_STRING *dst, int pre_skip, int post_skip)
{
if (c == yyGet()) // Check if two separators in a row
{
- found_escape=1; // duplicate. Remember for delete
+ flags|= Lip_string_st::string_flag_sep_escape;
continue;
}
else
yyUnget();
- /* Found end. Unescape and return string */
- const char *str, *end;
-
- str= get_tok_start();
- end= get_ptr();
- /* Extract the text from the token */
- str += pre_skip;
- end -= post_skip;
- DBUG_ASSERT(end >= str);
-
- if (!(dst->str= (char*) m_thd->alloc((uint) (end - str) + 1)))
- {
- dst->str= (char*) ""; // Sql_alloc has set error flag
- dst->length= 0;
- return true;
- }
+ /* Found end. */
+ dst->set(get_tok_start() + pre_skip, get_ptr() - post_skip, flags);
m_cpp_text_start= get_cpp_tok_start() + pre_skip;
m_cpp_text_end= get_cpp_ptr() - post_skip;
- if (!found_escape)
- {
- memcpy(dst->str, str, dst->length= (end - str));
- dst->str[dst->length]= 0;
- }
- else
- {
- dst->length= unescape(cs, dst->str, str, end, sep);
- }
return false;
}
}
@@ -875,6 +1074,29 @@ bool Lex_input_stream::get_text(LEX_STRING *dst, int pre_skip, int post_skip)
}
+bool Lip_string_st::copy_or_unescape(THD *thd, LEX_STRING *dst,
+ CHARSET_INFO *cs, uint sep) const
+{
+ if (!(dst->str= (char*) thd->alloc((uint) length() + 1)))
+ {
+ dst->str= (char*) ""; // Sql_alloc has set error flag
+ dst->length= 0;
+ return true;
+ }
+ if (!(m_flags & (string_flag_sep_escape | string_flag_backslash_escape)))
+ {
+ memcpy(dst->str, m_str, dst->length= length());
+ dst->str[dst->length]= 0;
+ }
+ else
+ {
+ dst->length= my_unescape(cs, dst->str, m_str, m_end,
+ sep, m_flags & string_flag_backslash_escape);
+ }
+ return false;
+}
+
+
/*
** Calc type of integer; long integer, longlong integer or real.
** Returns smallest type that match the string.
@@ -1169,6 +1391,9 @@ static int lex_one_token(YYSTYPE *yylval, THD *thd)
return((int) c);
case MY_LEX_IDENT_OR_NCHAR:
+ {
+ uint sep;
+ Lip_string_st tmp;
if (lip->yyPeek() != '\'')
{
state= MY_LEX_IDENT;
@@ -1176,14 +1401,21 @@ static int lex_one_token(YYSTYPE *yylval, THD *thd)
}
/* Found N'string' */
lip->yySkip(); // Skip '
- if (lip->get_text(&yylval->lex_str, 2, 1))
+ if (lip->get_text(&tmp, (sep= lip->yyGetLast()), 2, 1) ||
+ tmp.copy_or_unescape(thd, &yylval->lex_str, cs, sep))
{
- state= MY_LEX_CHAR; // Read char by char
- break;
+ state= MY_LEX_CHAR; // Read char by char
+ break;
}
- lex->text_string_is_7bit= (lip->tok_bitmap & 0x80) ? 0 : 1;
- return(NCHAR_STRING);
+ lip->body_utf8_append(lip->m_cpp_text_start);
+ lip->body_utf8_append_escape(thd, &yylval->lex_str,
+ national_charset_info,
+ lip->m_cpp_text_end, sep);
+
+ lex->text_string_is_7bit= tmp.is_7bit();
+ return(NCHAR_STRING);
+ }
case MY_LEX_IDENT_OR_HEX:
if (lip->yyPeek() == '\'')
{ // Found x'hex-number'
@@ -1541,23 +1773,33 @@ static int lex_one_token(YYSTYPE *yylval, THD *thd)
}
/* " used for strings */
case MY_LEX_STRING: // Incomplete text string
- if (lip->get_text(&yylval->lex_str, 1, 1))
+ {
+ uint sep;
+ Lip_string_st tmp;
+ if (lip->get_text(&tmp, (sep= lip->yyGetLast()), 1, 1) ||
+ tmp.copy_or_unescape(thd, &yylval->lex_str, cs, sep))
{
- state= MY_LEX_CHAR; // Read char by char
- break;
+ state= MY_LEX_CHAR; // Read char by char
+ break;
+ }
+ CHARSET_INFO *strcs= lip->m_underscore_cs ? lip->m_underscore_cs : cs;
+ if (strcs == &my_charset_bin && tmp.is_unsafe_binary_literal())
+ {
+ lip->body_utf8_append(lip->m_cpp_text_start - 1);
+ lip->body_utf8_append_hex_literal(&yylval->lex_str,
+ lip->m_cpp_text_end + 1);
+ }
+ else
+ {
+ lip->body_utf8_append(lip->m_cpp_text_start);
+ lip->body_utf8_append_escape(thd, &yylval->lex_str, strcs,
+ lip->m_cpp_text_end, sep);
}
-
- lip->body_utf8_append(lip->m_cpp_text_start);
-
- lip->body_utf8_append_literal(thd, &yylval->lex_str,
- lip->m_underscore_cs ? lip->m_underscore_cs : cs,
- lip->m_cpp_text_end);
-
lip->m_underscore_cs= NULL;
- lex->text_string_is_7bit= (lip->tok_bitmap & 0x80) ? 0 : 1;
+ lex->text_string_is_7bit= tmp.is_7bit();
return(TEXT_STRING);
-
+ }
case MY_LEX_COMMENT: // Comment
lex->select_lex.options|= OPTION_FOUND_COMMENT;
while ((c = lip->yyGet()) != '\n' && c) ;
diff --git a/sql/sql_lex.h b/sql/sql_lex.h
index 644d1a1..e9994dd 100644
--- a/sql/sql_lex.h
+++ b/sql/sql_lex.h
@@ -1789,6 +1789,49 @@ enum enum_comment_state
};
+struct Lip_string_st
+{
+private:
+ const char *m_str;
+ const char *m_end;
+ uint m_flags;
+public:
+ enum string_flags
+ {
+ string_flag_8bit= 0x01,
+ string_flag_mb= 0x02,
+ string_flag_sep_escape= 0x04,
+ string_flag_backslash_escape= 0x08,
+ string_flag_ascii_special_not_escaped= 0x10,
+ };
+ void set(const char *str, const char *end, uint flags)
+ {
+ m_str= str;
+ m_end= end;
+ m_flags= flags;
+ DBUG_ASSERT(end >= str);
+ }
+ const char *ptr() const { return m_str; }
+ size_t length() const { return m_end - m_str; }
+ bool is_7bit() const { return !(m_flags & string_flag_8bit); }
+ /**
+ Test if a binary literal needs X'hhhh' notation,
+ or is save to be printed as a regular character string.
+ This method is used during construction of a stored routine body
+ for presentation in INFORMATION_SCHEMA.ROUTINE_DEFINITION.
+ 8bit and multi-byte characters are not safe, as they would be
+ converted from character_set_client to utf8, and the result
+ would be different from what the user typed.
+ */
+ bool is_unsafe_binary_literal() const
+ {
+ return m_flags & (string_flag_8bit | string_flag_mb);
+ }
+ bool copy_or_unescape(THD *thd, LEX_STRING *dst,
+ CHARSET_INFO *cs, uint sep) const;
+};
+
+
/**
@brief This class represents the character input stream consumed during
lexical analysis.
@@ -1805,8 +1848,7 @@ enum enum_comment_state
class Lex_input_stream
{
- size_t unescape(CHARSET_INFO *cs, char *to,
- const char *str, const char *end, int sep);
+ my_charset_conv_wc_mb get_escape_func(THD *thd, my_wc_t quotation) const;
public:
Lex_input_stream()
{
@@ -2077,6 +2119,12 @@ class Lex_input_stream
return (uint) (m_body_utf8_ptr - m_body_utf8);
}
+ /**
+ Get the maximum length of the utf8-body buffer.
+ The utf8 body can grow because of the character set conversion and escaping.
+ */
+ uint get_body_utf8_maximum_length(THD *thd);
+
void body_utf8_start(THD *thd, const char *begin_ptr);
void body_utf8_append(const char *ptr);
void body_utf8_append(const char *ptr, const char *end_ptr);
@@ -2084,6 +2132,13 @@ class Lex_input_stream
const LEX_STRING *txt,
CHARSET_INFO *txt_cs,
const char *end_ptr);
+ void body_utf8_append_escape(THD *thd,
+ const LEX_STRING *txt,
+ CHARSET_INFO *txt_cs,
+ const char *end_ptr,
+ my_wc_t quotation);
+ void body_utf8_append_hex_literal(const LEX_STRING *txt,
+ const char *end_ptr);
/** Current thread. */
THD *m_thd;
@@ -2105,7 +2160,7 @@ class Lex_input_stream
/** LALR(2) resolution, value of the look ahead token.*/
LEX_YYSTYPE lookahead_yylval;
- bool get_text(LEX_STRING *to, int pre_skip, int post_skip);
+ bool get_text(Lip_string_st *to, uint sep, int pre_skip, int post_skip);
void add_digest_token(uint token, LEX_YYSTYPE yylval);
@@ -2184,9 +2239,6 @@ class Lex_input_stream
*/
const char *found_semicolon;
- /** Token character bitmaps, to detect 7bit strings. */
- uchar tok_bitmap;
-
/** SQL_MODE = IGNORE_SPACE. */
bool ignore_space;
diff --git a/strings/ctype.c b/strings/ctype.c
index f871a21..620c7e1 100644
--- a/strings/ctype.c
+++ b/strings/ctype.c
@@ -1030,19 +1030,18 @@ my_charset_is_ascii_compatible(CHARSET_INFO *cs)
@return Number of bytes copied to 'to' string
*/
-static uint32
-my_convert_internal(char *to, uint32 to_length,
- CHARSET_INFO *to_cs,
- const char *from, uint32 from_length,
- CHARSET_INFO *from_cs, uint *errors)
+uint32
+my_convert_using_func(char *to, uint32 to_length,
+ CHARSET_INFO *to_cs, my_charset_conv_wc_mb wc_mb,
+ const char *from, uint32 from_length,
+ CHARSET_INFO *from_cs, my_charset_conv_mb_wc mb_wc,
+ uint *errors)
{
int cnvres;
my_wc_t wc;
const uchar *from_end= (const uchar*) from + from_length;
char *to_start= to;
uchar *to_end= (uchar*) to + to_length;
- my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
- my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
uint error_count= 0;
while (1)
@@ -1119,8 +1118,11 @@ my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
immediately switch to slow mb_wc->wc_mb method.
*/
if ((to_cs->state | from_cs->state) & MY_CS_NONASCII)
- return my_convert_internal(to, to_length, to_cs,
- from, from_length, from_cs, errors);
+ return my_convert_using_func(to, to_length,
+ to_cs, to_cs->cset->wc_mb,
+ from, from_length,
+ from_cs, from_cs->cset->mb_wc,
+ errors);
length= length2= MY_MIN(to_length, from_length);
@@ -1152,9 +1154,11 @@ my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
uint32 copied_length= length2 - length;
to_length-= copied_length;
from_length-= copied_length;
- return copied_length + my_convert_internal(to, to_length, to_cs,
- from, from_length, from_cs,
- errors);
+ return copied_length + my_convert_using_func(to, to_length, to_cs,
+ to_cs->cset->wc_mb,
+ from, from_length, from_cs,
+ from_cs->cset->mb_wc,
+ errors);
}
}
Follow ups