maria-developers team mailing list archive
-
maria-developers team
-
Mailing list archive
-
Message #12581
Re: 0fc221a012d: MDEV-8334: Rename utf8 to utf8mb3
Hi, Rucha!
Thanks for combining all commits in one and a cleanup.
See comments below.
On Mar 18, Rucha Deodhar wrote:
> revision-id: 0fc221a012d (mariadb-10.5.2-402-g0fc221a012d)
> parent(s): a1542f8a573
> author: Rucha Deodhar <rucha.deodhar@xxxxxxxxxxx>
> committer: Rucha Deodhar <rucha.deodhar@xxxxxxxxxxx>
> timestamp: 2021-03-16 23:31:12 +0530
> message:
>
> MDEV-8334: Rename utf8 to utf8mb3
>
> This patch changes the main name of 3 byte character set from utf8 to
> utf8mb3. New old_mode UTF8_IS_UTF8MB3 is added and set TRUE by default,
> so that utf8 would mean utf8mb3. If not set, utf8 would mean utf8mb4.
> diff --git a/client/mysqlcheck.c b/client/mysqlcheck.c
> index fb3103a318d..a8990d8cb6b 100644
> --- a/client/mysqlcheck.c
> +++ b/client/mysqlcheck.c
> @@ -437,7 +437,7 @@ static int get_options(int *argc, char ***argv)
> if (!default_charset)
> {
> if (opt_fix_db_names || opt_fix_table_names)
> - default_charset= (char*) "utf8";
> + default_charset= (char*) "utf8mb3";
why not to keep it utf8?
> else
> default_charset= (char*) MYSQL_AUTODETECT_CHARSET_NAME;
> }
> diff --git a/client/mysqldump.c b/client/mysqldump.c
> index 7c363973da2..900456b31b2 100644
> --- a/client/mysqldump.c
> +++ b/client/mysqldump.c
> @@ -3235,7 +3235,7 @@ static uint get_table_structure(const char *table, const char *db, char *table_t
> {
> fprintf(sql_file,
> "/*!40101 SET @saved_cs_client = @@character_set_client */;\n"
> - "/*!40101 SET character_set_client = utf8 */;\n"
> + "/*!40101 SET character_set_client = utf8mb3 */;\n"
why not to keep it utf8?
> "%s%s;\n"
> "/*!40101 SET character_set_client = @saved_cs_client */;\n",
> is_log_table ? "CREATE TABLE IF NOT EXISTS " : "",
> diff --git a/extra/mariabackup/backup_mysql.cc b/extra/mariabackup/backup_mysql.cc
> index 3083326a7e0..c62252257b9 100644
> --- a/extra/mariabackup/backup_mysql.cc
> +++ b/extra/mariabackup/backup_mysql.cc
> @@ -117,7 +117,7 @@ xb_mysql_connect()
> mysql_options(connection, MYSQL_PLUGIN_DIR, xb_plugin_dir);
> }
> mysql_options(connection, MYSQL_OPT_PROTOCOL, &opt_protocol);
> - mysql_options(connection,MYSQL_SET_CHARSET_NAME, "utf8");
> + mysql_options(connection,MYSQL_SET_CHARSET_NAME, "utf8mb3");
why not to keep it utf8?
>
> msg("Connecting to MySQL server host: %s, user: %s, password: %s, "
> "port: %s, socket: %s", opt_host ? opt_host : "localhost",
> @@ -1506,7 +1506,7 @@ write_xtrabackup_info(MYSQL *connection, const char * filename, bool history,
> "incremental ENUM('Y', 'N') DEFAULT NULL,"
> "format ENUM('file', 'tar', 'xbstream') DEFAULT NULL,"
> "compressed ENUM('Y', 'N') DEFAULT NULL"
> - ") CHARACTER SET utf8 ENGINE=innodb", false);
> + ") CHARACTER SET utf8mb3 ENGINE=innodb", false);
why not to keep it utf8?
>
>
> #define ESCAPE_BOOL(expr) ((expr)?"'Y'":"'N'")
> diff --git a/extra/mariabackup/xtrabackup.cc b/extra/mariabackup/xtrabackup.cc
> index 62fdace654d..0f500c35ff6 100644
> --- a/extra/mariabackup/xtrabackup.cc
> +++ b/extra/mariabackup/xtrabackup.cc
> @@ -1716,7 +1716,7 @@ static int create_bootstrap_file()
> if(!f)
> return -1;
>
> - fputs("SET NAMES UTF8;\n",f);
> + fputs("SET NAMES UTF8MB3;\n",f);
why not to keep it utf8?
> enumerate_ibd_files(append_export_table);
> for (std::set<std::string>::iterator it = tables_for_export.begin();
> it != tables_for_export.end(); it++)
> diff --git a/mysql-test/main/create-uca.test b/mysql-test/main/create-uca.test
> index 0acb51f7286..f73f6114962 100644
> --- a/mysql-test/main/create-uca.test
> +++ b/mysql-test/main/create-uca.test
> @@ -1,5 +1,5 @@
> # Prerequisites
> -let collation=utf8_unicode_ci;
> +let collation=utf8mb3_unicode_ci;
that's fine, as we generally don't want tests to depend on the old_mode.
> --source include/have_collation.inc
>
> # Initial cleanup
> diff --git a/mysql-test/main/ctype_ldml.result b/mysql-test/main/ctype_ldml.result
> index 22b7a316111..7c284520733 100644
> --- a/mysql-test/main/ctype_ldml.result
> +++ b/mysql-test/main/ctype_ldml.result
> @@ -8,7 +8,6 @@ Variable_name Value
> character_sets_dir MYSQL_TEST_DIR/std_data/ldml/
> show collation like 'utf8_phone_ci';
> Collation Charset Id Default Compiled Sortlen
> -utf8_phone_ci utf8 352 8
I suppose the test should show there is a _phone_ci collation.
As you've renamed it, you need to adjust the test to do
show collation like 'utf8mb3_phone_ci';
> CREATE TABLE t1 (
> name VARCHAR(64),
> phone VARCHAR(64) CHARACTER SET utf8 COLLATE utf8_phone_ci
> @@ -37,7 +36,6 @@ Bar +7-912-800-80-01
> DROP TABLE t1;
> show collation like 'utf8_test_ci';
> Collation Charset Id Default Compiled Sortlen
> -utf8_test_ci utf8 353 8
same
> create table t1 (c1 char(1) character set utf8 collate utf8_test_ci);
> insert into t1 values ('a');
> select * from t1 where c1='b';
> @@ -526,7 +524,6 @@ DROP TABLE t1;
> SET NAMES utf8 COLLATE utf8_phone_ci;
> SHOW COLLATION LIKE 'utf8_phone_ci';
> Collation Charset Id Default Compiled Sortlen
> -utf8_phone_ci utf8 352 8
and here
> SET NAMES utf8;
> SELECT hex(weight_string(_utf8mb4'a' collate utf8mb4_test_400_ci));
> hex(weight_string(_utf8mb4'a' collate utf8mb4_test_400_ci))
> diff --git a/mysql-test/main/show_check.result b/mysql-test/main/show_check.result
> index d031c792922..1c85ef596c9 100644
> --- a/mysql-test/main/show_check.result
> +++ b/mysql-test/main/show_check.result
> @@ -874,12 +874,11 @@ set names utf8;
> ----------------------------------------------------------------
> SHOW CHARACTER SET LIKE 'utf8';
> Catalog Database Table Table_alias Column Column_alias Type Length Max length Is_null Flags Decimals Charsetnr
> -def information_schema CHARACTER_SETS CHARACTER_SETS CHARACTER_SET_NAME Charset 253 96 4 N 1 0 33
> -def information_schema CHARACTER_SETS CHARACTER_SETS DESCRIPTION Description 253 180 13 N 1 0 33
> -def information_schema CHARACTER_SETS CHARACTER_SETS DEFAULT_COLLATE_NAME Default collation 253 96 15 N 1 0 33
> -def information_schema CHARACTER_SETS CHARACTER_SETS MAXLEN Maxlen 8 3 1 N 32769 0 63
> +def information_schema CHARACTER_SETS CHARACTER_SETS CHARACTER_SET_NAME Charset 253 96 0 N 1 0 33
> +def information_schema CHARACTER_SETS CHARACTER_SETS DESCRIPTION Description 253 180 0 N 1 0 33
> +def information_schema CHARACTER_SETS CHARACTER_SETS DEFAULT_COLLATE_NAME Default collation 253 96 0 N 1 0 33
> +def information_schema CHARACTER_SETS CHARACTER_SETS MAXLEN Maxlen 8 3 0 N 32769 0 63
> Charset Description Default collation Maxlen
> -utf8 UTF-8 Unicode utf8_general_ci 3
again, I suspect this test should now do `SHOW CHARACTER SET LIKE 'utf8mb3';
> ----------------------------------------------------------------
> SHOW COLLATION LIKE 'latin1_bin';
> Catalog Database Table Table_alias Column Column_alias Type Length Max length Is_null Flags Decimals Charsetnr
> diff --git a/mysql-test/suite/funcs_1/r/charset_collation.result b/mysql-test/suite/funcs_1/r/charset_collation.result
> index 31bd30c5acf..6b52e80d6ba 100644
> --- a/mysql-test/suite/funcs_1/r/charset_collation.result
> +++ b/mysql-test/suite/funcs_1/r/charset_collation.result
> @@ -9,7 +9,6 @@ ORDER BY character_set_name;
> CHARACTER_SET_NAME DEFAULT_COLLATE_NAME DESCRIPTION MAXLEN
> binary binary Binary pseudo charset 1
> latin1 latin1_swedish_ci cp1252 West European 1
> -utf8 utf8_general_ci UTF-8 Unicode 3
and here too, changing the test is in order
>
> SELECT *
> FROM information_schema.collations
> diff --git a/mysql-test/suite/funcs_1/r/is_column_privileges.result b/mysql-test/suite/funcs_1/r/is_column_privileges.result
> index b6be9118048..46b2d515041 100644
> --- a/mysql-test/suite/funcs_1/r/is_column_privileges.result
> +++ b/mysql-test/suite/funcs_1/r/is_column_privileges.result
> @@ -45,7 +45,7 @@ COLUMN_PRIVILEGES CREATE TEMPORARY TABLE `COLUMN_PRIVILEGES` (
> `COLUMN_NAME` varchar(64) NOT NULL DEFAULT '',
> `PRIVILEGE_TYPE` varchar(64) NOT NULL DEFAULT '',
> `IS_GRANTABLE` varchar(3) NOT NULL DEFAULT ''
> -) ENGINE=MEMORY DEFAULT CHARSET=utf8
> +) ENGINE=MEMORY DEFAULT CHARSET=utf8mb3
Just a thought. Did you also fix all --embedded, all --ps, and all --big --big tests to pass?
> SHOW COLUMNS FROM information_schema.COLUMN_PRIVILEGES;
> Field Type Null Key Default Extra
> GRANTEE varchar(190) NO
> diff --git a/mysql-test/suite/innodb/r/innodb_ctype_ldml.result b/mysql-test/suite/innodb/r/innodb_ctype_ldml.result
> index 502f57156c3..8a96c023134 100644
> --- a/mysql-test/suite/innodb/r/innodb_ctype_ldml.result
> +++ b/mysql-test/suite/innodb/r/innodb_ctype_ldml.result
> @@ -8,7 +8,6 @@ Variable_name Value
> character_sets_dir MYSQL_TEST_DIR/std_data/ldml/
> show collation like 'utf8_phone_ci';
> Collation Charset Id Default Compiled Sortlen
> -utf8_phone_ci utf8 352 8
deja vu. I rememeber I've commented on this very thing already :)
please, update this test too.
> CREATE TABLE t1 (
> name VARCHAR(64),
> phone VARCHAR(64) CHARACTER SET utf8 COLLATE utf8_phone_ci
> Binary files a/mysql-test/suite/sys_vars/r/character_set_results_basic.result and b/mysql-test/suite/sys_vars/r/character_set_results_basic.result differ
> diff --git a/mysql-test/suite/sys_vars/r/old_mode_basic.result b/mysql-test/suite/sys_vars/r/old_mode_basic.result
> index 39c8e554be2..a6b95f1c60c 100644
> --- a/mysql-test/suite/sys_vars/r/old_mode_basic.result
> +++ b/mysql-test/suite/sys_vars/r/old_mode_basic.result
> @@ -167,8 +167,100 @@ NO_PROGRESS_INFO
> SET @@global.old_mode = @global_start_value;
> SELECT @@global.old_mode;
> @@global.old_mode
> -
> +UTF8_IS_UTF8MB3
> SET @@session.old_mode = @session_start_value;
> SELECT @@session.old_mode;
> @@session.old_mode
> -
> +UTF8_IS_UTF8MB3
> +#
> +# Beginning of 10.6 test
> +#
> +# MDEV-8334: Rename utf8 to utf8mb3
> +#
Ah! Great.
> +# Save and display old values
> +SET @save_old_mode = @@OLD_MODE;
> +SET @save_character_set_server = @@character_set_server;
> +SET @save_character_set_client = @@character_set_client;
> +SET @save_character_set_results = @@character_set_results;
> +SET @save_character_set_connection = @@character_set_connection;
> +SET @save_character_set_filesystem = @@character_set_filesystem;
> +SET @save_character_set_database = @@character_set_database;
> +SET @save_collation_connection = @@collation_connection;
> +SET @save_collation_server = @@collation_server;
> +SET @save_collation_database = @@collation_database;
> +SELECT @@OLD_MODE;
> +@@OLD_MODE
> +UTF8_IS_UTF8MB3
> +SELECT @@character_set_server,@@character_set_client,@@character_set_results,
> +@@character_set_connection, @@character_set_filesystem, @@character_set_database,
> +@@collation_connection, @@collation_server, @@collation_database;
> +@@character_set_server @@character_set_client @@character_set_results @@character_set_connection @@character_set_filesystem @@character_set_database @@collation_connection @@collation_server @@collation_database
> +latin1 latin1 latin1 latin1 binary latin1 latin1_swedish_ci latin1_swedish_ci latin1_swedish_ci
> +#
> +# UTF8MB3 alias for UTF8
> +#
> +SET @@character_set_server = utf8;
> +SET @@character_set_client = utf8;
> +SET @@character_set_results = utf8;
> +SET @@character_set_connection = utf8;
> +SET @@character_set_filesystem = utf8;
> +SET @@character_set_database = utf8;
> +SET @@collation_connection = utf8_general_ci;
> +SET @@collation_server = utf8_unicode_ci;
> +SET @@collation_database = utf8_bin;
> +SELECT @@character_set_server, @@character_set_client, @@character_set_results,
> +@@character_set_connection, @@character_set_filesystem, @@character_set_database,
> +@@collation_connection, @@collation_server, @@collation_database;
> +@@character_set_server @@character_set_client @@character_set_results @@character_set_connection @@character_set_filesystem @@character_set_database @@collation_connection @@collation_server @@collation_database
> +utf8mb3 utf8mb3 utf8mb3 utf8mb3 utf8mb3 utf8mb3 utf8mb3_general_ci utf8mb3_unicode_ci utf8mb3_bin
> +CREATE DATABASE db1 CHARACTER SET = 'utf8' COLLATE = 'utf8_general_ci';
> +ALTER DATABASE db1 CHARACTER SET = 'utf8' COLLATE = 'utf8_unicode_ci';
> +CREATE TABLE tb1 (id1 INT) CHARACTER SET 'utf8' COLLATE 'utf8_bin';
> +SHOW CREATE TABLE tb1;
> +Table Create Table
> +tb1 CREATE TABLE `tb1` (
> + `id1` int(11) DEFAULT NULL
> +) ENGINE=MyISAM DEFAULT CHARSET=utf8mb3 COLLATE=utf8mb3_bin
> +DROP TABLE tb1;
> +DROP DATABASE db1;
> +#
> +# UTF8MB4 is alias for UTF8
> +#
> +SET @@OLD_MODE=0;
> +SET @@character_set_server = utf8;
> +SET @@character_set_client = utf8;
> +SET @@character_set_results = utf8;
> +SET @@character_set_connection = utf8;
> +SET @@character_set_filesystem = utf8;
> +SET @@character_set_database = utf8;
> +SET @@collation_connection = utf8_general_ci;
> +SET @@collation_server = utf8_unicode_ci;
> +SET @@collation_database = utf8_bin;
> +SELECT @@character_set_server, @@character_set_client, @@character_set_results,
> +@@character_set_connection, @@character_set_filesystem, @@character_set_database,
> +@@collation_connection, @@collation_server, @@collation_database;
> +@@character_set_server @@character_set_client @@character_set_results @@character_set_connection @@character_set_filesystem @@character_set_database @@collation_connection @@collation_server @@collation_database
> +utf8mb4 utf8mb4 utf8mb4 utf8mb4 utf8mb4 utf8mb4 utf8mb4_general_ci utf8mb4_unicode_ci utf8mb4_bin
> +CREATE DATABASE db1 CHARACTER SET = 'utf8' COLLATE = 'utf8_general_ci';
> +ALTER DATABASE db1 CHARACTER SET = 'utf8' COLLATE = 'utf8_unicode_ci';
> +CREATE TABLE tb1 (id1 INT) CHARACTER SET 'utf8' COLLATE 'utf8_bin';
> +SHOW CREATE TABLE tb1;
> +Table Create Table
> +tb1 CREATE TABLE `tb1` (
> + `id1` int(11) DEFAULT NULL
> +) ENGINE=MyISAM DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin
> +DROP TABLE tb1;
> +DROP DATABASE db1;
> +SET @@OLD_MODE = @save_old_mode;
> +SET @@character_set_server = @save_character_set_server;
> +SET @@character_set_client = @save_character_set_client;
> +SET @@character_set_results = @save_character_set_results;
> +SET @@character_set_connection = @save_character_set_connection;
> +SET @@character_set_filesystem = @save_character_set_filesystem;
> +SET @@character_set_database = @save_character_set_database;
> +SET @@collation_connection = @save_collation_connection;
> +SET @@collation_server = @save_collation_server;
> +SET @@collation_database = @save_collation_database;
> +#
> +# End of 10.6 test
> +#
> diff --git a/mysys/charset.c b/mysys/charset.c
> index 32cfeb56e2d..dbb2749d217 100644
> --- a/mysys/charset.c
> +++ b/mysys/charset.c
> @@ -763,22 +764,13 @@ get_charset_number_internal(const char *charset_name, uint cs_flags)
> }
>
>
> -static const char*
> -get_charset_name_alias(const char *name)
> -{
> - if (!my_strcasecmp(&my_charset_latin1, name, "utf8mb3"))
> - return "utf8";
> - return NULL;
> -}
> -
> -
> uint get_charset_number(const char *charset_name, uint cs_flags)
> {
> uint id;
> my_pthread_once(&charsets_initialized, init_available_charsets);
> if ((id= get_charset_number_internal(charset_name, cs_flags)))
> return id;
> - if ((charset_name= get_charset_name_alias(charset_name)))
> + if ((charset_name= !my_strcasecmp(&my_charset_latin1, charset_name, "utf8") ? "utf8mb3" : NULL))
Huh? Why do you not check MY_UTF8_IS_UTF8MB3 here?
> return get_charset_number_internal(charset_name, cs_flags);
> return 0;
> }
> @@ -820,7 +812,7 @@ static CHARSET_INFO *find_collation_data_inheritance_source(CHARSET_INFO *cs)
> char name[MY_CS_NAME_SIZE + 1];
> memcpy(name, beg, end - beg);
> name[end - beg]= '\0';
> - return inheritance_source_by_id(cs, get_collation_number(name));
> + return inheritance_source_by_id(cs, get_collation_number(name,MYF(0)));
and not here?
> }
> return NULL;
> }
> @@ -961,7 +953,28 @@ my_collation_get_by_name(MY_CHARSET_LOADER *loader,
> CHARSET_INFO *get_charset_by_name(const char *cs_name, myf flags)
> {
> MY_CHARSET_LOADER loader;
> + my_bool utf8_is_utf8mb3= flags & MY_UTF8_IS_UTF8MB3 ? 1 : 0;
> + char *copy_of_name= (char*)cs_name;
> + char start[6], result[64];
> + char *temp_cs_name;
> +
> my_charset_loader_init_mysys(&loader);
> +
> + if (!strcasecmp("utf8",copy_of_name))
> + cs_name = (const char*)(utf8_is_utf8mb3 ? "utf8mb3" : "utf8mb4");
> +
> + strncpy(start, cs_name, 5);
> + temp_cs_name= (char *)(utf8_is_utf8mb3 ? "utf8mb3_":"utf8mb4_");
> +
> + if (!strncasecmp("utf8_", start,5))
> + {
> + copy_of_name+= 5;
> + result[63]='\0';
> + strcpy(result, temp_cs_name);
> + strcat(result, copy_of_name);
> + result[strlen(copy_of_name)+strlen(temp_cs_name)]='\0';
> + cs_name= (const char *) result;
> + }
And why do you do all that ^^^ ? Old code didn't try to change utf8mb3 to
utf8 here, because, I suppose, my_collation_get_by_name() below did all that.
Why did you add alias resolution where none was?
> return my_collation_get_by_name(&loader, cs_name, flags);
> }
>
> @@ -1005,12 +1018,16 @@ get_charset_by_csname(const char *cs_name, uint cs_flags, myf flags)
> {
> MY_CHARSET_LOADER loader;
> my_charset_loader_init_mysys(&loader);
> +
> + if (!strcasecmp("utf8",cs_name))
> + cs_name= (const char*)(flags & MY_UTF8_IS_UTF8MB3 ? "utf8mb3" : "utf8mb4");
same here
> +
> return my_charset_get_by_name(&loader, cs_name, cs_flags, flags);
> }
>
>
> /**
> - Resolve character set by the character set name (utf8, latin1, ...).
> + Resolve character set by the character set name (utf8mb3, latin1, ...).
>
> The function tries to resolve character set by the specified name. If
> there is character set with the given name, it is assigned to the "cs"
> @@ -1453,8 +1472,8 @@ static const MY_CSET_OS_NAME charsets[] =
>
> {"US-ASCII", "latin1", my_cs_approx},
>
> - {"utf8", "utf8", my_cs_exact},
> - {"utf-8", "utf8", my_cs_exact},
> + {"utf8mb3", "utf8mb3", my_cs_exact},
eh, no. Try to understand what this array is for.
> + {"utf-8", "utf8mb3", my_cs_exact},
> #endif
> {NULL, NULL, 0}
> };
> diff --git a/plugin/handler_socket/client/hslongrun.cpp b/plugin/handler_socket/client/hslongrun.cpp
> index b7c02951340..7f88d48fff2 100644
> --- a/plugin/handler_socket/client/hslongrun.cpp
> +++ b/plugin/handler_socket/client/hslongrun.cpp
> @@ -897,7 +897,7 @@ hs_longrun_init_table(const config& conf, int num_prepare,
> "v1 varchar(32) not null,"
> "v2 varchar(32) not null,"
> "v3 varchar(32) not null"
> - ") character set utf8 collate utf8_bin engine = innodb");
> + ") character set utf8mb3 collate utf8_bin engine = innodb");
just keep it utf8
> for (int i = 0; i < num_prepare; ++i) {
> const std::string i_str = to_stdstring(i);
> const std::string v1 = "pv1_" + i_str;
> diff --git a/plugin/handler_socket/client/hstest.pl b/plugin/handler_socket/client/hstest.pl
> index 1363e153c44..5924d8a0ce5 100755
> --- a/plugin/handler_socket/client/hstest.pl
> +++ b/plugin/handler_socket/client/hstest.pl
> @@ -52,7 +52,7 @@ for my $action (@actions) {
> "k $keytype primary key" .
> ",v varchar(32) not null" .
> $moreflds .
> - ") character set utf8 collate utf8_bin " .
> + ") character set utf8mb3 collate utf8_bin " .
and here. forget about handlersocket
> "engine = $engine");
> } elsif ($action eq "insert") {
> print("INSERT $db.$table tablesize=$tablesize\n");
> diff --git a/plugin/win_auth_client/common.cc b/plugin/win_auth_client/common.cc
> index 8b7319252ac..ddd34aec7da 100644
> --- a/plugin/win_auth_client/common.cc
> +++ b/plugin/win_auth_client/common.cc
> @@ -384,7 +384,7 @@ char* wchar_to_utf8(const wchar_t *string, size_t *len)
> buf= (char*)malloc(buf_len + 1);
> if (!buf)
> {
> - DBUG_PRINT("error",("Out of memory when converting string '%S' to utf8",
> + DBUG_PRINT("error",("Out of memory when converting string '%S' to utf8mb3",
Nope, see what this function is doing.
> string));
> return NULL;
> }
> @@ -408,7 +408,7 @@ char* wchar_to_utf8(const wchar_t *string, size_t *len)
>
> #ifndef DBUG_OFF
> Error_message_buf error_buf;
> - DBUG_PRINT("error", ("Could not convert string '%S' to utf8"
> + DBUG_PRINT("error", ("Could not convert string '%S' to utf8mb3"
same
> ", WideCharToMultiByte() failed with error %X (%s)",
> string, GetLastError(),
> get_last_error_message(error_buf)));
> @@ -451,7 +451,7 @@ wchar_t* utf8_to_wchar(const char *string, size_t *len)
>
> if (!buf)
> {
> - DBUG_PRINT("error",("Out of memory when converting utf8 string '%s'"
> + DBUG_PRINT("error",("Out of memory when converting utf8mb3 string '%s'"
same
> " to wide-char representation", string));
> return NULL;
> }
> diff --git a/scripts/fill_help_tables.sql b/scripts/fill_help_tables.sql
> index d0efb750330..ad7c4fce9a4 100644
> --- a/scripts/fill_help_tables.sql
> +++ b/scripts/fill_help_tables.sql
> @@ -22,7 +22,7 @@
don't change help tables, please.
they'll be regenerated from the documentation
>
> -- mysql -u root -p mysql < file_name
>
> -set names 'utf8';
> +set names 'utf8mb3';
>
> set sql_log_bin = 0;
>
> diff --git a/scripts/mysql_system_tables.sql b/scripts/mysql_system_tables.sql
> index e390f36a98b..7c8532577a1 100644
> --- a/scripts/mysql_system_tables.sql
> +++ b/scripts/mysql_system_tables.sql
> @@ -209,7 +209,7 @@ SET @create_transaction_registry="CREATE TABLE IF NOT EXISTS transaction_registr
> UNIQUE KEY (commit_id),
> INDEX (begin_timestamp),
> INDEX (commit_timestamp, transaction_id)
> -) ENGINE=INNODB DEFAULT CHARSET=utf8 COLLATE=utf8_bin STATS_PERSISTENT=0";
> +) ENGINE=INNODB DEFAULT CHARSET=utf8mb3 COLLATE=utf8_bin STATS_PERSISTENT=0";
here and in all other places in this file and other .sql files: you need to
change the collation too, not just the charset.
>
> SET @str=IF(@have_innodb <> 0, @create_innodb_table_stats, "SET @dummy = 0");
> PREPARE stmt FROM @str;
> diff --git a/sql/mysqld.cc b/sql/mysqld.cc
> index 0bf21e02002..cc7568990b4 100644
> --- a/sql/mysqld.cc
> +++ b/sql/mysqld.cc
> @@ -4039,7 +4039,10 @@ static int init_common_variables()
> *next_character_set_name++= '\0';
> if (!(default_charset_info=
> get_charset_by_csname(default_character_set_name,
> - MY_CS_PRIMARY, MYF(MY_WME))))
> + MY_CS_PRIMARY,
> + global_system_variables.old_behavior &
> + OLD_MODE_UTF8_IS_UTF8MB3 ?
> + MYF(MY_UTF8_IS_UTF8MB3 | MY_WME) : MYF(MY_WME))))
may be you'd better get this multi-line assignment out of if() ?
e.g.
myf utf8_flag= global_system_variables.old_behavior & OLD_MODE_UTF8_IS_UTF8MB3 : 0;
default_charset_info= get_charset_by_csname(default_character_set_name, MY_CS_PRIMARY, MYF(utf8_flag | MY_WME));
if (!default_charset_info) ...
or add a helper thd->utf8_alias()
as I suggested below
> {
> if (next_character_set_name)
> {
> @@ -4056,7 +4059,10 @@ static int init_common_variables()
> if (default_collation_name)
> {
> CHARSET_INFO *default_collation;
> - default_collation= get_charset_by_name(default_collation_name, MYF(0));
> + default_collation= get_charset_by_name(default_collation_name,
> + global_system_variables.old_behavior &
> + OLD_MODE_UTF8_IS_UTF8MB3 ?
> + MYF(MY_UTF8_IS_UTF8MB3) : MYF(0));
and here you'll be able to use utf8_flag without the conditional operator
> if (!default_collation)
> {
> #ifdef WITH_PERFSCHEMA_STORAGE_ENGINE
> @@ -4097,7 +4103,10 @@ static int init_common_variables()
>
> if (!(character_set_filesystem=
> get_charset_by_csname(character_set_filesystem_name,
> - MY_CS_PRIMARY, MYF(MY_WME))))
> + MY_CS_PRIMARY,
> + global_system_variables.old_behavior &
> + OLD_MODE_UTF8_IS_UTF8MB3 ?
> + MYF(MY_UTF8_IS_UTF8MB3 | MY_WME) : MYF(MY_WME))))
and here
> return 1;
> global_system_variables.character_set_filesystem= character_set_filesystem;
>
> @@ -7415,7 +7424,9 @@ static void usage(void)
> DBUG_ENTER("usage");
> if (!(default_charset_info= get_charset_by_csname(default_character_set_name,
> MY_CS_PRIMARY,
> - MYF(MY_WME))))
> + global_system_variables.old_behavior &
> + OLD_MODE_UTF8_IS_UTF8MB3 ?
> + MYF(MY_UTF8_IS_UTF8MB3 | MY_WME) : MYF(MY_WME))))
split this one too?
> exit(1);
> if (!default_collation_name)
> default_collation_name= (char*) default_charset_info->name;
> diff --git a/sql/sql_class.h b/sql/sql_class.h
> index 50b746fe514..15aa9ca8199 100644
> --- a/sql/sql_class.h
> +++ b/sql/sql_class.h
> @@ -1007,6 +1008,36 @@ inline void update_global_memory_status(int64 size)
> my_atomic_add64_explicit(ptr, size, MY_MEMORY_ORDER_RELAXED);
> }
>
> +inline const char* get_alias_collation_or_charset_name(const char* name,
> + bool utf8_is_utf8mb3)
> +{
> + char *copy_of_name= (char*)name;
> + char start[6], result[64];
> + char *temp_cs_name;
> +
> + if (!strchr(name,'_'))
> + {
> + if (!strcasecmp("utf8",name))
> + name = utf8_is_utf8mb3 ? "utf8mb3" : "utf8mb4";
> + return name;
> + }
> + else
> + {
> + strncpy(start, name, 5);
> + temp_cs_name= (char *)(utf8_is_utf8mb3 ? "utf8mb3_":"utf8mb4_");
> + if (!strncasecmp("utf8_", start,5))
> + {
> + copy_of_name+= 5;
> + result[63]='\0';
> + strcpy(result, temp_cs_name);
> + strcat(result, copy_of_name);
> + result[strlen(copy_of_name)+strlen(temp_cs_name)]='\0';
> + strcpy((char*)name,result);
> + }
> + }
> + return name;
> +}
Please, no.
First, you cannot just copy `result` into `name`, because `result` is longer,
you'll overwrite whatever was in memory after name's value.
Second, you don't need to resolve aliases here, charset code already does it,
don't duplicate that. Just pass MY_UTF8_IS_UTF8MB3 down to
my_collation_get_by_name() below.
> +
> /**
> Get collation by name, send error to client on failure.
> @param name Collation name
> diff --git a/sql/sql_db.cc b/sql/sql_db.cc
> index 9bf16220535..f471d8edc66 100644
> --- a/sql/sql_db.cc
> +++ b/sql/sql_db.cc
> @@ -583,9 +583,14 @@ bool load_db_opt(THD *thd, const char *path, Schema_specification_st *create)
> default-collation commands.
> */
> if (!(create->default_table_charset=
> - get_charset_by_csname(pos+1, MY_CS_PRIMARY, MYF(0))) &&
> + get_charset_by_csname(pos+1, MY_CS_PRIMARY,
> + thd->variables.old_behavior &
> + OLD_MODE_UTF8_IS_UTF8MB3 ?
> + MYF(MY_UTF8_IS_UTF8MB3) : MYF(0))) &&
> !(create->default_table_charset=
> - get_charset_by_name(pos+1, MYF(0))))
> + get_charset_by_name(pos+1, thd->variables.old_behavior &
> + OLD_MODE_UTF8_IS_UTF8MB3 ?
> + MYF(MY_UTF8_IS_UTF8MB3) : MYF(0))))
again, please move myf flags manipulations out of if()
> {
> sql_print_error("Error while loading database options: '%s':",path);
> sql_print_error(ER_THD(thd, ER_UNKNOWN_CHARACTER_SET),pos+1);
> @@ -595,7 +600,9 @@ bool load_db_opt(THD *thd, const char *path, Schema_specification_st *create)
> else if (!strncmp(buf,"default-collation", (pos-buf)))
> {
> if (!(create->default_table_charset= get_charset_by_name(pos+1,
> - MYF(0))))
> + thd->variables.old_behavior &
> + OLD_MODE_UTF8_IS_UTF8MB3 ?
> + MYF(MY_UTF8_IS_UTF8MB3) : MYF(0))))
and here
> {
> sql_print_error("Error while loading database options: '%s':",path);
> sql_print_error(ER_THD(thd, ER_UNKNOWN_COLLATION),pos+1);
> diff --git a/sql/sql_lex.cc b/sql/sql_lex.cc
> index 6871699dc5b..ad322eda097 100644
> --- a/sql/sql_lex.cc
> +++ b/sql/sql_lex.cc
> @@ -2789,7 +2789,10 @@ int Lex_input_stream::scan_ident_middle(THD *thd, Lex_ident_cli_st *str,
> body_utf8_append(m_cpp_text_start, m_cpp_tok_start + length);
> ErrConvString csname(str->str + 1, str->length - 1, &my_charset_bin);
> CHARSET_INFO *cs= get_charset_by_csname(csname.ptr(),
> - MY_CS_PRIMARY, MYF(0));
> + MY_CS_PRIMARY,
> + thd->variables.old_behavior &
> + OLD_MODE_UTF8_IS_UTF8MB3 ?
> + MYF(MY_UTF8_IS_UTF8MB3) : MYF(0));
may be it could be a helper in thd, like
THD::utf8_alias() {
return variables.old_behavior & OLD_MODE_UTF8_IS_UTF8MB3
? MY_UTF8_IS_UTF8MB3 : 0;
}
> if (cs)
> {
> *introducer= cs;
> diff --git a/storage/connect/mysql-test/connect/my.cnf b/storage/connect/mysql-test/connect/my.cnf
> index 6310772d01f..83f0aa8ab30 100644
> --- a/storage/connect/mysql-test/connect/my.cnf
> +++ b/storage/connect/mysql-test/connect/my.cnf
> @@ -14,4 +14,4 @@ MASTER_MYSOCK= @mysqld.1.socket
> SLAVE_MYPORT= @mysqld.2.port
> SLAVE_MYSOCK= @mysqld.2.socket
>
> -PGCLIENTENCODING= UTF8
> +PGCLIENTENCODING= UTF8MB3
eh... really? Are you sure you've tested it and it worked?
by the name of it I suspect it's a postgresql client encoding.
> diff --git a/storage/connect/mysql-test/connect/t/odbc_postgresql.sql b/storage/connect/mysql-test/connect/t/odbc_postgresql.sql
> index 1c302294393..3c78120a7a2 100644
> --- a/storage/connect/mysql-test/connect/t/odbc_postgresql.sql
> +++ b/storage/connect/mysql-test/connect/t/odbc_postgresql.sql
> @@ -4,13 +4,13 @@
> -- Run this script as a admin user:
> -- psql -U postgres < odbc_postgresql.sql
>
> -SET NAMES 'UTF8';
> +SET NAMES 'UTF8MB3';
same, postgresql
>
> DROP DATABASE IF EXISTS mtr;
> DROP USER IF EXISTS mtr;
>
> CREATE USER mtr WITH PASSWORD 'mtr';
> -CREATE DATABASE mtr OWNER=mtr ENCODING='UTF8';
> +CREATE DATABASE mtr OWNER=mtr ENCODING='UTF8MB3';
same
> GRANT ALL ON DATABASE mtr TO mtr;
> \c mtr
> SET role mtr;
> diff --git a/storage/innobase/fts/fts0opt.cc b/storage/innobase/fts/fts0opt.cc
> index e5164fcc4fa..a26a05c2cf9 100644
> --- a/storage/innobase/fts/fts0opt.cc
> +++ b/storage/innobase/fts/fts0opt.cc
> @@ -330,7 +330,7 @@ fts_word_t*
> fts_word_init(
> /*==========*/
> fts_word_t* word, /*!< in: word to initialize */
> - byte* utf8, /*!< in: UTF-8 string */
> + byte* utf8mb3, /*!< in: UTF-8 string */
don't rename variables, please
> ulint len) /*!< in: length of string in bytes */
> {
> mem_heap_t* heap = mem_heap_create(sizeof(fts_node_t));
> diff --git a/storage/mroonga/mysql-test/mroonga/storage/r/collation_utf8_unicode_ci_french.result b/storage/mroonga/mysql-test/mroonga/storage/r/collation_utf8_unicode_ci_french.result
> index 3f24de87035..3659aa5aee8 100644
> --- a/storage/mroonga/mysql-test/mroonga/storage/r/collation_utf8_unicode_ci_french.result
> +++ b/storage/mroonga/mysql-test/mroonga/storage/r/collation_utf8_unicode_ci_french.result
> @@ -7,5 +7,4 @@ FULLTEXT INDEX (content)
> INSERT INTO diaries VALUES ("Je suis un garçon.");
> SELECT * FROM diaries WHERE MATCH (content) AGAINST ("garcon");
> content
> -Je suis un garçon.
looks like a bug
> DROP TABLE diaries;
> diff --git a/storage/mroonga/mysql-test/mroonga/storage/r/collation_utf8_unicode_ci_japanese.result b/storage/mroonga/mysql-test/mroonga/storage/r/collation_utf8_unicode_ci_japanese.result
> index 94ef2608b81..79dac1e63a7 100644
> --- a/storage/mroonga/mysql-test/mroonga/storage/r/collation_utf8_unicode_ci_japanese.result
> +++ b/storage/mroonga/mysql-test/mroonga/storage/r/collation_utf8_unicode_ci_japanese.result
> @@ -7,5 +7,4 @@ FULLTEXT INDEX (content)
> INSERT INTO diaries VALUES ("ひらがなとカタカナを覚えました。");
> SELECT * FROM diaries WHERE MATCH (content) AGAINST ("かたかな");
> content
> -ひらがなとカタカナを覚えました。
that too
> DROP TABLE diaries;
> diff --git a/storage/mroonga/vendor/groonga/CMakeLists.txt b/storage/mroonga/vendor/groonga/CMakeLists.txt
> index d271d4c4eb9..fc134b81cde 100644
> --- a/storage/mroonga/vendor/groonga/CMakeLists.txt
> +++ b/storage/mroonga/vendor/groonga/CMakeLists.txt
> @@ -268,7 +268,7 @@ if(UNIX)
> ac_check_funcs(pthread_condattr_setpshared)
> endif()
>
> -option(GRN_WITH_NFKC "use NFKC based UTF8 normalization." ON)
> +option(GRN_WITH_NFKC "use NFKC based UTF8MB3 normalization." ON)
not here, please
>
> if(WIN32)
> ac_check_headers(winsock2.h)
> diff --git a/storage/mroonga/vendor/groonga/benchmark/bench-nfkc.c b/storage/mroonga/vendor/groonga/benchmark/bench-nfkc.c
> index ebae95b273b..3997b933e87 100644
> --- a/storage/mroonga/vendor/groonga/benchmark/bench-nfkc.c
> +++ b/storage/mroonga/vendor/groonga/benchmark/bench-nfkc.c
> @@ -83,11 +83,11 @@ static void
> bench_char_type(gpointer user_data)
> {
> uint64_t code_point;
> - char utf8[7];
> + char utf8mb3[7];
don't rename variables
>
> for (code_point = 1; code_point < MAX_UNICODE; code_point++) {
> - ucs2utf8(code_point, (unsigned char *)utf8);
> - grn_nfkc50_char_type(utf8);
> + ucs2utf8(code_point, (unsigned char *)utf8mb3);
> + grn_nfkc50_char_type(utf8mb3);
> }
> }
>
> diff --git a/storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/tool/dump_difference_utf8.rb b/storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/tool/dump_difference_utf8.rb
> index 4b6fde8c7b0..ce20a2c5b40 100644
> --- a/storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/tool/dump_difference_utf8.rb
> +++ b/storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/tool/dump_difference_utf8.rb
> @@ -36,8 +36,8 @@ parser.sorted_pages.each do |page, characters|
> next if base == sort
> n_differences += 1
> utf8s = [base, upper, lower, sort]
> - formatted_code_points = utf8s.collect do |utf8|
> - "%#07x" % Unicode.from_utf8(utf8)
> + formatted_code_points = utf8s.collect do |utf8mb3|
> + "%#07x" % Unicode.from_utf8(utf8mb3)
just revert all changes under storage/mroonga/vendor/*
> end
> if sort.bytesize > base.bytesize
> n_expanded_sort_characters += 1
> diff --git a/storage/sphinx/mysql-test/sphinx/my.cnf b/storage/sphinx/mysql-test/sphinx/my.cnf
> index f60380b7171..22cc06914f4 100644
> --- a/storage/sphinx/mysql-test/sphinx/my.cnf
> +++ b/storage/sphinx/mysql-test/sphinx/my.cnf
> @@ -7,7 +7,7 @@ xmlpipe_command = cat @ENV.MTR_SUITE_DIR/testdata.xml
> [index test1]
> source = src1
> docinfo = extern
> -charset_type = utf-8
> +charset_type = utf-8mb3
revert
> path = @ENV.MYSQLTEST_VARDIR/searchd/test1
>
> [indexer]
> diff --git a/storage/spider/mysql-test/spider/bg/my.cnf b/storage/spider/mysql-test/spider/bg/my.cnf
> index 246099c623e..39f5bd01c67 100644
> --- a/storage/spider/mysql-test/spider/bg/my.cnf
> +++ b/storage/spider/mysql-test/spider/bg/my.cnf
> @@ -75,15 +75,15 @@ MASTER_1_MYSOCK= @mysqld.1.1.socket
> MASTER_1_ENGINE_TYPE= Spider
> #MASTER_1_ENGINE_TYPE= MyISAM
> MASTER_1_ENGINE= ENGINE=Spider
> -MASTER_1_CHARSET= DEFAULT CHARSET=utf8
> +MASTER_1_CHARSET= DEFAULT CHARSET=utf8mb3
Make sure to update both charset and collation
here and in all other .cnf files in the spider suite
> MASTER_1_ENGINE2= ENGINE=MyISAM
> -MASTER_1_CHARSET2= DEFAULT CHARSET=utf8
> -MASTER_1_CHARSET3= DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci
> +MASTER_1_CHARSET2= DEFAULT CHARSET=utf8mb3
> +MASTER_1_CHARSET3= DEFAULT CHARSET=utf8mb3 COLLATE=utf8_unicode_ci
> SLAVE1_1_MYPORT= @mysqld.4.1.port
> SLAVE1_1_MYSOCK= @mysqld.4.1.socket
> SLAVE1_1_ENGINE_TYPE= MyISAM
> SLAVE1_1_ENGINE= ENGINE=MyISAM
> -SLAVE1_1_CHARSET= DEFAULT CHARSET=utf8
> +SLAVE1_1_CHARSET= DEFAULT CHARSET=utf8mb3
> USE_CHILD_GROUP2= 1
> OUTPUT_CHILD_GROUP2= 0
> CHILD2_1_MYPORT= @mysqld.2.1.port
> diff --git a/storage/spider/spd_init_query.h b/storage/spider/spd_init_query.h
> index 19b04d50b82..f12cef377e2 100644
> --- a/storage/spider/spd_init_query.h
> +++ b/storage/spider/spd_init_query.h
> @@ -559,7 +559,7 @@ static LEX_STRING spider_init_queries[] = {
> " table_name char(64) not null default '',"
> " primary key (table_id),"
> " unique uk1(db_name, table_name)"
> - " ) engine=Aria transactional=1 default charset=utf8 collate=utf8_bin;"
> + " ) engine=Aria transactional=1 default charset=utf8mb3 collate=utf8_bin;"
always change both charset and collation
(everywhere in this file)
> " create table if not exists mysql.spider_rewrite_table_tables("
> " table_id bigint unsigned not null,"
> " partition_id bigint unsigned not null auto_increment,"
> diff --git a/tests/mysql_client_test.c b/tests/mysql_client_test.c
> index 0043786d477..5d9213591fb 100644
> --- a/tests/mysql_client_test.c
> +++ b/tests/mysql_client_test.c
> @@ -19236,7 +19236,7 @@ static void test_bug12337762()
> rc= mysql_query(mysql, "create table charset_tab("\
> "txt1 varchar(32) character set Latin1,"\
> "txt2 varchar(32) character set Latin1 collate latin1_bin,"\
> - "txt3 varchar(32) character set utf8 collate utf8_bin"\
> + "txt3 varchar(32) character set utf8mb3 collate utf8_bin"\
both charset and collation
> ")");
>
> DIE_UNLESS(rc == 0);
Regards,
Sergei
VP of MariaDB Server Engineering
and security@xxxxxxxxxxx