← Back to team overview

maria-developers team mailing list archive

Re: 0fc221a012d: MDEV-8334: Rename utf8 to utf8mb3

 

Hi, Rucha!

Thanks for combining all commits in one and a cleanup.
See comments below.

On Mar 18, Rucha Deodhar wrote:
> revision-id: 0fc221a012d (mariadb-10.5.2-402-g0fc221a012d)
> parent(s): a1542f8a573
> author: Rucha Deodhar <rucha.deodhar@xxxxxxxxxxx>
> committer: Rucha Deodhar <rucha.deodhar@xxxxxxxxxxx>
> timestamp: 2021-03-16 23:31:12 +0530
> message:
> 
> MDEV-8334: Rename utf8 to utf8mb3
> 
> This patch changes the main name of 3 byte character set from utf8 to
> utf8mb3. New old_mode UTF8_IS_UTF8MB3 is added and set TRUE by default,
> so that utf8 would mean utf8mb3. If not set, utf8 would mean utf8mb4.

> diff --git a/client/mysqlcheck.c b/client/mysqlcheck.c
> index fb3103a318d..a8990d8cb6b 100644
> --- a/client/mysqlcheck.c
> +++ b/client/mysqlcheck.c
> @@ -437,7 +437,7 @@ static int get_options(int *argc, char ***argv)
>    if (!default_charset)
>    {
>      if (opt_fix_db_names || opt_fix_table_names)
> -      default_charset= (char*) "utf8";
> +      default_charset= (char*) "utf8mb3";

why not to keep it utf8?

>      else
>        default_charset= (char*) MYSQL_AUTODETECT_CHARSET_NAME;
>    }
> diff --git a/client/mysqldump.c b/client/mysqldump.c
> index 7c363973da2..900456b31b2 100644
> --- a/client/mysqldump.c
> +++ b/client/mysqldump.c
> @@ -3235,7 +3235,7 @@ static uint get_table_structure(const char *table, const char *db, char *table_t
>        {
>          fprintf(sql_file,
>                  "/*!40101 SET @saved_cs_client     = @@character_set_client */;\n"
> -                "/*!40101 SET character_set_client = utf8 */;\n"
> +                "/*!40101 SET character_set_client = utf8mb3 */;\n"

why not to keep it utf8?

>                  "%s%s;\n"
>                  "/*!40101 SET character_set_client = @saved_cs_client */;\n",
>                  is_log_table ? "CREATE TABLE IF NOT EXISTS " : "",
> diff --git a/extra/mariabackup/backup_mysql.cc b/extra/mariabackup/backup_mysql.cc
> index 3083326a7e0..c62252257b9 100644
> --- a/extra/mariabackup/backup_mysql.cc
> +++ b/extra/mariabackup/backup_mysql.cc
> @@ -117,7 +117,7 @@ xb_mysql_connect()
>  		mysql_options(connection, MYSQL_PLUGIN_DIR, xb_plugin_dir);
>  	}
>  	mysql_options(connection, MYSQL_OPT_PROTOCOL, &opt_protocol);
> -	mysql_options(connection,MYSQL_SET_CHARSET_NAME, "utf8");
> +	mysql_options(connection,MYSQL_SET_CHARSET_NAME, "utf8mb3");

why not to keep it utf8?

>  
>  	msg("Connecting to MySQL server host: %s, user: %s, password: %s, "
>  	       "port: %s, socket: %s", opt_host ? opt_host : "localhost",
> @@ -1506,7 +1506,7 @@ write_xtrabackup_info(MYSQL *connection, const char * filename, bool history,
>  		"incremental ENUM('Y', 'N') DEFAULT NULL,"
>  		"format ENUM('file', 'tar', 'xbstream') DEFAULT NULL,"
>  		"compressed ENUM('Y', 'N') DEFAULT NULL"
> -		") CHARACTER SET utf8 ENGINE=innodb", false);
> +		") CHARACTER SET utf8mb3 ENGINE=innodb", false);

why not to keep it utf8?

>  
>  
>  #define ESCAPE_BOOL(expr) ((expr)?"'Y'":"'N'")
> diff --git a/extra/mariabackup/xtrabackup.cc b/extra/mariabackup/xtrabackup.cc
> index 62fdace654d..0f500c35ff6 100644
> --- a/extra/mariabackup/xtrabackup.cc
> +++ b/extra/mariabackup/xtrabackup.cc
> @@ -1716,7 +1716,7 @@ static int create_bootstrap_file()
>    if(!f)
>     return -1;
>  
> -  fputs("SET NAMES UTF8;\n",f);
> +  fputs("SET NAMES UTF8MB3;\n",f);

why not to keep it utf8?

>    enumerate_ibd_files(append_export_table);
>    for (std::set<std::string>::iterator it = tables_for_export.begin();
>         it != tables_for_export.end(); it++)
> diff --git a/mysql-test/main/create-uca.test b/mysql-test/main/create-uca.test
> index 0acb51f7286..f73f6114962 100644
> --- a/mysql-test/main/create-uca.test
> +++ b/mysql-test/main/create-uca.test
> @@ -1,5 +1,5 @@
>  # Prerequisites
> -let collation=utf8_unicode_ci;
> +let collation=utf8mb3_unicode_ci;

that's fine, as we generally don't want tests to depend on the old_mode.

>  --source include/have_collation.inc
>  
>  # Initial cleanup
> diff --git a/mysql-test/main/ctype_ldml.result b/mysql-test/main/ctype_ldml.result
> index 22b7a316111..7c284520733 100644
> --- a/mysql-test/main/ctype_ldml.result
> +++ b/mysql-test/main/ctype_ldml.result
> @@ -8,7 +8,6 @@ Variable_name	Value
>  character_sets_dir	MYSQL_TEST_DIR/std_data/ldml/
>  show collation like 'utf8_phone_ci';
>  Collation	Charset	Id	Default	Compiled	Sortlen
> -utf8_phone_ci	utf8	352			8

I suppose the test should show there is a _phone_ci collation.
As you've renamed it, you need to adjust the test to do

   show collation like 'utf8mb3_phone_ci';

>  CREATE TABLE t1 (
>  name VARCHAR(64),
>  phone VARCHAR(64) CHARACTER SET utf8 COLLATE utf8_phone_ci
> @@ -37,7 +36,6 @@ Bar	+7-912-800-80-01
>  DROP TABLE t1;
>  show collation like 'utf8_test_ci';
>  Collation	Charset	Id	Default	Compiled	Sortlen
> -utf8_test_ci	utf8	353			8

same

>  create table t1 (c1 char(1) character set utf8 collate utf8_test_ci);
>  insert into t1 values ('a');
>  select * from t1 where c1='b';
> @@ -526,7 +524,6 @@ DROP TABLE t1;
>  SET NAMES utf8 COLLATE utf8_phone_ci;
>  SHOW COLLATION LIKE 'utf8_phone_ci';
>  Collation	Charset	Id	Default	Compiled	Sortlen
> -utf8_phone_ci	utf8	352			8

and here

>  SET NAMES utf8;
>  SELECT hex(weight_string(_utf8mb4'a' collate utf8mb4_test_400_ci));
>  hex(weight_string(_utf8mb4'a' collate utf8mb4_test_400_ci))
> diff --git a/mysql-test/main/show_check.result b/mysql-test/main/show_check.result
> index d031c792922..1c85ef596c9 100644
> --- a/mysql-test/main/show_check.result
> +++ b/mysql-test/main/show_check.result
> @@ -874,12 +874,11 @@ set names utf8;
>  ----------------------------------------------------------------
>  SHOW CHARACTER SET LIKE 'utf8';
>  Catalog	Database	Table	Table_alias	Column	Column_alias	Type	Length	Max length	Is_null	Flags	Decimals	Charsetnr
> -def	information_schema	CHARACTER_SETS	CHARACTER_SETS	CHARACTER_SET_NAME	Charset	253	96	4	N	1	0	33
> -def	information_schema	CHARACTER_SETS	CHARACTER_SETS	DESCRIPTION	Description	253	180	13	N	1	0	33
> -def	information_schema	CHARACTER_SETS	CHARACTER_SETS	DEFAULT_COLLATE_NAME	Default collation	253	96	15	N	1	0	33
> -def	information_schema	CHARACTER_SETS	CHARACTER_SETS	MAXLEN	Maxlen	8	3	1	N	32769	0	63
> +def	information_schema	CHARACTER_SETS	CHARACTER_SETS	CHARACTER_SET_NAME	Charset	253	96	0	N	1	0	33
> +def	information_schema	CHARACTER_SETS	CHARACTER_SETS	DESCRIPTION	Description	253	180	0	N	1	0	33
> +def	information_schema	CHARACTER_SETS	CHARACTER_SETS	DEFAULT_COLLATE_NAME	Default collation	253	96	0	N	1	0	33
> +def	information_schema	CHARACTER_SETS	CHARACTER_SETS	MAXLEN	Maxlen	8	3	0	N	32769	0	63
>  Charset	Description	Default collation	Maxlen
> -utf8	UTF-8 Unicode	utf8_general_ci	3

again, I suspect this test should now do `SHOW CHARACTER SET LIKE 'utf8mb3';

>  ----------------------------------------------------------------
>  SHOW COLLATION LIKE 'latin1_bin';
>  Catalog	Database	Table	Table_alias	Column	Column_alias	Type	Length	Max length	Is_null	Flags	Decimals	Charsetnr
> diff --git a/mysql-test/suite/funcs_1/r/charset_collation.result b/mysql-test/suite/funcs_1/r/charset_collation.result
> index 31bd30c5acf..6b52e80d6ba 100644
> --- a/mysql-test/suite/funcs_1/r/charset_collation.result
> +++ b/mysql-test/suite/funcs_1/r/charset_collation.result
> @@ -9,7 +9,6 @@ ORDER BY character_set_name;
>  CHARACTER_SET_NAME	DEFAULT_COLLATE_NAME	DESCRIPTION	MAXLEN
>  binary	binary	Binary pseudo charset	1
>  latin1	latin1_swedish_ci	cp1252 West European	1
> -utf8	utf8_general_ci	UTF-8 Unicode	3

and here too, changing the test is in order

>  
>  SELECT *
>  FROM information_schema.collations
> diff --git a/mysql-test/suite/funcs_1/r/is_column_privileges.result b/mysql-test/suite/funcs_1/r/is_column_privileges.result
> index b6be9118048..46b2d515041 100644
> --- a/mysql-test/suite/funcs_1/r/is_column_privileges.result
> +++ b/mysql-test/suite/funcs_1/r/is_column_privileges.result
> @@ -45,7 +45,7 @@ COLUMN_PRIVILEGES	CREATE TEMPORARY TABLE `COLUMN_PRIVILEGES` (
>    `COLUMN_NAME` varchar(64) NOT NULL DEFAULT '',
>    `PRIVILEGE_TYPE` varchar(64) NOT NULL DEFAULT '',
>    `IS_GRANTABLE` varchar(3) NOT NULL DEFAULT ''
> -) ENGINE=MEMORY DEFAULT CHARSET=utf8
> +) ENGINE=MEMORY DEFAULT CHARSET=utf8mb3

Just a thought. Did you also fix all --embedded, all --ps, and all --big --big tests to pass?

>  SHOW COLUMNS FROM information_schema.COLUMN_PRIVILEGES;
>  Field	Type	Null	Key	Default	Extra
>  GRANTEE	varchar(190)	NO			
> diff --git a/mysql-test/suite/innodb/r/innodb_ctype_ldml.result b/mysql-test/suite/innodb/r/innodb_ctype_ldml.result
> index 502f57156c3..8a96c023134 100644
> --- a/mysql-test/suite/innodb/r/innodb_ctype_ldml.result
> +++ b/mysql-test/suite/innodb/r/innodb_ctype_ldml.result
> @@ -8,7 +8,6 @@ Variable_name	Value
>  character_sets_dir	MYSQL_TEST_DIR/std_data/ldml/
>  show collation like 'utf8_phone_ci';
>  Collation	Charset	Id	Default	Compiled	Sortlen
> -utf8_phone_ci	utf8	352			8

deja vu. I rememeber I've commented on this very thing already :)
please, update this test too.

>  CREATE TABLE t1 (
>  name VARCHAR(64),
>  phone VARCHAR(64) CHARACTER SET utf8 COLLATE utf8_phone_ci
> Binary files a/mysql-test/suite/sys_vars/r/character_set_results_basic.result and b/mysql-test/suite/sys_vars/r/character_set_results_basic.result differ
> diff --git a/mysql-test/suite/sys_vars/r/old_mode_basic.result b/mysql-test/suite/sys_vars/r/old_mode_basic.result
> index 39c8e554be2..a6b95f1c60c 100644
> --- a/mysql-test/suite/sys_vars/r/old_mode_basic.result
> +++ b/mysql-test/suite/sys_vars/r/old_mode_basic.result
> @@ -167,8 +167,100 @@ NO_PROGRESS_INFO
>  SET @@global.old_mode = @global_start_value;
>  SELECT @@global.old_mode;
>  @@global.old_mode
> -
> +UTF8_IS_UTF8MB3
>  SET @@session.old_mode = @session_start_value;
>  SELECT @@session.old_mode;
>  @@session.old_mode
> -
> +UTF8_IS_UTF8MB3
> +#
> +# Beginning of 10.6 test
> +#
> +# MDEV-8334: Rename utf8 to utf8mb3
> +#

Ah! Great.

> +# Save and display old values
> +SET @save_old_mode = @@OLD_MODE;
> +SET @save_character_set_server = @@character_set_server;
> +SET @save_character_set_client = @@character_set_client;
> +SET @save_character_set_results = @@character_set_results;
> +SET @save_character_set_connection = @@character_set_connection;
> +SET @save_character_set_filesystem = @@character_set_filesystem;
> +SET @save_character_set_database = @@character_set_database;
> +SET @save_collation_connection = @@collation_connection;
> +SET @save_collation_server = @@collation_server;
> +SET @save_collation_database = @@collation_database;
> +SELECT @@OLD_MODE;
> +@@OLD_MODE
> +UTF8_IS_UTF8MB3
> +SELECT @@character_set_server,@@character_set_client,@@character_set_results,
> +@@character_set_connection, @@character_set_filesystem, @@character_set_database,
> +@@collation_connection, @@collation_server, @@collation_database;
> +@@character_set_server	@@character_set_client	@@character_set_results	@@character_set_connection	@@character_set_filesystem	@@character_set_database	@@collation_connection	@@collation_server	@@collation_database
> +latin1	latin1	latin1	latin1	binary	latin1	latin1_swedish_ci	latin1_swedish_ci	latin1_swedish_ci
> +#
> +# UTF8MB3 alias for UTF8
> +#
> +SET @@character_set_server = utf8;
> +SET @@character_set_client = utf8;
> +SET @@character_set_results = utf8;
> +SET @@character_set_connection = utf8;
> +SET @@character_set_filesystem = utf8;
> +SET @@character_set_database = utf8;
> +SET @@collation_connection = utf8_general_ci;
> +SET @@collation_server = utf8_unicode_ci;
> +SET @@collation_database = utf8_bin;
> +SELECT @@character_set_server, @@character_set_client, @@character_set_results,
> +@@character_set_connection, @@character_set_filesystem, @@character_set_database,
> +@@collation_connection, @@collation_server, @@collation_database;
> +@@character_set_server	@@character_set_client	@@character_set_results	@@character_set_connection	@@character_set_filesystem	@@character_set_database	@@collation_connection	@@collation_server	@@collation_database
> +utf8mb3	utf8mb3	utf8mb3	utf8mb3	utf8mb3	utf8mb3	utf8mb3_general_ci	utf8mb3_unicode_ci	utf8mb3_bin
> +CREATE DATABASE db1 CHARACTER SET = 'utf8' COLLATE = 'utf8_general_ci';
> +ALTER DATABASE db1 CHARACTER SET = 'utf8' COLLATE = 'utf8_unicode_ci';
> +CREATE TABLE tb1 (id1 INT) CHARACTER SET 'utf8' COLLATE 'utf8_bin';
> +SHOW CREATE TABLE tb1;
> +Table	Create Table
> +tb1	CREATE TABLE `tb1` (
> +  `id1` int(11) DEFAULT NULL
> +) ENGINE=MyISAM DEFAULT CHARSET=utf8mb3 COLLATE=utf8mb3_bin
> +DROP TABLE tb1;
> +DROP DATABASE db1;
> +#
> +# UTF8MB4 is alias for UTF8
> +#
> +SET @@OLD_MODE=0;
> +SET @@character_set_server = utf8;
> +SET @@character_set_client = utf8;
> +SET @@character_set_results = utf8;
> +SET @@character_set_connection = utf8;
> +SET @@character_set_filesystem = utf8;
> +SET @@character_set_database = utf8;
> +SET @@collation_connection = utf8_general_ci;
> +SET @@collation_server = utf8_unicode_ci;
> +SET @@collation_database = utf8_bin;
> +SELECT @@character_set_server, @@character_set_client, @@character_set_results,
> +@@character_set_connection, @@character_set_filesystem, @@character_set_database,
> +@@collation_connection, @@collation_server, @@collation_database;
> +@@character_set_server	@@character_set_client	@@character_set_results	@@character_set_connection	@@character_set_filesystem	@@character_set_database	@@collation_connection	@@collation_server	@@collation_database
> +utf8mb4	utf8mb4	utf8mb4	utf8mb4	utf8mb4	utf8mb4	utf8mb4_general_ci	utf8mb4_unicode_ci	utf8mb4_bin
> +CREATE DATABASE db1 CHARACTER SET = 'utf8' COLLATE = 'utf8_general_ci';
> +ALTER DATABASE db1 CHARACTER SET = 'utf8' COLLATE = 'utf8_unicode_ci';
> +CREATE TABLE tb1 (id1 INT) CHARACTER SET 'utf8' COLLATE 'utf8_bin';
> +SHOW CREATE TABLE tb1;
> +Table	Create Table
> +tb1	CREATE TABLE `tb1` (
> +  `id1` int(11) DEFAULT NULL
> +) ENGINE=MyISAM DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin
> +DROP TABLE tb1;
> +DROP DATABASE db1;
> +SET @@OLD_MODE = @save_old_mode;
> +SET @@character_set_server = @save_character_set_server;
> +SET @@character_set_client = @save_character_set_client;
> +SET @@character_set_results = @save_character_set_results;
> +SET @@character_set_connection = @save_character_set_connection;
> +SET @@character_set_filesystem = @save_character_set_filesystem;
> +SET @@character_set_database = @save_character_set_database;
> +SET @@collation_connection = @save_collation_connection;
> +SET @@collation_server = @save_collation_server;
> +SET @@collation_database = @save_collation_database;
> +#
> +# End of 10.6 test
> +#
> diff --git a/mysys/charset.c b/mysys/charset.c
> index 32cfeb56e2d..dbb2749d217 100644
> --- a/mysys/charset.c
> +++ b/mysys/charset.c
> @@ -763,22 +764,13 @@ get_charset_number_internal(const char *charset_name, uint cs_flags)
>  }
>  
>  
> -static const char*
> -get_charset_name_alias(const char *name)
> -{
> -  if (!my_strcasecmp(&my_charset_latin1, name, "utf8mb3"))
> -    return "utf8";
> -  return NULL;
> -}
> -
> -
>  uint get_charset_number(const char *charset_name, uint cs_flags)
>  {
>    uint id;
>    my_pthread_once(&charsets_initialized, init_available_charsets);
>    if ((id= get_charset_number_internal(charset_name, cs_flags)))
>      return id;
> -  if ((charset_name= get_charset_name_alias(charset_name)))
> +  if ((charset_name= !my_strcasecmp(&my_charset_latin1, charset_name, "utf8") ? "utf8mb3" : NULL))

Huh? Why do you not check MY_UTF8_IS_UTF8MB3 here?

>      return get_charset_number_internal(charset_name, cs_flags);
>    return 0;
>  }
> @@ -820,7 +812,7 @@ static CHARSET_INFO *find_collation_data_inheritance_source(CHARSET_INFO *cs)
>      char name[MY_CS_NAME_SIZE + 1];
>      memcpy(name, beg, end - beg);
>      name[end - beg]= '\0';
> -    return inheritance_source_by_id(cs, get_collation_number(name));
> +    return inheritance_source_by_id(cs, get_collation_number(name,MYF(0)));

and not here?

>    }
>    return NULL;
>  }
> @@ -961,7 +953,28 @@ my_collation_get_by_name(MY_CHARSET_LOADER *loader,
>  CHARSET_INFO *get_charset_by_name(const char *cs_name, myf flags)
>  {
>    MY_CHARSET_LOADER loader;
> +  my_bool utf8_is_utf8mb3= flags & MY_UTF8_IS_UTF8MB3 ? 1 : 0;
> +  char *copy_of_name= (char*)cs_name;
> +  char start[6], result[64];
> +  char *temp_cs_name;
> +  
>    my_charset_loader_init_mysys(&loader);
> +
> +  if (!strcasecmp("utf8",copy_of_name))
> +      cs_name = (const char*)(utf8_is_utf8mb3 ? "utf8mb3" : "utf8mb4");
> +  
> +  strncpy(start, cs_name, 5);
> +  temp_cs_name= (char *)(utf8_is_utf8mb3 ? "utf8mb3_":"utf8mb4_");
> +
> +  if (!strncasecmp("utf8_", start,5))
> +  {
> +    copy_of_name+= 5;
> +    result[63]='\0';
> +    strcpy(result, temp_cs_name);
> +    strcat(result, copy_of_name);
> +    result[strlen(copy_of_name)+strlen(temp_cs_name)]='\0';
> +    cs_name= (const char *) result;
> +  }

And why do you do all that ^^^ ? Old code didn't try to change utf8mb3 to
utf8 here, because, I suppose, my_collation_get_by_name() below did all that.
Why did you add alias resolution where none was?

>    return my_collation_get_by_name(&loader, cs_name, flags);
>  }
>  
> @@ -1005,12 +1018,16 @@ get_charset_by_csname(const char *cs_name, uint cs_flags, myf flags)
>  {
>    MY_CHARSET_LOADER loader;
>    my_charset_loader_init_mysys(&loader);
> +
> +  if (!strcasecmp("utf8",cs_name))
> +    cs_name= (const char*)(flags & MY_UTF8_IS_UTF8MB3 ? "utf8mb3" : "utf8mb4");

same here

> +
>    return my_charset_get_by_name(&loader, cs_name, cs_flags, flags);
>  }
>  
>  
>  /**
> -  Resolve character set by the character set name (utf8, latin1, ...).
> +  Resolve character set by the character set name (utf8mb3, latin1, ...).
>  
>    The function tries to resolve character set by the specified name. If
>    there is character set with the given name, it is assigned to the "cs"
> @@ -1453,8 +1472,8 @@ static const MY_CSET_OS_NAME charsets[] =
>  
>    {"US-ASCII",       "latin1",   my_cs_approx},
>  
> -  {"utf8",           "utf8",     my_cs_exact},
> -  {"utf-8",          "utf8",     my_cs_exact},
> +  {"utf8mb3",           "utf8mb3",     my_cs_exact},

eh, no. Try to understand what this array is for.

> +  {"utf-8",          "utf8mb3",     my_cs_exact},
>  #endif
>    {NULL,             NULL,       0}
>  };
> diff --git a/plugin/handler_socket/client/hslongrun.cpp b/plugin/handler_socket/client/hslongrun.cpp
> index b7c02951340..7f88d48fff2 100644
> --- a/plugin/handler_socket/client/hslongrun.cpp
> +++ b/plugin/handler_socket/client/hslongrun.cpp
> @@ -897,7 +897,7 @@ hs_longrun_init_table(const config& conf, int num_prepare,
>      "v1 varchar(32) not null,"
>      "v2 varchar(32) not null,"
>      "v3 varchar(32) not null"
> -    ") character set utf8 collate utf8_bin engine = innodb");
> +    ") character set utf8mb3 collate utf8_bin engine = innodb");

just keep it utf8

>    for (int i = 0; i < num_prepare; ++i) {
>      const std::string i_str = to_stdstring(i);
>      const std::string v1 = "pv1_" + i_str;
> diff --git a/plugin/handler_socket/client/hstest.pl b/plugin/handler_socket/client/hstest.pl
> index 1363e153c44..5924d8a0ce5 100755
> --- a/plugin/handler_socket/client/hstest.pl
> +++ b/plugin/handler_socket/client/hstest.pl
> @@ -52,7 +52,7 @@ for my $action (@actions) {
>  			"k $keytype primary key" .
>  			",v varchar(32) not null" .
>  			$moreflds .
> -			") character set utf8 collate utf8_bin " .
> +			") character set utf8mb3 collate utf8_bin " .

and here. forget about handlersocket

>  			"engine = $engine");
>  	} elsif ($action eq "insert") {
>  		print("INSERT $db.$table tablesize=$tablesize\n");
> diff --git a/plugin/win_auth_client/common.cc b/plugin/win_auth_client/common.cc
> index 8b7319252ac..ddd34aec7da 100644
> --- a/plugin/win_auth_client/common.cc
> +++ b/plugin/win_auth_client/common.cc
> @@ -384,7 +384,7 @@ char* wchar_to_utf8(const wchar_t *string, size_t *len)
>    buf= (char*)malloc(buf_len + 1);
>    if (!buf)
>    {
> -    DBUG_PRINT("error",("Out of memory when converting string '%S' to utf8",
> +    DBUG_PRINT("error",("Out of memory when converting string '%S' to utf8mb3",

Nope, see what this function is doing.

>                          string));
>      return NULL;
>    }
> @@ -408,7 +408,7 @@ char* wchar_to_utf8(const wchar_t *string, size_t *len)
>  
>  #ifndef DBUG_OFF
>    Error_message_buf error_buf;
> -  DBUG_PRINT("error", ("Could not convert string '%S' to utf8"
> +  DBUG_PRINT("error", ("Could not convert string '%S' to utf8mb3"

same

>                         ", WideCharToMultiByte() failed with error %X (%s)",
>                         string, GetLastError(), 
>                         get_last_error_message(error_buf)));
> @@ -451,7 +451,7 @@ wchar_t* utf8_to_wchar(const char *string, size_t *len)
>  
>    if (!buf)
>    {
> -    DBUG_PRINT("error",("Out of memory when converting utf8 string '%s'"
> +    DBUG_PRINT("error",("Out of memory when converting utf8mb3 string '%s'"

same

>                          " to wide-char representation", string));
>      return NULL;
>    }
> diff --git a/scripts/fill_help_tables.sql b/scripts/fill_help_tables.sql
> index d0efb750330..ad7c4fce9a4 100644
> --- a/scripts/fill_help_tables.sql
> +++ b/scripts/fill_help_tables.sql
> @@ -22,7 +22,7 @@

don't change help tables, please.
they'll be regenerated from the documentation

>  
>  --     mysql -u root -p mysql < file_name
>  
> -set names 'utf8';
> +set names 'utf8mb3';
>  
>  set sql_log_bin = 0;
>  
> diff --git a/scripts/mysql_system_tables.sql b/scripts/mysql_system_tables.sql
> index e390f36a98b..7c8532577a1 100644
> --- a/scripts/mysql_system_tables.sql
> +++ b/scripts/mysql_system_tables.sql
> @@ -209,7 +209,7 @@ SET @create_transaction_registry="CREATE TABLE IF NOT EXISTS transaction_registr
>  	UNIQUE KEY (commit_id),
>  	INDEX (begin_timestamp),
>  	INDEX (commit_timestamp, transaction_id)
> -) ENGINE=INNODB DEFAULT CHARSET=utf8 COLLATE=utf8_bin STATS_PERSISTENT=0";
> +) ENGINE=INNODB DEFAULT CHARSET=utf8mb3 COLLATE=utf8_bin STATS_PERSISTENT=0";

here and in all other places in this file and other .sql files: you need to
change the collation too, not just the charset.

>  
>  SET @str=IF(@have_innodb <> 0, @create_innodb_table_stats, "SET @dummy = 0");
>  PREPARE stmt FROM @str;
> diff --git a/sql/mysqld.cc b/sql/mysqld.cc
> index 0bf21e02002..cc7568990b4 100644
> --- a/sql/mysqld.cc
> +++ b/sql/mysqld.cc
> @@ -4039,7 +4039,10 @@ static int init_common_variables()
>        *next_character_set_name++= '\0';
>      if (!(default_charset_info=
>            get_charset_by_csname(default_character_set_name,
> -                                MY_CS_PRIMARY, MYF(MY_WME))))
> +                                MY_CS_PRIMARY,
> +                                global_system_variables.old_behavior &
> +                                OLD_MODE_UTF8_IS_UTF8MB3 ?
> +                                MYF(MY_UTF8_IS_UTF8MB3 | MY_WME) : MYF(MY_WME))))

may be you'd better get this multi-line assignment out of if() ?
e.g.

   myf utf8_flag= global_system_variables.old_behavior & OLD_MODE_UTF8_IS_UTF8MB3 : 0;
   default_charset_info= get_charset_by_csname(default_character_set_name, MY_CS_PRIMARY, MYF(utf8_flag | MY_WME));
   if (!default_charset_info) ...

or add a helper thd->utf8_alias()
as I suggested below

>      {
>        if (next_character_set_name)
>        {
> @@ -4056,7 +4059,10 @@ static int init_common_variables()
>    if (default_collation_name)
>    {
>      CHARSET_INFO *default_collation;
> -    default_collation= get_charset_by_name(default_collation_name, MYF(0));
> +    default_collation= get_charset_by_name(default_collation_name,
> +                                           global_system_variables.old_behavior &
> +                                           OLD_MODE_UTF8_IS_UTF8MB3 ?
> +                                           MYF(MY_UTF8_IS_UTF8MB3) : MYF(0));

and here you'll be able to use utf8_flag without the conditional operator

>      if (!default_collation)
>      {
>  #ifdef WITH_PERFSCHEMA_STORAGE_ENGINE
> @@ -4097,7 +4103,10 @@ static int init_common_variables()
>  
>    if (!(character_set_filesystem=
>          get_charset_by_csname(character_set_filesystem_name,
> -                              MY_CS_PRIMARY, MYF(MY_WME))))
> +                              MY_CS_PRIMARY,
> +                              global_system_variables.old_behavior &
> +                              OLD_MODE_UTF8_IS_UTF8MB3 ?
> +                              MYF(MY_UTF8_IS_UTF8MB3 | MY_WME) : MYF(MY_WME))))

and here

>      return 1;
>    global_system_variables.character_set_filesystem= character_set_filesystem;
>  
> @@ -7415,7 +7424,9 @@ static void usage(void)
>    DBUG_ENTER("usage");
>    if (!(default_charset_info= get_charset_by_csname(default_character_set_name,
>  					           MY_CS_PRIMARY,
> -						   MYF(MY_WME))))
> +						         global_system_variables.old_behavior &
> +                     OLD_MODE_UTF8_IS_UTF8MB3 ?
> +                     MYF(MY_UTF8_IS_UTF8MB3 | MY_WME) : MYF(MY_WME))))

split this one too?

>      exit(1);
>    if (!default_collation_name)
>      default_collation_name= (char*) default_charset_info->name;
> diff --git a/sql/sql_class.h b/sql/sql_class.h
> index 50b746fe514..15aa9ca8199 100644
> --- a/sql/sql_class.h
> +++ b/sql/sql_class.h
> @@ -1007,6 +1008,36 @@ inline void update_global_memory_status(int64 size)
>    my_atomic_add64_explicit(ptr, size, MY_MEMORY_ORDER_RELAXED);
>  }
>  
> +inline const char* get_alias_collation_or_charset_name(const char* name,
> +                                                       bool utf8_is_utf8mb3)
> +{
> +  char *copy_of_name= (char*)name;
> +  char start[6], result[64];
> +  char *temp_cs_name;
> +
> +  if (!strchr(name,'_'))
> +  {
> +    if (!strcasecmp("utf8",name))
> +      name = utf8_is_utf8mb3 ? "utf8mb3" : "utf8mb4";
> +    return name;
> +  }
> +  else
> +  {
> +    strncpy(start, name, 5);
> +    temp_cs_name= (char *)(utf8_is_utf8mb3 ? "utf8mb3_":"utf8mb4_");
> +    if (!strncasecmp("utf8_", start,5))
> +    {
> +      copy_of_name+= 5;
> +      result[63]='\0';
> +      strcpy(result, temp_cs_name);
> +      strcat(result, copy_of_name);
> +      result[strlen(copy_of_name)+strlen(temp_cs_name)]='\0';
> +      strcpy((char*)name,result);
> +    }
> +  }
> +  return name;
> +}

Please, no.
First, you cannot just copy `result` into `name`, because `result` is longer,
you'll overwrite whatever was in memory after name's value.

Second, you don't need to resolve aliases here, charset code already does it,
don't duplicate that. Just pass MY_UTF8_IS_UTF8MB3 down to
my_collation_get_by_name() below.

> +
>  /**
>    Get collation by name, send error to client on failure.
>    @param name     Collation name
> diff --git a/sql/sql_db.cc b/sql/sql_db.cc
> index 9bf16220535..f471d8edc66 100644
> --- a/sql/sql_db.cc
> +++ b/sql/sql_db.cc
> @@ -583,9 +583,14 @@ bool load_db_opt(THD *thd, const char *path, Schema_specification_st *create)
>             default-collation commands.
>          */
>          if (!(create->default_table_charset=
> -        get_charset_by_csname(pos+1, MY_CS_PRIMARY, MYF(0))) &&
> +        get_charset_by_csname(pos+1, MY_CS_PRIMARY,
> +                              thd->variables.old_behavior &
> +                                   OLD_MODE_UTF8_IS_UTF8MB3 ?
> +                                   MYF(MY_UTF8_IS_UTF8MB3) : MYF(0))) &&
>              !(create->default_table_charset=
> -              get_charset_by_name(pos+1, MYF(0))))
> +              get_charset_by_name(pos+1, thd->variables.old_behavior &
> +                                              OLD_MODE_UTF8_IS_UTF8MB3 ?
> +                                              MYF(MY_UTF8_IS_UTF8MB3) : MYF(0))))

again, please move myf flags manipulations out of if()

>          {
>            sql_print_error("Error while loading database options: '%s':",path);
>            sql_print_error(ER_THD(thd, ER_UNKNOWN_CHARACTER_SET),pos+1);
> @@ -595,7 +600,9 @@ bool load_db_opt(THD *thd, const char *path, Schema_specification_st *create)
>        else if (!strncmp(buf,"default-collation", (pos-buf)))
>        {
>          if (!(create->default_table_charset= get_charset_by_name(pos+1,
> -                                                           MYF(0))))
> +                                                           thd->variables.old_behavior &
> +                                                                OLD_MODE_UTF8_IS_UTF8MB3 ?
> +                                                                MYF(MY_UTF8_IS_UTF8MB3) : MYF(0))))

and here

>          {
>            sql_print_error("Error while loading database options: '%s':",path);
>            sql_print_error(ER_THD(thd, ER_UNKNOWN_COLLATION),pos+1);
> diff --git a/sql/sql_lex.cc b/sql/sql_lex.cc
> index 6871699dc5b..ad322eda097 100644
> --- a/sql/sql_lex.cc
> +++ b/sql/sql_lex.cc
> @@ -2789,7 +2789,10 @@ int Lex_input_stream::scan_ident_middle(THD *thd, Lex_ident_cli_st *str,
>      body_utf8_append(m_cpp_text_start, m_cpp_tok_start + length);
>      ErrConvString csname(str->str + 1, str->length - 1, &my_charset_bin);
>      CHARSET_INFO *cs= get_charset_by_csname(csname.ptr(),
> -                                            MY_CS_PRIMARY, MYF(0));
> +                                                 MY_CS_PRIMARY,
> +                                            thd->variables.old_behavior &
> +                                            OLD_MODE_UTF8_IS_UTF8MB3 ?
> +                                            MYF(MY_UTF8_IS_UTF8MB3) : MYF(0));

may be it could be a helper in thd, like

  THD::utf8_alias() {
    return variables.old_behavior & OLD_MODE_UTF8_IS_UTF8MB3
           ? MY_UTF8_IS_UTF8MB3 : 0;
  }

>      if (cs)
>      {
>        *introducer= cs;
> diff --git a/storage/connect/mysql-test/connect/my.cnf b/storage/connect/mysql-test/connect/my.cnf
> index 6310772d01f..83f0aa8ab30 100644
> --- a/storage/connect/mysql-test/connect/my.cnf
> +++ b/storage/connect/mysql-test/connect/my.cnf
> @@ -14,4 +14,4 @@ MASTER_MYSOCK=           @mysqld.1.socket
>  SLAVE_MYPORT=            @mysqld.2.port
>  SLAVE_MYSOCK=            @mysqld.2.socket
>  
> -PGCLIENTENCODING=        UTF8
> +PGCLIENTENCODING=        UTF8MB3

eh... really? Are you sure you've tested it and it worked?
by the name of it I suspect it's a postgresql client encoding.

> diff --git a/storage/connect/mysql-test/connect/t/odbc_postgresql.sql b/storage/connect/mysql-test/connect/t/odbc_postgresql.sql
> index 1c302294393..3c78120a7a2 100644
> --- a/storage/connect/mysql-test/connect/t/odbc_postgresql.sql
> +++ b/storage/connect/mysql-test/connect/t/odbc_postgresql.sql
> @@ -4,13 +4,13 @@
>  -- Run this script as a admin user:
>  -- psql -U postgres < odbc_postgresql.sql
>  
> -SET NAMES 'UTF8';
> +SET NAMES 'UTF8MB3';

same, postgresql

>  
>  DROP DATABASE IF EXISTS mtr;
>  DROP USER IF EXISTS mtr;
>  
>  CREATE USER mtr WITH PASSWORD 'mtr';
> -CREATE DATABASE mtr OWNER=mtr ENCODING='UTF8';
> +CREATE DATABASE mtr OWNER=mtr ENCODING='UTF8MB3';

same

>  GRANT ALL ON DATABASE mtr TO mtr;
>  \c mtr
>  SET role mtr;
> diff --git a/storage/innobase/fts/fts0opt.cc b/storage/innobase/fts/fts0opt.cc
> index e5164fcc4fa..a26a05c2cf9 100644
> --- a/storage/innobase/fts/fts0opt.cc
> +++ b/storage/innobase/fts/fts0opt.cc
> @@ -330,7 +330,7 @@ fts_word_t*
>  fts_word_init(
>  /*==========*/
>  	fts_word_t*	word,		/*!< in: word to initialize */
> -	byte*		utf8,		/*!< in: UTF-8 string */
> +	byte*		utf8mb3,		/*!< in: UTF-8 string */

don't rename variables, please

>  	ulint		len)		/*!< in: length of string in bytes */
>  {
>  	mem_heap_t*	heap = mem_heap_create(sizeof(fts_node_t));
> diff --git a/storage/mroonga/mysql-test/mroonga/storage/r/collation_utf8_unicode_ci_french.result b/storage/mroonga/mysql-test/mroonga/storage/r/collation_utf8_unicode_ci_french.result
> index 3f24de87035..3659aa5aee8 100644
> --- a/storage/mroonga/mysql-test/mroonga/storage/r/collation_utf8_unicode_ci_french.result
> +++ b/storage/mroonga/mysql-test/mroonga/storage/r/collation_utf8_unicode_ci_french.result
> @@ -7,5 +7,4 @@ FULLTEXT INDEX (content)
>  INSERT INTO diaries VALUES ("Je suis un garçon.");
>  SELECT * FROM diaries WHERE MATCH (content) AGAINST ("garcon");
>  content
> -Je suis un garçon.

looks like a bug

>  DROP TABLE diaries;
> diff --git a/storage/mroonga/mysql-test/mroonga/storage/r/collation_utf8_unicode_ci_japanese.result b/storage/mroonga/mysql-test/mroonga/storage/r/collation_utf8_unicode_ci_japanese.result
> index 94ef2608b81..79dac1e63a7 100644
> --- a/storage/mroonga/mysql-test/mroonga/storage/r/collation_utf8_unicode_ci_japanese.result
> +++ b/storage/mroonga/mysql-test/mroonga/storage/r/collation_utf8_unicode_ci_japanese.result
> @@ -7,5 +7,4 @@ FULLTEXT INDEX (content)
>  INSERT INTO diaries VALUES ("ひらがなとカタカナを覚えました。");
>  SELECT * FROM diaries WHERE MATCH (content) AGAINST ("かたかな");
>  content
> -ひらがなとカタカナを覚えました。

that too

>  DROP TABLE diaries;
> diff --git a/storage/mroonga/vendor/groonga/CMakeLists.txt b/storage/mroonga/vendor/groonga/CMakeLists.txt
> index d271d4c4eb9..fc134b81cde 100644
> --- a/storage/mroonga/vendor/groonga/CMakeLists.txt
> +++ b/storage/mroonga/vendor/groonga/CMakeLists.txt
> @@ -268,7 +268,7 @@ if(UNIX)
>    ac_check_funcs(pthread_condattr_setpshared)
>  endif()
>  
> -option(GRN_WITH_NFKC "use NFKC based UTF8 normalization." ON)
> +option(GRN_WITH_NFKC "use NFKC based UTF8MB3 normalization." ON)

not here, please

>  
>  if(WIN32)
>    ac_check_headers(winsock2.h)
> diff --git a/storage/mroonga/vendor/groonga/benchmark/bench-nfkc.c b/storage/mroonga/vendor/groonga/benchmark/bench-nfkc.c
> index ebae95b273b..3997b933e87 100644
> --- a/storage/mroonga/vendor/groonga/benchmark/bench-nfkc.c
> +++ b/storage/mroonga/vendor/groonga/benchmark/bench-nfkc.c
> @@ -83,11 +83,11 @@ static void
>  bench_char_type(gpointer user_data)
>  {
>    uint64_t code_point;
> -  char utf8[7];
> +  char utf8mb3[7];

don't rename variables

>  
>    for (code_point = 1; code_point < MAX_UNICODE; code_point++) {
> -    ucs2utf8(code_point, (unsigned char *)utf8);
> -    grn_nfkc50_char_type(utf8);
> +    ucs2utf8(code_point, (unsigned char *)utf8mb3);
> +    grn_nfkc50_char_type(utf8mb3);
>    }
>  }
>  
> diff --git a/storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/tool/dump_difference_utf8.rb b/storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/tool/dump_difference_utf8.rb
> index 4b6fde8c7b0..ce20a2c5b40 100644
> --- a/storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/tool/dump_difference_utf8.rb
> +++ b/storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/tool/dump_difference_utf8.rb
> @@ -36,8 +36,8 @@ parser.sorted_pages.each do |page, characters|
>      next if base == sort
>      n_differences += 1
>      utf8s = [base, upper, lower, sort]
> -    formatted_code_points = utf8s.collect do |utf8|
> -      "%#07x" % Unicode.from_utf8(utf8)
> +    formatted_code_points = utf8s.collect do |utf8mb3|
> +      "%#07x" % Unicode.from_utf8(utf8mb3)

just revert all changes under storage/mroonga/vendor/*

>      end
>      if sort.bytesize > base.bytesize
>        n_expanded_sort_characters += 1
> diff --git a/storage/sphinx/mysql-test/sphinx/my.cnf b/storage/sphinx/mysql-test/sphinx/my.cnf
> index f60380b7171..22cc06914f4 100644
> --- a/storage/sphinx/mysql-test/sphinx/my.cnf
> +++ b/storage/sphinx/mysql-test/sphinx/my.cnf
> @@ -7,7 +7,7 @@ xmlpipe_command = cat @ENV.MTR_SUITE_DIR/testdata.xml
>  [index test1]
>  source = src1
>  docinfo = extern
> -charset_type = utf-8
> +charset_type = utf-8mb3

revert

>  path = @ENV.MYSQLTEST_VARDIR/searchd/test1
>  
>  [indexer]
> diff --git a/storage/spider/mysql-test/spider/bg/my.cnf b/storage/spider/mysql-test/spider/bg/my.cnf
> index 246099c623e..39f5bd01c67 100644
> --- a/storage/spider/mysql-test/spider/bg/my.cnf
> +++ b/storage/spider/mysql-test/spider/bg/my.cnf
> @@ -75,15 +75,15 @@ MASTER_1_MYSOCK=          @mysqld.1.1.socket
>  MASTER_1_ENGINE_TYPE=     Spider
>  #MASTER_1_ENGINE_TYPE=     MyISAM
>  MASTER_1_ENGINE=          ENGINE=Spider
> -MASTER_1_CHARSET=         DEFAULT CHARSET=utf8
> +MASTER_1_CHARSET=         DEFAULT CHARSET=utf8mb3

Make sure to update both charset and collation
here and in all other .cnf files in the spider suite

>  MASTER_1_ENGINE2=         ENGINE=MyISAM
> -MASTER_1_CHARSET2=        DEFAULT CHARSET=utf8
> -MASTER_1_CHARSET3=        DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci
> +MASTER_1_CHARSET2=        DEFAULT CHARSET=utf8mb3
> +MASTER_1_CHARSET3=        DEFAULT CHARSET=utf8mb3 COLLATE=utf8_unicode_ci
>  SLAVE1_1_MYPORT=          @mysqld.4.1.port
>  SLAVE1_1_MYSOCK=          @mysqld.4.1.socket
>  SLAVE1_1_ENGINE_TYPE=     MyISAM
>  SLAVE1_1_ENGINE=          ENGINE=MyISAM
> -SLAVE1_1_CHARSET=         DEFAULT CHARSET=utf8
> +SLAVE1_1_CHARSET=         DEFAULT CHARSET=utf8mb3
>  USE_CHILD_GROUP2=         1
>  OUTPUT_CHILD_GROUP2=      0
>  CHILD2_1_MYPORT=          @mysqld.2.1.port
> diff --git a/storage/spider/spd_init_query.h b/storage/spider/spd_init_query.h
> index 19b04d50b82..f12cef377e2 100644
> --- a/storage/spider/spd_init_query.h
> +++ b/storage/spider/spd_init_query.h
> @@ -559,7 +559,7 @@ static LEX_STRING spider_init_queries[] = {
>      "      table_name char(64) not null default '',"
>      "      primary key (table_id),"
>      "      unique uk1(db_name, table_name)"
> -    "    ) engine=Aria transactional=1 default charset=utf8 collate=utf8_bin;"
> +    "    ) engine=Aria transactional=1 default charset=utf8mb3 collate=utf8_bin;"

always change both charset and collation
(everywhere in this file)

>      "    create table if not exists mysql.spider_rewrite_table_tables("
>      "      table_id bigint unsigned not null,"
>      "      partition_id bigint unsigned not null auto_increment,"
> diff --git a/tests/mysql_client_test.c b/tests/mysql_client_test.c
> index 0043786d477..5d9213591fb 100644
> --- a/tests/mysql_client_test.c
> +++ b/tests/mysql_client_test.c
> @@ -19236,7 +19236,7 @@ static void test_bug12337762()
>    rc= mysql_query(mysql, "create table charset_tab("\
>                           "txt1 varchar(32) character set Latin1,"\
>                           "txt2 varchar(32) character set Latin1 collate latin1_bin,"\
> -                         "txt3 varchar(32) character set utf8 collate utf8_bin"\
> +                         "txt3 varchar(32) character set utf8mb3 collate utf8_bin"\

both charset and collation

>  						 ")");
>    
>    DIE_UNLESS(rc == 0);

Regards,
Sergei
VP of MariaDB Server Engineering
and security@xxxxxxxxxxx