← Back to team overview

maria-developers team mailing list archive

Re: Please review MDEV-6353 my_ismbchar() and my_mbcharlen() refactoring

 

Hello Sergei,

On 03/30/2016 08:41 PM, Sergei Golubchik wrote:
> Hi, Alexander!
> 
> On Mar 30, Alexander Barkov wrote:
>> commit 4ab28aca964fa646aa55676db813dbed66b83093
>> Author: Alexander Barkov <bar@xxxxxxxxxxx>
>> Date:   Mon Mar 28 11:05:51 2016 +0400
>>
>>     MDEV-6353 my_ismbchar() and my_mbcharlen() refactoring, part 1.
>>     Fixing the debug_sync and create_options related code not to use my_mbcharlen():
>>     - debug_sync_token() now uses cs->cset->scan().
>>       Passing the end of the string pointer to debug_sync_update() in order to be
>>       able to use scan(). Adding support for a new pattern scan(MY_SEQ_NONSPACES).
>>       It does scans everything that scan(MY_SEQ_SPACES) does not.
>>     - Fixing set_one_value() to iterate bytes one by one. This is safe, because
>>       ',' cannot be a part of a multi-byte character in UTF8.
> 
> Looks ok, thanks!
> But don't push this commit alone, please.
> Wait until all parts of MDEV-6353 are ready and push them together.

Please review the final version, now removing all my_mbcharlen().

Thanks.

> 
> Regards,
> Sergei
> Chief Architect MariaDB
> and security@xxxxxxxxxxx
> 
diff --git a/client/mysqltest.cc b/client/mysqltest.cc
index f09ad31..afb746c 100644
--- a/client/mysqltest.cc
+++ b/client/mysqltest.cc
@@ -6571,37 +6571,35 @@ int read_line(char *buf, int size)
 
     if (!skip_char)
     {
-      /* Could be a multibyte character */
-      /* This code is based on the code in "sql_load.cc" */
-#ifdef USE_MB
-      int charlen = my_mbcharlen(charset_info, (unsigned char) c);
-      /* We give up if multibyte character is started but not */
-      /* completed before we pass buf_end */
-      if ((charlen > 1) && (p + charlen) <= buf_end)
+      *p++= c;
+      if (use_mb(charset_info))
       {
-	int i;
-	char* mb_start = p;
-
-	*p++ = c;
-
-	for (i= 1; i < charlen; i++)
-	{
-	  c= my_getc(cur_file->file);
-	  if (feof(cur_file->file))
-	    goto found_eof;
-	  *p++ = c;
-	}
-	if (! my_ismbchar(charset_info, mb_start, p))
-	{
-	  /* It was not a multiline char, push back the characters */
-	  /* We leave first 'c', i.e. pretend it was a normal char */
-	  while (p-1 > mb_start)
-	    my_ungetc(*--p);
-	}
+        const char *mb_start= p - 1;
+        /* Could be a multibyte character */
+        /* See a similar code in "sql_load.cc" */
+        for ( ; p < buf_end; )
+        {
+          int charlen= my_charlen(charset_info, mb_start, p);
+          if (charlen > 0)
+            break; /* Full character */
+          if (MY_CS_IS_TOOSMALL(charlen))
+          {
+            /* We give up if multibyte character is started but not */
+            /* completed before we pass buf_end */
+            c= my_getc(cur_file->file);
+            if (feof(cur_file->file))
+              goto found_eof;
+            *p++ = c;
+            continue;
+          }
+          DBUG_ASSERT(charlen == MY_CS_ILSEQ);
+          /* It was not a multiline char, push back the characters */
+          /* We leave first 'c', i.e. pretend it was a normal char */
+          while (p - 1 > mb_start)
+            my_ungetc(*--p);
+          break;
+        }
       }
-      else
-#endif
-	*p++= c;
     }
   }
   die("The input buffer is too small for this query.x\n"      \
diff --git a/include/m_ctype.h b/include/m_ctype.h
index c892d576..bb633f8 100644
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@@ -186,6 +186,7 @@ extern MY_UNI_CTYPE my_uni_ctype[256];
 
 #define MY_SEQ_INTTAIL	1
 #define MY_SEQ_SPACES	2
+#define MY_SEQ_NONSPACES 3 /* Skip non-space characters, including bad bytes */
 
         /* My charsets_list flags */
 #define MY_CS_COMPILED  1      /* compiled-in sets               */
@@ -403,7 +404,6 @@ struct my_charset_handler_st
 {
   my_bool (*init)(struct charset_info_st *, MY_CHARSET_LOADER *loader);
   /* Multibyte routines */
-  uint    (*mbcharlen)(CHARSET_INFO *, uint c);
   size_t  (*numchars)(CHARSET_INFO *, const char *b, const char *e);
   size_t  (*charpos)(CHARSET_INFO *, const char *b, const char *e,
                      size_t pos);
@@ -779,7 +779,6 @@ size_t my_well_formed_char_length_8bit(CHARSET_INFO *cs,
                                        size_t nchars,
                                        MY_STRCOPY_STATUS *status);
 int my_charlen_8bit(CHARSET_INFO *, const uchar *str, const uchar *end);
-uint my_mbcharlen_8bit(CHARSET_INFO *, uint c);
 
 
 /* Functions for multibyte charsets */
@@ -1010,11 +1009,19 @@ int my_charlen(CHARSET_INFO *cs, const char *str, const char *end)
   return (cs->cset->charlen)(cs, (const uchar *) str,
                                  (const uchar *) end);
 }
-#ifdef USE_MB
-#define my_mbcharlen(s, a)            ((s)->cset->mbcharlen((s),(a)))
-#else
-#define my_mbcharlen(s, a)            1
-#endif
+
+
+/**
+  Convert broken and incomplete byte sequences to 1 byte.
+*/
+static inline
+int my_charlen_fix(CHARSET_INFO *cs, const char *str, const char *end)
+{
+  int char_length= my_charlen(cs, str, end);
+  DBUG_ASSERT(str < end);
+  return char_length > 0 ? (uint) char_length : 0U;
+}
+
 
 #define my_caseup_str(s, a)           ((s)->cset->caseup_str((s), (a)))
 #define my_casedn_str(s, a)           ((s)->cset->casedn_str((s), (a)))
diff --git a/mysys/charset.c b/mysys/charset.c
index 3c134dc..253dc72 100644
--- a/mysys/charset.c
+++ b/mysys/charset.c
@@ -54,6 +54,12 @@ get_collation_number_internal(const char *name)
 }
 
 
+static my_bool is_multi_byte_ident(CHARSET_INFO *cs, uchar ch)
+{
+  int chlen= my_charlen(cs, (const char *) &ch, (const char *) &ch + 1);
+  return MY_CS_IS_TOOSMALL(chlen) ? TRUE : FALSE;
+}
+
 static my_bool init_state_maps(struct charset_info_st *cs)
 {
   uint i;
@@ -73,10 +79,8 @@ static my_bool init_state_maps(struct charset_info_st *cs)
       state_map[i]=(uchar) MY_LEX_IDENT;
     else if (my_isdigit(cs,i))
       state_map[i]=(uchar) MY_LEX_NUMBER_IDENT;
-#if defined(USE_MB) && defined(USE_MB_IDENT)
-    else if (my_mbcharlen(cs, i)>1)
+    else if (is_multi_byte_ident(cs, i))
       state_map[i]=(uchar) MY_LEX_IDENT;
-#endif
     else if (my_isspace(cs,i))
       state_map[i]=(uchar) MY_LEX_SKIP;
     else
@@ -909,8 +913,8 @@ size_t escape_string_for_mysql(CHARSET_INFO *charset_info,
   {
     char escape= 0;
 #ifdef USE_MB
-    int tmp_length;
-    if (use_mb_flag && (tmp_length= my_ismbchar(charset_info, from, end)))
+    int tmp_length= my_charlen(charset_info, from, end);
+    if (use_mb_flag && tmp_length > 1)
     {
       if (to + tmp_length > to_end)
       {
@@ -933,7 +937,7 @@ size_t escape_string_for_mysql(CHARSET_INFO *charset_info,
      multi-byte character into a valid one. For example, 0xbf27 is not
      a valid GBK character, but 0xbf5c is. (0x27 = ', 0x5c = \)
     */
-    if (use_mb_flag && (tmp_length= my_mbcharlen(charset_info, *from)) > 1)
+    if (use_mb_flag && tmp_length != 1)
       escape= *from;
     else
 #endif
diff --git a/sql/create_options.cc b/sql/create_options.cc
index 66515be..3011c4b 100644
--- a/sql/create_options.cc
+++ b/sql/create_options.cc
@@ -184,7 +184,7 @@ static bool set_one_value(ha_create_table_option *opt,
       {
         for (end=start;
              *end && *end != ',';
-             end+= my_mbcharlen(system_charset_info, *end)) /* no-op */;
+             end++) /* no-op */;
         if (!my_strnncoll(system_charset_info,
                           (uchar*)start, end-start,
                           (uchar*)value->str, value->length))
diff --git a/sql/debug_sync.cc b/sql/debug_sync.cc
index 8b3412e..e84f1e8 100644
--- a/sql/debug_sync.cc
+++ b/sql/debug_sync.cc
@@ -847,16 +847,16 @@ static bool debug_sync_set_action(THD *thd, st_debug_sync_action *action)
     to the string terminator ASCII NUL ('\0').
 */
 
-static char *debug_sync_token(char **token_p, uint *token_length_p, char *ptr)
+static char *debug_sync_token(char **token_p, uint *token_length_p,
+                              char *ptr, char *ptrend)
 {
   DBUG_ASSERT(token_p);
   DBUG_ASSERT(token_length_p);
   DBUG_ASSERT(ptr);
 
   /* Skip leading space */
-  while (my_isspace(system_charset_info, *ptr))
-    ptr+= my_mbcharlen(system_charset_info, (uchar) *ptr);
-
+  ptr+= system_charset_info->cset->scan(system_charset_info,
+                                        ptr, ptrend, MY_SEQ_SPACES);
   if (!*ptr)
   {
     ptr= NULL;
@@ -867,8 +867,8 @@ static char *debug_sync_token(char **token_p, uint *token_length_p, char *ptr)
   *token_p= ptr;
 
   /* Find token end. */
-  while (*ptr && !my_isspace(system_charset_info, *ptr))
-    ptr+= my_mbcharlen(system_charset_info, (uchar) *ptr);
+  ptr+= system_charset_info->cset->scan(system_charset_info,
+                                        ptr, ptrend, MY_SEQ_NONSPACES);
 
   /* Get token length. */
   *token_length_p= ptr - *token_p;
@@ -876,18 +876,19 @@ static char *debug_sync_token(char **token_p, uint *token_length_p, char *ptr)
   /* If necessary, terminate token. */
   if (*ptr)
   {
+    DBUG_ASSERT(ptr < ptrend);
     /* Get terminator character length. */
-    uint mbspacelen= my_mbcharlen(system_charset_info, (uchar) *ptr);
+    int mbspacelen= my_charlen(system_charset_info, ptr, ptrend);
 
     /* Terminate token. */
     *ptr= '\0';
 
     /* Skip the terminator. */
-    ptr+= mbspacelen;
+    ptr+= mbspacelen < 1 ? 1 : mbspacelen;
 
     /* Skip trailing space */
-    while (my_isspace(system_charset_info, *ptr))
-      ptr+= my_mbcharlen(system_charset_info, (uchar) *ptr);
+    ptr+= system_charset_info->cset->scan(system_charset_info,
+                                          ptr, ptrend, MY_SEQ_SPACES);
   }
 
  end:
@@ -917,7 +918,8 @@ static char *debug_sync_token(char **token_p, uint *token_length_p, char *ptr)
     undefined in this case.
 */
 
-static char *debug_sync_number(ulong *number_p, char *actstrptr)
+static char *debug_sync_number(ulong *number_p, char *actstrptr,
+                                                char *actstrend)
 {
   char                  *ptr;
   char                  *ept;
@@ -927,7 +929,7 @@ static char *debug_sync_number(ulong *number_p, char *actstrptr)
   DBUG_ASSERT(actstrptr);
 
   /* Get token from string. */
-  if (!(ptr= debug_sync_token(&token, &token_length, actstrptr)))
+  if (!(ptr= debug_sync_token(&token, &token_length, actstrptr, actstrend)))
     goto end;
 
   *number_p= strtoul(token, &ept, 10);
@@ -971,7 +973,7 @@ static char *debug_sync_number(ulong *number_p, char *actstrptr)
     for the string.
 */
 
-static bool debug_sync_eval_action(THD *thd, char *action_str)
+static bool debug_sync_eval_action(THD *thd, char *action_str, char *action_end)
 {
   st_debug_sync_action  *action= NULL;
   const char            *errmsg;
@@ -986,7 +988,7 @@ static bool debug_sync_eval_action(THD *thd, char *action_str)
   /*
     Get debug sync point name. Or a special command.
   */
-  if (!(ptr= debug_sync_token(&token, &token_length, action_str)))
+  if (!(ptr= debug_sync_token(&token, &token_length, action_str, action_end)))
   {
     errmsg= "Missing synchronization point name";
     goto err;
@@ -1009,7 +1011,7 @@ static bool debug_sync_eval_action(THD *thd, char *action_str)
   /*
     Get kind of action to be taken at sync point.
   */
-  if (!(ptr= debug_sync_token(&token, &token_length, ptr)))
+  if (!(ptr= debug_sync_token(&token, &token_length, ptr, action_end)))
   {
     /* No action present. Try special commands. Token unchanged. */
 
@@ -1090,7 +1092,7 @@ static bool debug_sync_eval_action(THD *thd, char *action_str)
   if (!my_strcasecmp(system_charset_info, token, "SIGNAL"))
   {
     /* It is SIGNAL. Signal name must follow. */
-    if (!(ptr= debug_sync_token(&token, &token_length, ptr)))
+    if (!(ptr= debug_sync_token(&token, &token_length, ptr, action_end)))
     {
       errmsg= "Missing signal name after action SIGNAL";
       goto err;
@@ -1108,7 +1110,7 @@ static bool debug_sync_eval_action(THD *thd, char *action_str)
     action->execute= 1;
 
     /* Get next token. If none follows, set action. */
-    if (!(ptr= debug_sync_token(&token, &token_length, ptr)))
+    if (!(ptr= debug_sync_token(&token, &token_length, ptr, action_end)))
       goto set_action;
   }
 
@@ -1118,7 +1120,7 @@ static bool debug_sync_eval_action(THD *thd, char *action_str)
   if (!my_strcasecmp(system_charset_info, token, "WAIT_FOR"))
   {
     /* It is WAIT_FOR. Wait_for signal name must follow. */
-    if (!(ptr= debug_sync_token(&token, &token_length, ptr)))
+    if (!(ptr= debug_sync_token(&token, &token_length, ptr, action_end)))
     {
       errmsg= "Missing signal name after action WAIT_FOR";
       goto err;
@@ -1137,7 +1139,7 @@ static bool debug_sync_eval_action(THD *thd, char *action_str)
     action->timeout= opt_debug_sync_timeout;
 
     /* Get next token. If none follows, set action. */
-    if (!(ptr= debug_sync_token(&token, &token_length, ptr)))
+    if (!(ptr= debug_sync_token(&token, &token_length, ptr, action_end)))
       goto set_action;
 
     /*
@@ -1146,14 +1148,14 @@ static bool debug_sync_eval_action(THD *thd, char *action_str)
     if (!my_strcasecmp(system_charset_info, token, "TIMEOUT"))
     {
       /* It is TIMEOUT. Number must follow. */
-      if (!(ptr= debug_sync_number(&action->timeout, ptr)))
+      if (!(ptr= debug_sync_number(&action->timeout, ptr, action_end)))
       {
         errmsg= "Missing valid number after TIMEOUT";
         goto err;
       }
 
       /* Get next token. If none follows, set action. */
-      if (!(ptr= debug_sync_token(&token, &token_length, ptr)))
+      if (!(ptr= debug_sync_token(&token, &token_length, ptr, action_end)))
         goto set_action;
     }
   }
@@ -1174,14 +1176,14 @@ static bool debug_sync_eval_action(THD *thd, char *action_str)
     }
 
     /* Number must follow. */
-    if (!(ptr= debug_sync_number(&action->execute, ptr)))
+    if (!(ptr= debug_sync_number(&action->execute, ptr, action_end)))
     {
       errmsg= "Missing valid number after EXECUTE";
       goto err;
     }
 
     /* Get next token. If none follows, set action. */
-    if (!(ptr= debug_sync_token(&token, &token_length, ptr)))
+    if (!(ptr= debug_sync_token(&token, &token_length, ptr, action_end)))
       goto set_action;
   }
 
@@ -1191,14 +1193,14 @@ static bool debug_sync_eval_action(THD *thd, char *action_str)
   if (!my_strcasecmp(system_charset_info, token, "HIT_LIMIT"))
   {
     /* Number must follow. */
-    if (!(ptr= debug_sync_number(&action->hit_limit, ptr)))
+    if (!(ptr= debug_sync_number(&action->hit_limit, ptr, action_end)))
     {
       errmsg= "Missing valid number after HIT_LIMIT";
       goto err;
     }
 
     /* Get next token. If none follows, set action. */
-    if (!(ptr= debug_sync_token(&token, &token_length, ptr)))
+    if (!(ptr= debug_sync_token(&token, &token_length, ptr, action_end)))
       goto set_action;
   }
 
@@ -1246,7 +1248,7 @@ static bool debug_sync_eval_action(THD *thd, char *action_str)
     terminators in the string. So we need to take a copy here.
 */
 
-bool debug_sync_update(THD *thd, char *val_str)
+bool debug_sync_update(THD *thd, char *val_str, size_t len)
 {
   DBUG_ENTER("debug_sync_update");
   DBUG_PRINT("debug_sync", ("set action: '%s'", val_str));
@@ -1255,8 +1257,9 @@ bool debug_sync_update(THD *thd, char *val_str)
     debug_sync_eval_action() places '\0' in the string, which itself
     must be '\0' terminated.
   */
+  DBUG_ASSERT(val_str[len] == '\0');
   DBUG_RETURN(opt_debug_sync_timeout ?
-              debug_sync_eval_action(thd, val_str) :
+              debug_sync_eval_action(thd, val_str, val_str + len) :
               FALSE);
 }
 
@@ -1592,7 +1595,7 @@ bool debug_sync_set_action(THD *thd, const char *action_str, size_t len)
   DBUG_ASSERT(action_str);
   
   value= strmake_root(thd->mem_root, action_str, len);
-  rc= debug_sync_eval_action(thd, value);
+  rc= debug_sync_eval_action(thd, value, value + len);
   DBUG_RETURN(rc);
 }
 
diff --git a/sql/debug_sync.h b/sql/debug_sync.h
index bf1b316..339a211 100644
--- a/sql/debug_sync.h
+++ b/sql/debug_sync.h
@@ -45,6 +45,9 @@ extern void debug_sync_init_thread(THD *thd);
 extern void debug_sync_end_thread(THD *thd);
 extern bool debug_sync_set_action(THD *thd, const char *action_str, size_t len);
 
+extern bool debug_sync_update(THD *thd, char *val_str, size_t len);
+extern uchar *debug_sync_value_ptr(THD *thd);
+
 #endif /* defined(ENABLED_DEBUG_SYNC) */
 
 #endif /* DEBUG_SYNC_INCLUDED */
diff --git a/sql/sql_class.cc b/sql/sql_class.cc
index e3b7056..e29608b 100644
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -3226,7 +3226,7 @@ int select_export::send_data(List<Item> &items)
 
           if ((NEED_ESCAPING(*pos) ||
                (check_second_byte &&
-                my_mbcharlen(character_set_client, (uchar) *pos) == 2 &&
+                ((uchar) *pos) > 0x7F /* a potential MB2HEAD */ &&
                 pos + 1 < end &&
                 NEED_ESCAPING(pos[1]))) &&
               /*
diff --git a/sql/sys_vars.ic b/sql/sys_vars.ic
index 373f583..bb290b6 100644
--- a/sql/sys_vars.ic
+++ b/sql/sys_vars.ic
@@ -1434,6 +1434,9 @@ class Sys_var_plugin: public sys_var
 };
 
 #if defined(ENABLED_DEBUG_SYNC)
+
+#include "debug_sync.h"
+
 /**
   The class for @@debug_sync session-only variable
 */
@@ -1462,15 +1465,21 @@ class Sys_var_debug_sync :public sys_var
     String str(buff, sizeof(buff), system_charset_info), *res;
 
     if (!(res=var->value->val_str(&str)))
+    {
       var->save_result.string_value.str= const_cast<char*>("");
+      var->save_result.string_value.length= 0;
+    }
     else
+    {
       var->save_result.string_value.str= thd->strmake(res->ptr(), res->length());
+      var->save_result.string_value.length= res->length();
+    }
     return false;
   }
   bool session_update(THD *thd, set_var *var)
   {
-    extern bool debug_sync_update(THD *thd, char *val_str);
-    return debug_sync_update(thd, var->save_result.string_value.str);
+    return debug_sync_update(thd, var->save_result.string_value.str,
+                                  var->save_result.string_value.length);
   }
   bool global_update(THD *thd, set_var *var)
   {
@@ -1488,7 +1497,6 @@ class Sys_var_debug_sync :public sys_var
   }
   uchar *session_value_ptr(THD *thd, const LEX_STRING *base)
   {
-    extern uchar *debug_sync_value_ptr(THD *thd);
     return debug_sync_value_ptr(thd);
   }
   uchar *global_value_ptr(THD *thd, const LEX_STRING *base)
diff --git a/storage/federated/ha_federated.cc b/storage/federated/ha_federated.cc
index 478a8f1..2334848 100644
--- a/storage/federated/ha_federated.cc
+++ b/storage/federated/ha_federated.cc
@@ -561,8 +561,7 @@ static bool append_ident(String *string, const char *name, size_t length,
     for (name_end= name+length; name < name_end; name+= clen)
     {
       uchar c= *(uchar *) name;
-      if (!(clen= my_mbcharlen(system_charset_info, c)))
-        clen= 1;
+      clen= my_charlen_fix(system_charset_info, name, name_end);
       if (clen == 1 && c == (uchar) quote_char &&
           (result= string->append(&quote_char, 1, system_charset_info)))
         goto err;
diff --git a/storage/federatedx/ha_federatedx.cc b/storage/federatedx/ha_federatedx.cc
index 890d1bf..56d900c 100644
--- a/storage/federatedx/ha_federatedx.cc
+++ b/storage/federatedx/ha_federatedx.cc
@@ -500,8 +500,7 @@ bool append_ident(String *string, const char *name, uint length,
     for (name_end= name+length; name < name_end; name+= clen)
     {
       uchar c= *(uchar *) name;
-      if (!(clen= my_mbcharlen(system_charset_info, c)))
-        clen= 1;
+      clen= my_charlen_fix(system_charset_info, name, name_end);
       if (clen == 1 && c == (uchar) quote_char &&
           (result= string->append(&quote_char, 1, system_charset_info)))
         goto err;
diff --git a/storage/spider/spd_db_conn.cc b/storage/spider/spd_db_conn.cc
index 69a05dc..a65338c 100644
--- a/storage/spider/spd_db_conn.cc
+++ b/storage/spider/spd_db_conn.cc
@@ -1370,7 +1370,7 @@ int spider_db_append_name_with_quote_str(
   for (name_end = name + length; name < name_end; name += length)
   {
     head_code = *name;
-    if (!(length = my_mbcharlen(system_charset_info, (uchar) head_code)))
+    if ((length= my_charlen(system_charset_info, name, name_end)) < 1)
     {
       my_message(ER_SPIDER_WRONG_CHARACTER_IN_NAME_NUM,
         ER_SPIDER_WRONG_CHARACTER_IN_NAME_STR, MYF(0));
diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c
index 9629319..9ae394e 100644
--- a/strings/ctype-big5.c
+++ b/strings/ctype-big5.c
@@ -848,12 +848,6 @@ static uint16 big5strokexfrm(uint16 i)
 }
 
 
-static uint mbcharlen_big5(CHARSET_INFO *cs __attribute__((unused)), uint c)
-{
-  return (isbig5head(c)? 2 : 1);
-}
-
-
 /* page 0 0xA140-0xC7FC */
 static const uint16 tab_big5_uni0[]={
 0x3000,0xFF0C,0x3001,0x3002,0xFF0E,0x2022,0xFF1B,0xFF1A,
@@ -6731,7 +6725,6 @@ static MY_COLLATION_HANDLER my_collation_handler_big5_bin=
 static MY_CHARSET_HANDLER my_charset_big5_handler=
 {
   NULL,			/* init */
-  mbcharlen_big5,
   my_numchars_mb,
   my_charpos_mb,
   my_well_formed_len_big5,
diff --git a/strings/ctype-bin.c b/strings/ctype-bin.c
index 8331de3..aab7f2b 100644
--- a/strings/ctype-bin.c
+++ b/strings/ctype-bin.c
@@ -225,13 +225,6 @@ static int my_strcasecmp_bin(CHARSET_INFO * cs __attribute__((unused)),
 }
 
 
-uint my_mbcharlen_8bit(CHARSET_INFO *cs __attribute__((unused)),
-                      uint c __attribute__((unused)))
-{
-  return 1;
-}
-
-
 static int my_mb_wc_bin(CHARSET_INFO *cs __attribute__((unused)),
 			my_wc_t *wc,
 			const uchar *str,
@@ -510,7 +503,6 @@ static MY_COLLATION_HANDLER my_collation_binary_handler =
 static MY_CHARSET_HANDLER my_charset_handler=
 {
   NULL,			/* init */
-  my_mbcharlen_8bit,	/* mbcharlen     */
   my_numchars_8bit,
   my_charpos_8bit,
   my_well_formed_len_8bit,
diff --git a/strings/ctype-cp932.c b/strings/ctype-cp932.c
index 2163662..151fac8 100644
--- a/strings/ctype-cp932.c
+++ b/strings/ctype-cp932.c
@@ -191,12 +191,6 @@ static const uchar sort_order_cp932[]=
 #include "ctype-mb.ic"
 
 
-static uint mbcharlen_cp932(CHARSET_INFO *cs __attribute__((unused)),uint c)
-{
-  return (iscp932head((uchar) c) ? 2 : 1);
-}
-
-
 #define cp932code(c,d)	((((uint) (uchar)(c)) << 8) | (uint) (uchar) (d))
 
 
@@ -34687,7 +34681,6 @@ static MY_COLLATION_HANDLER my_collation_handler_cp932_bin=
 static MY_CHARSET_HANDLER my_charset_handler=
 {
   NULL,			/* init */
-  mbcharlen_cp932,
   my_numchars_mb,
   my_charpos_mb,
   my_well_formed_len_cp932,
diff --git a/strings/ctype-euc_kr.c b/strings/ctype-euc_kr.c
index 19ed586..d238913 100644
--- a/strings/ctype-euc_kr.c
+++ b/strings/ctype-euc_kr.c
@@ -210,12 +210,6 @@ static const uchar sort_order_euc_kr[]=
 #include "ctype-mb.ic"
 
 
-static uint mbcharlen_euc_kr(CHARSET_INFO *cs __attribute__((unused)),uint c)
-{
-  return (iseuc_kr_head(c) ? 2 : 1);
-}
-
-
 static MY_UNICASE_CHARACTER cA3[256]=
 {
   {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */
@@ -9979,7 +9973,6 @@ static MY_COLLATION_HANDLER my_collation_handler_euckr_bin=
 static MY_CHARSET_HANDLER my_charset_handler=
 {
   NULL,			/* init */
-  mbcharlen_euc_kr,
   my_numchars_mb,
   my_charpos_mb,
   my_well_formed_len_euckr,
diff --git a/strings/ctype-eucjpms.c b/strings/ctype-eucjpms.c
index 469d3a5..caafd1d 100644
--- a/strings/ctype-eucjpms.c
+++ b/strings/ctype-eucjpms.c
@@ -221,12 +221,6 @@ static const uchar sort_order_eucjpms[]=
 #include "strcoll.ic"
 
 
-static uint mbcharlen_eucjpms(CHARSET_INFO *cs __attribute__((unused)),uint c)
-{
-  return (iseucjpms(c)? 2: iseucjpms_ss2(c)? 2: iseucjpms_ss3(c)? 3: 1);
-}
-
-
 /* Case info pages for JIS-X-0208 range */
 
 static MY_UNICASE_CHARACTER cA2[256]=
@@ -67511,7 +67505,6 @@ static MY_COLLATION_HANDLER my_collation_eucjpms_bin_handler =
 static MY_CHARSET_HANDLER my_charset_handler=
 {
     NULL,		/* init */
-    mbcharlen_eucjpms,
     my_numchars_mb,
     my_charpos_mb,
     my_well_formed_len_eucjpms,
diff --git a/strings/ctype-gb2312.c b/strings/ctype-gb2312.c
index a77237c..dbb92fa 100644
--- a/strings/ctype-gb2312.c
+++ b/strings/ctype-gb2312.c
@@ -173,12 +173,6 @@ static const uchar sort_order_gb2312[]=
 #include "ctype-mb.ic"
 
 
-static uint mbcharlen_gb2312(CHARSET_INFO *cs __attribute__((unused)),uint c)
-{
-  return (isgb2312head(c)? 2 : 1);
-}
-
-
 static MY_UNICASE_CHARACTER cA2[256]=
 {
   {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */
@@ -6385,7 +6379,6 @@ static MY_COLLATION_HANDLER my_collation_handler_gb2312_bin=
 static MY_CHARSET_HANDLER my_charset_handler=
 {
   NULL,			/* init */
-  mbcharlen_gb2312,
   my_numchars_mb,
   my_charpos_mb,
   my_well_formed_len_gb2312,
diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c
index e4e015a..617d72d 100644
--- a/strings/ctype-gbk.c
+++ b/strings/ctype-gbk.c
@@ -3451,11 +3451,6 @@ static uint16 gbksortorder(uint16 i)
 }
 
 
-static uint mbcharlen_gbk(CHARSET_INFO *cs __attribute__((unused)),uint c)
-{
-  return (isgbkhead(c)? 2 : 1);
-}
-
 /* page 0 0x8140-0xFE4F */
 static const uint16 tab_gbk_uni0[]={
 0x4E02,0x4E04,0x4E05,0x4E06,0x4E0F,0x4E12,0x4E17,0x4E1F,
@@ -10666,7 +10661,6 @@ static MY_COLLATION_HANDLER my_collation_handler_gbk_bin=
 static MY_CHARSET_HANDLER my_charset_handler=
 {
   NULL,			/* init */
-  mbcharlen_gbk,
   my_numchars_mb,
   my_charpos_mb,
   my_well_formed_len_gbk,
diff --git a/strings/ctype-latin1.c b/strings/ctype-latin1.c
index aba63d9..fe25534 100644
--- a/strings/ctype-latin1.c
+++ b/strings/ctype-latin1.c
@@ -396,7 +396,6 @@ int my_wc_mb_latin1(CHARSET_INFO *cs  __attribute__((unused)),
 static MY_CHARSET_HANDLER my_charset_handler=
 {
     NULL,			/* init */
-    my_mbcharlen_8bit,
     my_numchars_8bit,
     my_charpos_8bit,
     my_well_formed_len_8bit,
diff --git a/strings/ctype-mb.c b/strings/ctype-mb.c
index 3fa66cb..56b3309 100644
--- a/strings/ctype-mb.c
+++ b/strings/ctype-mb.c
@@ -230,7 +230,7 @@ int my_strcasecmp_mb(CHARSET_INFO * cs,const char *s, const char *t)
         if (*s++ != *t++) 
           return 1;
     }
-    else if (my_mbcharlen(cs, *t) > 1)
+    else if (my_charlen(cs, t, t + cs->mbmaxlen) > 1)
       return 1;
     else if (map[(uchar) *s++] != map[(uchar) *t++])
       return 1;
diff --git a/strings/ctype-simple.c b/strings/ctype-simple.c
index 5e5a345..f405c4f 100644
--- a/strings/ctype-simple.c
+++ b/strings/ctype-simple.c
@@ -1059,6 +1059,13 @@ size_t my_scan_8bit(CHARSET_INFO *cs, const char *str, const char *end, int sq)
         break;
     }
     return (size_t) (str - str0);
+  case MY_SEQ_NONSPACES:
+    for ( ; str < end ; str++)
+    {
+      if (my_isspace(cs, *str))
+        break;
+    }
+    return (size_t) (str - str0);
   default:
     return 0;
   }
@@ -1916,7 +1923,6 @@ my_strxfrm_pad_desc_and_reverse(CHARSET_INFO *cs,
 MY_CHARSET_HANDLER my_charset_8bit_handler=
 {
     my_cset_init_8bit,
-    my_mbcharlen_8bit,		/* mbcharlen     */
     my_numchars_8bit,
     my_charpos_8bit,
     my_well_formed_len_8bit,
diff --git a/strings/ctype-sjis.c b/strings/ctype-sjis.c
index ebcea22..e054614 100644
--- a/strings/ctype-sjis.c
+++ b/strings/ctype-sjis.c
@@ -192,12 +192,6 @@ static const uchar sort_order_sjis[]=
 #include "ctype-mb.ic"
 
 
-static uint mbcharlen_sjis(CHARSET_INFO *cs __attribute__((unused)),uint c)
-{
-  return (issjishead((uchar) c) ? 2 : 1);
-}
-
-
 #define sjiscode(c,d)	((((uint) (uchar)(c)) << 8) | (uint) (uchar) (d))
 
 
@@ -34066,7 +34060,6 @@ static MY_COLLATION_HANDLER my_collation_handler_sjis_bin=
 static MY_CHARSET_HANDLER my_charset_handler=
 {
   NULL,			/* init */
-  mbcharlen_sjis,
   my_numchars_mb,
   my_charpos_mb,
   my_well_formed_len_sjis,
diff --git a/strings/ctype-tis620.c b/strings/ctype-tis620.c
index 711bb21..82fd864 100644
--- a/strings/ctype-tis620.c
+++ b/strings/ctype-tis620.c
@@ -834,7 +834,6 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
 static MY_CHARSET_HANDLER my_charset_handler=
 {
     NULL,		/* init */
-    my_mbcharlen_8bit,	/* mbcharlen */
     my_numchars_8bit,
     my_charpos_8bit,
     my_well_formed_len_8bit,
diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c
index 74e474c..b5fab16 100644
--- a/strings/ctype-ucs2.c
+++ b/strings/ctype-ucs2.c
@@ -1049,6 +1049,9 @@ my_scan_mb2(CHARSET_INFO *cs __attribute__((unused)),
     {
     }
     return (size_t) (str - str0);
+  case MY_SEQ_NONSPACES:
+    DBUG_ASSERT(0);
+    /* pass through */
   default:
     return 0;
   }
@@ -1431,15 +1434,6 @@ my_charlen_utf16(CHARSET_INFO *cs, const uchar *str, const uchar *end)
 /* Defines my_well_formed_char_length_utf16 */
 
 
-static uint
-my_mbcharlen_utf16(CHARSET_INFO *cs  __attribute__((unused)),
-                   uint c __attribute__((unused)))
-{
-  DBUG_ASSERT(0);
-  return MY_UTF16_HIGH_HEAD(c) ? 4 : 2;
-}
-
-
 static size_t
 my_numchars_utf16(CHARSET_INFO *cs,
                   const char *b, const char *e)
@@ -1567,7 +1561,6 @@ static MY_COLLATION_HANDLER my_collation_utf16_bin_handler =
 MY_CHARSET_HANDLER my_charset_utf16_handler=
 {
   NULL,                /* init         */
-  my_mbcharlen_utf16,  /* mbcharlen    */
   my_numchars_utf16,
   my_charpos_utf16,
   my_well_formed_len_utf16,
@@ -1789,7 +1782,6 @@ static MY_COLLATION_HANDLER my_collation_utf16le_bin_handler =
 static MY_CHARSET_HANDLER my_charset_utf16le_handler=
 {
   NULL,                /* init         */
-  my_mbcharlen_utf16,
   my_numchars_utf16,
   my_charpos_utf16,
   my_well_formed_len_utf16,
@@ -2083,14 +2075,6 @@ my_charlen_utf32(CHARSET_INFO *cs __attribute__((unused)),
 /* Defines my_well_formed_char_length_utf32 */
 
 
-static uint
-my_mbcharlen_utf32(CHARSET_INFO *cs  __attribute__((unused)) , 
-                   uint c __attribute__((unused)))
-{
-  return 4;
-}
-
-
 static int
 my_vsnprintf_utf32(char *dst, size_t n, const char* fmt, va_list ap)
 {
@@ -2484,6 +2468,9 @@ my_scan_utf32(CHARSET_INFO *cs,
       str+= res;
     }
     return (size_t) (str - str0);
+  case MY_SEQ_NONSPACES:
+    DBUG_ASSERT(0);
+    /* pass through */
   default:
     return 0;
   }
@@ -2525,7 +2512,6 @@ static MY_COLLATION_HANDLER my_collation_utf32_bin_handler =
 MY_CHARSET_HANDLER my_charset_utf32_handler=
 {
   NULL, /* init */
-  my_mbcharlen_utf32,
   my_numchars_utf32,
   my_charpos_utf32,
   my_well_formed_len_utf32,
@@ -2862,13 +2848,6 @@ my_fill_ucs2(CHARSET_INFO *cs __attribute__((unused)),
 }
 
 
-static uint my_mbcharlen_ucs2(CHARSET_INFO *cs  __attribute__((unused)) , 
-                              uint c __attribute__((unused)))
-{
-  return 2;
-}
-
-
 static
 size_t my_numchars_ucs2(CHARSET_INFO *cs __attribute__((unused)),
                         const char *b, const char *e)
@@ -3003,7 +2982,6 @@ static MY_COLLATION_HANDLER my_collation_ucs2_bin_handler =
 MY_CHARSET_HANDLER my_charset_ucs2_handler=
 {
     NULL,		/* init */
-    my_mbcharlen_ucs2,	/* mbcharlen    */
     my_numchars_ucs2,
     my_charpos_ucs2,
     my_well_formed_len_ucs2,
diff --git a/strings/ctype-ujis.c b/strings/ctype-ujis.c
index b24fdb3..786ae99 100644
--- a/strings/ctype-ujis.c
+++ b/strings/ctype-ujis.c
@@ -220,12 +220,6 @@ static const uchar sort_order_ujis[]=
 #include "strcoll.ic"
 
 
-static uint mbcharlen_ujis(CHARSET_INFO *cs __attribute__((unused)),uint c)
-{
-  return (isujis(c)? 2: isujis_ss2(c)? 2: isujis_ss3(c)? 3: 1);
-}
-
-
 static
 size_t my_numcells_eucjp(CHARSET_INFO *cs __attribute__((unused)),
                        const char *str, const char *str_end)
@@ -67255,7 +67249,6 @@ static MY_COLLATION_HANDLER my_collation_ujis_bin_handler =
 static MY_CHARSET_HANDLER my_charset_handler=
 {
     NULL,		/* init */
-    mbcharlen_ujis,
     my_numchars_mb,
     my_charpos_mb,
     my_well_formed_len_ujis,
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index 3a5616b..b6a7a0d 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -5426,21 +5426,6 @@ my_weight_mb3_utf8_general_mysql500_ci(uchar b0, uchar b1, uchar b2)
 #include "strcoll.ic"
 
 
-static uint my_mbcharlen_utf8(CHARSET_INFO *cs  __attribute__((unused)),
-                              uint c)
-{
-  if (c < 0x80)
-    return 1;
-  else if (c < 0xc2)
-    return 0; /* Illegal mb head */
-  else if (c < 0xe0)
-    return 2;
-  else if (c < 0xf0)
-    return 3;
-  return 0; /* Illegal mb head */;
-}
-
-
 static MY_COLLATION_HANDLER my_collation_utf8_general_ci_handler =
 {
     NULL,               /* init */
@@ -5491,7 +5476,6 @@ static MY_COLLATION_HANDLER my_collation_utf8_bin_handler =
 MY_CHARSET_HANDLER my_charset_utf8_handler=
 {
     NULL,               /* init */
-    my_mbcharlen_utf8,
     my_numchars_mb,
     my_charpos_mb,
     my_well_formed_len_utf8,
@@ -7045,7 +7029,6 @@ static MY_COLLATION_HANDLER my_collation_filename_handler =
 static MY_CHARSET_HANDLER my_charset_filename_handler=
 {
     NULL,               /* init */
-    my_mbcharlen_utf8,
     my_numchars_mb,
     my_charpos_mb,
     my_well_formed_len_mb,
@@ -7111,57 +7094,6 @@ struct charset_info_st my_charset_filename=
 };
 
 
-#ifdef MY_TEST_UTF8
-#include <stdio.h>
-
-static void test_mb(CHARSET_INFO *cs, uchar *s)
-{
-  while(*s)
-  {
-    if (my_ismbhead_utf8(cs,*s))
-    {
-      uint len=my_mbcharlen_utf8(cs,*s);
-      while(len--)
-      {
-        printf("%c",*s);
-        s++;
-      }
-      printf("\n");
-    }
-    else
-    {
-      printf("%c\n",*s);
-      s++;
-    }
-  }
-}
-
-int main()
-{
-  char str[1024]=" utf8 test проба ПЕРА по-РУССКИ";
-  CHARSET_INFO *cs;
-
-  test_mb(cs,(uchar*)str);
-
-  printf("orig      :'%s'\n",str);
-
-  my_caseup_utf8(cs,str,15);
-  printf("caseup    :'%s'\n",str);
-
-  my_caseup_str_utf8(cs,str);
-  printf("caseup_str:'%s'\n",str);
-
-  my_casedn_utf8(cs,str,15);
-  printf("casedn    :'%s'\n",str);
-
-  my_casedn_str_utf8(cs,str);
-  printf("casedn_str:'%s'\n",str);
-
-  return 0;
-}
-
-#endif
-
 #endif /* HAVE_CHARSET_UTF8 */
 
 
@@ -7755,23 +7687,6 @@ size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs,
 #include "strcoll.ic"
 
 
-static uint
-my_mbcharlen_utf8mb4(CHARSET_INFO *cs  __attribute__((unused)), uint c)
-{
-  if (c < 0x80)
-    return 1;
-  if (c < 0xc2)
-    return 0; /* Illegal mb head */
-  if (c < 0xe0)
-    return 2;
-  if (c < 0xf0)
-    return 3;
-  if (c < 0xf8)
-    return 4;
-  return 0; /* Illegal mb head */;
-}
-
-
 static MY_COLLATION_HANDLER my_collation_utf8mb4_general_ci_handler=
 {
   NULL,               /* init */
@@ -7807,7 +7722,6 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_bin_handler =
 MY_CHARSET_HANDLER my_charset_utf8mb4_handler=
 {
   NULL,               /* init */
-  my_mbcharlen_utf8mb4,
   my_numchars_mb,
   my_charpos_mb,
   my_well_formed_len_utf8mb4,
diff --git a/strings/my_strchr.c b/strings/my_strchr.c
index 0305ef8..2365731 100644
--- a/strings/my_strchr.c
+++ b/strings/my_strchr.c
@@ -38,7 +38,7 @@
     const char *acc_end= (ACC) + (LEN);                                 \
     for (ptr_str= (STR) ; ptr_str < (END) ; ptr_str+= mbl)              \
     {                                                                   \
-      mbl= my_mbcharlen((CS), *(uchar*)ptr_str);                        \
+      mbl= my_charlen_fix((CS), ptr_str, (END));                        \
       if (mbl < 2)                                                      \
       {                                                                 \
         DBUG_ASSERT(mbl == 1);                                          \
@@ -63,10 +63,9 @@ end:                                                                    \
 char *my_strchr(CHARSET_INFO *cs, const char *str, const char *end,
                 pchar c)
 {
-  uint mbl;
   while (str < end)
   {
-    mbl= my_mbcharlen(cs, *(uchar *)str);
+    uint mbl= my_ismbchar(cs, str, end);
     if (mbl < 2)
     {
       if (*str == c)
diff --git a/strings/my_vsnprintf.c b/strings/my_vsnprintf.c
index 4178b20..75514a9 100644
--- a/strings/my_vsnprintf.c
+++ b/strings/my_vsnprintf.c
@@ -168,8 +168,7 @@ static char *backtick_string(CHARSET_INFO *cs, char *to, const char *end,
   for ( ; par < par_end; par+= char_len)
   {
     uchar c= *(uchar *) par;
-    if (!(char_len= my_mbcharlen(cs, c)))
-      char_len= 1;
+    char_len= my_charlen_fix(cs, par, par_end);
     if (char_len == 1 && c == (uchar) quote_char )
     {
       if (start + 1 >= end)

Follow ups

References