maria-developers team mailing list archive

Thread
Date

MDEV-8214 Asian MB2 charsets: compare broken bytes as "greater than any non-broken character"

To: Sergei Golubchik <serg@xxxxxxxxxxx>, maria-developers <maria-developers@xxxxxxxxxxxxxxxxxxx>
From: Alexander Barkov <bar@xxxxxxxxxxx>
Date: Mon, 25 May 2015 00:08:32 +0400
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Thunderbird/31.6.0

  Hi Sergei,

For simplicity I decided to split:

MDEV-8036 Fix all collations to compare broken bytes as "greater thanany non-broken character"


into sub-tasks.


Please review a patch for the first sub-task MDEV-8214.


The overall plan is:
- MDEV-8214, for the Asian charsets with mbmaxlen=2
- MDEV-8215, for the Asian charsets with mbmaxlen=3, based on MDEV-8214
- MDEV-8036, the rest (Unicode character sets)


Thanks.

diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c
index eda81c0..aa06a7a 100644
--- a/strings/ctype-big5.c
+++ b/strings/ctype-big5.c
@@ -49,6 +49,7 @@
 #define big5tail(e)	((uchar)(e&0xff))
 
 #define MY_FUNCTION_NAME(x)   my_ ## x ## _big5
+#define IS_MB1_CHAR(x)        ((uchar) (x) < 0x80)
 #define IS_MB2_CHAR(x,y)      (isbig5head(x) && isbig5tail(y))
 #define DEFINE_ASIAN_ROUTINES
 #include "ctype-mb.ic"
@@ -849,89 +850,6 @@ static uint16 big5strokexfrm(uint16 i)
 }
 
 
-
-static int my_strnncoll_big5_internal(const uchar **a_res,
-				      const uchar **b_res, size_t length)
-{
-  const uchar *a= *a_res, *b= *b_res;
-
-  while (length--)
-  {
-    if ((length > 0) && isbig5code(*a,*(a+1)) && isbig5code(*b, *(b+1)))
-    {
-      if (*a != *b || *(a+1) != *(b+1))
-	return ((int) big5code(*a,*(a+1)) -
-		(int) big5code(*b,*(b+1)));
-      a+= 2;
-      b+= 2;
-      length--;
-    }
-    else if (sort_order_big5[*a++] !=
-	     sort_order_big5[*b++])
-      return ((int) sort_order_big5[a[-1]] -
-	      (int) sort_order_big5[b[-1]]);
-  }
-  *a_res= a;
-  *b_res= b;
-  return 0;
-}
-
-
-/* Compare strings */
-
-static int my_strnncoll_big5(CHARSET_INFO *cs __attribute__((unused)), 
-			     const uchar *a, size_t a_length,
-                             const uchar *b, size_t b_length,
-                             my_bool b_is_prefix)
-{
-  size_t length= MY_MIN(a_length, b_length);
-  int res= my_strnncoll_big5_internal(&a, &b, length);
-  return res ? res : (int)((b_is_prefix ? length : a_length) - b_length);
-}
-
-
-/* compare strings, ignore end space */
-
-static int my_strnncollsp_big5(CHARSET_INFO * cs __attribute__((unused)), 
-			       const uchar *a, size_t a_length, 
-			       const uchar *b, size_t b_length,
-                               my_bool diff_if_only_endspace_difference)
-{
-  size_t length= MY_MIN(a_length, b_length);
-  int res= my_strnncoll_big5_internal(&a, &b, length);
-
-#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
-  diff_if_only_endspace_difference= 0;
-#endif
-
-  if (!res && a_length != b_length)
-  {
-    const uchar *end;
-    int swap= 1;
-    if (diff_if_only_endspace_difference)
-      res= 1;                                   /* Assume 'a' is bigger */
-    /*
-      Check the next not space character of the longer key. If it's < ' ',
-      then it's smaller than the other key.
-    */
-    if (a_length < b_length)
-    {
-      /* put longer key in a */
-      a_length= b_length;
-      a= b;
-      swap= -1;                                 /* swap sign of result */
-      res= -res;
-    }
-    for (end= a + a_length-length; a < end ; a++)
-    {
-      if (*a != ' ')
-	return (*a < ' ') ? -swap : swap;
-    }
-  }
-  return res;
-}
-
-
 static size_t
 my_strnxfrm_big5(CHARSET_INFO *cs,
                  uchar *dst, size_t dstlen, uint nweights,
@@ -6853,11 +6771,29 @@ my_mb_wc_big5(CHARSET_INFO *cs __attribute__((unused)),
 }
 
 
-static MY_COLLATION_HANDLER my_collation_big5_chinese_ci_handler =
+#undef MY_FUNCTION_NAME
+#undef WEIGHT_MB1
+#undef WEIGHT_MB2
+#define MY_FUNCTION_NAME(x)   my_ ## x ## _big5_chinese_ci
+#define WEIGHT_MB1(x)        (sort_order_big5[(uchar) (x)])
+#define WEIGHT_MB2(x,y)      (big5code(x, y))
+#include "ctype-strcoll.ic"
+
+
+#undef MY_FUNCTION_NAME
+#undef WEIGHT_MB1
+#undef WEIGHT_MB2
+#define MY_FUNCTION_NAME(x)   my_ ## x ## _big5_bin
+#define WEIGHT_MB1(x)        ((uchar) (x))
+#define WEIGHT_MB2(x,y)      (big5code(x, y))
+#include "ctype-strcoll.ic"
+
+
+static MY_COLLATION_HANDLER my_collation_handler_big5_chinese_ci=
 {
   NULL,			/* init */
-  my_strnncoll_big5,
-  my_strnncollsp_big5,
+  my_strnncoll_big5_chinese_ci,
+  my_strnncollsp_big5_chinese_ci,
   my_strnxfrm_big5,
   my_strnxfrmlen_simple,
   my_like_range_mb,
@@ -6868,6 +6804,23 @@ static MY_COLLATION_HANDLER my_collation_big5_chinese_ci_handler =
   my_propagate_simple
 };
 
+
+static MY_COLLATION_HANDLER my_collation_handler_big5_bin=
+{
+  NULL,	                /* init */
+  my_strnncoll_big5_bin,
+  my_strnncollsp_big5_bin,
+  my_strnxfrm_mb,
+  my_strnxfrmlen_simple,
+  my_like_range_mb,
+  my_wildcmp_mb_bin,
+  my_strcasecmp_mb_bin,
+  my_instr_mb,
+  my_hash_sort_mb_bin,
+  my_propagate_simple
+};
+
+
 static MY_CHARSET_HANDLER my_charset_big5_handler=
 {
   NULL,			/* init */
@@ -6931,7 +6884,7 @@ struct charset_info_st my_charset_big5_chinese_ci=
     1,                  /* escape_with_backslash_is_dangerous */
     1,                  /* levels_for_order   */
     &my_charset_big5_handler,
-    &my_collation_big5_chinese_ci_handler
+    &my_collation_handler_big5_chinese_ci
 };
 
 
@@ -6964,7 +6917,7 @@ struct charset_info_st my_charset_big5_bin=
     1,                  /* escape_with_backslash_is_dangerous */
     1,                  /* levels_for_order   */
     &my_charset_big5_handler,
-    &my_collation_mb_bin_handler
+    &my_collation_handler_big5_bin
 };
 
 
diff --git a/strings/ctype-cp932.c b/strings/ctype-cp932.c
index 2e26a98..2454c9a 100644
--- a/strings/ctype-cp932.c
+++ b/strings/ctype-cp932.c
@@ -185,6 +185,7 @@ static const uchar sort_order_cp932[]=
 
 #define MY_FUNCTION_NAME(x)   my_ ## x ## _cp932
 #define IS_8BIT_CHAR(x)       iscp932kata(x)
+#define IS_MB1_CHAR(x)        ((uchar) (x) < 0x80 || iscp932kata(x))
 #define IS_MB2_CHAR(x,y)      (iscp932head(x) && iscp932tail(y))
 #define DEFINE_ASIAN_ROUTINES
 #include "ctype-mb.ic"
@@ -1717,90 +1718,6 @@ MY_UNICASE_INFO my_caseinfo_cp932=
   my_caseinfo_pages_cp932
 };
 
-static int my_strnncoll_cp932_internal(CHARSET_INFO *cs,
-				      const uchar **a_res, size_t a_length,
-				      const uchar **b_res, size_t b_length)
-{
-  const uchar *a= *a_res, *b= *b_res;
-  const uchar *a_end= a + a_length;
-  const uchar *b_end= b + b_length;
-  while (a < a_end && b < b_end)
-  {
-    if (ismbchar_cp932(cs,(char*) a, (char*) a_end) &&
-	ismbchar_cp932(cs,(char*) b, (char*) b_end))
-    {
-      uint a_char= cp932code(*a, *(a+1));
-      uint b_char= cp932code(*b, *(b+1));
-      if (a_char != b_char)
-	return a_char - b_char;
-      a += 2;
-      b += 2;
-    } else
-    {
-      if (sort_order_cp932[(uchar)*a] != sort_order_cp932[(uchar)*b])
-	return sort_order_cp932[(uchar)*a] - sort_order_cp932[(uchar)*b];
-      a++;
-      b++;
-    }
-  }
-  *a_res= a;
-  *b_res= b;
-  return 0;
-}
-
-
-static int my_strnncoll_cp932(CHARSET_INFO *cs __attribute__((unused)),
-			      const uchar *a, size_t a_length, 
-			      const uchar *b, size_t b_length,
-                              my_bool b_is_prefix)
-{
-  int res= my_strnncoll_cp932_internal(cs, &a, a_length, &b, b_length);
-  if (b_is_prefix && a_length > b_length)
-    a_length= b_length;
-  return res ? res : (int) (a_length - b_length);
-}
-
-
-static int my_strnncollsp_cp932(CHARSET_INFO *cs __attribute__((unused)),
-                                const uchar *a, size_t a_length, 
-                                const uchar *b, size_t b_length,
-                                my_bool diff_if_only_endspace_difference
-                                __attribute__((unused)))
-{
-  const uchar *a_end= a + a_length;
-  const uchar *b_end= b + b_length;
-  int res= my_strnncoll_cp932_internal(cs, &a, a_length, &b, b_length);
-
-#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
-  diff_if_only_endspace_difference= 0;
-#endif
-
-  if (!res && (a != a_end || b != b_end))
-  {
-    int swap= 1;
-    if (diff_if_only_endspace_difference)
-      res= 1;                                   /* Assume 'a' is bigger */
-    /*
-      Check the next not space character of the longer key. If it's < ' ',
-      then it's smaller than the other key.
-    */
-    if (a == a_end)
-    {
-      /* put shorter key in a */
-      a_end= b_end;
-      a= b;
-      swap= -1;				/* swap sign of result */
-      res= -res;
-    }
-    for (; a < a_end ; a++)
-    {
-      if (*a != (uchar) ' ')
-	return (*a < (uchar) ' ') ? -swap : swap;
-    }
-  }
-  return res;
-}
-
 
 static const uint16 cp932_to_unicode[65536]=
 {
@@ -34720,15 +34637,44 @@ size_t my_numcells_cp932(CHARSET_INFO *cs __attribute__((unused)),
 }
 
 
-static MY_COLLATION_HANDLER my_collation_ci_handler =
+/*
+  cp932_chinese_ci and cp932_bin sort character blocks in this order:
+  1. [00..7F]                - 7BIT characters (ASCII)
+  2. [81..9F][40..7E,80..FC] - MB2 characters, part1
+  3. [A1..DF]                - 8BIT characters (Kana)
+  4. [E0..FC][40..7E,80..FC] - MB2 characters, part2
+*/
+#undef MY_FUNCTION_NAME
+#undef WEIGHT_PAD_SPACE
+#undef WEIGHT_MB1
+#undef WEIGHT_MB2
+#define MY_FUNCTION_NAME(x)   my_ ## x ## _cp932_japanese_ci
+#define WEIGHT_PAD_SPACE     (256 * (int) ' ')
+#define WEIGHT_MB1(x)        (256 * (int) sort_order_cp932[(uchar) (x)])
+#define WEIGHT_MB2(x,y)      (cp932code(x, y))
+#include "ctype-strcoll.ic"
+
+
+#undef MY_FUNCTION_NAME
+#undef WEIGHT_PAD_SPACE
+#undef WEIGHT_MB1
+#undef WEIGHT_MB2
+#define MY_FUNCTION_NAME(x)   my_ ## x ## _cp932_bin
+#define WEIGHT_PAD_SPACE     (256 * (int) ' ')
+#define WEIGHT_MB1(x)        (256 * (int) (uchar) (x))
+#define WEIGHT_MB2(x,y)      (cp932code(x, y))
+#include "ctype-strcoll.ic"
+
+
+static MY_COLLATION_HANDLER my_collation_handler_cp932_japanese_ci=
 {
-  NULL,			/* init */
-  my_strnncoll_cp932,
-  my_strnncollsp_cp932,
+  NULL,                  /* init */
+  my_strnncoll_cp932_japanese_ci,
+  my_strnncollsp_cp932_japanese_ci,
   my_strnxfrm_mb,
   my_strnxfrmlen_simple,
   my_like_range_mb,
-  my_wildcmp_mb,	/* wildcmp  */
+  my_wildcmp_mb,
   my_strcasecmp_8bit,
   my_instr_mb,
   my_hash_sort_simple,
@@ -34736,6 +34682,22 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
 };
 
 
+static MY_COLLATION_HANDLER my_collation_handler_cp932_bin=
+{
+  NULL,	                /* init */
+  my_strnncoll_cp932_bin,
+  my_strnncollsp_cp932_bin,
+  my_strnxfrm_mb,
+  my_strnxfrmlen_simple,
+  my_like_range_mb,
+  my_wildcmp_mb_bin,
+  my_strcasecmp_mb_bin,
+  my_instr_mb,
+  my_hash_sort_mb_bin,
+  my_propagate_simple
+};
+
+
 static MY_CHARSET_HANDLER my_charset_handler=
 {
   NULL,			/* init */
@@ -34800,7 +34762,7 @@ struct charset_info_st my_charset_cp932_japanese_ci=
     1,                  /* escape_with_backslash_is_dangerous */
     1,                  /* levels_for_order   */
     &my_charset_handler,
-    &my_collation_ci_handler
+    &my_collation_handler_cp932_japanese_ci
 };
 
 struct charset_info_st my_charset_cp932_bin=
@@ -34832,7 +34794,7 @@ struct charset_info_st my_charset_cp932_bin=
     1,                  /* escape_with_backslash_is_dangerous */
     1,                  /* levels_for_order   */
     &my_charset_handler,
-    &my_collation_mb_bin_handler
+    &my_collation_handler_cp932_bin
 };
 
 #endif
diff --git a/strings/ctype-euc_kr.c b/strings/ctype-euc_kr.c
index a2c95bf..db47b3d 100644
--- a/strings/ctype-euc_kr.c
+++ b/strings/ctype-euc_kr.c
@@ -201,8 +201,10 @@ static const uchar sort_order_euc_kr[]=
                               iseuc_kr_tail2(c) || \
                               iseuc_kr_tail3(c))
 
+#define euckrcode(c,d)        (((uchar)(c) <<8) | (uchar)(d))
 
 #define MY_FUNCTION_NAME(x)   my_ ## x ## _euckr
+#define IS_MB1_CHAR(x)        ((uchar) (x) < 0x80)
 #define IS_MB2_CHAR(x,y)      (iseuc_kr_head(x) && iseuc_kr_tail(y))
 #define DEFINE_ASIAN_ROUTINES
 #include "ctype-mb.ic"
@@ -9938,21 +9940,56 @@ my_mb_wc_euc_kr(CHARSET_INFO *cs __attribute__((unused)),
 }
 
 
-static MY_COLLATION_HANDLER my_collation_ci_handler =
+#undef MY_FUNCTION_NAME
+#undef WEIGHT_MB1
+#undef WEIGHT_MB2
+#define MY_FUNCTION_NAME(x)   my_ ## x ## _euckr_korean_ci
+#define WEIGHT_MB1(x)        (sort_order_euc_kr[(uchar) (x)])
+#define WEIGHT_MB2(x,y)      (euckrcode(x, y))
+#include "ctype-strcoll.ic"
+
+
+#undef MY_FUNCTION_NAME
+#undef WEIGHT_MB1
+#undef WEIGHT_MB2
+#define MY_FUNCTION_NAME(x)   my_ ## x ## _euckr_bin
+#define WEIGHT_MB1(x)        ((uchar) (x))
+#define WEIGHT_MB2(x,y)      (euckrcode(x, y))
+#include "ctype-strcoll.ic"
+
+
+static MY_COLLATION_HANDLER my_collation_handler_euckr_korean_ci=
 {
-  NULL,			/* init */
-  my_strnncoll_simple,  /* strnncoll  */
-  my_strnncollsp_simple,
-  my_strnxfrm_mb,	/* strnxfrm   */
+  NULL,                 /* init */
+  my_strnncoll_euckr_korean_ci,
+  my_strnncollsp_euckr_korean_ci,
+  my_strnxfrm_mb,
   my_strnxfrmlen_simple,
-  my_like_range_mb,     /* like_range */
-  my_wildcmp_mb,	/* wildcmp    */
+  my_like_range_mb,
+  my_wildcmp_mb,
   my_strcasecmp_mb,
   my_instr_mb,
   my_hash_sort_simple,
   my_propagate_simple
 };
 
+
+static MY_COLLATION_HANDLER my_collation_handler_euckr_bin=
+{
+  NULL,                 /* init */
+  my_strnncoll_euckr_bin,
+  my_strnncollsp_euckr_bin,
+  my_strnxfrm_mb,
+  my_strnxfrmlen_simple,
+  my_like_range_mb,
+  my_wildcmp_mb_bin,
+  my_strcasecmp_mb_bin,
+  my_instr_mb,
+  my_hash_sort_mb_bin,
+  my_propagate_simple
+};
+
+
 static MY_CHARSET_HANDLER my_charset_handler=
 {
   NULL,			/* init */
@@ -10017,7 +10054,7 @@ struct charset_info_st my_charset_euckr_korean_ci=
     0,                  /* escape_with_backslash_is_dangerous */
     1,                  /* levels_for_order   */
     &my_charset_handler,
-    &my_collation_ci_handler
+    &my_collation_handler_euckr_korean_ci
 };
 
 
@@ -10050,7 +10087,7 @@ struct charset_info_st my_charset_euckr_bin=
     0,                  /* escape_with_backslash_is_dangerous */
     1,                  /* levels_for_order   */
     &my_charset_handler,
-    &my_collation_mb_bin_handler
+    &my_collation_handler_euckr_bin
 };
 
 #endif
diff --git a/strings/ctype-gb2312.c b/strings/ctype-gb2312.c
index 129e8ed..abe9bc8 100644
--- a/strings/ctype-gb2312.c
+++ b/strings/ctype-gb2312.c
@@ -163,9 +163,11 @@ static const uchar sort_order_gb2312[]=
 
 #define isgb2312head(c) (0xa1<=(uchar)(c) && (uchar)(c)<=0xf7)
 #define isgb2312tail(c) (0xa1<=(uchar)(c) && (uchar)(c)<=0xfe)
+#define gb2312code(c,d) (((uchar)(c) <<8) | (uchar)(d))
 
 
 #define MY_FUNCTION_NAME(x)   my_ ## x ## _gb2312
+#define IS_MB1_CHAR(x)        ((uchar) (x) < 0x80)
 #define IS_MB2_CHAR(x,y)      (isgb2312head(x) && isgb2312tail(y))
 #define DEFINE_ASIAN_ROUTINES
 #include "ctype-mb.ic"
@@ -6341,11 +6343,29 @@ my_mb_wc_gb2312(CHARSET_INFO *cs  __attribute__((unused)),
 }
 
 
-static MY_COLLATION_HANDLER my_collation_ci_handler =
+#undef MY_FUNCTION_NAME
+#undef WEIGHT_MB1
+#undef WEIGHT_MB2
+#define MY_FUNCTION_NAME(x)   my_ ## x ## _gb2312_chinese_ci
+#define WEIGHT_MB1(x)        (sort_order_gb2312[(uchar) (x)])
+#define WEIGHT_MB2(x,y)      (gb2312code(x, y))
+#include "ctype-strcoll.ic"
+
+
+#undef MY_FUNCTION_NAME
+#undef WEIGHT_MB1
+#undef WEIGHT_MB2
+#define MY_FUNCTION_NAME(x)   my_ ## x ## _gb2312_bin
+#define WEIGHT_MB1(x)        ((uchar) (x))
+#define WEIGHT_MB2(x,y)      (gb2312code(x, y))
+#include "ctype-strcoll.ic"
+
+
+static MY_COLLATION_HANDLER my_collation_handler_gb2312_chinese_ci=
 {
-  NULL,			/* init */
-  my_strnncoll_simple,  /* strnncoll  */
-  my_strnncollsp_simple,
+  NULL,                 /* init */
+  my_strnncoll_gb2312_chinese_ci,
+  my_strnncollsp_gb2312_chinese_ci,
   my_strnxfrm_mb,       /* strnxfrm   */
   my_strnxfrmlen_simple,
   my_like_range_mb,     /* like_range */
@@ -6356,6 +6376,24 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
   my_propagate_simple
 };
 
+
+static MY_COLLATION_HANDLER my_collation_handler_gb2312_bin=
+{
+  NULL,	                /* init */
+  my_strnncoll_gb2312_bin,
+  my_strnncollsp_gb2312_bin,
+  my_strnxfrm_mb,
+  my_strnxfrmlen_simple,
+  my_like_range_mb,
+  my_wildcmp_mb_bin,
+  my_strcasecmp_mb_bin,
+  my_instr_mb,
+  my_hash_sort_mb_bin,
+  my_propagate_simple
+};
+
+
+
 static MY_CHARSET_HANDLER my_charset_handler=
 {
   NULL,			/* init */
@@ -6420,9 +6458,10 @@ struct charset_info_st my_charset_gb2312_chinese_ci=
     0,                  /* escape_with_backslash_is_dangerous */
     1,                  /* levels_for_order   */
     &my_charset_handler,
-    &my_collation_ci_handler
+    &my_collation_handler_gb2312_chinese_ci
 };
 
+
 struct charset_info_st my_charset_gb2312_bin=
 {
     86,0,0,		/* number */
@@ -6452,7 +6491,7 @@ struct charset_info_st my_charset_gb2312_bin=
     0,                  /* escape_with_backslash_is_dangerous */
     1,                  /* levels_for_order   */
     &my_charset_handler,
-    &my_collation_mb_bin_handler
+    &my_collation_handler_gb2312_bin
 };
 
 #endif
diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c
index b3bd1ef..df02f3f 100644
--- a/strings/ctype-gbk.c
+++ b/strings/ctype-gbk.c
@@ -44,6 +44,7 @@
 #define gbktail(e)     ((uchar)(e&0xff))
 
 #define MY_FUNCTION_NAME(x)   my_ ## x ## _gbk
+#define IS_MB1_CHAR(x)        ((uchar) (x) < 0x80)
 #define IS_MB2_CHAR(x,y)      (isgbkhead(x) && isgbktail(y))
 #define DEFINE_ASIAN_ROUTINES
 #include "ctype-mb.ic"
@@ -3450,87 +3451,6 @@ static uint16 gbksortorder(uint16 i)
 }
 
 
-int my_strnncoll_gbk_internal(const uchar **a_res, const uchar **b_res,
-			      size_t length)
-{
-  const uchar *a= *a_res, *b= *b_res;
-  uint a_char,b_char; 
-
-  while (length--)
-  {
-    if ((length > 0) && isgbkcode(*a,*(a+1)) && isgbkcode(*b, *(b+1)))
-    {
-      a_char= gbkcode(*a,*(a+1));
-      b_char= gbkcode(*b,*(b+1));
-      if (a_char != b_char)
-        return ((int) gbksortorder((uint16) a_char) -
-		(int) gbksortorder((uint16) b_char));
-      a+= 2;
-      b+= 2;
-      length--;
-    }
-    else if (sort_order_gbk[*a++] != sort_order_gbk[*b++])
-      return ((int) sort_order_gbk[a[-1]] -
-	      (int) sort_order_gbk[b[-1]]);
-  }
-  *a_res= a;
-  *b_res= b;
-  return 0;
-}
-
-
-
-int my_strnncoll_gbk(CHARSET_INFO *cs __attribute__((unused)),
-		     const uchar *a, size_t a_length,
-                     const uchar *b, size_t b_length,
-                     my_bool b_is_prefix)
-{
-  size_t length= MY_MIN(a_length, b_length);
-  int res= my_strnncoll_gbk_internal(&a, &b, length);
-  return res ? res : (int) ((b_is_prefix ? length : a_length) - b_length);
-}
-
-
-static int my_strnncollsp_gbk(CHARSET_INFO * cs __attribute__((unused)),
-			      const uchar *a, size_t a_length, 
-			      const uchar *b, size_t b_length,
-                              my_bool diff_if_only_endspace_difference)
-{
-  size_t length= MY_MIN(a_length, b_length);
-  int res= my_strnncoll_gbk_internal(&a, &b, length);
-
-#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
-  diff_if_only_endspace_difference= 0;
-#endif
-
-  if (!res && a_length != b_length)
-  {
-    const uchar *end;
-    int swap= 1;
-    if (diff_if_only_endspace_difference)
-      res= 1;                                   /* Assume 'a' is bigger */
-    /*
-      Check the next not space character of the longer key. If it's < ' ',
-      then it's smaller than the other key.
-    */
-    if (a_length < b_length)
-    {
-      /* put shorter key in a */
-      a_length= b_length;
-      a= b;
-      swap= -1;				/* swap sign of result */
-      res= -res;
-    }
-    for (end= a + a_length-length; a < end ; a++)
-    {
-      if (*a != ' ')
-	return (*a < ' ') ? -swap : swap;
-    }
-  }
-  return res;
-}
-
-
 static size_t
 my_strnxfrm_gbk(CHARSET_INFO *cs,
                 uchar *dst, size_t dstlen, uint nweights,
@@ -10735,11 +10655,29 @@ my_mb_wc_gbk(CHARSET_INFO *cs __attribute__((unused)),
 }
 
 
-static MY_COLLATION_HANDLER my_collation_ci_handler =
+#undef MY_FUNCTION_NAME
+#undef WEIGHT_MB1
+#undef WEIGHT_MB2
+#define MY_FUNCTION_NAME(x)   my_ ## x ## _gbk_chinese_ci
+#define WEIGHT_MB1(x)        (sort_order_gbk[(uchar) (x)])
+#define WEIGHT_MB2(x,y)      (gbksortorder(gbkcode(x,y)))
+#include "ctype-strcoll.ic"
+
+
+#undef MY_FUNCTION_NAME
+#undef WEIGHT_MB1
+#undef WEIGHT_MB2
+#define MY_FUNCTION_NAME(x)   my_ ## x ## _gbk_bin
+#define WEIGHT_MB1(x)        ((uchar) (x))
+#define WEIGHT_MB2(x,y)      (gbkcode(x,y))
+#include "ctype-strcoll.ic"
+
+
+static MY_COLLATION_HANDLER my_collation_handler_gbk_chinese_ci=
 {
-  NULL,			/* init */
-  my_strnncoll_gbk,
-  my_strnncollsp_gbk,
+  NULL,                 /* init */
+  my_strnncoll_gbk_chinese_ci,
+  my_strnncollsp_gbk_chinese_ci,
   my_strnxfrm_gbk,
   my_strnxfrmlen_simple,
   my_like_range_mb,
@@ -10750,6 +10688,24 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
   my_propagate_simple
 };
 
+
+static MY_COLLATION_HANDLER my_collation_handler_gbk_bin=
+{
+  NULL,                 /* init */
+  my_strnncoll_gbk_bin,
+  my_strnncollsp_gbk_bin,
+  my_strnxfrm_mb,
+  my_strnxfrmlen_simple,
+  my_like_range_mb,
+  my_wildcmp_mb_bin,
+  my_strcasecmp_mb_bin,
+  my_instr_mb,
+  my_hash_sort_mb_bin,
+  my_propagate_simple
+};
+
+
+
 static MY_CHARSET_HANDLER my_charset_handler=
 {
   NULL,			/* init */
@@ -10814,7 +10770,7 @@ struct charset_info_st my_charset_gbk_chinese_ci=
     1,                  /* escape_with_backslash_is_dangerous */
     1,                  /* levels_for_order   */
     &my_charset_handler,
-    &my_collation_ci_handler
+    &my_collation_handler_gbk_chinese_ci
 };
 
 struct charset_info_st my_charset_gbk_bin=
@@ -10846,7 +10802,7 @@ struct charset_info_st my_charset_gbk_bin=
     1,                  /* escape_with_backslash_is_dangerous */
     1,                  /* levels_for_order   */
     &my_charset_handler,
-    &my_collation_mb_bin_handler
+    &my_collation_handler_gbk_bin
 };
 
 
diff --git a/strings/ctype-sjis.c b/strings/ctype-sjis.c
index bbf0026..bb48e41 100644
--- a/strings/ctype-sjis.c
+++ b/strings/ctype-sjis.c
@@ -186,6 +186,7 @@ static const uchar sort_order_sjis[]=
 
 #define MY_FUNCTION_NAME(x)   my_ ## x ## _sjis
 #define IS_8BIT_CHAR(x)       issjiskata(x)
+#define IS_MB1_CHAR(x)        ((uchar) (x) < 0x80 || issjiskata(x))
 #define IS_MB2_CHAR(x,y)      (issjishead(x) && issjistail(y))
 #define DEFINE_ASIAN_ROUTINES
 #include "ctype-mb.ic"
@@ -1088,90 +1089,6 @@ static MY_UNICASE_INFO my_caseinfo_sjis=
 };
 
 
-static int my_strnncoll_sjis_internal(CHARSET_INFO *cs,
-				      const uchar **a_res, size_t a_length,
-				      const uchar **b_res, size_t b_length)
-{
-  const uchar *a= *a_res, *b= *b_res;
-  const uchar *a_end= a + a_length;
-  const uchar *b_end= b + b_length;
-  while (a < a_end && b < b_end)
-  {
-    if (ismbchar_sjis(cs,(char*) a, (char*) a_end) &&
-	ismbchar_sjis(cs,(char*) b, (char*) b_end))
-    {
-      uint a_char= sjiscode(*a, *(a+1));
-      uint b_char= sjiscode(*b, *(b+1));
-      if (a_char != b_char)
-	return (int) a_char - (int) b_char;
-      a += 2;
-      b += 2;
-    } else
-    {
-      if (sort_order_sjis[(uchar)*a] != sort_order_sjis[(uchar)*b])
-	return sort_order_sjis[(uchar)*a] - sort_order_sjis[(uchar)*b];
-      a++;
-      b++;
-    }
-  }
-  *a_res= a;
-  *b_res= b;
-  return 0;
-}
-
-
-static int my_strnncoll_sjis(CHARSET_INFO *cs __attribute__((unused)),
-                             const uchar *a, size_t a_length, 
-                             const uchar *b, size_t b_length,
-                             my_bool b_is_prefix)
-{
-  int res= my_strnncoll_sjis_internal(cs, &a, a_length, &b, b_length);
-  if (b_is_prefix && a_length > b_length)
-    a_length= b_length;
-  return res ? res : (int) (a_length - b_length);
-}
-
-
-static int my_strnncollsp_sjis(CHARSET_INFO *cs __attribute__((unused)),
-			       const uchar *a, size_t a_length, 
-			       const uchar *b, size_t b_length,
-                               my_bool diff_if_only_endspace_difference)
-{
-  const uchar *a_end= a + a_length, *b_end= b + b_length;
-  int res= my_strnncoll_sjis_internal(cs, &a, a_length, &b, b_length);
-
-#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
-  diff_if_only_endspace_difference= 0;
-#endif
-
-  if (!res && (a != a_end || b != b_end))
-  {
-    int swap= 1;
-    if (diff_if_only_endspace_difference)
-      res= 1;                                   /* Assume 'a' is bigger */
-    /*
-      Check the next not space character of the longer key. If it's < ' ',
-      then it's smaller than the other key.
-    */
-    if (a == a_end)
-    {
-      /* put shorter key in a */
-      a_end= b_end;
-      a= b;
-      swap= -1;				/* swap sign of result */
-      res= -res;
-    }
-    for (; a < a_end ; a++)
-    {
-      if (*a != ' ')
-	return (*a < ' ') ? -swap : swap;
-    }
-  }
-  return res;
-}
-
-
-
 /* SJIS->Unicode conversion table */
 static uint16 sjis_to_unicode[65536]=
 {
@@ -34099,15 +34016,44 @@ size_t my_numcells_sjis(CHARSET_INFO *cs __attribute__((unused)),
 }
 
 
-static MY_COLLATION_HANDLER my_collation_ci_handler =
+/*
+  sjis_chinese_ci and sjis_bin sort character blocks in this order:
+  1. [00..7F]                - 7BIT characters (ASCII)
+  2. [81..9F][40..7E,80..FC] - MB2 characters, part1
+  3. [A1..DF]                - 8BIT characters (Kana)
+  4. [E0..FC][40..7E,80..FC] - MB2 characters, part2
+*/
+#undef MY_FUNCTION_NAME
+#undef WEIGHT_PAD_SPACE
+#undef WEIGHT_MB1
+#undef WEIGHT_MB2
+#define MY_FUNCTION_NAME(x)   my_ ## x ## _sjis_japanese_ci
+#define WEIGHT_PAD_SPACE     (256 * (int) ' ')
+#define WEIGHT_MB1(x)        (256 * (int) sort_order_sjis[(uchar) (x)])
+#define WEIGHT_MB2(x,y)      (sjiscode(x, y))
+#include "ctype-strcoll.ic"
+
+
+#undef MY_FUNCTION_NAME
+#undef WEIGHT_PAD_SPACE
+#undef WEIGHT_MB1
+#undef WEIGHT_MB2
+#define MY_FUNCTION_NAME(x)   my_ ## x ## _sjis_bin
+#define WEIGHT_PAD_SPACE     (256 * (int) ' ')
+#define WEIGHT_MB1(x)        (256 * (int) (uchar) (x))
+#define WEIGHT_MB2(x,y)      (sjiscode(x, y))
+#include "ctype-strcoll.ic"
+
+
+static MY_COLLATION_HANDLER my_collation_handler_sjis_japanese_ci=
 {
-  NULL,			/* init */
-  my_strnncoll_sjis,
-  my_strnncollsp_sjis,
+  NULL,                 /* init */
+  my_strnncoll_sjis_japanese_ci,
+  my_strnncollsp_sjis_japanese_ci,
   my_strnxfrm_mb,
   my_strnxfrmlen_simple,
   my_like_range_mb,
-  my_wildcmp_mb,	/* wildcmp  */
+  my_wildcmp_mb,
   my_strcasecmp_8bit,
   my_instr_mb,
   my_hash_sort_simple,
@@ -34115,6 +34061,22 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
 };
 
 
+static MY_COLLATION_HANDLER my_collation_handler_sjis_bin=
+{
+  NULL,                 /* init */
+  my_strnncoll_sjis_bin,
+  my_strnncollsp_sjis_bin,
+  my_strnxfrm_mb,
+  my_strnxfrmlen_simple,
+  my_like_range_mb,
+  my_wildcmp_mb_bin,
+  my_strcasecmp_mb_bin,
+  my_instr_mb,
+  my_hash_sort_mb_bin,
+  my_propagate_simple
+};
+
+
 static MY_CHARSET_HANDLER my_charset_handler=
 {
   NULL,			/* init */
@@ -34179,7 +34141,7 @@ struct charset_info_st my_charset_sjis_japanese_ci=
     1,                  /* escape_with_backslash_is_dangerous */
     1,                  /* levels_for_order   */
     &my_charset_handler,
-    &my_collation_ci_handler
+    &my_collation_handler_sjis_japanese_ci
 };
 
 struct charset_info_st my_charset_sjis_bin=
@@ -34211,7 +34173,7 @@ struct charset_info_st my_charset_sjis_bin=
     1,                  /* escape_with_backslash_is_dangerous */
     1,                  /* levels_for_order   */
     &my_charset_handler,
-    &my_collation_mb_bin_handler
+    &my_collation_handler_sjis_bin
 };
 
 #endif
diff --git a/strings/ctype-strcoll.ic b/strings/ctype-strcoll.ic
new file mode 100644
index 0000000..7217f99
--- /dev/null
+++ b/strings/ctype-strcoll.ic
@@ -0,0 +1,208 @@
+/*
+   Copyright (c) 2015, MariaDB Foundation
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+
+#ifndef MY_FUNCTION_NAME
+#error MY_FUNCTION_NAME is not defined
+#endif
+
+
+/*
+  The weight for automatically padded spaces when comparing strings with
+  the PAD SPACE property.
+  Should normally be equal to the weight of a regular space.
+*/
+#ifndef WEIGHT_PAD_SPACE
+#define WEIGHT_PAD_SPACE  (' ')
+#endif
+
+
+/*
+  Weight of an illegal byte.
+  Must be greater than weight of any normal character.
+  Two bad bytes are compared binary.
+*/
+#ifndef WEIGHT_ILSEQ
+#define WEIGHT_ILSEQ(x)   (0xFF00 + (x))
+#endif
+
+
+/**
+  Scan a valid character, or a bad byte, or an auto-padded space
+  from a string and calculate the weight of the scanned sequence.
+
+  @param [OUT] weight - the weight is returned here
+  @param str          - the string
+  @param end          - the end of the string
+  @return             - the number of bytes scanned
+
+  The including source file must define the following macros:
+  IS_MB1_CHAR(x)
+  IS_MB2_CHAR(x,y)
+  WEIGHT_PAD_SPACE
+  WEIGHT_MB1(x)
+  WEIGHT_MB2(x,y)
+  WEIGHT_ILSEQ(x)
+*/
+static inline uint
+MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end)
+{
+  if (str >= end)
+  {
+    *weight= WEIGHT_PAD_SPACE;
+    return 0;
+  }
+
+  if (IS_MB1_CHAR(*str))
+  {
+    *weight= WEIGHT_MB1(*str);           /* A valid single byte character*/
+    return 1;
+  }
+
+  if (str + 2 > end)                     /* The string ended unexpectedly */
+    goto bad;                            /* Treat as a bad byte */
+
+  if (IS_MB2_CHAR(str[0], str[1]))
+  {
+    *weight= WEIGHT_MB2(str[0], str[1]);
+    return 2;                            /* A valid two-byte character */
+  }
+
+bad:
+  *weight= WEIGHT_ILSEQ(str[0]);         /* Bad byte */
+  return 1;
+}
+
+
+/**
+  Compare two strings according to the collation,
+  without handling the PAD SPACE property.
+
+  Note, cs->coll->strnncoll() is usually used to compare identifiers.
+  Perhaps we should eventually (in 10.2?) create a new collation 
+  my_charset_utf8_general_ci_no_pad and have only one comparison function
+  in MY_COLLATION_HANDLER.
+
+  @param cs          - the character set and collation
+  @param a           - the left string
+  @param a_length    - the length of the left string
+  @param b           - the right string
+  @param b_length    - the length of the right string
+  @param b_is_prefix - if the caller wants to check if "b" is a prefix of "a"
+  @return            - the comparison result
+*/
+static int
+MY_FUNCTION_NAME(strnncoll)(CHARSET_INFO *cs __attribute__((unused)),
+                            const uchar *a, size_t a_length, 
+                            const uchar *b, size_t b_length,
+                            my_bool b_is_prefix)
+{
+  const uchar *a_end= a + a_length;
+  const uchar *b_end= b + b_length;
+  for ( ; ; )
+  {
+    int a_weight, b_weight, res;
+    uint a_wlen= MY_FUNCTION_NAME(scan_weight)(&a_weight, a, a_end);
+    uint b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end);
+    /*
+      a_wlen  b_wlen Comment
+      ------  ------ -------
+      0       0      Strings ended simultaneously, "a" and "b" are equal.
+      0       >0     "a" is a prefix of "b", so "a" is smaller.
+      >0      0      "b" is a prefix of "a", check b_is_prefix.
+      >0      >0     Two weights were scanned, check weight difference.
+    */
+    if (!a_wlen)
+      return b_wlen ? -b_weight : 0;
+
+    if (!b_wlen)
+      return b_is_prefix ? 0 : a_weight;
+
+    if ((res= (a_weight - b_weight)))
+      return res;
+    /*
+      None of the strings has ended yet.
+    */
+    DBUG_ASSERT(a < a_end);
+    DBUG_ASSERT(b < b_end);
+    a+= a_wlen;
+    b+= b_wlen;
+  }
+  DBUG_ASSERT(0);
+  return 0;
+}
+
+
+/**
+  Compare two strings according to the collation, with PAD SPACE handling.
+
+  @param cs          - the character set and collation
+  @param a           - the left string
+  @param a_length    - the length of the left string
+  @param b           - the right string
+  @param b_length    - the length of the right string
+  @param diff_if_only_endspace_difference - not used in the code.
+                       TODO: this should be eventually removed (in 10.2?)
+  @return            - the comparison result
+*/
+
+static int
+MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs __attribute__((unused)),
+                              const uchar *a, size_t a_length, 
+                              const uchar *b, size_t b_length,
+                              my_bool diff_if_only_endspace_difference
+                              __attribute__((unused)))
+{
+  const uchar *a_end= a + a_length;
+  const uchar *b_end= b + b_length;
+  for ( ; ; )
+  {
+    int a_weight, b_weight, res;
+    uint a_wlen= MY_FUNCTION_NAME(scan_weight)(&a_weight, a, a_end);
+    uint b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end);
+    if ((res= (a_weight - b_weight)))
+    {
+      /*
+        Got two different weights. Each weight can be generated by either of:
+        - a real character
+        - a bad byte sequence or an incomplete byte sequence
+        - an auto-generated trailing space (PAD SPACE)
+        It does not matter how exactly each weight was generated.
+        Just return the weight difference.
+      */
+      return res;
+    }
+    if (!a_wlen && !b_wlen)
+    {
+      /*
+        Got two auto-generated trailing spaces, i.e.
+        both strings have now ended, so they are equal.
+      */
+      DBUG_ASSERT(a == a_end);
+      DBUG_ASSERT(b == b_end);
+      return 0;
+    }
+    /*
+      At least one of the strings has not ended yet, continue comparison.
+    */
+    DBUG_ASSERT(a < a_end || b < b_end);
+    a+= a_wlen;
+    b+= b_wlen;
+  }
+  DBUG_ASSERT(0);
+  return 0;
+}
diff --git a/unittest/strings/strings-t.c b/unittest/strings/strings-t.c
index 6baef04..27c39fb 100644
--- a/unittest/strings/strings-t.c
+++ b/unittest/strings/strings-t.c
@@ -95,11 +95,361 @@ static CHARSET_INFO *charset_list[]=
 };
 
 
+typedef struct
+{
+  const char *a;
+  size_t alen;
+  const char *b;
+  size_t blen;
+  int res;
+} STRNNCOLL_PARAM;
+
+
+#define CSTR(x)  (x),(sizeof(x)-1)
+
+/*
+  Byte sequence types used in the tests:
+    8BIT     - a 8 bit byte (>=00x80) which makes a single byte characters
+    MB2      - two bytes that make a valid character
+    H2       - a byte which is a valid MB2 head byte
+    T2       - a byte which is a valid MB2 tail byte
+    ILSEQ    - a byte which makes an illegal sequence
+    H2+ILSEQ - a sequence that starts with a valid H2 byte,
+               but not followed by a valid T2 byte.
+
+  Charset H2               T2                      8BIT
+  ------- ---------------- ---------------         -------- 
+  big5    [A1..F9]         [40..7E,A1..FE]          
+  euckr   [81..FE]         [41..5A,61..7A,81..FE]   
+  gb2312  [A1..F7]         [A1..FE]
+  gbk     [81..FE]         [40..7E,80..FE]
+
+  cp932   [81..9F,E0..FC]  [40..7E,80..FC]         [A1..DF] 
+  sjis    [81..9F,E0..FC]  [40..7E,80..FC]         [A1..DF]
+
+
+  Essential byte sequences in various character sets:
+
+  Sequence  big5   cp932      euckr  gb2312    gbk   sjis
+  --------  ----   -----      -----  ------    ---   ----
+  80        ILSEQ  ILSEQ      ILSEQ  ILSEQ     ILSEQ ILSEQ
+  81        ILSEQ  H2         H2     ILSEQ     H2    H2
+  A1        H2     8BIT       H2     H2        H2    8BIT
+  A1A1      MB2    8BIT+8BIT  MB2    MB2       MB2   8BIT+8BIT
+  E0E0      MB2    MB2        MB2    MB2       MB2   MB2
+  F9FE      MB2    H2+ILSEQ   MB2    ILSEQ+T2  MB2   H2+ILSEQ
+*/
+
+
+/*
+  For character sets that have the following byte sequences:
+    80   - ILSEQ
+    81   - ILSEQ or H2
+    F9   - ILSEQ or H2
+    A1A1 - MB2 or 8BIT+8BIT
+    E0E0 - MB2
+*/
+STRNNCOLL_PARAM strcoll_mb2_common[]=
+{
+  /* Compare two good sequences */
+  {CSTR(""),         CSTR(""),           0},
+  {CSTR(""),         CSTR(" "),          0},
+  {CSTR(""),         CSTR("A"),         -1},
+  {CSTR(""),         CSTR("a"),         -1},
+  {CSTR(""),         CSTR("\xA1\xA1"),  -1},
+  {CSTR(""),         CSTR("\xE0\xE0"),  -1},
+
+  {CSTR(" "),        CSTR(""),          0},
+  {CSTR(" "),        CSTR(" "),         0},
+  {CSTR(" "),        CSTR("A"),        -1},
+  {CSTR(" "),        CSTR("a"),        -1},
+  {CSTR(" "),        CSTR("\xA1\xA1"), -1},
+  {CSTR(" "),        CSTR("\xE0\xE0"), -1},
+
+  {CSTR("a"),        CSTR(""),          1},
+  {CSTR("a"),        CSTR(" "),         1},
+  {CSTR("a"),        CSTR("a"),         0},
+  {CSTR("a"),        CSTR("\xA1\xA1"), -1},
+  {CSTR("a"),        CSTR("\xE0\xE0"), -1},
+
+  {CSTR("\xA1\xA1"), CSTR("\xA1\xA1"),  0},
+  {CSTR("\xA1\xA1"), CSTR("\xE0\xE0"), -1},
+
+  /* Compare a good character to an illegal or an incomplete sequence */
+  {CSTR(""),         CSTR("\x80"),     -1},
+  {CSTR(""),         CSTR("\x81"),     -1},
+  {CSTR(""),         CSTR("\xF9"),     -1},
+
+  {CSTR(" "),        CSTR("\x80"),     -1},
+  {CSTR(" "),        CSTR("\x81"),     -1},
+  {CSTR(" "),        CSTR("\xF9"),     -1},
+
+  {CSTR("a"),        CSTR("\x80"),     -1},
+  {CSTR("a"),        CSTR("\x81"),     -1},
+  {CSTR("a"),        CSTR("\xF9"),     -1},
+
+  {CSTR("\xA1\xA1"), CSTR("\x80"),     -1},
+  {CSTR("\xA1\xA1"), CSTR("\x81"),     -1},
+  {CSTR("\xA1\xA1"), CSTR("\xF9"),     -1},
+
+  {CSTR("\xE0\xE0"), CSTR("\x80"),     -1},
+  {CSTR("\xE0\xE0"), CSTR("\x81"),     -1},
+  {CSTR("\xE0\xE0"), CSTR("\xF9"),     -1},
+
+  /* Compare two bad/incomplete sequences */
+  {CSTR("\x80"),     CSTR("\x80"),      0},
+  {CSTR("\x80"),     CSTR("\x81"),     -1},
+  {CSTR("\x80"),     CSTR("\xF9"),     -1},
+  {CSTR("\x81"),     CSTR("\x81"),      0},
+  {CSTR("\x81"),     CSTR("\xF9"),     -1},
+
+  {NULL, 0, NULL, 0, 0}
+};
+
+
+/*
+  For character sets that have good mb2 characters A1A1 and F9FE
+*/
+STRNNCOLL_PARAM strcoll_mb2_A1A1_mb2_F9FE[]=
+{
+  /* Compare two good characters */
+  {CSTR(""),         CSTR("\xF9\xFE"), -1},
+  {CSTR(" "),        CSTR("\xF9\xFE"), -1},
+  {CSTR("a")       , CSTR("\xF9\xFE"), -1},
+  {CSTR("\xA1\xA1"), CSTR("\xF9\xFE"), -1},
+  {CSTR("\xF9\xFE"), CSTR("\xF9\xFE"),  0},
+
+  /* Compare a good character to an illegal or an incomplete sequence */
+  {CSTR(""),         CSTR("\xA1"),     -1},
+  {CSTR(""),         CSTR("\xF9"),     -1},
+  {CSTR("a"),        CSTR("\xA1"),     -1},
+  {CSTR("a"),        CSTR("\xF9"),     -1},
+
+  {CSTR("\xA1\xA1"), CSTR("\xA1"),     -1},
+  {CSTR("\xA1\xA1"), CSTR("\xF9"),     -1},
+
+  {CSTR("\xF9\xFE"), CSTR("\x80"),     -1},
+  {CSTR("\xF9\xFE"), CSTR("\x81"),     -1},
+  {CSTR("\xF9\xFE"), CSTR("\xA1"),     -1},
+  {CSTR("\xF9\xFE"), CSTR("\xF9"),     -1},
+
+  /* Compare two bad/incomplete sequences */
+  {CSTR("\x80"),     CSTR("\xA1"),     -1},
+  {CSTR("\x80"),     CSTR("\xF9"),     -1},
+
+  {NULL, 0, NULL, 0, 0}
+};
+
+
+/*
+  For character sets that have:
+    A1A1 - a good mb2 character
+    F9FE - a bad sequence
+*/
+STRNNCOLL_PARAM strcoll_mb2_A1A1_bad_F9FE[]=
+{
+  /* Compare a good character to an illegal or an incomplete sequence */
+  {CSTR(""),         CSTR("\xF9\xFE"), -1},
+  {CSTR(" "),        CSTR("\xF9\xFE"), -1},
+  {CSTR("a")       , CSTR("\xF9\xFE"), -1},
+  {CSTR("\xA1\xA1"), CSTR("\xF9\xFE"), -1},
+
+  {CSTR(""),         CSTR("\xA1"),     -1},
+  {CSTR(""),         CSTR("\xF9"),     -1},
+  {CSTR("a"),        CSTR("\xA1"),     -1},
+  {CSTR("a"),        CSTR("\xF9"),     -1},
+
+  {CSTR("\xA1\xA1"), CSTR("\xA1"),     -1},
+  {CSTR("\xA1\xA1"), CSTR("\xF9"),     -1},
+
+  /* Compare two bad/incomplete sequences */
+  {CSTR("\xF9\xFE"), CSTR("\x80"),     1},
+  {CSTR("\xF9\xFE"), CSTR("\x81"),     1},
+  {CSTR("\xF9\xFE"), CSTR("\xA1"),     1},
+  {CSTR("\xF9\xFE"), CSTR("\xF9"),     1},
+  {CSTR("\x80"),     CSTR("\xA1"),     -1},
+  {CSTR("\x80"),     CSTR("\xF9"),     -1},
+  {CSTR("\xF9\xFE"), CSTR("\xF9\xFE"),  0},
+
+  {NULL, 0, NULL, 0, 0}
+};
+
+
+/*
+  For character sets that have:
+    80   - ILSEQ or H2
+    81   - ILSEQ or H2
+    A1   - 8BIT
+    F9   - ILSEQ or H2
+    F9FE - a bad sequence (ILSEQ+XX or H2+ILSEQ)
+*/
+STRNNCOLL_PARAM strcoll_mb1_A1_bad_F9FE[]=
+{
+  /* Compare two good characters */
+  {CSTR(""),         CSTR("\xA1"),     -1},
+  {CSTR("\xA1\xA1"), CSTR("\xA1"),      1},
+
+  /* Compare a good character to an illegal or an incomplete sequence */
+  {CSTR(""),         CSTR("\xF9"),     -1},
+  {CSTR(""),         CSTR("\xF9\xFE"), -1},
+  {CSTR(" "),        CSTR("\xF9\xFE"), -1},
+  {CSTR("a"),        CSTR("\xF9\xFE"), -1},
+  {CSTR("a"),        CSTR("\xA1"),     -1},
+  {CSTR("a"),        CSTR("\xF9"),     -1},
+
+  {CSTR("\xA1\xA1"), CSTR("\xF9"),     -1},
+  {CSTR("\xA1\xA1"), CSTR("\xF9\xFE"), -1},
+
+  {CSTR("\xF9\xFE"), CSTR("\x80"),     1},
+  {CSTR("\xF9\xFE"), CSTR("\x81"),     1},
+  {CSTR("\xF9\xFE"), CSTR("\xA1"),     1},
+  {CSTR("\xF9\xFE"), CSTR("\xF9"),     1},
+
+  {CSTR("\x80"),     CSTR("\xA1"),      1},
+
+  /* Compare two bad/incomplete sequences */
+  {CSTR("\x80"),     CSTR("\xF9"),     -1},
+  {CSTR("\xF9\xFE"), CSTR("\xF9\xFE"),  0},
+
+  {NULL, 0, NULL, 0, 0}
+};
+
+
+/*
+  For character sets (e.g. cp932 and sjis) that have:
+    8181 - a valid MB2 character
+    A1   - a valid 8BIT character
+    E0E0 - a valid MB2 character
+  and sort in this order:
+    8181 < A1 < E0E0
+*/
+STRNNCOLL_PARAM strcoll_8181_A1_E0E0[]=
+{
+  {CSTR("\x81\x81"), CSTR("\xA1"),     -1},
+  {CSTR("\x81\x81"), CSTR("\xE0\xE0"), -1},
+  {CSTR("\xA1"),     CSTR("\xE0\xE0"), -1},
+
+  {NULL, 0, NULL, 0, 0}
+};
+
+
+static void
+str2hex(char *dst, size_t dstlen, const char *src, size_t srclen)
+{
+  char *dstend= dst + dstlen;
+  const char *srcend= src + srclen;
+  for (*dst= '\0' ; dst + 3 < dstend && src < srcend; )
+  {
+    sprintf(dst, "%02X", (unsigned char) src[0]);
+    dst+=2;
+    src++;
+  }
+}
+
+
+/*
+  Check if the two comparison result are semantically equal:
+  both are negative, both are positive, or both are zero.
+*/
+static int
+eqres(int ares, int bres)
+{
+  return (ares < 0 && bres < 0) ||
+         (ares > 0 && bres > 0) ||
+         (ares == 0 && bres == 0);
+}
+
+
+static int
+strcollsp(CHARSET_INFO *cs, const STRNNCOLL_PARAM *param)
+{
+  int failed= 0;
+  const STRNNCOLL_PARAM *p;
+  diag("%-20s %-10s %-10s %10s %10s", "Collation", "a", "b", "ExpectSign", "Actual");
+  for (p= param; p->a; p++)
+  {
+    char ahex[64], bhex[64];
+    int res= cs->coll->strnncollsp(cs, (uchar *) p->a, p->alen,
+                                       (uchar *) p->b, p->blen, 0);
+    str2hex(ahex, sizeof(ahex), p->a, p->alen);
+    str2hex(bhex, sizeof(bhex), p->b, p->blen);
+    diag("%-20s %-10s %-10s %10d %10d%s",
+         cs->name, ahex, bhex, p->res, res,
+         eqres(res, p->res) ? "" : " FAILED");
+    if (!eqres(res, p->res))
+    {
+      failed++;
+    }
+    else
+    {
+      /* Test in reverse order */
+      res= cs->coll->strnncollsp(cs, (uchar *) p->b, p->blen,
+                                     (uchar *) p->a, p->alen, 0);
+      if (!eqres(res, -p->res))
+      {
+        diag("Comparison in reverse order failed. Expected %d, got %d",
+             -p->res, res);
+        failed++;
+      }
+    }
+  }
+  return failed;
+}
+
+
+static int
+test_strcollsp()
+{
+  int failed= 0;
+#ifdef HAVE_CHARSET_big5
+  failed+= strcollsp(&my_charset_big5_chinese_ci, strcoll_mb2_common);
+  failed+= strcollsp(&my_charset_big5_chinese_ci, strcoll_mb2_A1A1_mb2_F9FE);
+  failed+= strcollsp(&my_charset_big5_bin,        strcoll_mb2_common);
+  failed+= strcollsp(&my_charset_big5_bin,        strcoll_mb2_A1A1_mb2_F9FE);
+#endif
+#ifdef HAVE_CHARSET_cp932
+  failed+= strcollsp(&my_charset_cp932_japanese_ci, strcoll_mb2_common);
+  failed+= strcollsp(&my_charset_cp932_japanese_ci, strcoll_mb1_A1_bad_F9FE);
+  failed+= strcollsp(&my_charset_cp932_bin,         strcoll_mb2_common);
+  failed+= strcollsp(&my_charset_cp932_bin,         strcoll_mb1_A1_bad_F9FE);
+  failed+= strcollsp(&my_charset_cp932_japanese_ci, strcoll_8181_A1_E0E0);
+  failed+= strcollsp(&my_charset_cp932_bin,         strcoll_8181_A1_E0E0);
+#endif
+#ifdef HAVE_CHARSET_euckr
+  failed+= strcollsp(&my_charset_euckr_korean_ci, strcoll_mb2_common);
+  failed+= strcollsp(&my_charset_euckr_korean_ci, strcoll_mb2_A1A1_mb2_F9FE);
+  failed+= strcollsp(&my_charset_euckr_bin,       strcoll_mb2_common);
+  failed+= strcollsp(&my_charset_euckr_bin,       strcoll_mb2_A1A1_mb2_F9FE);
+#endif
+#ifdef HAVE_CHARSET_gb2312
+  failed+= strcollsp(&my_charset_gb2312_chinese_ci, strcoll_mb2_common);
+  failed+= strcollsp(&my_charset_gb2312_chinese_ci, strcoll_mb2_A1A1_bad_F9FE);
+  failed+= strcollsp(&my_charset_gb2312_bin,        strcoll_mb2_common);
+  failed+= strcollsp(&my_charset_gb2312_bin,        strcoll_mb2_A1A1_bad_F9FE);
+#endif
+#ifdef HAVE_CHARSET_gbk
+  failed+= strcollsp(&my_charset_gbk_chinese_ci, strcoll_mb2_common);
+  failed+= strcollsp(&my_charset_gbk_chinese_ci, strcoll_mb2_A1A1_mb2_F9FE);
+  failed+= strcollsp(&my_charset_gbk_bin,        strcoll_mb2_common);
+  failed+= strcollsp(&my_charset_gbk_bin,        strcoll_mb2_A1A1_mb2_F9FE);
+#endif
+#ifdef HAVE_CHARSET_sjis
+  failed+= strcollsp(&my_charset_sjis_japanese_ci, strcoll_mb2_common);
+  failed+= strcollsp(&my_charset_sjis_bin,         strcoll_mb2_common);
+  failed+= strcollsp(&my_charset_sjis_japanese_ci, strcoll_mb1_A1_bad_F9FE);
+  failed+= strcollsp(&my_charset_sjis_bin,         strcoll_mb1_A1_bad_F9FE);
+  failed+= strcollsp(&my_charset_sjis_japanese_ci, strcoll_8181_A1_E0E0);
+  failed+= strcollsp(&my_charset_sjis_bin,         strcoll_8181_A1_E0E0);
+#endif
+  return failed;
+}
+
+
 int main()
 {
   size_t i, failed= 0;
   
-  plan(1);
+  plan(2);
   diag("Testing my_like_range_xxx() functions");
   
   for (i= 0; i < array_elements(charset_list); i++)
@@ -112,5 +462,10 @@ int main()
     }
   }
   ok(failed == 0, "Testing my_like_range_xxx() functions");
+  
+  diag("Testing cs->coll->strnncollsp()");
+  failed= test_strcollsp();
+  ok(failed == 0, "Testing cs->coll->strnncollsp()");
+
   return exit_status();
 }

Follow ups

Re: MDEV-8214 Asian MB2 charsets: compare broken bytes as "greater than any non-broken character"
From: Sergei Golubchik, 2015-06-18