maria-developers team mailing list archive
-
maria-developers team
-
Mailing list archive
-
Message #01610
bzr commit into MariaDB 5.1, with Maria 1.5:maria branch (monty:2777)
#At lp:maria based on revid:knielsen@xxxxxxxxxxxxxxx-20091127132059-3su1w7xhsbbtpg6f
2777 Michael Widenius 2009-11-30
Added more general support for sorting 2 characters as one (contractions)
Added support for Croatian sorting orders utf8_croatian_ci and ucs2_croatian_ci.
Patch done by Alexander Barkov. See http://www.collation-charts.org/articles/croatian.htm
modified:
include/m_ctype.h
mysql-test/r/ctype_uca.result
mysql-test/t/ctype_uca.test
mysys/charset-def.c
strings/ctype-mb.c
strings/ctype-uca.c
strings/ctype-ucs2.c
per-file messages:
mysql-test/r/ctype_uca.result
Added testing of Croatian sort order
mysql-test/t/ctype_uca.test
Added testing of Croatian sort order
=== modified file 'include/m_ctype.h'
--- a/include/m_ctype.h 2009-09-07 20:50:10 +0000
+++ b/include/m_ctype.h 2009-11-30 12:42:24 +0000
@@ -49,6 +49,24 @@ typedef struct unicase_info_st
extern MY_UNICASE_INFO *my_unicase_default[256];
extern MY_UNICASE_INFO *my_unicase_turkish[256];
+#define MY_UCA_MAX_CONTRACTION 4
+#define MY_UCA_MAX_WEIGHT_SIZE 8
+
+typedef struct my_contraction_t
+{
+ my_wc_t ch[MY_UCA_MAX_CONTRACTION]; /* Character sequence */
+ uint16 weight[MY_UCA_MAX_WEIGHT_SIZE];/* Its weight string, 0-terminated */
+} MY_CONTRACTION;
+
+
+typedef struct my_contraction_list_t
+{
+ size_t nitems; /* Number of items in the list */
+ MY_CONTRACTION *item; /* List of contractions */
+ char *flags; /* Character flags, e.g. "is contraction head") */
+} MY_CONTRACTIONS;
+
+
typedef struct uni_ctype_st
{
uchar pctype;
@@ -262,7 +280,7 @@ typedef struct charset_info_st
uchar *to_lower;
uchar *to_upper;
uchar *sort_order;
- uint16 *contractions;
+ MY_CONTRACTIONS *contractions;
uint16 **sort_order_big;
uint16 *tab_to_uni;
MY_UNI_IDX *tab_from_uni;
@@ -475,6 +493,13 @@ my_bool my_charset_is_ascii_based(CHARSE
my_bool my_charset_is_8bit_pure_ascii(CHARSET_INFO *cs);
uint my_charset_repertoire(CHARSET_INFO *cs);
+my_bool my_uca_have_contractions(CHARSET_INFO *cs);
+my_bool my_uca_can_be_contraction_head(CHARSET_INFO *cs, my_wc_t wc);
+my_bool my_uca_can_be_contraction_tail(CHARSET_INFO *cs, my_wc_t wc);
+uint16 *my_uca_contraction2_weight(CHARSET_INFO *cs, my_wc_t wc1, my_wc_t wc2);
+
+
+
#define _MY_U 01 /* Upper case */
#define _MY_L 02 /* Lower case */
=== modified file 'mysql-test/r/ctype_uca.result'
--- a/mysql-test/r/ctype_uca.result 2008-03-26 09:51:16 +0000
+++ b/mysql-test/r/ctype_uca.result 2009-11-30 12:42:24 +0000
@@ -159,6 +159,7 @@ insert into t1 values (_ucs2 0x01fc),(_u
insert into t1 values ('AA'),('Aa'),('aa'),('aA');
insert into t1 values ('CH'),('Ch'),('ch'),('cH');
insert into t1 values ('DZ'),('Dz'),('dz'),('dZ');
+insert into t1 values ('D��'),('D��'),('d��'),('d��');
insert into t1 values ('IJ'),('Ij'),('ij'),('iJ');
insert into t1 values ('LJ'),('Lj'),('lj'),('lJ');
insert into t1 values ('LL'),('Ll'),('ll'),('lL');
@@ -181,7 +182,7 @@ C,c,��,��,��,��,��,��,��,��,��,��
CH,Ch,cH,ch
��,��
D,d,��,��
-DZ,Dz,dZ,dz,��,��,��,��,��,��
+DZ,Dz,D��,D��,dZ,dz,d��,d��,��,��,��,��,��,��
��,��
��
��
@@ -286,7 +287,7 @@ C,c,��,��,��,��,��,��,��,��,��,��
CH,Ch,cH,ch
��,��
D,d,��,��
-DZ,Dz,dZ,dz,��,��,��,��,��,��
+DZ,Dz,D��,D��,dZ,dz,d��,d��,��,��,��,��,��,��
��,��
��,��
��
@@ -400,6 +401,7 @@ CH,Ch,cH,ch
��,��
D,d,��,��
DZ,Dz,dZ,dz,��,��,��,��,��,��
+D��,D��,d��,d��
��,��
��
��
@@ -513,7 +515,7 @@ C,c,��,��,��,��,��,��,��,��,��,��
CH,Ch,cH,ch
��,��
D,d,��,��
-DZ,Dz,dZ,dz,��,��,��,��,��,��
+DZ,Dz,D��,D��,dZ,dz,d��,d��,��,��,��,��,��,��
��,��
��
��
@@ -622,6 +624,7 @@ CH,Ch,cH,ch
��,��
D,d,��,��
DZ,Dz,dZ,dz,��,��,��,��,��,��
+D��,D��,d��,d��
��,��
��
��
@@ -729,7 +732,7 @@ CH,Ch,cH,ch
��,��
��,��
D,d,��,��
-DZ,Dz,dZ,dz,��,��,��,��,��,��
+DZ,Dz,D��,D��,dZ,dz,d��,d��,��,��,��,��,��,��
��,��
��
��
@@ -840,6 +843,7 @@ CH,Ch,cH,ch
��,��
D,d,��,��
DZ,Dz,dZ,dz
+D��,D��,d��,d��
��,��,��,��,��,��
��,��
��
@@ -951,7 +955,7 @@ C,c,��,��,��,��,��,��,��,��,��,��
CH,Ch,cH,ch
��,��
D,d,��,��
-DZ,Dz,dZ,dz,��,��,��,��,��,��
+DZ,Dz,D��,D��,dZ,dz,d��,d��,��,��,��,��,��,��
��,��
��
��
@@ -1056,7 +1060,7 @@ C,c,��,��,��,��,��,��,��,��,��,��
CH,Ch,cH,ch
��,��
D,d,��,��
-DZ,Dz,dZ,dz,��,��,��,��,��,��
+DZ,Dz,D��,D��,dZ,dz,d��,d��,��,��,��,��,��,��
��,��
��
��
@@ -1164,7 +1168,7 @@ CH,Ch,cH,ch
��,��
��,��
D,d,��,��
-DZ,Dz,dZ,dz,��,��,��,��,��,��
+DZ,Dz,D��,D��,dZ,dz,d��,d��,��,��,��,��,��,��
��,��
��
��
@@ -1275,6 +1279,7 @@ cH
��,��
D,d,��,��
DZ,Dz,dZ,dz,��,��,��,��,��,��
+D��,D��,d��,d��
��,��
��
��
@@ -1382,7 +1387,7 @@ C,c,��,��,��,��,��,��,��,��,��,��
CH,Ch,cH,ch
��,��
D,d,��,��
-DZ,Dz,dZ,dz,��,��,��,��,��,��
+DZ,Dz,D��,D��,dZ,dz,d��,d��,��,��,��,��,��,��
��,��
��
��
@@ -1491,6 +1496,7 @@ cH
��,��
D,d,��,��
DZ,Dz,dZ,dz,��,��,��,��,��,��
+D��,D��,d��,d��
��,��
��
��
@@ -1599,6 +1605,7 @@ cH
��,��
D,d,��,��
DZ,Dz,dZ,dz,��,��,��,��,��,��
+D��,D��,d��,d��
��,��
��
��
@@ -1707,7 +1714,7 @@ cH
CH,Ch,ch
��,��
D,d,��,��
-DZ,Dz,dZ,dz,��,��,��,��,��,��
+DZ,Dz,D��,D��,dZ,dz,d��,d��,��,��,��,��,��,��
��,��
��
��
@@ -1813,7 +1820,7 @@ C,c,��,��,��,��,��,��,��,��,��,��
CH,Ch,cH,ch
��,��
D,d,��,��
-DZ,Dz,dZ,dz,��,��,��,��,��,��
+DZ,Dz,D��,D��,dZ,dz,d��,d��,��,��,��,��,��,��
��,��
��
��
@@ -1921,7 +1928,7 @@ CH,Ch,cH,ch
��,��
��,��
D,d,��,��
-DZ,Dz,dZ,dz,��,��,��,��,��,��
+DZ,Dz,D��,D��,dZ,dz,d��,d��,��,��,��,��,��,��
��,��
��
��
@@ -2030,7 +2037,7 @@ C,c,��,��,��,��,��,��,��,��,��,��
CH,Ch,cH,ch
��,��
D,d,��,��
-DZ,Dz,dZ,dz,��,��,��,��,��,��
+DZ,Dz,D��,D��,dZ,dz,d��,d��,��,��,��,��,��,��
��,��
��
��
@@ -2121,6 +2128,118 @@ Z,z,��,��,��,��,��,��
��
��
��
+select group_concat(c1 order by c1) from t1 group by c1 collate utf8_croatian_ci;
+group_concat(c1 order by c1)
+��
+��
+A,a,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��
+AA,Aa,aA,aa
+��,��,��,��,��,��
+B,b
+��
+��
+��,��
+C,c,��,��,��,��,��,��
+CH,Ch,cH,ch
+��,��
+��,��
+��,��
+D,d,��,��
+DZ,Dz,dZ,dz,��,��,��
+d��
+D��,D��,d��,��,��,��
+��,��
+��
+��
+��,��
+��,��
+E,e,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��
+��,��
+��
+��
+F,f
+��,��
+G,g,��,��,��,��,��,��,��,��,��,��,��,��
+��,��
+��
+��
+��,��
+H,h,��,��
+��,��
+��,��
+I,i,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��
+IJ,Ij,iJ,ij,��,��
+��
+��
+��
+J,j,��,��,��
+K,k,��,��,��,��
+��,��
+L,l,��,��,��,��,��,��
+��,��
+lJ
+LL,Ll,lL,ll
+LJ,Lj,lj,��,��,��
+��,��
+��
+��
+M,m
+N,n,��,��,��,��,��,��,��,��,��,��
+nJ
+NJ,Nj,nj,��,��,��
+��
+��
+��,��
+O,o,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��
+OE,Oe,oE,oe,��,��
+��,��,��,��
+��
+��
+P,p
+��,��
+Q,q
+��
+R,r,��,��,��,��,��,��
+RR,Rr,rR,rr
+��
+S,s,��,��,��,��,��,��,��
+SS,Ss,sS,ss,��
+��,��
+��
+��
+T,t,��,��,��,��
+��
+��,��
+��
+��,��
+��
+U,u,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��
+��
+��
+V,v
+��
+W,w,��,��
+X,x
+Y,y,��,��,��,��,��,��
+��,��
+Z,z,��,��,��,��
+��
+��,��
+��,��
+��,��,��
+��,��
+��
+��,��
+��,��
+��
+��,��
+��,��
+��,��
+��
+��
+��
+��
+��
drop table t1;
SET NAMES utf8;
CREATE TABLE t1 (c varchar(255) NOT NULL COLLATE utf8_general_ci, INDEX (c));
=== modified file 'mysql-test/t/ctype_uca.test'
--- a/mysql-test/t/ctype_uca.test 2008-02-20 18:49:26 +0000
+++ b/mysql-test/t/ctype_uca.test 2009-11-30 12:42:24 +0000
@@ -186,6 +186,7 @@ insert into t1 values (_ucs2 0x01fc),(_u
insert into t1 values ('AA'),('Aa'),('aa'),('aA');
insert into t1 values ('CH'),('Ch'),('ch'),('cH');
insert into t1 values ('DZ'),('Dz'),('dz'),('dZ');
+insert into t1 values ('D��'),('D��'),('d��'),('d��');
insert into t1 values ('IJ'),('Ij'),('ij'),('iJ');
insert into t1 values ('LJ'),('Lj'),('lj'),('lJ');
insert into t1 values ('LL'),('Ll'),('ll'),('lL');
@@ -213,6 +214,7 @@ select group_concat(c1 order by c1) from
select group_concat(c1 order by c1) from t1 group by c1 collate utf8_roman_ci;
select group_concat(c1 order by c1) from t1 group by c1 collate utf8_esperanto_ci;
select group_concat(c1 order by c1) from t1 group by c1 collate utf8_hungarian_ci;
+select group_concat(c1 order by c1) from t1 group by c1 collate utf8_croatian_ci;
drop table t1;
=== modified file 'mysys/charset-def.c'
--- a/mysys/charset-def.c 2007-06-21 20:10:40 +0000
+++ b/mysys/charset-def.c 2009-11-30 12:42:24 +0000
@@ -42,6 +42,7 @@ extern CHARSET_INFO my_charset_ucs2_roma
extern CHARSET_INFO my_charset_ucs2_persian_uca_ci;
extern CHARSET_INFO my_charset_ucs2_esperanto_uca_ci;
extern CHARSET_INFO my_charset_ucs2_hungarian_uca_ci;
+extern CHARSET_INFO my_charset_ucs2_croatian_uca_ci;
#endif
#ifdef HAVE_CHARSET_utf8
@@ -63,6 +64,7 @@ extern CHARSET_INFO my_charset_utf8_roma
extern CHARSET_INFO my_charset_utf8_persian_uca_ci;
extern CHARSET_INFO my_charset_utf8_esperanto_uca_ci;
extern CHARSET_INFO my_charset_utf8_hungarian_uca_ci;
+extern CHARSET_INFO my_charset_utf8_croatian_uca_ci;
#ifdef HAVE_UTF8_GENERAL_CS
extern CHARSET_INFO my_charset_utf8_general_cs;
#endif
@@ -152,6 +154,7 @@ my_bool init_compiled_charsets(myf flags
add_compiled_collation(&my_charset_ucs2_persian_uca_ci);
add_compiled_collation(&my_charset_ucs2_esperanto_uca_ci);
add_compiled_collation(&my_charset_ucs2_hungarian_uca_ci);
+ add_compiled_collation(&my_charset_ucs2_croatian_uca_ci);
#endif
#endif
@@ -186,6 +189,7 @@ my_bool init_compiled_charsets(myf flags
add_compiled_collation(&my_charset_utf8_persian_uca_ci);
add_compiled_collation(&my_charset_utf8_esperanto_uca_ci);
add_compiled_collation(&my_charset_utf8_hungarian_uca_ci);
+ add_compiled_collation(&my_charset_utf8_croatian_uca_ci);
#endif
#endif
=== modified file 'strings/ctype-mb.c'
--- a/strings/ctype-mb.c 2009-02-13 16:41:47 +0000
+++ b/strings/ctype-mb.c 2009-11-30 12:42:24 +0000
@@ -567,8 +567,7 @@ my_bool my_like_range_mb(CHARSET_INFO *c
char *min_end= min_str + res_length;
char *max_end= max_str + res_length;
size_t maxcharlen= res_length / cs->mbmaxlen;
- const char *contraction_flags= cs->contractions ?
- ((const char*) cs->contractions) + 0x40*0x40 : NULL;
+ my_bool have_contractions= my_uca_have_contractions(cs);
for (; ptr != end && min_str != min_end && maxcharlen ; maxcharlen--)
{
@@ -636,8 +635,8 @@ fill_max_and_min:
'ab\min\min\min\min' and 'ab\max\max\max\max'.
*/
- if (contraction_flags && ptr + 1 < end &&
- contraction_flags[(uchar) *ptr])
+ if (have_contractions && ptr + 1 < end &&
+ my_uca_can_be_contraction_head(cs, (uchar) *ptr))
{
/* Ptr[0] is a contraction head. */
@@ -659,8 +658,8 @@ fill_max_and_min:
is not a contraction, then we put only ptr[0],
and continue with ptr[1] on the next loop.
*/
- if (contraction_flags[(uchar) ptr[1]] &&
- cs->contractions[(*ptr-0x40)*0x40 + ptr[1] - 0x40])
+ if (my_uca_can_be_contraction_tail(cs, (uchar) ptr[1]) &&
+ my_uca_contraction2_weight(cs, (uchar) ptr[0], (uchar) ptr[1]))
{
/* Contraction found */
if (maxcharlen == 1 || min_str + 1 >= min_end)
=== modified file 'strings/ctype-uca.c'
--- a/strings/ctype-uca.c 2009-11-16 20:49:51 +0000
+++ b/strings/ctype-uca.c 2009-11-30 12:42:24 +0000
@@ -6713,6 +6713,16 @@ static const char hungarian[]=
"&U < \\u00FC <<< \\u00DC << \\u0171 <<< \\u0170";
+static const char croatian[]=
+
+"&C < \\u010D <<< \\u010C < \\u0107 <<< \\u0106 "
+"&D < d\\u017E <<< \\u01C6 <<< D\\u017E <<< \\u01C5 <<< D\\u017D <<< \\u01C4 "
+" < \\u0111 <<< \\u0110 "
+"&L < lj <<< \\u01C9 <<< Lj <<< \\u01C8 <<< LJ <<< \\u01C7 "
+"&N < nj <<< \\u01CC <<< Nj <<< \\u01CB <<< NJ <<< \\u01CA "
+"&S < \\u0161 <<< \\u0160 "
+"&Z < \\u017E <<< \\u017D";
+
/*
Unicode Collation Algorithm:
Collation element (weight) scanner,
@@ -6726,7 +6736,7 @@ typedef struct my_uca_scanner_st
const uchar *send; /* End of the input string */
uchar *uca_length;
uint16 **uca_weight;
- uint16 *contractions;
+ MY_CONTRACTIONS *contractions;
uint16 implicit[2];
int page;
int code;
@@ -6747,6 +6757,164 @@ typedef struct my_uca_scanner_handler_st
static uint16 nochar[]= {0,0};
+#define MY_UCA_CNT_FLAG_SIZE 4096
+#define MY_UCA_CNT_FLAG_MASK 4095
+
+#define MY_UCA_CNT_HEAD 1
+#define MY_UCA_CNT_TAIL 2
+
+
+
+
+/********** Helper functions to handle contraction ************/
+
+
+/**
+ Mark a character as a contraction part
+
+ @cs Pointer to CHARSET_INFO data
+ @wc Unicode code point
+ @flag flag: "is contraction head", "is contraction tail"
+*/
+
+static void
+my_uca_add_contraction_flag(CHARSET_INFO *cs, my_wc_t wc, int flag)
+{
+ cs->contractions->flags[wc & MY_UCA_CNT_FLAG_MASK]|= flag;
+}
+
+
+/**
+ Add a new contraction into contraction list
+
+ @cs Pointer to CHARSET_INFO data
+ @wc Unicode code points of the characters
+ @len Number of characters
+
+ @return New contraction
+ @retval Pointer to a newly added contraction
+*/
+
+static MY_CONTRACTION *
+my_uca_add_contraction(CHARSET_INFO *cs,
+ my_wc_t *wc, int len __attribute__((unused)))
+{
+ MY_CONTRACTIONS *list= cs->contractions;
+ MY_CONTRACTION *next= &list->item[list->nitems];
+ DBUG_ASSERT(len == 2); /* We currently support only contraction2 */
+ next->ch[0]= wc[0];
+ next->ch[1]= wc[1];
+ list->nitems++;
+ return next;
+}
+
+
+/**
+ Allocate and initialize memory for contraction list and flags
+
+ @cs Pointer to CHARSET_INFO data
+ @alloc Memory allocation function (typically points to my_alloc_once)
+ @n Number of contractions
+
+ @return Error code
+ @retval 0 - memory allocated successfully
+ @retval 1 - not enough memory
+*/
+
+static my_bool
+my_uca_alloc_contractions(CHARSET_INFO *cs, void *(*alloc)(size_t), size_t n)
+{
+ uint size= n * sizeof(MY_CONTRACTION);
+ if (!(cs->contractions= (*alloc)(sizeof(MY_CONTRACTIONS))))
+ return 1;
+ bzero(cs->contractions, sizeof(MY_CONTRACTIONS));
+ if (!(cs->contractions->item= (*alloc)(size)) ||
+ !(cs->contractions->flags= (char*) (*alloc)(MY_UCA_CNT_FLAG_SIZE)))
+ return 1;
+ bzero((void*) cs->contractions->item, size);
+ bzero((void*) cs->contractions->flags, MY_UCA_CNT_FLAG_SIZE);
+ return 0;
+}
+
+
+/**
+ Check if UCA data has contractions (public version)
+
+ @cs Pointer to CHARSET_INFO data
+ @retval 0 - no contraction, 1 - have contractions.
+*/
+
+my_bool
+my_uca_have_contractions(CHARSET_INFO *cs)
+{
+ return cs->contractions != NULL;
+}
+
+
+/**
+ Check if a character can be contraction head
+
+ @cs Pointer to CHARSET_INFO data
+ @wc Code point
+
+ @retval 0 - cannot be contraction head
+ @retval 1 - can be contraction head
+*/
+
+my_bool
+my_uca_can_be_contraction_head(CHARSET_INFO *cs, my_wc_t wc)
+{
+ return cs->contractions->flags[wc & MY_UCA_CNT_FLAG_MASK] & MY_UCA_CNT_HEAD;
+}
+
+
+/**
+ Check if a character can be contraction tail
+
+ @cs Pointer to CHARSET_INFO data
+ @wc Code point
+
+ @retval 0 - cannot be contraction tail
+ @retval 1 - can be contraction tail
+*/
+
+my_bool
+my_uca_can_be_contraction_tail(CHARSET_INFO *cs, my_wc_t wc)
+{
+ return cs->contractions->flags[wc & MY_UCA_CNT_FLAG_MASK] & MY_UCA_CNT_TAIL;
+}
+
+
+/**
+ Find a contraction and return its weight array
+
+ @cs Pointer to CHARSET data
+ @wc1 First character
+ @wc2 Second character
+
+ @return Weight array
+ @retval NULL - no contraction found
+ @retval ptr - contraction weight array
+*/
+
+uint16 *
+my_uca_contraction2_weight(CHARSET_INFO *cs, my_wc_t wc1, my_wc_t wc2)
+{
+ MY_CONTRACTIONS *list= cs->contractions;
+ MY_CONTRACTION *c, *last;
+ for (c= list->item, last= &list->item[list->nitems]; c < last; c++)
+ {
+ if (c->ch[0] == wc1 && c->ch[1] == wc2)
+ {
+ return c->weight;
+ }
+ }
+ return NULL;
+}
+
+
+
+
#ifdef HAVE_CHARSET_ucs2
/*
Initialize collation weight scanner
@@ -6766,7 +6934,7 @@ static uint16 nochar[]= {0,0};
*/
static void my_uca_scanner_init_ucs2(my_uca_scanner *scanner,
- CHARSET_INFO *cs __attribute__((unused)),
+ CHARSET_INFO *cs,
const uchar *str, size_t length)
{
scanner->wbeg= nochar;
@@ -6777,6 +6945,7 @@ static void my_uca_scanner_init_ucs2(my_
scanner->uca_length= cs->sort_order;
scanner->uca_weight= cs->sort_order_big;
scanner->contractions= cs->contractions;
+ scanner->cs= cs;
return;
}
@@ -6865,18 +7034,23 @@ static int my_uca_scanner_next_ucs2(my_u
if (scanner->contractions && (scanner->sbeg <= scanner->send))
{
- int cweight;
+ my_wc_t wc1= ((scanner->page << 8) | scanner->code);
- if (!scanner->page && !scanner->sbeg[0] &&
- (scanner->sbeg[1] > 0x40) && (scanner->sbeg[1] < 0x80) &&
- (scanner->code > 0x40) && (scanner->code < 0x80) &&
- (cweight= scanner->contractions[(scanner->code-0x40)*0x40+scanner->sbeg[1]-0x40]))
+ if (my_uca_can_be_contraction_head(scanner->cs, wc1))
+ {
+ uint16 *cweight;
+ my_wc_t wc2= (((my_wc_t) scanner->sbeg[0]) << 8) | scanner->sbeg[1];
+ if (my_uca_can_be_contraction_tail(scanner->cs, wc2) &&
+ (cweight= my_uca_contraction2_weight(scanner->cs,
+ scanner->code,
+ scanner->sbeg[1])))
{
scanner->implicit[0]= 0;
scanner->wbeg= scanner->implicit;
scanner->sbeg+=2;
- return cweight;
+ return *cweight;
}
+ }
}
if (!ucaw[scanner->page])
@@ -6959,23 +7133,22 @@ static int my_uca_scanner_next_any(my_uc
scanner->code= wc & 0xFF;
scanner->sbeg+= mb_len;
- if (scanner->contractions && !scanner->page &&
- (scanner->code > 0x40) && (scanner->code < 0x80))
+ if (my_uca_have_contractions(scanner->cs) &&
+ my_uca_can_be_contraction_head(scanner->cs, wc))
{
- uint page1, code1, cweight;
+ my_wc_t wc2;
+ uint16 *cweight;
- if (((mb_len= scanner->cs->cset->mb_wc(scanner->cs, &wc,
+ if (((mb_len= scanner->cs->cset->mb_wc(scanner->cs, &wc2,
scanner->sbeg,
scanner->send)) >=0) &&
- (!(page1= (wc >> 8))) &&
- ((code1= (wc & 0xFF)) > 0x40) &&
- (code1 < 0x80) &&
- (cweight= scanner->contractions[(scanner->code-0x40)*0x40 + code1-0x40]))
+ my_uca_can_be_contraction_tail(scanner->cs, wc2) &&
+ (cweight= my_uca_contraction2_weight(scanner->cs, wc, wc2)))
{
scanner->implicit[0]= 0;
scanner->wbeg= scanner->implicit;
scanner->sbeg+= mb_len;
- return cweight;
+ return *cweight;
}
}
@@ -7012,6 +7185,33 @@ static my_uca_scanner_handler my_any_uca
my_uca_scanner_next_any
};
+
+
+/**
+ Helper function:
+ Find address of weights of the given character.
+
+ @weights UCA weight array
+ @lengths UCA length array
+ @ch character Unicode code point
+
+ @return Weight array
+ @retval pointer to weight array for the given character,
+ or NULL if this page does not have implicit weights.
+*/
+
+static inline uint16 *
+my_char_weight_addr(CHARSET_INFO *cs, uint wc)
+{
+ uint page= (wc >> 8);
+ uint ofst= wc & 0xFF;
+ return cs->sort_order_big[page] ?
+ cs->sort_order_big[page] + ofst * cs->sort_order[page] :
+ NULL;
+}
+
+
+
/*
Compares two strings according to the collation
@@ -7683,8 +7883,8 @@ ex:
typedef struct my_coll_rule_item_st
{
- uint base; /* Base character */
- uint curr[2]; /* Current character */
+ my_wc_t base; /* Base character */
+ my_wc_t curr[2]; /* Current character */
int diff[3]; /* Primary, Secondary and Tertiary difference */
} MY_COLL_RULE;
@@ -7834,6 +8034,7 @@ static int my_coll_rule_parse(MY_COLL_RU
static my_bool create_tailoring(CHARSET_INFO *cs, void *(*alloc)(size_t))
{
MY_COLL_RULE rule[MY_MAX_COLL_RULE];
+ MY_COLL_RULE *r, *rfirst, *rlast;
char errstr[128];
uchar *newlengths;
uint16 **newweights;
@@ -7858,6 +8059,9 @@ static my_bool create_tailoring(CHARSET_
return 1;
}
+ rfirst= rule;
+ rlast= rule + rc;
+
if (!cs->caseinfo)
cs->caseinfo= my_unicase_default;
@@ -7941,44 +8145,21 @@ static my_bool create_tailoring(CHARSET_
/* Now process contractions */
if (ncontractions)
{
- /*
- 8K for weights for basic latin letter pairs,
- plus 256 bytes for "is contraction part" flags.
- */
- uint size= 0x40*0x40*sizeof(uint16) + 256;
- char *contraction_flags;
- if (!(cs->contractions= (uint16*) (*alloc)(size)))
- return 1;
- bzero((void*)cs->contractions, size);
- contraction_flags= ((char*) cs->contractions) + 0x40*0x40;
- for (i=0; i < rc; i++)
+ if (my_uca_alloc_contractions(cs, alloc, ncontractions))
+ return 1;
+ for (r= rfirst; r < rlast; r++)
{
- if (rule[i].curr[1])
+ uint16 *to;
+ if (r->curr[1]) /* Contraction */
{
- uint pageb= (rule[i].base >> 8) & 0xFF;
- uint chb= rule[i].base & 0xFF;
- uint16 *offsb= defweights[pageb] + chb*deflengths[pageb];
- uint offsc;
-
- if (offsb[1] ||
- rule[i].curr[0] < 0x40 || rule[i].curr[0] > 0x7f ||
- rule[i].curr[1] < 0x40 || rule[i].curr[1] > 0x7f)
- {
- /*
- TODO: add error reporting;
- We support only basic latin letters contractions at this point.
- Also, We don't support contractions with weight longer than one.
- Otherwise, we'd need much more memory.
- */
- return 1;
- }
- offsc= (rule[i].curr[0]-0x40)*0x40+(rule[i].curr[1]-0x40);
-
- /* Copy base weight applying primary difference */
- cs->contractions[offsc]= offsb[0] + rule[i].diff[0];
- /* Mark both letters as "is contraction part */
- contraction_flags[rule[i].curr[0]]= 1;
- contraction_flags[rule[i].curr[1]]= 1;
+ /* Mark both letters as "is contraction part" */
+ my_uca_add_contraction_flag(cs, r->curr[0], MY_UCA_CNT_HEAD);
+ my_uca_add_contraction_flag(cs, r->curr[1], MY_UCA_CNT_TAIL);
+ to= my_uca_add_contraction(cs, r->curr, 2)->weight;
+ /* Copy weight from the reset character */
+ to[0]= my_char_weight_addr(cs, r->base)[0];
+ /* Apply primary difference */
+ to[0]+= r->diff[0];
}
}
}
@@ -8701,6 +8882,39 @@ CHARSET_INFO my_charset_ucs2_hungarian_u
};
+CHARSET_INFO my_charset_ucs2_croatian_uca_ci=
+{
+ 149,0,0, /* number */
+ MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
+ "ucs2", /* cs name */
+ "ucs2_croatian_ci", /* name */
+ "", /* comment */
+ croatian, /* tailoring */
+ NULL, /* ctype */
+ NULL, /* to_lower */
+ NULL, /* to_upper */
+ NULL, /* sort_order */
+ NULL, /* contractions */
+ NULL, /* sort_order_big*/
+ NULL, /* tab_to_uni */
+ NULL, /* tab_from_uni */
+ my_unicase_default, /* caseinfo */
+ NULL, /* state_map */
+ NULL, /* ident_map */
+ 8, /* strxfrm_multiply */
+ 1, /* caseup_multiply */
+ 1, /* casedn_multiply */
+ 2, /* mbminlen */
+ 2, /* mbmaxlen */
+ 9, /* min_sort_char */
+ 0xFFFF, /* max_sort_char */
+ ' ', /* pad char */
+ 0, /* escape_with_backslash_is_dangerous */
+ &my_charset_ucs2_handler,
+ &my_collation_ucs2_uca_handler
+};
+
+
#endif
@@ -9358,6 +9572,38 @@ CHARSET_INFO my_charset_utf8_hungarian_u
&my_collation_any_uca_handler
};
+CHARSET_INFO my_charset_utf8_croatian_uca_ci=
+{
+ 213,0,0, /* number */
+ MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
+ "utf8", /* cs name */
+ "utf8_croatian_ci", /* name */
+ "", /* comment */
+ croatian, /* tailoring */
+ ctype_utf8, /* ctype */
+ NULL, /* to_lower */
+ NULL, /* to_upper */
+ NULL, /* sort_order */
+ NULL, /* contractions */
+ NULL, /* sort_order_big*/
+ NULL, /* tab_to_uni */
+ NULL, /* tab_from_uni */
+ my_unicase_default, /* caseinfo */
+ NULL, /* state_map */
+ NULL, /* ident_map */
+ 8, /* strxfrm_multiply */
+ 1, /* caseup_multiply */
+ 1, /* casedn_multiply */
+ 1, /* mbminlen */
+ 3, /* mbmaxlen */
+ 9, /* min_sort_char */
+ 0xFFFF, /* max_sort_char */
+ ' ', /* pad char */
+ 0, /* escape_with_backslash_is_dangerous */
+ &my_charset_utf8_handler,
+ &my_collation_any_uca_handler
+};
+
#endif /* HAVE_CHARSET_utf8 */
#endif /* HAVE_UCA_COLLATIONS */
=== modified file 'strings/ctype-ucs2.c'
--- a/strings/ctype-ucs2.c 2009-10-15 21:38:29 +0000
+++ b/strings/ctype-ucs2.c 2009-11-30 12:42:24 +0000
@@ -1526,8 +1526,7 @@ my_bool my_like_range_ucs2(CHARSET_INFO
char *min_org=min_str;
char *min_end=min_str+res_length;
size_t charlen= res_length / cs->mbmaxlen;
- const char *contraction_flags= cs->contractions ?
- ((const char*) cs->contractions) + 0x40*0x40 : NULL;
+ my_bool have_contractions= my_uca_have_contractions(cs);
for ( ; ptr + 1 < end && min_str + 1 < min_end && charlen > 0
; ptr+=2, charlen--)
@@ -1567,8 +1566,9 @@ fill_max_and_min:
return 0;
}
- if (contraction_flags && ptr + 3 < end &&
- ptr[0] == '\0' && contraction_flags[(uchar) ptr[1]])
+ if (have_contractions && ptr + 3 < end &&
+ ptr[0] == '\0' &&
+ my_uca_can_be_contraction_head(cs, (uchar) ptr[1]))
{
/* Contraction head found */
if (ptr[2] == '\0' && (ptr[3] == w_one || ptr[3] == w_many))
@@ -1581,8 +1581,9 @@ fill_max_and_min:
Check if the second letter can be contraction part,
and if two letters really produce a contraction.
*/
- if (ptr[2] == '\0' && contraction_flags[(uchar) ptr[3]] &&
- cs->contractions[(ptr[1]-0x40)*0x40 + ptr[3] - 0x40])
+ if (ptr[2] == '\0' &&
+ my_uca_can_be_contraction_tail(cs, (uchar) ptr[3]) &&
+ my_uca_contraction2_weight(cs,(uchar) ptr[1], (uchar) ptr[3]))
{
/* Contraction found */
if (charlen == 1 || min_str + 2 >= min_end)