Skip to content

Commit

Permalink
WL#2673 Unicode Collation Algorithm new version
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexander Barkov committed Jun 25, 2010
1 parent b8d94ee commit 89a7720
Show file tree
Hide file tree
Showing 37 changed files with 19,500 additions and 1,490 deletions.
73 changes: 61 additions & 12 deletions include/m_ctype.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,16 +56,63 @@ extern "C" {



typedef struct unicase_info_st
typedef struct unicase_info_char_st
{
uint32 toupper;
uint32 tolower;
uint32 sort;
} MY_UNICASE_CHARACTER;


typedef struct unicase_info_st
{
my_wc_t maxchar;
MY_UNICASE_CHARACTER **page;
} MY_UNICASE_INFO;


extern MY_UNICASE_INFO *my_unicase_default[256];
extern MY_UNICASE_INFO *my_unicase_turkish[256];
extern MY_UNICASE_INFO my_unicase_default;
extern MY_UNICASE_INFO my_unicase_turkish;
extern MY_UNICASE_INFO my_unicase_unicode520;

#define MY_UCA_MAX_CONTRACTION 4
#define MY_UCA_MAX_WEIGHT_SIZE 8

typedef struct my_contraction_t
{
my_wc_t ch[MY_UCA_MAX_CONTRACTION]; /* Character sequence */
uint16 weight[MY_UCA_MAX_WEIGHT_SIZE];/* Its weight string, 0-terminated */
} MY_CONTRACTION;



typedef struct my_contraction_list_t
{
size_t nitems; /* Number of items in the list */
MY_CONTRACTION *item; /* List of contractions */
char *flags; /* Character flags, e.g. "is contraction head") */
} MY_CONTRACTIONS;



typedef struct uca_info_st
{
my_wc_t maxchar;
uchar *lengths;
uint16 **weights;
MY_CONTRACTIONS contractions;
} MY_UCA_INFO;



my_bool my_uca_have_contractions(MY_UCA_INFO *uca);
my_bool my_uca_can_be_contraction_head(MY_UCA_INFO *uca, my_wc_t wc);
my_bool my_uca_can_be_contraction_tail(MY_UCA_INFO *uca, my_wc_t wc);
uint16 *my_uca_contraction2_weight(MY_UCA_INFO *uca, my_wc_t wc1, my_wc_t wc2);


extern MY_UCA_INFO my_uca_v400;


typedef struct uni_ctype_st
{
Expand Down Expand Up @@ -183,7 +230,8 @@ struct charset_info_st;
/* See strings/CHARSET_INFO.txt for information about this structure */
typedef struct my_collation_handler_st
{
my_bool (*init)(struct charset_info_st *, void *(*alloc)(size_t));
my_bool (*init)(struct charset_info_st *, void *(*alloc)(size_t),
char *error, size_t errsize);
/* Collation routines */
int (*strnncoll)(struct charset_info_st *,
const uchar *, size_t, const uchar *, size_t, my_bool);
Expand Down Expand Up @@ -235,7 +283,8 @@ typedef size_t (*my_charset_conv_case)(struct charset_info_st *,
/* See strings/CHARSET_INFO.txt about information on this structure */
typedef struct my_charset_handler_st
{
my_bool (*init)(struct charset_info_st *, void *(*alloc)(size_t));
my_bool (*init)(struct charset_info_st *, void *(*alloc)(size_t),
char *error, size_t errsize);
/* Multibyte routines */
uint (*ismbchar)(struct charset_info_st *, const char *, const char *);
uint (*mbcharlen)(struct charset_info_st *, uint c);
Expand Down Expand Up @@ -320,20 +369,19 @@ typedef struct charset_info_st
uchar *to_lower;
uchar *to_upper;
uchar *sort_order;
uint16 *contractions;
uint16 **sort_order_big;
MY_UCA_INFO *uca;
uint16 *tab_to_uni;
MY_UNI_IDX *tab_from_uni;
MY_UNICASE_INFO **caseinfo;
MY_UNICASE_INFO *caseinfo;
uchar *state_map;
uchar *ident_map;
uint strxfrm_multiply;
uchar caseup_multiply;
uchar casedn_multiply;
uint mbminlen;
uint mbmaxlen;
uint16 min_sort_char;
uint16 max_sort_char; /* For LIKE optimization */
my_wc_t min_sort_char;
my_wc_t max_sort_char; /* For LIKE optimization */
uchar pad_char;
my_bool escape_with_backslash_is_dangerous;
uchar levels_for_compare;
Expand Down Expand Up @@ -584,10 +632,11 @@ int my_wildcmp_unicode(CHARSET_INFO *cs,
const char *str, const char *str_end,
const char *wildstr, const char *wildend,
int escape, int w_one, int w_many,
MY_UNICASE_INFO **weights);
MY_UNICASE_INFO *weights);

extern my_bool my_parse_charset_xml(const char *bug, size_t len,
int (*add)(CHARSET_INFO *cs));
int (*add)(CHARSET_INFO *cs),
char *error, size_t errsize);
extern char *my_strchr(CHARSET_INFO *cs, const char *str, const char *end,
pchar c);

Expand Down
166 changes: 166 additions & 0 deletions mysql-test/include/ctype_unicode520.inc
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
#
# WL#2673 Unicode collation algorithm new version
#
CREATE TABLE t1 AS SELECT repeat('a', 10) as c LIMIT 0;
SHOW CREATE TABLE t1;

#
# Unicode-5.0.0 characters
#

# Latin Extended-B and IP extensions
INSERT INTO t1 VALUES (_utf32 0x0180),(_utf32 0x023A);
INSERT INTO t1 VALUES (_utf32 0x023B),(_utf32 0x023C);
INSERT INTO t1 VALUES (_utf32 0x023D),(_utf32 0x023E);
INSERT INTO t1 VALUES (_utf32 0x0241),(_utf32 0x0242);
INSERT INTO t1 VALUES (_utf32 0x0243),(_utf32 0x0244);
INSERT INTO t1 VALUES (_utf32 0x0245),(_utf32 0x0246);
INSERT INTO t1 VALUES (_utf32 0x0247),(_utf32 0x0248);
INSERT INTO t1 VALUES (_utf32 0x0249),(_utf32 0x024A);
INSERT INTO t1 VALUES (_utf32 0x024B),(_utf32 0x024C);
INSERT INTO t1 VALUES (_utf32 0x024D),(_utf32 0x024E);
INSERT INTO t1 VALUES (_utf32 0x024F),(_utf32 0x026B);
INSERT INTO t1 VALUES (_utf32 0x027D),(_utf32 0x0289);
INSERT INTO t1 VALUES (_utf32 0x028C);

# Greek and Coptic
INSERT INTO t1 VALUES (_utf32 0x037B), (_utf32 0x037C);
INSERT INTO t1 VALUES (_utf32 0x037D), (_utf32 0x03FD);
INSERT INTO t1 VALUES (_utf32 0x03FE), (_utf32 0x03FF);

# Cyrillic
INSERT INTO t1 VALUES (_utf32 0x04C0), (_utf32 0x04CF);
INSERT INTO t1 VALUES (_utf32 0x04F6), (_utf32 0x04F7);
INSERT INTO t1 VALUES (_utf32 0x04FA), (_utf32 0x04FB);
INSERT INTO t1 VALUES (_utf32 0x04FC), (_utf32 0x04FD);
INSERT INTO t1 VALUES (_utf32 0x04FE), (_utf32 0x04FF);
INSERT INTO t1 VALUES (_utf32 0x0510), (_utf32 0x0511);
INSERT INTO t1 VALUES (_utf32 0x0512), (_utf32 0x0513);

# Georgian, Georgian Supplement
INSERT INTO t1 VALUES (_utf32 0x10A0), (_utf32 0x10A1);
INSERT INTO t1 VALUES (_utf32 0x10A2), (_utf32 0x10A3);
INSERT INTO t1 VALUES (_utf32 0x10A4), (_utf32 0x10A5);
INSERT INTO t1 VALUES (_utf32 0x10A6), (_utf32 0x10A7);
INSERT INTO t1 VALUES (_utf32 0x2D00), (_utf32 0x2D01);
INSERT INTO t1 VALUES (_utf32 0x2D02), (_utf32 0x2D03);
INSERT INTO t1 VALUES (_utf32 0x2D04), (_utf32 0x2D05);
INSERT INTO t1 VALUES (_utf32 0x2D06), (_utf32 0x2D07);

# Phonetic Extensions
INSERT INTO t1 VALUES (_utf32 0x1D7D);

# Letterlike Symbols
INSERT INTO t1 VALUES (_utf32 0x2132),(_utf32 0x214E);

# Number Forms
INSERT INTO t1 VALUES (_utf32 0x2183),(_utf32 0x2184);

# Coptic
INSERT INTO t1 VALUES (_utf32 0x2C80), (_utf32 0x2C81);
INSERT INTO t1 VALUES (_utf32 0x2C82), (_utf32 0x2C83);
INSERT INTO t1 VALUES (_utf32 0x2C84), (_utf32 0x2C85);
INSERT INTO t1 VALUES (_utf32 0x2C86), (_utf32 0x2C87);
INSERT INTO t1 VALUES (_utf32 0x2C88), (_utf32 0x2C89);
INSERT INTO t1 VALUES (_utf32 0x2C8A), (_utf32 0x2C8B);
INSERT INTO t1 VALUES (_utf32 0x2C8C), (_utf32 0x2C8D);
INSERT INTO t1 VALUES (_utf32 0x2C8E), (_utf32 0x2C8F);

# Latin Extended-C
INSERT INTO t1 VALUES (_utf32 0x2C60), (_utf32 0x2C61);
INSERT INTO t1 VALUES (_utf32 0x2C62), (_utf32 0x2C63);
INSERT INTO t1 VALUES (_utf32 0x2C64), (_utf32 0x2C65);
INSERT INTO t1 VALUES (_utf32 0x2C66), (_utf32 0x2C67);
INSERT INTO t1 VALUES (_utf32 0x2C68), (_utf32 0x2C69);
INSERT INTO t1 VALUES (_utf32 0x2C6A), (_utf32 0x2C6B);
INSERT INTO t1 VALUES (_utf32 0x2C6C), (_utf32 0x2C75);
INSERT INTO t1 VALUES (_utf32 0x2C76);

# Glagolitic
INSERT INTO t1 VALUES (_utf32 0x2C00), (_utf32 0x2C01);
INSERT INTO t1 VALUES (_utf32 0x2C02), (_utf32 0x2C03);
INSERT INTO t1 VALUES (_utf32 0x2C04), (_utf32 0x2C05);
INSERT INTO t1 VALUES (_utf32 0x2C06), (_utf32 0x2C07);
INSERT INTO t1 VALUES (_utf32 0x2C30), (_utf32 0x2C31);
INSERT INTO t1 VALUES (_utf32 0x2C32), (_utf32 0x2C33);
INSERT INTO t1 VALUES (_utf32 0x2C34), (_utf32 0x2C35);
INSERT INTO t1 VALUES (_utf32 0x2C36), (_utf32 0x2C37);

# Deseret
INSERT INTO t1 VALUES (_utf32 0x10400), (_utf32 0x10401);
INSERT INTO t1 VALUES (_utf32 0x10402), (_utf32 0x10403);
INSERT INTO t1 VALUES (_utf32 0x10404), (_utf32 0x10405);
INSERT INTO t1 VALUES (_utf32 0x10406), (_utf32 0x10407);
INSERT INTO t1 VALUES (_utf32 0x10428), (_utf32 0x10429);
INSERT INTO t1 VALUES (_utf32 0x1042A), (_utf32 0x1042B);
INSERT INTO t1 VALUES (_utf32 0x1042C), (_utf32 0x1042D);
INSERT INTO t1 VALUES (_utf32 0x1042E), (_utf32 0x1042F);


#
# Unicode 5.1.0 characters
#

INSERT INTO t1 VALUES (_utf32 0x0370); # GREEK CAPITAL LETTER HETA
INSERT INTO t1 VALUES (_utf32 0x0371); # GREEK SMALL LETTER HETA
INSERT INTO t1 VALUES (_utf32 0x0372); # GREEK CAPITAL LETTER ARCHAIC SAMPI
INSERT INTO t1 VALUES (_utf32 0x0373); # GREEK SMALL LETTER ARCHAIC SAMPI

INSERT INTO t1 VALUES (_utf32 0x0514); # CYRILLIC CAPITAL LETTER LHA
INSERT INTO t1 VALUES (_utf32 0x0515); # CYRILLIC SMALL LETTER LHA
INSERT INTO t1 VALUES (_utf32 0x0516); # CYRILLIC CAPITAL LETTER RHA
INSERT INTO t1 VALUES (_utf32 0x0517); # CYRILLIC SMALL LETTER RHA

INSERT INTO t1 VALUES (_utf32 0xA640); # CYRILLIC CAPITAL LETTER ZEMLYA
INSERT INTO t1 VALUES (_utf32 0xA641); # CYRILLIC SMALL LETTER ZEMLYA
INSERT INTO t1 VALUES (_utf32 0xA642); # CYRILLIC CAPITAL LETTER DZELO
INSERT INTO t1 VALUES (_utf32 0xA643); # CYRILLIC SMALL LETTER DZELO

INSERT INTO t1 VALUES (_utf32 0xA722); # LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF
INSERT INTO t1 VALUES (_utf32 0xA723); # LATIN SMALL LETTER EGYPTOLOGICAL ALEF
INSERT INTO t1 VALUES (_utf32 0xA724); # LATIN CAPITAL LETTER EGYPTOLOGICAL AIN
INSERT INTO t1 VALUES (_utf32 0xA725); # LATIN SMALL LETTER EGYPTOLOGICAL AIN

INSERT INTO t1 VALUES (_utf32 0xA726); # LATIN CAPITAL LETTER HENG
INSERT INTO t1 VALUES (_utf32 0xA727); # LATIN SMALL LETTER HENG
INSERT INTO t1 VALUES (_utf32 0xA728); # LATIN CAPITAL LETTER TZ
INSERT INTO t1 VALUES (_utf32 0xA729); # LATIN SMALL LETTER TZ
INSERT INTO t1 VALUES (_utf32 0xA72A); # LATIN CAPITAL LETTER TRESILLO
INSERT INTO t1 VALUES (_utf32 0xA72B); # LATIN SMALL LETTER TRESILLO

#
# Unicode 5.2.0 characters
#

INSERT INTO t1 VALUES (_utf32 0x2CEB); # COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI;Lu;0;L;;;;;N;;;;2CEC;
INSERT INTO t1 VALUES (_utf32 0x2CEC); # COPTIC SMALL LETTER CRYPTOGRAMMIC SHEI;Ll;0;L;;;;;N;;;2CEB;;2CEB
INSERT INTO t1 VALUES (_utf32 0x2CED); # COPTIC CAPITAL LETTER CRYPTOGRAMMIC GANGIA;Lu;0;L;;;;;N;;;;2CEE;
INSERT INTO t1 VALUES (_utf32 0x2CEE); # COPTIC SMALL LETTER CRYPTOGRAMMIC GANGIA;Ll;0;L;;;;;N;;;2CED;;2CED

#
# Check case folding and UCA weights
#
SELECT hex(c), hex(lower(c)), hex(upper(c)), hex(weight_string(c)), c
FROM t1 ORDER BY c, BINARY c;


#
# Check that LIKE works fine with and without index.
# This test makes sure that cs->min_sort_char and cs->max_sort_char
# are set properly
# Also check that LIKE is case insensitive for supplementary characters
#
INSERT INTO t1 VALUES ('a');
INSERT INTO t1 VALUES (concat(_utf32 0x61, _utf32 0xFFFF));
INSERT INTO t1 VALUES (concat(_utf32 0x61, _utf32 0x10FFFF));
INSERT INTO t1 VALUES (concat(_utf32 0x61, _utf32 0x10400));
SELECT hex(c), hex(weight_string(c)) FROM t1 WHERE c LIKE 'a%' ORDER BY c;
SELECT hex(c), hex(weight_string(c)), c FROM t1 WHERE c LIKE _utf32 0x10400 ORDER BY c, BINARY c;
SELECT hex(c), hex(weight_string(c)), c FROM t1 WHERE c LIKE _utf32 0x10428 ORDER BY c, BINARY c;
ALTER TABLE t1 ADD KEY(c);
EXPLAIN SELECT hex(c) FROM t1 WHERE c LIKE 'a%' ORDER BY c;
SELECT hex(c), hex(weight_string(c)) FROM t1 WHERE c LIKE 'a%' ORDER BY c;
SELECT hex(c), hex(weight_string(c)), c FROM t1 WHERE c LIKE _utf32 0x10400 ORDER BY c, BINARY c;
SELECT hex(c), hex(weight_string(c)), c FROM t1 WHERE c LIKE _utf32 0x10428 ORDER BY c, BINARY c;

DROP TABLE t1;
65 changes: 65 additions & 0 deletions mysql-test/r/ctype_ldml.result
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,51 @@ select * from t1 where c1='b';
c1
a
drop table t1;
SELECT hex(weight_string(_utf8mb4'a' collate utf8mb4_test_ci));
hex(weight_string(_utf8mb4'a' collate utf8mb4_test_ci))
120F
SELECT hex(weight_string(convert(_utf32 0x10002 using utf8mb4) collate utf8mb4_test_ci));
hex(weight_string(convert(_utf32 0x10002 using utf8mb4) collate utf8mb4_test_ci))
314A
SELECT hex(@a:=convert(_utf32 0x10400 using utf8mb4) collate utf8mb4_test_ci), hex(lower(@a));
hex(@a:=convert(_utf32 0x10400 using utf8mb4) collate utf8mb4_test_ci) hex(lower(@a))
F0909080 F09090A8
SELECT hex(@a:=convert(_utf32 0x10428 using utf8mb4) collate utf8mb4_test_ci), hex(upper(@a));
hex(@a:=convert(_utf32 0x10428 using utf8mb4) collate utf8mb4_test_ci) hex(upper(@a))
F09090A8 F0909080
SELECT hex(@a:=convert(_utf32 0x2C00 using utf8mb4) collate utf8mb4_test_ci), hex(lower(@a));
hex(@a:=convert(_utf32 0x2C00 using utf8mb4) collate utf8mb4_test_ci) hex(lower(@a))
E2B080 E2B0B0
SELECT hex(@a:=convert(_utf32 0x2C30 using utf8mb4) collate utf8mb4_test_ci), hex(upper(@a));
hex(@a:=convert(_utf32 0x2C30 using utf8mb4) collate utf8mb4_test_ci) hex(upper(@a))
E2B0B0 E2B080
SELECT hex(weight_string(convert(_utf32 0x61 using utf8mb4) collate utf8mb4_test_ci));
hex(weight_string(convert(_utf32 0x61 using utf8mb4) collate utf8mb4_test_ci))
120F
SELECT hex(weight_string(convert(_utf32 0x62 using utf8mb4) collate utf8mb4_test_ci));
hex(weight_string(convert(_utf32 0x62 using utf8mb4) collate utf8mb4_test_ci))
120F
SELECT hex(weight_string(convert(_utf32 0x10062 using utf8mb4) collate utf8mb4_test_ci));
hex(weight_string(convert(_utf32 0x10062 using utf8mb4) collate utf8mb4_test_ci))
120F
SELECT hex(weight_string(convert(_utf32 0x10400 using utf8mb4) collate utf8mb4_test_ci));
hex(weight_string(convert(_utf32 0x10400 using utf8mb4) collate utf8mb4_test_ci))
30D2
SELECT hex(weight_string(convert(_utf32 0x100400 using utf8mb4) collate utf8mb4_test_ci));
hex(weight_string(convert(_utf32 0x100400 using utf8mb4) collate utf8mb4_test_ci))
30D2
SELECT hex(weight_string(_utf8mb4 0x64 collate utf8mb4_test_ci));
hex(weight_string(_utf8mb4 0x64 collate utf8mb4_test_ci))
1250
SELECT hex(weight_string(convert(_ucs2 0x0064017e using utf8mb4) collate utf8mb4_test_ci));
hex(weight_string(convert(_ucs2 0x0064017e using utf8mb4) collate utf8mb4_test_ci))
1251
SELECT hex(weight_string(convert(_ucs2 0x0044017e using utf8mb4) collate utf8mb4_test_ci));
hex(weight_string(convert(_ucs2 0x0044017e using utf8mb4) collate utf8mb4_test_ci))
1251
SELECT hex(weight_string(convert(_ucs2 0x0044017d using utf8mb4) collate utf8mb4_test_ci));
hex(weight_string(convert(_ucs2 0x0044017d using utf8mb4) collate utf8mb4_test_ci))
1251
CREATE TABLE t1 (
col1 varchar(100) character set utf8 collate utf8_test_ci
);
Expand Down Expand Up @@ -411,6 +456,7 @@ select * from information_schema.collations where id>256 order by id;
COLLATION_NAME CHARACTER_SET_NAME ID IS_DEFAULT IS_COMPILED SORTLEN
utf8mb4_test_ci utf8mb4 326 8
utf16_test_ci utf16 327 8
utf8mb4_test_400_ci utf8mb4 328 8
utf8_phone_ci utf8 352 8
utf8_test_ci utf8 353 8
ucs2_test_ci ucs2 358 8
Expand All @@ -423,6 +469,7 @@ latin1_test latin1 99 Yes 1
utf8_test_ci utf8 353 8
ucs2_test_ci ucs2 358 8
utf8mb4_test_ci utf8mb4 326 8
utf8mb4_test_400_ci utf8mb4 328 8
utf16_test_ci utf16 327 8
utf32_test_ci utf32 391 8
show collation like 'ucs2_vn_ci';
Expand All @@ -449,3 +496,21 @@ SHOW COLLATION LIKE 'utf8_phone_ci';
Collation Charset Id Default Compiled Sortlen
utf8_phone_ci utf8 352 8
SET NAMES utf8;
SELECT hex(weight_string(_utf8mb4'a' collate utf8mb4_test_400_ci));
hex(weight_string(_utf8mb4'a' collate utf8mb4_test_400_ci))
0E33
SELECT hex(weight_string(convert(_utf32 0x10002 using utf8mb4) collate utf8mb4_test_400_ci));
hex(weight_string(convert(_utf32 0x10002 using utf8mb4) collate utf8mb4_test_400_ci))
FFFD
SELECT hex(@a:=convert(_utf32 0x10400 using utf8mb4) collate utf8mb4_test_400_ci), hex(lower(@a));
hex(@a:=convert(_utf32 0x10400 using utf8mb4) collate utf8mb4_test_400_ci) hex(lower(@a))
F0909080 F0909080
SELECT hex(@a:=convert(_utf32 0x10428 using utf8mb4) collate utf8mb4_test_400_ci), hex(upper(@a));
hex(@a:=convert(_utf32 0x10428 using utf8mb4) collate utf8mb4_test_400_ci) hex(upper(@a))
F09090A8 F09090A8
SELECT hex(@a:=convert(_utf32 0x2C00 using utf8mb4) collate utf8mb4_test_400_ci), hex(lower(@a));
hex(@a:=convert(_utf32 0x2C00 using utf8mb4) collate utf8mb4_test_400_ci) hex(lower(@a))
E2B080 E2B080
SELECT hex(@a:=convert(_utf32 0x2C30 using utf8mb4) collate utf8mb4_test_400_ci), hex(upper(@a));
hex(@a:=convert(_utf32 0x2C30 using utf8mb4) collate utf8mb4_test_400_ci) hex(upper(@a))
E2B0B0 E2B0B0
Loading

0 comments on commit 89a7720

Please sign in to comment.