Skip to content

Commit

Permalink
added chinese HZ encoding support. fixed ascii area character convers…
Browse files Browse the repository at this point in the history
…ion was not work in euc-cn and euc-kr.
  • Loading branch information
Rui Hirokawa committed May 12, 2002
1 parent 38ad391 commit 767fa10
Show file tree
Hide file tree
Showing 5 changed files with 272 additions and 11 deletions.
119 changes: 113 additions & 6 deletions ext/mbstring/mbfilter.c
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,16 @@ static mbfl_language mbfl_language_japanese = {
mbfl_no_encoding_7bit
};

static mbfl_language mbfl_language_korean = {
mbfl_no_language_korean,
"Korean",
"ko",
NULL,
mbfl_no_encoding_2022kr,
mbfl_no_encoding_base64,
mbfl_no_encoding_7bit
};

static mbfl_language mbfl_language_english = {
mbfl_no_language_english,
"English",
Expand All @@ -155,20 +165,32 @@ static mbfl_language mbfl_language_english = {
mbfl_no_encoding_8bit
};

static mbfl_language mbfl_language_chinese = {
mbfl_no_language_chinese,
"Chinese",
"zh",
static mbfl_language mbfl_language_simplified_chinese = {
mbfl_no_language_simplified_chinese,
"Simplified Chinese",
"zh-cn",
NULL,
mbfl_no_encoding_2022jp,
mbfl_no_encoding_hz,
mbfl_no_encoding_base64,
mbfl_no_encoding_7bit
};

static mbfl_language mbfl_language_traditional_chinese = {
mbfl_no_language_traditional_chinese,
"Traditional Chinese",
"zh-tw",
NULL,
mbfl_no_encoding_hz,
mbfl_no_encoding_base64,
mbfl_no_encoding_7bit
};

static mbfl_language *mbfl_language_ptr_table[] = {
&mbfl_language_uni,
&mbfl_language_japanese,
&mbfl_language_chinese,
&mbfl_language_korean,
&mbfl_language_simplified_chinese,
&mbfl_language_traditional_chinese,
&mbfl_language_english,
NULL
};
Expand Down Expand Up @@ -707,6 +729,15 @@ static mbfl_encoding mbfl_encoding_cp936 = {
MBFL_ENCTYPE_MBCS
};

static mbfl_encoding mbfl_encoding_hz = {
mbfl_no_encoding_hz,
"HZ",
"HZ-GB-2312",
NULL,
NULL,
MBFL_ENCTYPE_MBCS | MBFL_ENCTYPE_SHFTCODE
};

#endif /* HAVE_MBSTR_CN */

#if defined(HAVE_MBSTR_TW)
Expand Down Expand Up @@ -967,6 +998,7 @@ static mbfl_encoding *mbfl_encoding_ptr_list[] = {
#if defined(HAVE_MBSTR_CN)
&mbfl_encoding_euc_cn,
&mbfl_encoding_cp936,
&mbfl_encoding_hz,
#endif
#if defined(HAVE_MBSTR_TW)
&mbfl_encoding_euc_tw,
Expand Down Expand Up @@ -1072,6 +1104,7 @@ static int mbfl_filt_ident_2022jp(int c, mbfl_identify_filter *filter TSRMLS_DC)
#if defined(HAVE_MBSTR_CN)
static int mbfl_filt_ident_euccn(int c, mbfl_identify_filter *filter TSRMLS_DC);
static int mbfl_filt_ident_cp936(int c, mbfl_identify_filter *filter TSRMLS_DC);
static int mbfl_filt_ident_hz(int c, mbfl_identify_filter *filter TSRMLS_DC);
#endif /* HAVE_MBSTR_CN */

#if defined(HAVE_MBSTR_TW)
Expand Down Expand Up @@ -1605,6 +1638,23 @@ static struct mbfl_convert_vtbl vtbl_wchar_cp936 = {
mbfl_filt_conv_common_dtor,
mbfl_filt_conv_wchar_cp936,
mbfl_filt_conv_common_flush };

static struct mbfl_convert_vtbl vtbl_hz_wchar = {
mbfl_no_encoding_hz,
mbfl_no_encoding_wchar,
mbfl_filt_conv_common_ctor,
mbfl_filt_conv_common_dtor,
mbfl_filt_conv_hz_wchar,
mbfl_filt_conv_common_flush };

static struct mbfl_convert_vtbl vtbl_wchar_hz = {
mbfl_no_encoding_wchar,
mbfl_no_encoding_hz,
mbfl_filt_conv_common_ctor,
mbfl_filt_conv_common_dtor,
mbfl_filt_conv_wchar_hz,
mbfl_filt_conv_any_hz_flush };

#endif /* HAVE_MBSTR_CN */

#if defined(HAVE_MBSTR_TW)
Expand Down Expand Up @@ -1923,6 +1973,8 @@ static struct mbfl_convert_vtbl *mbfl_convert_filter_list[] = {
&vtbl_wchar_euccn,
&vtbl_cp936_wchar,
&vtbl_wchar_cp936,
&vtbl_hz_wchar,
&vtbl_wchar_hz,
#endif
#if defined(HAVE_MBSTR_TW)
&vtbl_euctw_wchar,
Expand Down Expand Up @@ -2083,6 +2135,13 @@ static struct mbfl_identify_vtbl vtbl_identify_cp936 = {
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
mbfl_filt_ident_cp936 };

static struct mbfl_identify_vtbl vtbl_identify_hz = {
mbfl_no_encoding_hz,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
mbfl_filt_ident_hz };

#endif /* HAVE_MBSTR_CN */

#if defined(HAVE_MBSTR_TW)
Expand Down Expand Up @@ -2218,6 +2277,7 @@ static struct mbfl_identify_vtbl *mbfl_identify_filter_list[] = {
#if defined(HAVE_MBSTR_CN)
&vtbl_identify_euccn,
&vtbl_identify_cp936,
&vtbl_identify_hz,
#endif
#if defined(HAVE_MBSTR_TW)
&vtbl_identify_euctw,
Expand Down Expand Up @@ -5756,6 +5816,53 @@ mbfl_filt_ident_cp936(int c, mbfl_identify_filter *filter TSRMLS_DC)
return c;
}

static int
mbfl_filt_ident_hz(int c, mbfl_identify_filter *filter TSRMLS_DC)
{
switch (filter->status & 0xf) {
/* case 0x00: ASCII */
/* case 0x10: GB2312 */
case 0:
if (c == 0x7e) {
filter->status += 2;
} else if (filter->status == 0x10 && c > 0x20 && c < 0x7f) { /* DBCS first char */
filter->status += 1;
} else if (c >= 0 && c < 0x80) { /* latin, CTLs */
;
} else {
filter->flag = 1; /* bad */
}
break;

/* case 0x11: GB2312 second char */
case 1:
filter->status &= ~0xf;
if (c < 0x21 || c > 0x7e) { /* bad */
filter->flag = 1;
}
break;

case 2:
if (c == 0x7d) { /* '}' */
filter->status = 0;
} else if (c == 0x7b) { /* '{' */
filter->status = 0x10;
} else if (c == 0x7e) { /* '~' */
filter->status = 0;
} else {
filter->flag = 1; /* bad */
filter->status &= ~0xf;
}
break;

default:
filter->status = 0;
break;
}

return c;
}

#endif /* HAVE_MBSTR_CN */

#if defined(HAVE_MBSTR_TW)
Expand Down
5 changes: 4 additions & 1 deletion ext/mbstring/mbfilter.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,8 @@ enum mbfl_no_language {
mbfl_no_language_polish, /* pl */
mbfl_no_language_portuguese, /* pt */
mbfl_no_language_swedish, /* sv */
mbfl_no_language_chinese, /* zh */
mbfl_no_language_simplified_chinese, /* zh-cn */
mbfl_no_language_traditional_chinese, /* zh-tw */
mbfl_no_language_max
};

Expand Down Expand Up @@ -172,7 +173,9 @@ enum mbfl_no_encoding {
mbfl_no_encoding_euc_tw,
mbfl_no_encoding_big5,
mbfl_no_encoding_euc_kr,
mbfl_no_encoding_2022kr,
mbfl_no_encoding_uhc,
mbfl_no_encoding_hz,
mbfl_no_encoding_charset_max
};

Expand Down
152 changes: 150 additions & 2 deletions ext/mbstring/mbfilter_cn.c
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,8 @@ mbfl_filt_conv_wchar_euccn(int c, mbfl_convert_filter *filter TSRMLS_DC)
c1 = (s >> 8) & 0xff;
c2 = s & 0xff;

if (c1 < 0xa1 || c2 < 0xa1) { /* exclude CP932 extension */
s = 0;
if (c1 < 0xa1 || c2 < 0xa1) { /* exclude CP936 extension */
s = c;
}

if (s <= 0) {
Expand Down Expand Up @@ -259,6 +259,154 @@ mbfl_filt_conv_wchar_cp936(int c, mbfl_convert_filter *filter TSRMLS_DC)
return c;
}


/*
* HZ => wchar
*/
int
mbfl_filt_conv_hz_wchar(int c, mbfl_convert_filter *filter TSRMLS_DC)
{
int c1, s, w;

switch (filter->status & 0xf) {
/* case 0x00: ASCII */
/* case 0x10: GB2312 */
case 0:
if (c == 0x7e) {
filter->status += 2;
} else if (filter->status == 0x10 && c > 0x20 && c < 0x7f) { /* DBCS first char */
filter->cache = c;
filter->status += 1;
} else if (c >= 0 && c < 0x80) { /* latin, CTLs */
CK((*filter->output_function)(c, filter->data TSRMLS_CC));
} else {
w = c & MBFL_WCSGROUP_MASK;
w |= MBFL_WCSGROUP_THROUGH;
CK((*filter->output_function)(w, filter->data TSRMLS_CC));
}
break;

/* case 0x11: GB2312 second char */
case 1:
filter->status &= ~0xf;
c1 = filter->cache;
if (c1 > 0x20 && c1 < 0x7f && c > 0x20 && c < 0x7f) {
s = (c1 - 1)*192 + c + 0x40; /* GB2312 */
if (s >= 0 && s < cp936_ucs_table_size) {
w = cp936_ucs_table[s];
} else {
w = 0;
}
if (w <= 0) {
w = (c1 << 8) | c;
w &= MBFL_WCSPLANE_MASK;
w |= MBFL_WCSPLANE_GB2312;
}
CK((*filter->output_function)(w, filter->data TSRMLS_CC));
} else if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
CK((*filter->output_function)(c, filter->data TSRMLS_CC));
} else {
w = (c1 << 8) | c;
w &= MBFL_WCSGROUP_MASK;
w |= MBFL_WCSGROUP_THROUGH;
CK((*filter->output_function)(w, filter->data TSRMLS_CC));
}
break;

/* '~' */
case 2:
if (c == 0x7d) { /* '}' */
filter->status = 0x0;
} else if (c == 0x7b) { /* '{' */
filter->status = 0x10;
} else if (c == 0x7e) { /* '~' */
filter->status = 0x0;
CK((*filter->output_function)(0x007e, filter->data TSRMLS_CC));
}
break;

default:
filter->status = 0;
break;
}

return c;
}

/*
* wchar => HZ
*/
int
mbfl_filt_conv_wchar_hz(int c, mbfl_convert_filter *filter TSRMLS_DC)
{
int s;

s = 0;
if (c >= ucs_a1_cp936_table_min && c < ucs_a1_cp936_table_max) {
s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min];
} else if (c >= ucs_a2_cp936_table_min && c < ucs_a2_cp936_table_max) {
s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min];
} else if (c >= ucs_a3_cp936_table_min && c < ucs_a3_cp936_table_max) {
s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min];
} else if (c >= ucs_i_cp936_table_min && c < ucs_i_cp936_table_max) {
s = ucs_i_cp936_table[c - ucs_i_cp936_table_min];
} else if (c >= ucs_hff_cp936_table_min && c < ucs_hff_cp936_table_max) {
s = ucs_hff_cp936_table[c - ucs_hff_cp936_table_min];
}
if (s >= 0x0080) {
s -= 0x8080;
}

if (s <= 0) {
if (c == 0) {
s = 0;
} else if (s <= 0) {
s = -1;
}
} else if ((s >= 0x80 && s < 0x2121) || (s > 0x8080)) {
s = -1;
}
if (s >= 0) {
if (s < 0x80) { /* ASCII */
if ((filter->status & 0xff00) != 0) {
CK((*filter->output_function)(0x7e, filter->data TSRMLS_CC)); /* '~' */
CK((*filter->output_function)(0x7d, filter->data TSRMLS_CC)); /* '}' */
}
filter->status = 0;
if (s == 0x7e){
CK((*filter->output_function)(0x7e, filter->data TSRMLS_CC));
}
CK((*filter->output_function)(s, filter->data TSRMLS_CC));
} else { /* GB 2312-80 */
if ((filter->status & 0xff00) != 0x200) {
CK((*filter->output_function)(0x7e, filter->data TSRMLS_CC)); /* '~' */
CK((*filter->output_function)(0x7b, filter->data TSRMLS_CC)); /* '{' */
}
filter->status = 0x200;
CK((*filter->output_function)((s >> 8) & 0x7f, filter->data TSRMLS_CC));
CK((*filter->output_function)(s & 0x7f, filter->data TSRMLS_CC));
}
} else {
if (filter->illegal_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
CK(mbfl_filt_conv_illegal_output(c, filter TSRMLS_CC));
}
}

return c;
}

int
mbfl_filt_conv_any_hz_flush(mbfl_convert_filter *filter TSRMLS_DC)
{
/* back to latin */
if ((filter->status & 0xff00) != 0) {
CK((*filter->output_function)(0x7e, filter->data TSRMLS_CC)); /* ~ */
CK((*filter->output_function)(0x7d, filter->data TSRMLS_CC)); /* '{' */
}
filter->status &= 0xff;
return 0;
}

#endif /* HAVE_MBSTR_CN */

/*
Expand Down
3 changes: 3 additions & 0 deletions ext/mbstring/mbfilter_cn.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,8 @@ int mbfl_filt_conv_euccn_wchar(int c, mbfl_convert_filter *filter TSRMLS_DC);
int mbfl_filt_conv_wchar_euccn(int c, mbfl_convert_filter *filter TSRMLS_DC);
int mbfl_filt_conv_cp936_wchar(int c, mbfl_convert_filter *filter TSRMLS_DC);
int mbfl_filt_conv_wchar_cp936(int c, mbfl_convert_filter *filter TSRMLS_DC);
int mbfl_filt_conv_hz_wchar(int c, mbfl_convert_filter *filter TSRMLS_DC);
int mbfl_filt_conv_wchar_hz(int c, mbfl_convert_filter *filter TSRMLS_DC);
int mbfl_filt_conv_any_hz_flush(mbfl_convert_filter *filter TSRMLS_DC);

#endif /* MBFL_MBFILTER_CN_H */
Loading

0 comments on commit 767fa10

Please sign in to comment.