diff --git a/glib/gutf8.c b/glib/gutf8.c index 579c017f7..f48ed4af3 100644 --- a/glib/gutf8.c +++ b/glib/gutf8.c @@ -677,6 +677,8 @@ g_utf8_get_char_validated (const gchar *p, return result; } +#define CONT_BYTE_FAST(p) ((guchar)*p++ & 0x3f) + /** * g_utf8_to_ucs4_fast: * @str: a UTF-8 encoded string @@ -729,39 +731,52 @@ g_utf8_to_ucs4_fast (const gchar *str, p = str; for (i=0; i < n_chars; i++) { - gunichar wc = (guchar)*p++; + guchar first = (guchar)*p++; + gunichar wc; - if (wc < 0x80) + if (first < 0xc0) { - result[i] = wc; + /* We really hope first < 0x80, but we don't want to test an + * extra branch for invalid input, which this function + * does not care about. Handling unexpected continuation bytes + * here will do the least damage. */ + wc = first; } else - { - gunichar mask = 0x40; - - if (G_UNLIKELY ((wc & mask) == 0)) - { - /* It's an out-of-sequence 10xxxxxxx byte. - * Rather than making an ugly hash of this and the next byte - * and overrunning the buffer, it's more useful to treat it - * with a replacement character - */ - result[i] = 0xfffd; - continue; - } - - do - { - wc <<= 6; - wc |= (guchar)(*p++) & 0x3f; - mask <<= 5; - } - while((wc & mask) != 0); - - wc &= mask - 1; - - result[i] = wc; + { + gunichar c1 = CONT_BYTE_FAST(p); + if (first < 0xe0) + { + wc = ((first & 0x1f) << 6) | c1; + } + else + { + gunichar c2 = CONT_BYTE_FAST(p); + if (first < 0xf0) + { + wc = ((first & 0x0f) << 12) | (c1 << 6) | c2; + } + else + { + gunichar c3 = CONT_BYTE_FAST(p); + wc = ((first & 0x07) << 18) | (c1 << 12) | (c2 << 6) | c3; + if (G_UNLIKELY (first >= 0xf8)) + { + /* This can't be valid UTF-8, but g_utf8_next_char() + * and company allow out-of-range sequences */ + gunichar mask = 1 << 20; + while ((wc & mask) != 0) + { + wc <<= 6; + wc |= CONT_BYTE_FAST(p); + mask <<= 5; + } + wc &= mask - 1; + } + } + } } + result[i] = wc; } result[i] = 0; @@ -1442,20 +1457,18 @@ g_ucs4_to_utf16 (const gunichar *str, return result; } -#define CONTINUATION_CHAR \ - G_STMT_START { \ - if ((*(guchar *)p & 0xc0) != 0x80) /* 10xxxxxx */ \ - goto error; \ - val <<= 6; \ - val |= (*(guchar *)p) & 0x3f; \ - } G_STMT_END +#define VALIDATE_BYTE(mask, expect) \ + G_STMT_START { \ + if (G_UNLIKELY((*(guchar *)p & (mask)) != (expect))) \ + goto error; \ + } G_STMT_END + +/* see IETF RFC 3629 Section 4 */ static const gchar * fast_validate (const char *str) { - gunichar val = 0; - gunichar min = 0; const gchar *p; for (p = str; *p; p++) @@ -1465,49 +1478,56 @@ fast_validate (const char *str) else { const gchar *last; - + last = p; - if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */ + if (*(guchar *)p < 0xe0) /* 110xxxxx */ { if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0)) goto error; - p++; - if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */ - goto error; } else { - if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */ + if (*(guchar *)p < 0xf0) /* 1110xxxx */ { - min = (1 << 11); - val = *(guchar *)p & 0x0f; - goto TWO_REMAINING; + switch (*(guchar *)p++ & 0x0f) + { + case 0: + VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */ + break; + case 0x0d: + VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */ + break; + default: + VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ + } } - else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */ + else if (*(guchar *)p < 0xf5) /* 11110xxx excluding out-of-range */ { - min = (1 << 16); - val = *(guchar *)p & 0x07; + switch (*(guchar *)p++ & 0x07) + { + case 0: + VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ + if (G_UNLIKELY((*(guchar *)p & 0x30) == 0)) + goto error; + break; + case 4: + VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */ + break; + default: + VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ + } + p++; + VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ } else goto error; - - p++; - CONTINUATION_CHAR; - TWO_REMAINING: - p++; - CONTINUATION_CHAR; - p++; - CONTINUATION_CHAR; - - if (G_UNLIKELY (val < min)) - goto error; + } + + p++; + VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ - if (G_UNLIKELY (!UNICODE_VALID(val))) - goto error; - } - continue; - + error: return last; } @@ -1521,8 +1541,6 @@ fast_validate_len (const char *str, gssize max_len) { - gunichar val = 0; - gunichar min = 0; const gchar *p; g_assert (max_len >= 0); @@ -1534,57 +1552,65 @@ fast_validate_len (const char *str, else { const gchar *last; - + last = p; - if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */ + if (*(guchar *)p < 0xe0) /* 110xxxxx */ { if (G_UNLIKELY (max_len - (p - str) < 2)) goto error; if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0)) goto error; - p++; - if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */ - goto error; } else { - if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */ + if (*(guchar *)p < 0xf0) /* 1110xxxx */ { if (G_UNLIKELY (max_len - (p - str) < 3)) goto error; - - min = (1 << 11); - val = *(guchar *)p & 0x0f; - goto TWO_REMAINING; + + switch (*(guchar *)p++ & 0x0f) + { + case 0: + VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */ + break; + case 0x0d: + VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */ + break; + default: + VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ + } } - else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */ + else if (*(guchar *)p < 0xf5) /* 11110xxx excluding out-of-range */ { if (G_UNLIKELY (max_len - (p - str) < 4)) goto error; - - min = (1 << 16); - val = *(guchar *)p & 0x07; + + switch (*(guchar *)p++ & 0x07) + { + case 0: + VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ + if (G_UNLIKELY((*(guchar *)p & 0x30) == 0)) + goto error; + break; + case 4: + VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */ + break; + default: + VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ + } + p++; + VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ } else goto error; - - p++; - CONTINUATION_CHAR; - TWO_REMAINING: - p++; - CONTINUATION_CHAR; - p++; - CONTINUATION_CHAR; - - if (G_UNLIKELY (val < min)) - goto error; - if (G_UNLIKELY (!UNICODE_VALID(val))) - goto error; - } - + } + + p++; + VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ + continue; - + error: return last; } diff --git a/glib/tests/utf8-performance.c b/glib/tests/utf8-performance.c index 20e5e0215..5049b99e9 100644 --- a/glib/tests/utf8-performance.c +++ b/glib/tests/utf8-performance.c @@ -35,24 +35,33 @@ static const char str_cyrillic[] = /* First sentence from the Wikipedia article: * http://zh.wikipedia.org/w/index.php?title=%E6%B1%89%E5%AD%97&oldid=13053137 */ -static const char str_chinese[] = +static const char str_han[] = "漢字,亦稱中文字、中国字,在台灣又被稱為國字,是漢字文化圈廣泛使用的一種文字,屬於表意文字的詞素音節文字"; typedef int (* GrindFunc) (const char *, gsize); +#define GRIND_LOOP_BEGIN \ + { \ + int i; \ + for (i = 0; i < NUM_ITERATIONS; i++) + +#define GRIND_LOOP_END \ + } + static int grind_get_char (const char *str, gsize len) { gunichar acc = 0; - int i; - for (i = 0; i < NUM_ITERATIONS; i++) + GRIND_LOOP_BEGIN { const char *p = str; - while (*p) { - acc += g_utf8_get_char (p); - p = g_utf8_next_char (p); - } + while (*p) + { + acc += g_utf8_get_char (p); + p = g_utf8_next_char (p); + } } + GRIND_LOOP_END; return acc; } @@ -60,28 +69,29 @@ static int grind_get_char_validated (const char *str, gsize len) { gunichar acc = 0; - int i; - for (i = 0; i < NUM_ITERATIONS; i++) + GRIND_LOOP_BEGIN { const char *p = str; - while (*p) { - acc += g_utf8_get_char_validated (p, -1); - p = g_utf8_next_char (p); - } + while (*p) + { + acc += g_utf8_get_char_validated (p, -1); + p = g_utf8_next_char (p); + } } + GRIND_LOOP_END; return acc; } static int grind_utf8_to_ucs4 (const char *str, gsize len) { - int i; - for (i = 0; i < NUM_ITERATIONS; i++) + GRIND_LOOP_BEGIN { gunichar *ustr; ustr = g_utf8_to_ucs4 (str, -1, NULL, NULL, NULL); g_free (ustr); } + GRIND_LOOP_END; return 0; } @@ -89,8 +99,7 @@ static int grind_get_char_backwards (const char *str, gsize len) { gunichar acc = 0; - int i; - for (i = 0; i < NUM_ITERATIONS; i++) + GRIND_LOOP_BEGIN { const char *p = str + len; do @@ -100,51 +109,78 @@ grind_get_char_backwards (const char *str, gsize len) } while (p != str); } + GRIND_LOOP_END; return acc; } static int grind_utf8_to_ucs4_sized (const char *str, gsize len) { - int i; - for (i = 0; i < NUM_ITERATIONS; i++) + GRIND_LOOP_BEGIN { gunichar *ustr; ustr = g_utf8_to_ucs4 (str, len, NULL, NULL, NULL); g_free (ustr); } + GRIND_LOOP_END; return 0; } static int grind_utf8_to_ucs4_fast (const char *str, gsize len) { - int i; - for (i = 0; i < NUM_ITERATIONS; i++) + GRIND_LOOP_BEGIN { gunichar *ustr; ustr = g_utf8_to_ucs4_fast (str, -1, NULL); g_free (ustr); } + GRIND_LOOP_END; return 0; } static int grind_utf8_to_ucs4_fast_sized (const char *str, gsize len) { - int i; - for (i = 0; i < NUM_ITERATIONS; i++) + GRIND_LOOP_BEGIN { gunichar *ustr; ustr = g_utf8_to_ucs4_fast (str, len, NULL); g_free (ustr); } + GRIND_LOOP_END; + return 0; +} + +static int +grind_utf8_validate (const char *str, gsize len) +{ + GRIND_LOOP_BEGIN + g_utf8_validate (str, -1, NULL); + GRIND_LOOP_END; + return 0; +} + +static int +grind_utf8_validate_sized (const char *str, gsize len) +{ + GRIND_LOOP_BEGIN + g_utf8_validate (str, len, NULL); + GRIND_LOOP_END; return 0; } +typedef struct _GrindData { + GrindFunc func; + const char *str; +} GrindData; + static void -perform_for (GrindFunc grind_func, const char *str, const char *label) +perform (gconstpointer data) { + GrindData *gd = (GrindData *) data; + GrindFunc grind_func = gd->func; + const char *str = gd->str; gsize len; gulong bytes_ground; gdouble time_elapsed; @@ -161,21 +197,32 @@ perform_for (GrindFunc grind_func, const char *str, const char *label) result = ((gdouble) bytes_ground / time_elapsed) * 1.0e-6; - g_test_maximized_result (result, "%-9s %6.1f MB/s", label, result); + g_test_maximized_result (result, "%7.1f MB/s", result); + + g_slice_free (GrindData, gd); } static void -perform (gconstpointer data) +add_cases(const char *path, GrindFunc func) { - GrindFunc grind_func = (GrindFunc) data; - - if (!g_test_perf ()) - return; - - perform_for (grind_func, str_ascii, "ASCII:"); - perform_for (grind_func, str_latin1, "Latin-1:"); - perform_for (grind_func, str_cyrillic, "Cyrillic:"); - perform_for (grind_func, str_chinese, "Chinese:"); +#define ADD_CASE(script) \ + G_STMT_START { \ + GrindData *gd; \ + gchar *full_path; \ + gd = g_slice_new0(GrindData); \ + gd->func = func; \ + gd->str = str_##script; \ + full_path = g_strdup_printf("%s/" #script, path); \ + g_test_add_data_func (full_path, gd, perform); \ + g_free (full_path); \ + } G_STMT_END + + ADD_CASE(ascii); + ADD_CASE(latin1); + ADD_CASE(cyrillic); + ADD_CASE(han); + +#undef ADD_CASE } int @@ -185,13 +232,15 @@ main (int argc, char **argv) if (g_test_perf ()) { - g_test_add_data_func ("/utf8/perf/get_char", grind_get_char, perform); - g_test_add_data_func ("/utf8/perf/get_char-backwards", grind_get_char_backwards, perform); - g_test_add_data_func ("/utf8/perf/get_char_validated", grind_get_char_validated, perform); - g_test_add_data_func ("/utf8/perf/utf8_to_ucs4", grind_utf8_to_ucs4, perform); - g_test_add_data_func ("/utf8/perf/utf8_to_ucs4-sized", grind_utf8_to_ucs4_sized, perform); - g_test_add_data_func ("/utf8/perf/utf8_to_ucs4_fast", grind_utf8_to_ucs4_fast, perform); - g_test_add_data_func ("/utf8/perf/utf8_to_ucs4_fast-sized", grind_utf8_to_ucs4_fast_sized, perform); + add_cases ("/utf8/perf/get_char", grind_get_char); + add_cases ("/utf8/perf/get_char-backwards", grind_get_char_backwards); + add_cases ("/utf8/perf/get_char_validated", grind_get_char_validated); + add_cases ("/utf8/perf/utf8_to_ucs4", grind_utf8_to_ucs4); + add_cases ("/utf8/perf/utf8_to_ucs4-sized", grind_utf8_to_ucs4_sized); + add_cases ("/utf8/perf/utf8_to_ucs4_fast", grind_utf8_to_ucs4_fast); + add_cases ("/utf8/perf/utf8_to_ucs4_fast-sized", grind_utf8_to_ucs4_fast_sized); + add_cases ("/utf8/perf/utf8_validate", grind_utf8_validate); + add_cases ("/utf8/perf/utf8_validate-sized", grind_utf8_validate_sized); } return g_test_run ();