From 4d7f52e7ddd9eb600c5e15622286b127e18a28a1 Mon Sep 17 00:00:00 2001 From: Matthew Flatt Date: Fri, 25 Dec 2020 08:00:32 -0700 Subject: [PATCH] expose WTF-8 converters on all platforms --- .../scribblings/reference/bytes.scrbl | 41 +++-- .../scribblings/reference/reference.scrbl | 6 + .../tests/racket/unicode.rktl | 69 ++++--- racket/src/bc/src/string.c | 174 +++++++++--------- racket/src/cs/schemified/io.scm | 50 +++-- racket/src/io/converter/main.rkt | 23 +-- racket/src/io/converter/utf-8.rkt | 28 +-- racket/src/io/demo.rkt | 5 +- 8 files changed, 215 insertions(+), 181 deletions(-) diff --git a/pkgs/racket-doc/scribblings/reference/bytes.scrbl b/pkgs/racket-doc/scribblings/reference/bytes.scrbl index 597c9d2c31a..19d1616b2bc 100644 --- a/pkgs/racket-doc/scribblings/reference/bytes.scrbl +++ b/pkgs/racket-doc/scribblings/reference/bytes.scrbl @@ -453,30 +453,44 @@ Certain encoding combinations are always available: @item{@racket[(bytes-open-converter "platform-UTF-8" "platform-UTF-16")] --- converts UTF-8 to UTF-16 on @|AllUnix|, where each UTF-16 code unit is a sequence of two bytes ordered by the current - platform's endianness. On Windows, the input can include - encodings that are not valid UTF-8, but which naturally extend the - UTF-8 encoding to support unpaired surrogate code units, and the - output is a sequence of UTF-16 code units (as little-endian byte - pairs), potentially including unpaired surrogates.} + platform's endianness. On Windows, the conversion is the same + as @racket[(bytes-open-converter "WTF-8" "WTF-16")] to support + unpaired surrogate code units.} @item{@racket[(bytes-open-converter "platform-UTF-8-permissive" "platform-UTF-16")] --- like @racket[(bytes-open-converter "platform-UTF-8" "platform-UTF-16")], but an input byte that is not part of a valid UTF-8 encoding sequence (or valid for the unpaired-surrogate extension on - Windows) is effectively replaced with @racket[(char->integer #\?)].} + Windows) is effectively replaced with @racketvalfont{#\uFFFD}.} @item{@racket[(bytes-open-converter "platform-UTF-16" "platform-UTF-8")] --- converts UTF-16 (bytes ordered by the current platform's - endianness) to UTF-8 on @|AllUnix|. On Windows, the input can - include UTF-16 code units that are unpaired surrogates, and the - corresponding output includes an encoding of each surrogate in a - natural extension of UTF-8. On @|AllUnix|, surrogates are + endianness) to UTF-8 on @|AllUnix|. On Windows, the conversion + is the same as @racket[(bytes-open-converter "WTF-16" "WTF-8")] + to support unpaired surrogates. On @|AllUnix|, surrogates are assumed to be paired: a pair of bytes with the bits @code{#xD800} starts a surrogate pair, and the @code{#x03FF} bits are used from the pair and following pair (independent of the value of the @code{#xDC00} bits). On all platforms, performance may be poor when decoding from an odd offset within an input byte string.} + @item{@racket[(bytes-open-converter "WTF-8" "WTF-16")] + --- converts the WTF-8 @cite["Sapin18"] superset of UTF-8 to a + superset of UTF-16 to support unpaired surrogate code units, where + each UTF-16 code unit is a sequence of two bytes ordered by the + current platform's endianness.} + + @item{@racket[(bytes-open-converter "WTF-8-permissive" "WTF-16")] + --- like @racket[(bytes-open-converter "WTF-8" "WTF-16")], + but an input byte that is not part of a valid WTF-8 encoding + sequence is effectively replaced with @racketvalfont{#\uFFFD}.} + + @item{@racket[(bytes-open-converter "WTF-16" "WTF-8")] + --- converts the WTF-16 @cite["Sapin18"] superset of UTF-16 to the + WTF-8 superset of UTF-8. The input can include UTF-16 code units + that are unpaired surrogates, and the corresponding output includes + an encoding of each surrogate in a natural extension of UTF-8.} + ] A newly opened byte converter is registered with the current custodian @@ -501,7 +515,12 @@ current executable's directory at run time, and the DLL must either supply @tt{_errno} or link to @filepath{msvcrt.dll} for @tt{_errno}; otherwise, only the guaranteed combinations are available. -Use @racket[bytes-convert] with the result to convert byte strings.} +Use @racket[bytes-convert] with the result to convert byte strings. + +@history[#:changed "7.9.0.17" @elem{Added built-in converters for + @racket["WTF-8"], + <@racket["WTF-8-permissive"], and + @racket["WTF-16"].}]} @defproc[(bytes-close-converter [converter bytes-converter?]) void]{ diff --git a/pkgs/racket-doc/scribblings/reference/reference.scrbl b/pkgs/racket-doc/scribblings/reference/reference.scrbl index 334d0575a1a..e98cad657a6 100644 --- a/pkgs/racket-doc/scribblings/reference/reference.scrbl +++ b/pkgs/racket-doc/scribblings/reference/reference.scrbl @@ -209,6 +209,12 @@ The @racketmodname[racket] library combines #:url "https://doi.org/10.1017/CBO9780511574962" #:date "1999") + (bib-entry #:key "Sapin18" + #:author "Simon Sapin" + #:title "The WTF-8 Encoding" + #:url "http://simonsapin.github.io/wtf-8/" + #:date "2018") + (bib-entry #:key "Shan04" #:author "Ken Shan" #:title "Shift to Control" diff --git a/pkgs/racket-test-core/tests/racket/unicode.rktl b/pkgs/racket-test-core/tests/racket/unicode.rktl index 7658ad86ac0..bc328309a80 100644 --- a/pkgs/racket-test-core/tests/racket/unicode.rktl +++ b/pkgs/racket-test-core/tests/racket/unicode.rktl @@ -889,11 +889,14 @@ (go (lambda (n p) (read-n n p 1))) (go (lambda (n p) (read-n n p 2)))))) ;; Test UTF-16 - (let ([c (bytes-open-converter "platform-UTF-8" "platform-UTF-16")]) + (for ([c (list (bytes-open-converter "platform-UTF-8" "platform-UTF-16") + (bytes-open-converter "WTF-8" "WTF-16"))] + [wtf? (list (eq? 'windows (system-type)) + #t)]) (let-values ([(s2 n status) (bytes-convert c s)]) (case parse-status [(surrogate1 surrogate2) - (if (eq? (system-type) 'windows) + (if wtf? (begin (if (eq? parse-status 'surrogate1) (test 'aborts 'status status) @@ -975,20 +978,23 @@ basic-utf-8-tests)) ;; Further UTF-16 tests -(let ([c (bytes-open-converter "platform-UTF-16" "platform-UTF-8")]) +(for ([c (list (bytes-open-converter "platform-UTF-16" "platform-UTF-8") + (bytes-open-converter "WTF-16" "WTF-8"))] + [wtf? (list (eq? 'windows (system-type)) + #t)]) (let-values ([(s n status) (bytes-convert c (bytes-append (integer->integer-bytes #xD800 2 #f) (integer->integer-bytes #xDC00 2 #f)))]) (test-values (list #"" 0 'aborts) (lambda () (bytes-convert c (integer->integer-bytes #xD800 2 #f) ))) - ;; Windows: unpaired surrogates allowed: - (when (eq? 'windows (system-type)) + ;; WTF: unpaired surrogates allowed: + (when wtf? (test-values (list #"" 0 'aborts) (lambda () (bytes-convert c (integer->integer-bytes #xD8FF 2 #f)))) (test-values (list #"\355\277\277" 2 'complete) (lambda () (bytes-convert c (integer->integer-bytes #xDFFF 2 #f))))) - ;; Non-windows: after #xD800 bits, surrogate pair is assumed - (unless (eq? 'windows (system-type)) + ;; UTF: after #xD800 bits, surrogate pair is assumed + (unless wtf? (test-values (list #"" 0 'aborts) (lambda () (bytes-convert c (integer->integer-bytes #xD800 2 #f)))) (test-values (list #"" 0 'aborts) @@ -1027,29 +1033,32 @@ (test-values '(#"" complete) (lambda () (bytes-convert-end c)))) -(when (eq? (system-type) 'windows) - (let ([c (bytes-open-converter "platform-UTF-8-permissive" "platform-UTF-16")]) - ;; Check that we use all 6 bytes of #"\355\240\200\355\260\200" or none - (test-values (list 12 6 'complete) - (lambda () - (bytes-convert c #"\355\240\200\355\260\200" 0 6 (make-bytes 12)))) - ;; If we can't look all the way to the end, reliably abort without writing: - (let ([s (make-bytes 12 (char->integer #\x))]) - (let loop ([n 1]) - (unless (= n 6) - (test-values (list 0 0 'aborts) - (lambda () - (bytes-convert c #"\355\240\200\355\260\200" 0 n s))) - (test #"xxxxxxxxxxxx" values s) ; no writes to bytes string - (loop (add1 n))))) - (let ([s (make-bytes 12 (char->integer #\x))]) - (let loop ([n 0]) - (unless (= n 12) - (test-values (list 0 0 'continues) - (lambda () - (bytes-convert c #"\355\240\200\355\260\200" 0 6 (make-bytes n)))) - (test #"xxxxxxxxxxxx" values s) ; no writes to bytes string - (loop (add1 n))))))) +(for ([c (append + (if (eq? (system-type) 'windows) + (list (bytes-open-converter "platform-UTF-8-permissive" "platform-UTF-16")) + null) + (list (bytes-open-converter "WTF-8-permissive" "WTF-16")))]) + ;; Check that we use all 6 bytes of #"\355\240\200\355\260\200" or none + (test-values (list 12 6 'complete) + (lambda () + (bytes-convert c #"\355\240\200\355\260\200" 0 6 (make-bytes 12)))) + ;; If we can't look all the way to the end, reliably abort without writing: + (let ([s (make-bytes 12 (char->integer #\x))]) + (let loop ([n 1]) + (unless (= n 6) + (test-values (list 0 0 'aborts) + (lambda () + (bytes-convert c #"\355\240\200\355\260\200" 0 n s))) + (test #"xxxxxxxxxxxx" values s) ; no writes to bytes string + (loop (add1 n))))) + (let ([s (make-bytes 12 (char->integer #\x))]) + (let loop ([n 0]) + (unless (= n 12) + (test-values (list 0 0 'continues) + (lambda () + (bytes-convert c #"\355\240\200\355\260\200" 0 6 (make-bytes n)))) + (test #"xxxxxxxxxxxx" values s) ; no writes to bytes string + (loop (add1 n)))))) ;; Seems like this sort of thing should be covered above, and maybe it ;; it after some other corrections. But just in case: diff --git a/racket/src/bc/src/string.c b/racket/src/bc/src/string.c index de007f66b73..81a344c30d6 100644 --- a/racket/src/bc/src/string.c +++ b/racket/src/bc/src/string.c @@ -29,6 +29,12 @@ # define mzLOCALE_IS_UTF_8(s) (!(rktio_convert_properties(scheme_rktio) & RKTIO_CONVERTER_SUPPORTED)) #endif +#ifdef WINDOWS_UNICODE_SUPPORT +# define WIN_UTF16_AS_WTF16(utf16) utf16 +#else +# define WIN_UTF16_AS_WTF16(utf16) 0 +#endif + #define mzICONV_KIND 0 #define mzUTF8_KIND 1 #define mzUTF8_TO_UTF16_KIND 2 @@ -39,7 +45,7 @@ typedef struct Scheme_Converter { short closed; short kind; rktio_converter_t *cd; - int permissive; + int permissive, wtf; Scheme_Custodian_Reference *mref; } Scheme_Converter; @@ -154,13 +160,13 @@ static int mz_char_strcmp_ci(const char *who, const mzchar *str1, intptr_t l1, c static int mz_strcmp(const char *who, unsigned char *str1, intptr_t l1, unsigned char *str2, intptr_t l2); XFORM_NONGCING static intptr_t utf8_decode_x(const unsigned char *s, intptr_t start, intptr_t end, - unsigned int *us, intptr_t dstart, intptr_t dend, - intptr_t *ipos, intptr_t *jpos, - char compact, char utf16, - int *state, int might_continue, int permissive); + unsigned int *us, intptr_t dstart, intptr_t dend, + intptr_t *ipos, intptr_t *jpos, + char compact, char utf16, + int *state, int might_continue, int permissive, int wtf); XFORM_NONGCING static intptr_t utf8_encode_x(const unsigned int *us, intptr_t start, intptr_t end, - unsigned char *s, intptr_t dstart, intptr_t dend, - intptr_t *_ipos, intptr_t *_opos, char utf16); + unsigned char *s, intptr_t dstart, intptr_t dend, + intptr_t *_ipos, intptr_t *_opos, char utf16, int wtf); static char *string_to_from_locale(int to_bytes, char *in, intptr_t delta, intptr_t len, @@ -1199,7 +1205,7 @@ do_byte_string_to_char_string(const char *who, NULL, 0, -1, NULL, NULL, 0, 0, NULL, 0, - (perm > -1) ? 0xD800 : 0); + (perm > -1) ? 0xD800 : 0, 0); if (ulen < 0) { scheme_contract_error(who, "string is not a well-formed UTF-8 encoding", @@ -1212,7 +1218,7 @@ do_byte_string_to_char_string(const char *who, v, 0, -1, NULL, NULL, 0, 0, NULL, 0, - (perm > -1) ? 0xD800 : 0); + (perm > -1) ? 0xD800 : 0, 0); if (perm > -1) { for (i = 0; i < ulen; i++) { @@ -1562,7 +1568,7 @@ byte_string_utf8_index(int argc, Scheme_Object *argv[]) result = utf8_decode_x((unsigned char *)chars, istart, ifinish, NULL, 0, pos, &ipos, &opos, - 0, 0, NULL, 0, perm ? 1 : 0); + 0, 0, NULL, 0, perm ? 1 : 0, 0); if (((result < 0) && (result != -3)) || ((ipos == ifinish) && (opos <= pos))) @@ -1610,7 +1616,7 @@ byte_string_utf8_ref(int argc, Scheme_Object *argv[]) utf8_decode_x((unsigned char *)chars, istart, ifinish, NULL, 0, pos, &ipos, &opos, - 0, 0, NULL, 0, perm ? 1 : 0); + 0, 0, NULL, 0, perm ? 1 : 0, 0); if (opos < pos) return scheme_false; istart = ipos; @@ -1619,7 +1625,7 @@ byte_string_utf8_ref(int argc, Scheme_Object *argv[]) utf8_decode_x((unsigned char *)chars, istart, ifinish, us, 0, 1, &ipos, &opos, - 0, 0, NULL, 0, perm ? 0xFFFFFF : 0); + 0, 0, NULL, 0, perm ? 0xFFFFFF : 0, 0); if (opos < 1) return scheme_false; @@ -2771,7 +2777,7 @@ static char *do_convert(rktio_converter_t *cd, r = utf8_decode_x((unsigned char *)in, id + dip, iilen, (unsigned int *)out, (od + dop) >> 2, iolen >> 2, &ipos, &opos, - 0, 0, NULL, 0, 0); + 0, 0, NULL, 0, 0, 0); opos <<= 2; dop = (opos - od); @@ -4334,7 +4340,7 @@ Scheme_Object *scheme_open_converter(const char *from_e, const char *to_e) Scheme_Converter *c; rktio_converter_t *cd; int kind; - int permissive; + int permissive, wtf; int need_regis = 1; Scheme_Custodian_Reference *mref; @@ -4354,6 +4360,7 @@ Scheme_Object *scheme_open_converter(const char *from_e, const char *to_e) permissive = 0; cd = NULL; need_regis = 0; + wtf = 0; } else if ((!strcmp(from_e, "platform-UTF-8") || !strcmp(from_e, "platform-UTF-8-permissive")) && !strcmp(to_e, "platform-UTF-16")) { @@ -4364,12 +4371,32 @@ Scheme_Object *scheme_open_converter(const char *from_e, const char *to_e) permissive = 0; cd = NULL; need_regis = 0; + wtf = WIN_UTF16_AS_WTF16(1); + } else if ((!strcmp(from_e, "WTF-8") + || !strcmp(from_e, "WTF-8-permissive")) + && !strcmp(to_e, "WTF-16")) { + kind = mzUTF8_TO_UTF16_KIND; + if (!strcmp(from_e, "WTF-8-permissive")) + permissive = 0xFFFD; + else + permissive = 0; + cd = NULL; + need_regis = 0; + wtf = 1; } else if (!strcmp(from_e, "platform-UTF-16") && !strcmp(to_e, "platform-UTF-8")) { kind = mzUTF16_TO_UTF8_KIND; permissive = 0; cd = NULL; need_regis = 0; + wtf = WIN_UTF16_AS_WTF16(1); + } else if (!strcmp(from_e, "WTF-16") + && !strcmp(to_e, "WTF-8")) { + kind = mzUTF16_TO_UTF8_KIND; + permissive = 0; + cd = NULL; + need_regis = 0; + wtf = 1; } else { char *tmp_from_e = NULL, *tmp_to_e = NULL; @@ -4397,6 +4424,7 @@ Scheme_Object *scheme_open_converter(const char *from_e, const char *to_e) kind = mzICONV_KIND; permissive = 0; + wtf = 0; } c = MALLOC_ONE_TAGGED(Scheme_Converter); @@ -4404,6 +4432,7 @@ Scheme_Object *scheme_open_converter(const char *from_e, const char *to_e) c->closed = 0; c->kind = kind; c->permissive = permissive; + c->wtf = wtf; c->cd = cd; if (!need_regis) mref = NULL; @@ -4537,7 +4566,7 @@ static Scheme_Object *convert_one(const char *who, int opos, int argc, Scheme_Ob status = utf8_encode_x((const unsigned int *)instr, istart >> 1, ifinish >> 1, (unsigned char *)r, ostart, ofinish, - &amt_read, &amt_wrote, 1); + &amt_read, &amt_wrote, 1, c->wtf); amt_read -= (istart >> 1); @@ -4547,7 +4576,7 @@ static Scheme_Object *convert_one(const char *who, int opos, int argc, Scheme_Ob r = (char *)scheme_malloc_atomic(amt_wrote + 1); utf8_encode_x((const unsigned int *)instr, istart >> 1, ifinish >> 1, (unsigned char *)r, ostart, ofinish, - NULL, NULL, 1); + NULL, NULL, 1, c->wtf); r[amt_wrote] = 0; } amt_read <<= 1; @@ -4600,7 +4629,7 @@ static Scheme_Object *convert_one(const char *who, int opos, int argc, Scheme_Ob status = utf8_decode_x((unsigned char *)instr, istart, ifinish, (unsigned int *)r, _ostart, _ofinish, &amt_read, &amt_wrote, - 1, utf16, NULL, 1, c->permissive); + 1, utf16, NULL, 1, c->permissive, c->wtf); if (utf16) { _ostart <<= 1; @@ -4624,7 +4653,7 @@ static Scheme_Object *convert_one(const char *who, int opos, int argc, Scheme_Ob utf8_decode_x((unsigned char *)instr, istart, ifinish, (unsigned int *)r, ostart, _ofinish, NULL, NULL, - 1, utf16, NULL, 1, c->permissive); + 1, utf16, NULL, 1, c->permissive, c->wtf); r[amt_wrote] = 0; } } else if (!r) @@ -4721,7 +4750,7 @@ static intptr_t utf8_decode_x(const unsigned char *s, intptr_t start, intptr_t e unsigned int *us, intptr_t dstart, intptr_t dend, intptr_t *ipos, intptr_t *jpos, char compact, char utf16, int *_state, - int might_continue, int permissive) + int might_continue, int permissive, int wtf) /* Results: non-negative => translation complete, = number of produced chars -1 => input ended in middle of encoding (only if might_continue) @@ -4731,8 +4760,8 @@ static intptr_t utf8_decode_x(const unsigned char *s, intptr_t start, intptr_t e ipos & jpos are filled with ending positions (between [d]start and [d]end) before return, unless they are NULL. - compact => UTF-8 to UTF-8 or UTF-16 --- the latter if utf16 - for Windows for utf16, decode extended UTF-8 that allows surrogates + compact => UTF-8 to UTF-8 or UTF-16 --- the latter if utf16; + for utf16 and wtf, decode extended UTF-8 that allows surrogates _state provides initial state and is filled with ending state; when it's not NULL, the us must be NULL @@ -4742,16 +4771,13 @@ static intptr_t utf8_decode_x(const unsigned char *s, intptr_t start, intptr_t e permissive is non-zero => use permissive as value for bad byte sequences. When generating UTF-8, this must be an ASCII character or U+FFFD. */ - { intptr_t i, j, oki; int failmode = -3, state; int init_doki; int nextbits, v; unsigned int sc; -# ifdef WINDOWS_UNICODE_SUPPORT int pending_surrogate = 0; -# endif if (_state) { state = (*_state) & 0x7; @@ -4817,12 +4843,10 @@ static intptr_t utf8_decode_x(const unsigned char *s, intptr_t start, intptr_t e /* We finished. One last check: */ if ((((v >= 0xD800) && (v <= 0xDFFF)) || (v > 0x10FFFF)) -# ifdef WINDOWS_UNICODE_SUPPORT - && (!utf16 - /* If UTF-16 for Windows, just apply upper-limit check */ - || (v > 0x10FFFF)) -# endif - ) { + && (!wtf + || !utf16 + /* If WTF-16, just apply upper-limit check */ + || (v > 0x10FFFF))) { /* UTF-16 surrogates or other illegal code units */ if (permissive) { v = permissive; @@ -4902,14 +4926,12 @@ static intptr_t utf8_decode_x(const unsigned char *s, intptr_t start, intptr_t e if (compact) { if (utf16) { if (v > 0xFFFF) { -# ifdef WINDOWS_UNICODE_SUPPORT if (pending_surrogate) { if (us) ((unsigned short *)us)[j] = pending_surrogate; j++; /* Accept previously written unpaired surrogate */ pending_surrogate = 0; } -# endif if (j + 1 >= dend) break; if (us) { @@ -4918,8 +4940,7 @@ static intptr_t utf8_decode_x(const unsigned char *s, intptr_t start, intptr_t e ((unsigned short *)us)[j+1] = 0xDC00 | (v & 0x3FF); } j++; - } else { -# ifdef WINDOWS_UNICODE_SUPPORT + } else if (wtf) { /* We allow a surrogate by itself, but don't allow a 0xDC00 after a 0xD800, otherwise multiple encodings can map to the same thing. */ @@ -4977,10 +4998,9 @@ static intptr_t utf8_decode_x(const unsigned char *s, intptr_t start, intptr_t e --j; /* don't accept unpaired surrogate, yet */ else if (us) ((unsigned short *)us)[j] = v; -# else + } else { if (us) ((unsigned short *)us)[j] = v; -# endif } } else { intptr_t delta; @@ -5031,7 +5051,6 @@ static intptr_t utf8_decode_x(const unsigned char *s, intptr_t start, intptr_t e failmode = -1; i = end - 1; /* to ensure that failmode is returned */ } else if (permissive) { -# ifdef WINDOWS_UNICODE_SUPPORT if (pending_surrogate) { /* Unpaired surrogate before permissive replacements */ if (utf16 && (j < dend)) { @@ -5041,7 +5060,6 @@ static intptr_t utf8_decode_x(const unsigned char *s, intptr_t start, intptr_t e } pending_surrogate = 0; } -#endif for (i = oki; i < end; i++) { if (j < dend) { if (us) { @@ -5061,7 +5079,6 @@ static intptr_t utf8_decode_x(const unsigned char *s, intptr_t start, intptr_t e } } -# ifdef WINDOWS_UNICODE_SUPPORT if (pending_surrogate) { if (!might_continue) { /* Accept unpaired surrogate at end of input */ @@ -5074,7 +5091,6 @@ static intptr_t utf8_decode_x(const unsigned char *s, intptr_t start, intptr_t e oki -= 3; } } -#endif if (ipos) *ipos = oki; @@ -5084,12 +5100,10 @@ static intptr_t utf8_decode_x(const unsigned char *s, intptr_t start, intptr_t e if (i < end) return failmode; -# ifdef WINDOWS_UNICODE_SUPPORT if (pending_surrogate) { /* input must have ended right after surrogate */ return -1; } -#endif return j - dstart; } @@ -5099,7 +5113,7 @@ intptr_t scheme_utf8_decode(const unsigned char *s, intptr_t start, intptr_t end intptr_t *ipos, char utf16, int permissive) { return utf8_decode_x(s, start, end, us, dstart, dend, - ipos, NULL, utf16, utf16, NULL, 0, permissive); + ipos, NULL, utf16, utf16, NULL, 0, permissive, WIN_UTF16_AS_WTF16(utf16)); } intptr_t scheme_utf8_decode_offset_prefix(const unsigned char *s, intptr_t start, intptr_t end, @@ -5107,7 +5121,7 @@ intptr_t scheme_utf8_decode_offset_prefix(const unsigned char *s, intptr_t start intptr_t *ipos, char utf16, int permissive) { return utf8_decode_x(s, start, end, us, dstart, dend, - ipos, NULL, utf16, utf16, NULL, 1, permissive); + ipos, NULL, utf16, utf16, NULL, 1, permissive, WIN_UTF16_AS_WTF16(utf16)); } intptr_t scheme_utf8_decode_as_prefix(const unsigned char *s, intptr_t start, intptr_t end, @@ -5117,13 +5131,13 @@ intptr_t scheme_utf8_decode_as_prefix(const unsigned char *s, intptr_t start, in { intptr_t opos; utf8_decode_x(s, start, end, us, dstart, dend, - ipos, &opos, utf16, utf16, NULL, 1, permissive); + ipos, &opos, utf16, utf16, NULL, 1, permissive, WIN_UTF16_AS_WTF16(utf16)); return opos - dstart; } intptr_t scheme_utf8_decode_all(const unsigned char *s, intptr_t len, unsigned int *us, int permissive) { - return utf8_decode_x(s, 0, len, us, 0, -1, NULL, NULL, 0, 0, NULL, 0, permissive); + return utf8_decode_x(s, 0, len, us, 0, -1, NULL, NULL, 0, 0, NULL, 0, permissive, 0); } intptr_t scheme_utf8_decode_prefix(const unsigned char *s, intptr_t len, unsigned int *us, int permissive) @@ -5142,7 +5156,7 @@ intptr_t scheme_utf8_decode_prefix(const unsigned char *s, intptr_t len, unsigne return len; } - return utf8_decode_x(s, 0, len, us, 0, -1, NULL, NULL, 0, 0, NULL, 1, permissive); + return utf8_decode_x(s, 0, len, us, 0, -1, NULL, NULL, 0, 0, NULL, 1, permissive, 0); } mzchar *scheme_utf8_decode_to_buffer_len(const unsigned char *s, intptr_t len, @@ -5152,7 +5166,7 @@ mzchar *scheme_utf8_decode_to_buffer_len(const unsigned char *s, intptr_t len, ulen = utf8_decode_x(s, 0, len, NULL, 0, -1, NULL, NULL, 0, 0, - NULL, 0, 0); + NULL, 0, 0, 0); if (ulen < 0) return NULL; if (ulen + 1 > blen) { @@ -5160,7 +5174,7 @@ mzchar *scheme_utf8_decode_to_buffer_len(const unsigned char *s, intptr_t len, } utf8_decode_x(s, 0, len, buf, 0, -1, NULL, NULL, 0, 0, - NULL, 0, 0); + NULL, 0, 0, 0); buf[ulen] = 0; *_ulen = ulen; return buf; @@ -5193,14 +5207,14 @@ intptr_t scheme_utf8_decode_count(const unsigned char *s, intptr_t start, intptr NULL, 0, -1, NULL, &pos, 0, 0, _state, - might_continue, permissive); + might_continue, permissive, 0); return pos; } static intptr_t utf8_encode_x(const unsigned int *us, intptr_t start, intptr_t end, - unsigned char *s, intptr_t dstart, intptr_t dend, - intptr_t *_ipos, intptr_t *_opos, char utf16) + unsigned char *s, intptr_t dstart, intptr_t dend, + intptr_t *_ipos, intptr_t *_opos, char utf16, int wtf) /* Results: -1 => input ended in the middle of an encoding - only when utf16 and _opos non-negative => reports number of bytes/code-units produced */ @@ -5222,30 +5236,23 @@ static intptr_t utf8_encode_x(const unsigned int *us, intptr_t start, intptr_t e end and _opos is 0. The well-formedness assumption was probably not a good idea, but note that it's explicitly documented to behave that way. */ -# ifdef WINDOWS_UNICODE_SUPPORT -# define UNPAIRED_MASK 0xFC00 -# else -# define UNPAIRED_MASK 0xF800 -# endif - if (((i + 1) == end) && ((wc & UNPAIRED_MASK) == 0xD800) && _opos) { +# define UNPAIRED_MASK(wtf) (wtf ? 0xFC00 : 0xF800) + if (((i + 1) == end) && ((wc & UNPAIRED_MASK(wtf)) == 0xD800) && _opos) { /* Ended in the middle of a surrogate pair */ *_opos = j; if (_ipos) *_ipos = i; return -1; } -# ifdef WINDOWS_UNICODE_SUPPORT - if ((wc & 0xFC00) != 0xD800) { + if (wtf && ((wc & 0xFC00) != 0xD800)) { /* Count as one */ - } else if ((i + 1 >= end) - || (((((unsigned short *)us)[i+1]) & 0xFC00) != 0xDC00)) { - } else -# endif - { - i++; - wc = ((wc & 0x3FF) << 10) + ((((unsigned short *)us)[i]) & 0x3FF); - wc += 0x10000; - } + } else if (wtf && ((i + 1 >= end) + || (((((unsigned short *)us)[i+1]) & 0xFC00) != 0xDC00))) { + } else { + i++; + wc = ((wc & 0x3FF) << 10) + ((((unsigned short *)us)[i]) & 0x3FF); + wc += 0x10000; + } } } else { wc = us[i]; @@ -5279,26 +5286,23 @@ static intptr_t utf8_encode_x(const unsigned int *us, intptr_t start, intptr_t e /* Unparse surrogates. We assume that the surrogates are well formed on non-Windows platforms, but when _opos, we detect ending in the middle of an surrogate pair. */ - if (((i + 1) == end) && ((wc & UNPAIRED_MASK) == 0xD800) && _opos) { + if (((i + 1) == end) && ((wc & UNPAIRED_MASK(wtf)) == 0xD800) && _opos) { /* Ended in the middle of a surrogate pair */ *_opos = j; if (_ipos) *_ipos = i; return -1; } -# ifdef WINDOWS_UNICODE_SUPPORT - if ((wc & 0xFC00) != 0xD800) { + if (wtf && ((wc & 0xFC00) != 0xD800)) { /* Let the misplaced surrogate through */ - } else if ((i + 1 >= end) - || (((((unsigned short *)us)[i+1]) & 0xFC00) != 0xDC00)) { + } else if (wtf && ((i + 1 >= end) + || (((((unsigned short *)us)[i+1]) & 0xFC00) != 0xDC00))) { /* Let the misplaced surrogate through */ - } else -# endif - { - i++; - wc = ((wc & 0x3FF) << 10) + ((((unsigned short *)us)[i]) & 0x3FF); - wc += 0x10000; - } + } else { + i++; + wc = ((wc & 0x3FF) << 10) + ((((unsigned short *)us)[i]) & 0x3FF); + wc += 0x10000; + } } } else { wc = us[i]; @@ -5360,12 +5364,12 @@ intptr_t scheme_utf8_encode(const unsigned int *us, intptr_t start, intptr_t end { return utf8_encode_x(us, start, end, s, dstart, -1, - NULL, NULL, utf16); + NULL, NULL, utf16, WIN_UTF16_AS_WTF16(utf16)); } intptr_t scheme_utf8_encode_all(const unsigned int *us, intptr_t len, unsigned char *s) { - return utf8_encode_x(us, 0, len, s, 0, -1, NULL, NULL, 0 /* utf16 */); + return utf8_encode_x(us, 0, len, s, 0, -1, NULL, NULL, 0 /* utf16 */, 0); } char *scheme_utf8_encode_to_buffer_len(const mzchar *s, intptr_t len, @@ -5389,11 +5393,11 @@ char *scheme_utf8_encode_to_buffer_len(const mzchar *s, intptr_t len, } } - slen = utf8_encode_x(s, 0, len, NULL, 0, -1, NULL, NULL, 0); + slen = utf8_encode_x(s, 0, len, NULL, 0, -1, NULL, NULL, 0, 0); if (slen + 1 > blen) { buf = (char *)scheme_malloc_atomic(slen + 1); } - utf8_encode_x(s, 0, len, (unsigned char *)buf, 0, -1, NULL, NULL, 0); + utf8_encode_x(s, 0, len, (unsigned char *)buf, 0, -1, NULL, NULL, 0, 0); buf[slen] = 0; *_slen = slen; return buf; diff --git a/racket/src/cs/schemified/io.scm b/racket/src/cs/schemified/io.scm index 9102eb0158b..5a4d7a23d1f 100644 --- a/racket/src/cs/schemified/io.scm +++ b/racket/src/cs/schemified/io.scm @@ -14846,9 +14846,9 @@ (if (let ((or-part_0 (eq? from_0 'utf-16))) (if or-part_0 or-part_0 - (let ((or-part_1 (eq? from_0 'utf-16-ish))) + (let ((or-part_1 (eq? from_0 'wtf-16))) (if or-part_1 or-part_1 (eq? from_0 'utf-16-assume))))) - (let ((temp33_0 (eq? from_0 'utf-16-ish))) + (let ((temp33_0 (eq? from_0 'wtf-16))) (let ((temp34_0 (eq? from_0 'utf-16-assume))) (utf-16-ish-reencode!.1 temp34_0 @@ -14861,19 +14861,17 @@ dest-end_0))) (let ((or-part_0 (eq? from_0 'utf-8-permissive))) (let ((temp41_0 - (if or-part_0 - or-part_0 - (eq? from_0 'utf-8-ish-permissive)))) + (if or-part_0 or-part_0 (eq? from_0 'wtf-8-permissive)))) (let ((temp42_0 - (let ((or-part_1 (eq? from_0 'utf-8-ish))) + (let ((or-part_1 (eq? from_0 'wtf-8))) (if or-part_1 or-part_1 - (eq? from_0 'utf-8-ish-permissive))))) + (eq? from_0 'wtf-8-permissive))))) (let ((temp43_0 (let ((or-part_1 (eq? to_0 'utf-16))) (if or-part_1 or-part_1 - (let ((or-part_2 (eq? to_0 'utf-16-ish))) + (let ((or-part_2 (eq? to_0 'wtf-16))) (if or-part_2 or-part_2 (eq? to_0 'utf-16-assume))))))) @@ -14902,7 +14900,7 @@ (define utf-8-ish-reencode!.1 (|#%name| utf-8-ish-reencode! - (lambda (from-utf-8-ish?3_0 + (lambda (from-wtf-8?3_0 permissive?2_0 to-utf-16?4_0 in-bstr8_0 @@ -15111,7 +15109,7 @@ (if (if (>= next-accum_0 55296) (<= next-accum_0 57343) #f) - (if from-utf-8-ish?3_0 + (if from-wtf-8?3_0 (if (= i_0 in-end10_0) (let ((app_0 (- base-i_0 in-start9_0))) @@ -15439,7 +15437,7 @@ (|#%name| utf-16-ish-reencode! (lambda (assume-paired-surrogates?16_0 - from-utf-16-ish?15_0 + from-wtf-16?15_0 in-bstr19_0 in-start20_0 in-end21_0 @@ -15693,10 +15691,10 @@ v2_0 1023)))))) (continue_0 v3_0 (+ i_0 4))) - (if from-utf-16-ish?15_0 + (if from-wtf-16?15_0 (continue_0 v_0 (+ i_0 2)) (done_0 'error))))))) - (if from-utf-16-ish?15_0 + (if from-wtf-16?15_0 (continue_0 v_0 (+ i_0 2)) (done_0 'error))) (continue_0 v_0 (+ i_0 2))))))))))))))) @@ -15806,10 +15804,10 @@ 'bytes-converter 'custodian-reference)))))) (define windows? (eq? 'windows (system-type))) -(define platform-utf-8 (if windows? 'utf-8-ish 'utf-8)) +(define platform-utf-8 (if windows? 'wtf-8 'utf-8)) (define platform-utf-8-permissive - (if windows? 'utf-8-ish-permissive 'utf-8-permissive)) -(define platform-utf-16 (if windows? 'utf-16-ish 'utf-16-assume)) + (if windows? 'wtf-8-permissive 'utf-8-permissive)) +(define platform-utf-16 (if windows? 'wtf-16 'utf-16-assume)) (define bytes-open-converter-in-custodian (lambda (who_0 cust_0 from-str_0 to-str_0) (begin @@ -15843,23 +15841,21 @@ (bytes-converter1.1 (utf-8-converter1.1 platform-utf-16 platform-utf-8) #f) - (if (if (string=? from-str_0 "UTF-8-ish") - (string=? to-str_0 "UTF-16-ish") + (if (if (string=? from-str_0 "WTF-8") + (string=? to-str_0 "WTF-16") #f) - (bytes-converter1.1 - (utf-8-converter1.1 'utf-8-ish 'utf-16-ish) - #f) - (if (if (string=? from-str_0 "UTF-8-ish-permissive") - (string=? to-str_0 "UTF-16-ish") + (bytes-converter1.1 (utf-8-converter1.1 'wtf-8 'wtf-16) #f) + (if (if (string=? from-str_0 "WTF-8-permissive") + (string=? to-str_0 "WTF-16") #f) (bytes-converter1.1 - (utf-8-converter1.1 'utf-8-ish-permissive 'utf-16-ish) + (utf-8-converter1.1 'wtf-8-permissive 'wtf-16) #f) - (if (if (string=? from-str_0 "UTF-16-ish") - (string=? to-str_0 "UTF-8-ish") + (if (if (string=? from-str_0 "WTF-16") + (string=? to-str_0 "WTF-8") #f) (bytes-converter1.1 - (utf-8-converter1.1 'utf-16-ish 'utf-8-ish) + (utf-8-converter1.1 'wtf-16 'wtf-8) #f) (if (if (let ((or-part_0 (if (string=? from-str_0 "UTF-8") diff --git a/racket/src/io/converter/main.rkt b/racket/src/io/converter/main.rkt index a56250789ce..b61259de954 100644 --- a/racket/src/io/converter/main.rkt +++ b/racket/src/io/converter/main.rkt @@ -24,9 +24,9 @@ ;; intended for converting to and from arbitrary 16-byte sequences, ;; which is useful for encoding Windows paths. (define windows? (eq? 'windows (system-type))) -(define platform-utf-8 (if windows? 'utf-8-ish 'utf-8)) -(define platform-utf-8-permissive (if windows? 'utf-8-ish-permissive 'utf-8-permissive)) -(define platform-utf-16 (if windows? 'utf-16-ish 'utf-16-assume)) +(define platform-utf-8 (if windows? 'wtf-8 'utf-8)) +(define platform-utf-8-permissive (if windows? 'wtf-8-permissive 'utf-8-permissive)) +(define platform-utf-16 (if windows? 'wtf-16 'utf-16-assume)) (define (bytes-open-converter-in-custodian who cust from-str to-str) (check who string? from-str) @@ -47,17 +47,18 @@ [(and (string=? from-str "platform-UTF-16") (string=? to-str "platform-UTF-8")) (bytes-converter (utf-8-converter platform-utf-16 platform-utf-8) #f)] - ;; "UTF-8-ish" is also known as "WTF-8". - ;; "UTF-16-ish" is similar to UTF-16, but allows unpaired surrogates --- which is still + ;; WTF-16 is similar to UTF-16, but allows unpaired surrogates --- which is still ;; different from UCS-2, since paired surrogates are decoded as in UTF-16. - [(and (string=? from-str "UTF-8-ish") (string=? to-str "UTF-16-ish")) - (bytes-converter (utf-8-converter 'utf-8-ish 'utf-16-ish) + ;; WTF-8 is the analogous extension of UTF-8, where a surrogate pair encoded + ;; as a sequence of unpaired surrogates is specifically disallowed. + [(and (string=? from-str "WTF-8") (string=? to-str "WTF-16")) + (bytes-converter (utf-8-converter 'wtf-8 'wtf-16) #f)] - [(and (string=? from-str "UTF-8-ish-permissive") (string=? to-str "UTF-16-ish")) - (bytes-converter (utf-8-converter 'utf-8-ish-permissive 'utf-16-ish) + [(and (string=? from-str "WTF-8-permissive") (string=? to-str "WTF-16")) + (bytes-converter (utf-8-converter 'wtf-8-permissive 'wtf-16) #f)] - [(and (string=? from-str "UTF-16-ish") (string=? to-str "UTF-8-ish")) - (bytes-converter (utf-8-converter 'utf-16-ish 'utf-8-ish) + [(and (string=? from-str "WTF-16") (string=? to-str "WTF-8")) + (bytes-converter (utf-8-converter 'wtf-16 'wtf-8) #f)] [(and (or (and (string=? from-str "UTF-8") (string=? to-str "")) (and (string=? from-str "") (string=? to-str "UTF-8"))) diff --git a/racket/src/io/converter/utf-8.rkt b/racket/src/io/converter/utf-8.rkt index bc09ce2916d..22fefc352e1 100644 --- a/racket/src/io/converter/utf-8.rkt +++ b/racket/src/io/converter/utf-8.rkt @@ -19,20 +19,20 @@ (define to (utf-8-converter-to c)) (define-values (in-consumed out-produced status) (if (or (eq? from 'utf-16) - (eq? from 'utf-16-ish) + (eq? from 'wtf-16) (eq? from 'utf-16-assume)) (utf-16-ish-reencode! src src-start src-end dest dest-start dest-end - #:from-utf-16-ish? (eq? from 'utf-16-ish) + #:from-wtf-16? (eq? from 'wtf-16) #:assume-paired-surrogates? (eq? from 'utf-16-assume)) (utf-8-ish-reencode! src src-start src-end dest dest-start dest-end #:permissive? (or (eq? from 'utf-8-permissive) - (eq? from 'utf-8-ish-permissive)) - #:from-utf-8-ish? (or (eq? from 'utf-8-ish) - (eq? from 'utf-8-ish-permissive)) + (eq? from 'wtf-8-permissive)) + #:from-wtf-8? (or (eq? from 'wtf-8) + (eq? from 'wtf-8-permissive)) #:to-utf-16? (or (eq? to 'utf-16) - (eq? to 'utf-16-ish) + (eq? to 'wtf-16) (eq? to 'utf-16-assume))))) (values in-consumed out-produced @@ -44,17 +44,17 @@ ;; Similar to `utf-8-decode` in "../string/utf-8-decode.rkt", but ;; "decodes" back to a byte string either as UTF-8 or UTF-16, and also -;; supports a "utf-8-ish" encoding that allows unpaired surrogates. +;; supports a WTF-8 encoding that allows unpaired surrogates. ;; ;; There's a lot of similarly to the implementation of `utf-8-decode`, ;; but with enough differences to make abstraction difficult. (define (utf-8-ish-reencode! in-bstr in-start in-end out-bstr out-start out-end #:permissive? permissive? - #:from-utf-8-ish? from-utf-8-ish? + #:from-wtf-8? from-wtf-8? #:to-utf-16? to-utf-16?) (let loop ([i in-start] [j out-start] [base-i in-start] [accum 0] [remaining 0] - ;; for '-ish' mode to UTF-16: + ;; for WTF-8 mode to WTF-16: [pending-surrogate #f]) ;; Used to write a pending surrogate before continuing to write other: @@ -196,7 +196,7 @@ [(and (v . >= . #xD800) (v . <= . #xDFFF)) (cond - [from-utf-8-ish? + [from-wtf-8? ;; Assuming `to-utf-16?`... ;; Allow an unpaired surrogate, but make sure it's really unpaired (cond @@ -268,7 +268,7 @@ (- j out-start) 'continues)]))] [else - ;; For UTF-8-to-UTF-8 with no "-ish" corrections, we can just copy + ;; For UTF-8-to-UTF-8 (no WTF-8), we can just copy ;; the input encoding bytes to the output bytes (define next-i (add1 i)) (let loop ([from-i base-i] [to-j j]) @@ -329,7 +329,7 @@ ;; Converts UTF-16 into UTF-8 (define (utf-16-ish-reencode! in-bstr in-start in-end out-bstr out-start out-end - #:from-utf-16-ish? from-utf-16-ish? + #:from-wtf-16? from-wtf-16? #:assume-paired-surrogates? assume-paired-surrogates?) (let loop ([i in-start] [j out-start]) (define (done status) @@ -378,7 +378,7 @@ (bitwise-ior (arithmetic-shift (bitwise-and v #x3FF) 10) (bitwise-and v2 #x3FF)))) (continue v3 (+ i 4))] - [from-utf-16-ish? + [from-wtf-16? ;; continue anyway as as unpaired surrogate (continue v (+ i 2))] [else @@ -386,7 +386,7 @@ [else ;; unpaired surrogate (cond - [from-utf-16-ish? + [from-wtf-16? ;; continue anyway (continue v (+ i 2))] [else (done 'error)])])] diff --git a/racket/src/io/demo.rkt b/racket/src/io/demo.rkt index 8c784127d41..7c33decef89 100644 --- a/racket/src/io/demo.rkt +++ b/racket/src/io/demo.rkt @@ -574,7 +574,7 @@ (call-with-values (lambda () (bytes-convert c #"\360\220\220\200")) list)) (test (void) (bytes-close-converter c))) -(let ([c (bytes-open-converter "UTF-8-ish" "UTF-16-ish")]) +(let ([c (bytes-open-converter "WTF-8" "WTF-16")]) (test `(,(reorder #"A\0\200\0") 3 complete) (call-with-values (lambda () (bytes-convert c #"A\302\200")) list)) (test `(,(reorder #"A\0") 1 error) @@ -586,7 +586,6 @@ (test `(,(reorder #"\1\334") 3 complete) (call-with-values (lambda () (bytes-convert c #"\355\260\201")) list)) ;; surrogate pair where each is separately encoded, high before low - (log-error "here") (test `(,(reorder #"") 0 error) (call-with-values (lambda () (bytes-convert c #"\355\240\200\355\260\201")) list)) ;; surrogate pair where each is separately encoded, low before high @@ -599,7 +598,7 @@ (call-with-values (lambda () (bytes-convert c #"\360\220\220\200")) list)) (test (void) (bytes-close-converter c))) -(let ([c (bytes-open-converter "UTF-16-ish" "UTF-8-ish")]) +(let ([c (bytes-open-converter "WTF-16" "WTF-8")]) (test `(#"A\302\200" 4 complete) (call-with-values (lambda () (bytes-convert c (reorder #"A\0\200\0"))) list)) ;; unpaired high surrogate