From f3ecc3cdc5160aeab966a09bcec25eabe61fe295 Mon Sep 17 00:00:00 2001 From: Dmitri Gribenko Date: Wed, 4 Nov 2015 16:54:05 -0800 Subject: [PATCH] transcode(..., stopOnError:) => transcode(..., stoppingOnError:) --- "\\" | 960 +++++++++++++++++++ stdlib/public/core/Character.swift | 2 +- stdlib/public/core/StringBuffer.swift | 4 +- stdlib/public/core/StringCore.swift | 2 +- stdlib/public/core/Unicode.swift | 2 +- validation-test/stdlib/Unicode.swift | 24 +- validation-test/stdlib/UnicodeTrie.swift.gyb | 2 +- 7 files changed, 978 insertions(+), 18 deletions(-) create mode 100644 "\\" diff --git "a/\\" "b/\\" new file mode 100644 index 0000000000000..3bc33e693b2dd --- /dev/null +++ "b/\\" @@ -0,0 +1,960 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See http://swift.org/LICENSE.txt for license information +// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +// +//===----------------------------------------------------------------------===// + + +// Conversions between different Unicode encodings. Note that UTF-16 and +// UTF-32 decoding are *not* currently resilient to erroneous data. + +/// The result of one Unicode decoding step. +/// +/// A unicode scalar value, an indication that no more unicode scalars +/// are available, or an indication of a decoding error. +public enum UnicodeDecodingResult { + case Result(UnicodeScalar) + case EmptyInput + case Error + + /// Return true if `self` indicates no more unicode scalars are + /// available. + @warn_unused_result + public func isEmptyInput() -> Bool { + switch self { + case .EmptyInput: + return true + default: + return false + } + } +} + +/// A Unicode [encoding scheme](http://www.unicode.org/glossary/#character_encoding_scheme). +/// +/// Consists of an underlying [code unit](http://www.unicode.org/glossary/#code_unit) and functions to +/// translate between sequences of these code units and [unicode scalar values](http://www.unicode.org/glossary/#unicode_scalar_value). +public protocol UnicodeCodecType { + + /// A type that can hold [code unit](http://www.unicode.org/glossary/#code_unit) values for this + /// encoding. + typealias CodeUnit + + init() + + /// Start or continue decoding a UTF sequence. + /// + /// In order to decode a code unit sequence completely, this function should + /// be called repeatedly until it returns `UnicodeDecodingResult.EmptyInput`. + /// Checking that the iterator was exhausted is not sufficient. The decoder + /// can have an internal buffer that is pre-filled with data from the input + /// iterator. + /// + /// Because of buffering, it is impossible to find the corresponding position + /// in the iterator for a given returned `UnicodeScalar` or an error. + /// + /// - parameter next: An *iterator* of code units to be decoded. + mutating func decode< + I : IteratorProtocol where I.Element == CodeUnit + >(inout next: I) -> UnicodeDecodingResult + + /// Encode a `UnicodeScalar` as a series of `CodeUnit`s by + /// calling `output` on each `CodeUnit`. + static func encode(input: UnicodeScalar, output: (CodeUnit) -> Void) +} + +/// A codec for [UTF-8](http://www.unicode.org/glossary/#UTF_8). +public struct UTF8 : UnicodeCodecType { + + /// A type that can hold [code unit](http://www.unicode.org/glossary/#code_unit) values for this + /// encoding. + public typealias CodeUnit = UInt8 + + public init() {} + + /// Returns the number of expected trailing bytes for a given first byte: 0, + /// 1, 2 or 3. If the first byte can not start a valid UTF-8 code unit + /// sequence, returns 4. + @warn_unused_result + public static func _numTrailingBytes(cu0: CodeUnit) -> UInt8 { + if _fastPath(cu0 & 0x80 == 0) { + // 0x00 -- 0x7f: 1-byte sequences. + return 0 + } + + // 0xc0 -- 0xc1: invalid first byte. + // 0xc2 -- 0xdf: 2-byte sequences. + // 0xe0 -- 0xef: 3-byte sequences. + // 0xf0 -- 0xf4: 4-byte sequences. + // 0xf5 -- 0xff: invalid first byte. + + // The rules above are represented as a lookup table. The lookup table + // consists of two words, where `high` contains the high bit of the result, + // `low` contains the low bit. + // + // Bit patterns: + // high | low | meaning + // -----+-----+---------------- + // 0 | 0 | 2-byte sequence + // 0 | 1 | 3-byte sequence + // 1 | 0 | 4-byte sequence + // 1 | 1 | invalid + // + // This implementation allows us to handle these cases without branches. + + // ---------0xf?------- ---------0xe?------- ---------0xd?------- ---------0xc?------- + let low: UInt64 = + 0b1111_1111__1110_0000__1111_1111__1111_1111__0000_0000__0000_0000__0000_0000__0000_0011 + let high: UInt64 = + 0b1111_1111__1111_1111__0000_0000__0000_0000__0000_0000__0000_0000__0000_0000__0000_0011 + + let index = UInt64(max(0, Int(cu0) - 0xc0)) + let highBit = ((high >> index) & 1) << 1 + let lowBit = (low >> index) & 1 + return UInt8(1 + (highBit | lowBit)) + } + + /// Lookahead buffer used for UTF-8 decoding. New bytes are inserted at LSB, + /// and bytes are read at MSB. + var _decodeLookahead: UInt32 = 0 + + /// Flags with layout: `0bxxxx_yyyy`. + /// + /// `xxxx` is the EOF flag. It means that the input iterator has signaled + /// end of sequence. Out of the four bits, only one bit can be set. The bit + /// position specifies how many bytes have been consumed from the lookahead + /// buffer already. A value of `1000` means that there are `yyyy` bytes in + /// the buffer, `0100` means that there are `yyyy - 1` bytes, `0010` -- + /// `yyyy - 2`, `0001` -- `yyyy - 3`. + /// + /// `yyyy` specifies how many bytes are valid in the lookahead buffer. Value + /// is expressed in unary code. Valid values: `1111` (4), `0111` (3), + /// `0011` (2), `0001` (1), `0000` (0). + /// + /// This representation is crafted to allow one to consume a byte from a + /// buffer with a shift, and update flags with a single-bit right shift. + var _lookaheadFlags: UInt8 = 0 + + /// Return `true` if the LSB bytes in `buffer` are well-formed UTF-8 code + /// unit sequence. + @warn_unused_result + static func _isValidUTF8Impl(buffer: UInt32, length: UInt8) -> Bool { + switch length { + case 4: + let cu3 = UInt8((buffer >> 24) & 0xff) + if cu3 < 0x80 || cu3 > 0xbf { + return false + } + fallthrough + case 3: + let cu2 = UInt8((buffer >> 16) & 0xff) + if cu2 < 0x80 || cu2 > 0xbf { + return false + } + fallthrough + case 2: + let cu0 = UInt8(buffer & 0xff) + let cu1 = UInt8((buffer >> 8) & 0xff) + switch cu0 { + case 0xe0: + if cu1 < 0xa0 || cu1 > 0xbf { + return false + } + case 0xed: + if cu1 < 0x80 || cu1 > 0x9f { + return false + } + case 0xf0: + if cu1 < 0x90 || cu1 > 0xbf { + return false + } + case 0xf4: + if cu1 < 0x80 || cu1 > 0x8f { + return false + } + default: + _sanityCheck(cu0 >= 0xc2 && cu0 <= 0xf4, + "invalid first bytes should be handled in the caller") + if cu1 < 0x80 || cu1 > 0xbf { + return false + } + } + return true + + default: + _sanityCheckFailure("one-byte sequences should be handled in the caller") + } + } + + /// Return `true` if the LSB bytes in `buffer` are well-formed UTF-8 code + /// unit sequence. + @warn_unused_result + static func _isValidUTF8(buffer: UInt32, validBytes: UInt8) -> Bool { + _sanityCheck(validBytes & 0b0000_1111 != 0, + "input buffer should not be empty") + + let cu0 = UInt8(buffer & 0xff) + let trailingBytes = _numTrailingBytes(cu0) + switch trailingBytes { + case 0: + return true + + case 1, 2, 3: + // We *don't* need to check the if the buffer actually contains at least + // `trailingBytes` bytes. Here's why. + // + // If the buffer is not full -- contains fewer than 4 bytes, we are at + // EOF, and the buffer will be padded with 0x00. Thus, an incomplete + // code unit sequence just before EOF would be seen by code below as + // padded with nuls. This sequence will be rejected by the logic in + // `_isValidUTF8Impl`, because the nul byte is not a valid continuation + // byte for UTF-8. + return _isValidUTF8Impl(buffer, length: trailingBytes + 1) + + default: + return false + } + } + + /// Given an ill-formed sequence, find the length of its maximal subpart. + @inline(never) + @warn_unused_result + static func _findMaximalSubpartOfIllFormedUTF8Sequence( + buffer: UInt32, validBytes: UInt8) -> UInt8 { + var buffer = buffer + var validBytes = validBytes + // This function is '@inline(never)' because it is used only in the error + // handling path. + + // Clear EOF flag, we don't care about it. + validBytes &= 0b0000_1111 + + _sanityCheck(validBytes != 0, + "input buffer should not be empty") + _sanityCheck(!UTF8._isValidUTF8(buffer, validBytes: validBytes), + "input sequence should be ill-formed UTF-8") + + // Unicode 6.3.0, D93b: + // + // Maximal subpart of an ill-formed subsequence: The longest code unit + // subsequence starting at an unconvertible offset that is either: + // a. the initial subsequence of a well-formed code unit sequence, or + // b. a subsequence of length one. + + // Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8 + // Byte Sequences. + + let cu0 = UInt8(buffer & 0xff) + buffer >>= 8 + validBytes >>= 1 + if (cu0 >= 0xc2 && cu0 <= 0xdf) { + // First byte is valid, but we know that this code unit sequence is + // invalid, so the maximal subpart has to end after the first byte. + return 1 + } + + if validBytes == 0 { + return 1 + } + + let cu1 = UInt8(buffer & 0xff) + buffer >>= 8 + validBytes >>= 1 + + if (cu0 == 0xe0) { + return (cu1 >= 0xa0 && cu1 <= 0xbf) ? 2 : 1 + } + if (cu0 >= 0xe1 && cu0 <= 0xec) { + return (cu1 >= 0x80 && cu1 <= 0xbf) ? 2 : 1 + } + if (cu0 == 0xed) { + return (cu1 >= 0x80 && cu1 <= 0x9f) ? 2 : 1 + } + if (cu0 >= 0xee && cu0 <= 0xef) { + return (cu1 >= 0x80 && cu1 <= 0xbf) ? 2 : 1 + } + if (cu0 == 0xf0) { + if (cu1 >= 0x90 && cu1 <= 0xbf) { + if validBytes == 0 { + return 2 + } + + let cu2 = UInt8(buffer & 0xff) + return (cu2 >= 0x80 && cu2 <= 0xbf) ? 3 : 2 + } + return 1 + } + if (cu0 >= 0xf1 && cu0 <= 0xf3) { + if (cu1 >= 0x80 && cu1 <= 0xbf) { + if validBytes == 0 { + return 2 + } + + let cu2 = UInt8(buffer & 0xff) + return (cu2 >= 0x80 && cu2 <= 0xbf) ? 3 : 2 + } + return 1 + } + if (cu0 == 0xf4) { + if (cu1 >= 0x80 && cu1 <= 0x8f) { + if validBytes == 0 { + return 2 + } + + let cu2 = UInt8(buffer & 0xff) + return (cu2 >= 0x80 && cu2 <= 0xbf) ? 3 : 2 + } + return 1 + } + + _sanityCheck((cu0 >= 0x80 && cu0 <= 0xc1) || cu0 >= 0xf5, + "case analysis above should have handled all valid first bytes") + + // There are no well-formed sequences that start with these bytes. Maximal + // subpart is defined to have length 1 in these cases. + return 1 + } + + /// Start or continue decoding a UTF sequence. + /// + /// In order to decode a code unit sequence completely, this function should + /// be called repeatedly until it returns `UnicodeDecodingResult.EmptyInput`. + /// Checking that the iterator was exhausted is not sufficient. The decoder + /// can have an internal buffer that is pre-filled with data from the input + /// iterator. + /// + /// Because of buffering, it is impossible to find the corresponding position + /// in the iterator for a given returned `UnicodeScalar` or an error. + /// + /// - parameter next: A *iterator* over the code units to be decoded. + public mutating func decode< + I : IteratorProtocol where I.Element == CodeUnit + >(inout next: I) -> UnicodeDecodingResult { + // If the EOF flag is not set, fill the lookahead buffer from the input + // iterator. + if _lookaheadFlags & 0b1111_0000 == 0 { + // Add more bytes into the buffer until we have 4. + while _lookaheadFlags != 0b0000_1111 { + if let codeUnit = next.next() { + _decodeLookahead = (_decodeLookahead << 8) | UInt32(codeUnit) + _lookaheadFlags = (_lookaheadFlags << 1) | 1 + } else { + // Set the EOF flag. + switch _lookaheadFlags & 0b0000_1111 { + case 0b1111: + _sanityCheckFailure("should have not entered buffer refill loop") + case 0b0111: + _lookaheadFlags |= 0b0100_0000 + case 0b0011: + _lookaheadFlags |= 0b0010_0000 + case 0b0001: + _lookaheadFlags |= 0b0001_0000 + case 0b0000: + _lookaheadFlags |= 0b1000_0000 + return .EmptyInput + default: + _sanityCheckFailure("bad value in _lookaheadFlags") + } + break + } + } + } + + if _slowPath(_lookaheadFlags & 0b0000_1111 == 0) { + return .EmptyInput + } + + if _slowPath(_lookaheadFlags & 0b1111_0000 != 0) { + // Reached EOF. Restore the invariant: first unread byte is always at + // MSB. + switch _lookaheadFlags & 0b1111_0000 { + case 0b1000_0000: + break + case 0b0100_0000: + _decodeLookahead <<= 1 * 8 + case 0b0010_0000: + _decodeLookahead <<= 2 * 8 + case 0b0001_0000: + _decodeLookahead <<= 3 * 8 + default: + _sanityCheckFailure("bad value in _lookaheadFlags") + } + _lookaheadFlags = (_lookaheadFlags & 0b0000_1111) | 0b1000_0000 + } + + // The first byte to read is located at MSB of `_decodeLookahead`. Get a + // representation of the buffer where we can read bytes starting from LSB. + var buffer = _decodeLookahead.byteSwapped + if _slowPath(!UTF8._isValidUTF8(buffer, validBytes: _lookaheadFlags)) { + // The code unit sequence is ill-formed. According to Unicode + // recommendation, replace the maximal subpart of ill-formed sequence + // with one replacement character. + _lookaheadFlags >>= + UTF8._findMaximalSubpartOfIllFormedUTF8Sequence(buffer, + validBytes: _lookaheadFlags) + return .Error + } + + // At this point we know that `buffer` starts with a well-formed code unit + // sequence. Decode it. + // + // When consuming bytes from the `buffer`, we just need to update + // `_lookaheadFlags`. The stored buffer in `_decodeLookahead` will be + // shifted at the beginning of the next decoding cycle. + let cu0 = UInt8(buffer & 0xff) + buffer >>= 8 + _lookaheadFlags >>= 1 + + if cu0 < 0x80 { + // 1-byte sequences. + return .Result(UnicodeScalar(UInt32(cu0))) + } + + // Start with octet 1 (we'll mask off high bits later). + var result = UInt32(cu0) + + let cu1 = UInt8(buffer & 0xff) + buffer >>= 8 + _lookaheadFlags >>= 1 + result = (result << 6) | UInt32(cu1 & 0x3f) + if cu0 < 0xe0 { + // 2-byte sequences. + return .Result(UnicodeScalar(result & 0x000007ff)) // 11 bits + } + + let cu2 = UInt8(buffer & 0xff) + buffer >>= 8 + _lookaheadFlags >>= 1 + result = (result << 6) | UInt32(cu2 & 0x3f) + if cu0 < 0xf0 { + // 3-byte sequences. + return .Result(UnicodeScalar(result & 0x0000ffff)) // 16 bits + } + + // 4-byte sequences. + let cu3 = UInt8(buffer & 0xff) + _lookaheadFlags >>= 1 + result = (result << 6) | UInt32(cu3 & 0x3f) + return .Result(UnicodeScalar(result & 0x001fffff)) // 21 bits + } + + /// Encode a `UnicodeScalar` as a series of `CodeUnit`s by + /// calling `output` on each `CodeUnit`. + public static func encode( + input: UnicodeScalar, + output put: (CodeUnit) -> Void + ) { + var c = UInt32(input) + var buf3 = UInt8(c & 0xFF) + + if c >= UInt32(1<<7) { + c >>= 6 + buf3 = (buf3 & 0x3F) | 0x80 // 10xxxxxx + var buf2 = UInt8(c & 0xFF) + if c < UInt32(1<<5) { + buf2 |= 0xC0 // 110xxxxx + } + else { + c >>= 6 + buf2 = (buf2 & 0x3F) | 0x80 // 10xxxxxx + var buf1 = UInt8(c & 0xFF) + if c < UInt32(1<<4) { + buf1 |= 0xE0 // 1110xxxx + } + else { + c >>= 6 + buf1 = (buf1 & 0x3F) | 0x80 // 10xxxxxx + put(UInt8(c | 0xF0)) // 11110xxx + } + put(buf1) + } + put(buf2) + } + put(buf3) + } + + /// Return `true` if `byte` is a continuation byte of the form + /// `0b10xxxxxx`. + @warn_unused_result + public static func isContinuation(byte: CodeUnit) -> Bool { + return byte & 0b11_00__0000 == 0b10_00__0000 + } + + var _value = UInt8() +} + +/// A codec for [UTF-16](http://www.unicode.org/glossary/#UTF_16). +public struct UTF16 : UnicodeCodecType { + /// A type that can hold [code unit](http://www.unicode.org/glossary/#code_unit) values for this + /// encoding. + public typealias CodeUnit = UInt16 + + public init() {} + + /// A lookahead buffer for one UTF-16 code unit. + var _decodeLookahead: UInt32 = 0 + + /// Flags with layout: `0b0000_00xy`. + /// + /// `y` is the EOF flag. + /// + /// `x` is set when `_decodeLookahead` contains a code unit. + var _lookaheadFlags: UInt8 = 0 + + /// Start or continue decoding a UTF sequence. + /// + /// In order to decode a code unit sequence completely, this function should + /// be called repeatedly until it returns `UnicodeDecodingResult.EmptyInput`. + /// Checking that the iterator was exhausted is not sufficient. The decoder + /// can have an internal buffer that is pre-filled with data from the input + /// iterator. + /// + /// Because of buffering, it is impossible to find the corresponding position + /// in the iterator for a given returned `UnicodeScalar` or an error. + /// + /// - parameter next: An *iterator* of code units to be decoded. + public mutating func decode< + I : IteratorProtocol where I.Element == CodeUnit + >(inout input: I) -> UnicodeDecodingResult { + if _lookaheadFlags & 0b01 != 0 { + return .EmptyInput + } + + // Note: maximal subpart of ill-formed sequence for UTF-16 can only have + // length 1. Length 0 does not make sense. Neither does length 2 -- in + // that case the sequence is valid. + + var unit0: UInt32 + if _fastPath(_lookaheadFlags & 0b10 == 0) { + if let first = input.next() { + unit0 = UInt32(first) + } else { + // Set EOF flag. + _lookaheadFlags |= 0b01 + return .EmptyInput + } + } else { + // Fetch code unit from the lookahead buffer and note this fact in flags. + unit0 = _decodeLookahead + _lookaheadFlags &= 0b01 + } + + // A well-formed pair of surrogates looks like this: + // [1101 10ww wwxx xxxx] [1101 11xx xxxx xxxx] + + if _fastPath((unit0 >> 11) != 0b1101_1) { + // Neither high-surrogate, nor low-surrogate -- sequence of 1 code unit, + // decoding is trivial. + return .Result(UnicodeScalar(unit0)) + } + + if _slowPath((unit0 >> 10) == 0b1101_11) { + // `unit0` is a low-surrogate. We have an ill-formed sequence. + return .Error + } + + // At this point we know that `unit0` is a high-surrogate. + + var unit1: UInt32 + if let second = input.next() { + unit1 = UInt32(second) + } else { + // EOF reached. Set EOF flag. + _lookaheadFlags |= 0b01 + + // We have seen a high-surrogate and EOF, so we have an ill-formed + // sequence. + return .Error + } + + if _fastPath((unit1 >> 10) == 0b1101_11) { + // `unit1` is a low-surrogate. We have a well-formed surrogate pair. + + let result = 0x10000 + (((unit0 & 0x03ff) << 10) | (unit1 & 0x03ff)) + return .Result(UnicodeScalar(result)) + } + + // Otherwise, we have an ill-formed sequence. These are the possible + // cases: + // + // * `unit1` is a high-surrogate, so we have a pair of two high-surrogates. + // + // * `unit1` is not a surrogate. We have an ill-formed sequence: + // high-surrogate followed by a non-surrogate. + + // Save the second code unit in the lookahead buffer. + _decodeLookahead = unit1 + _lookaheadFlags |= 0b10 + return .Error + } + + /// Try to decode one Unicode scalar, and return the actual number of code + /// units it spanned in the input. This function may consume more code + /// units than required for this scalar. + mutating func _decodeOne< + I : IteratorProtocol where I.Element == CodeUnit + >(inout input: I) -> (UnicodeDecodingResult, Int) { + let result = decode(&input) + switch result { + case .Result(let us): + return (result, UTF16.width(us)) + + case .EmptyInput: + return (result, 0) + + case .Error: + return (result, 1) + } + } + + /// Encode a `UnicodeScalar` as a series of `CodeUnit`s by + /// calling `output` on each `CodeUnit`. + public static func encode( + input: UnicodeScalar, + output put: (CodeUnit) -> Void + ) { + let scalarValue: UInt32 = UInt32(input) + + if scalarValue <= UInt32(UInt16.max) { + put(UInt16(scalarValue)) + } + else { + let lead_offset = UInt32(0xd800) - UInt32(0x10000 >> 10) + put(UInt16(lead_offset + (scalarValue >> 10))) + put(UInt16(0xdc00 + (scalarValue & 0x3ff))) + } + } + + var _value = UInt16() +} + +/// A codec for [UTF-32](http://www.unicode.org/glossary/#UTF_32). +public struct UTF32 : UnicodeCodecType { + /// A type that can hold [code unit](http://www.unicode.org/glossary/#code_unit) values for this + /// encoding. + public typealias CodeUnit = UInt32 + + public init() {} + + /// Start or continue decoding a UTF sequence. + /// + /// In order to decode a code unit sequence completely, this function should + /// be called repeatedly until it returns `UnicodeDecodingResult.EmptyInput`. + /// Checking that the iterator was exhausted is not sufficient. The decoder + /// can have an internal buffer that is pre-filled with data from the input + /// iterator. + /// + /// Because of buffering, it is impossible to find the corresponding position + /// in the iterator for a given returned `UnicodeScalar` or an error. + /// + /// - parameter next: An *iterator* over the code units to be decoded. + public mutating func decode< + I : IteratorProtocol where I.Element == CodeUnit + >(inout input: I) -> UnicodeDecodingResult { + return UTF32._decode(&input) + } + + static func _decode< + I : IteratorProtocol where I.Element == CodeUnit + >(inout input: I) -> UnicodeDecodingResult { + guard let x = input.next() else { return .EmptyInput } + if _fastPath((x >> 11) != 0b1101_1 && x <= 0x10ffff) { + return .Result(UnicodeScalar(x)) + } else { + return .Error + } + } + + /// Encode a `UnicodeScalar` as a series of `CodeUnit`s by + /// calling `output` on each `CodeUnit`. + public static func encode( + input: UnicodeScalar, + output put: (CodeUnit) -> Void + ) { + put(UInt32(input)) + } +} + +/// Translate `input`, in the given `InputEncoding`, into `output`, in +/// the given `OutputEncoding`. +/// +/// - parameter stopOnError: Causes encoding to stop when an encoding +/// error is detected in `input`, if `true`. Otherwise, U+FFFD +/// replacement characters are inserted for each detected error. +public func transcode< + Input : IteratorProtocol, + InputEncoding : UnicodeCodecType, + OutputEncoding : UnicodeCodecType + where InputEncoding.CodeUnit == Input.Element>( + inputEncoding: InputEncoding.Type, _ outputEncoding: OutputEncoding.Type, + _ input: Input, _ output: (OutputEncoding.CodeUnit) -> Void, + stoppingOnError stopOnError: Bool +) -> Bool { + + var input = input + + // NB. It is not possible to optimize this routine to a memcpy if + // InputEncoding == OutputEncoding. The reason is that memcpy will not + // substitute U+FFFD replacement characters for ill-formed sequences. + + var inputDecoder = inputEncoding.init() + var hadError = false + for var scalar = inputDecoder.decode(&input); + !scalar.isEmptyInput(); + scalar = inputDecoder.decode(&input) { + switch scalar { + case .Result(let us): + OutputEncoding.encode(us, output: output) + case .EmptyInput: + _sanityCheckFailure("should not enter the loop when input becomes empty") + case .Error: + if stopOnError { + return (hadError: true) + } else { + OutputEncoding.encode("\u{fffd}", output: output) + hadError = true + } + } + } + return hadError +} + +/// Transcode UTF-16 to UTF-8, replacing ill-formed sequences with U+FFFD. +/// +/// Returns the index of the first unhandled code unit and the UTF-8 data +/// that was encoded. +@warn_unused_result +internal func _transcodeSomeUTF16AsUTF8< + Input : Collection + where + Input.Iterator.Element == UInt16>( + input: Input, _ startIndex: Input.Index +) -> (Input.Index, _StringCore.UTF8Chunk) { + typealias UTF8Chunk = _StringCore.UTF8Chunk + + let endIndex = input.endIndex + let utf8Max = sizeof(UTF8Chunk.self) + var result: UTF8Chunk = 0 + var utf8Count = 0 + var nextIndex = startIndex + while nextIndex != input.endIndex && utf8Count != utf8Max { + let u = UInt(input[nextIndex]) + let shift = UTF8Chunk(utf8Count * 8) + var utf16Length: Input.Index.Distance = 1 + + if _fastPath(u <= 0x7f) { + result |= UTF8Chunk(u) << shift + ++utf8Count + } else { + var scalarUtf8Length: Int + var r: UInt + if _fastPath((u >> 11) != 0b1101_1) { + // Neither high-surrogate, nor low-surrogate -- well-formed sequence + // of 1 code unit, decoding is trivial. + if u < 0x800 { + r = 0b10__00_0000__110__0_0000 + r |= u >> 6 + r |= (u & 0b11_1111) << 8 + scalarUtf8Length = 2 + } + else { + r = 0b10__00_0000__10__00_0000__1110__0000 + r |= u >> 12 + r |= ((u >> 6) & 0b11_1111) << 8 + r |= (u & 0b11_1111) << 16 + scalarUtf8Length = 3 + } + } else { + let unit0 = u + if _slowPath((unit0 >> 10) == 0b1101_11) { + // `unit0` is a low-surrogate. We have an ill-formed sequence. + // Replace it with U+FFFD. + r = 0xbdbfef + scalarUtf8Length = 3 + } else if _slowPath(nextIndex.advancedBy(1) == endIndex) { + // We have seen a high-surrogate and EOF, so we have an ill-formed + // sequence. Replace it with U+FFFD. + r = 0xbdbfef + scalarUtf8Length = 3 + } else { + let unit1 = UInt(input[nextIndex.advancedBy(1)]) + if _fastPath((unit1 >> 10) == 0b1101_11) { + // `unit1` is a low-surrogate. We have a well-formed surrogate + // pair. + let v = 0x10000 + (((unit0 & 0x03ff) << 10) | (unit1 & 0x03ff)) + + r = 0b10__00_0000__10__00_0000__10__00_0000__1111_0__000 + r |= v >> 18 + r |= ((v >> 12) & 0b11_1111) << 8 + r |= ((v >> 6) & 0b11_1111) << 16 + r |= (v & 0b11_1111) << 24 + scalarUtf8Length = 4 + utf16Length = 2 + } else { + // Otherwise, we have an ill-formed sequence. Replace it with + // U+FFFD. + r = 0xbdbfef + scalarUtf8Length = 3 + } + } + } + // Don't overrun the buffer + if utf8Count + scalarUtf8Length > utf8Max { + break + } + result |= numericCast(r) << shift + utf8Count += scalarUtf8Length + } + nextIndex = nextIndex.advancedBy(utf16Length) + } + // FIXME: Annoying check, courtesy of + if utf8Count < sizeofValue(result) { + result |= ~0 << numericCast(utf8Count * 8) + } + return (nextIndex, result) +} + +/// Instances of conforming types are used in internal `String` +/// representation. +public // @testable +protocol _StringElementType { + @warn_unused_result + static func _toUTF16CodeUnit(_: Self) -> UTF16.CodeUnit + + @warn_unused_result + static func _fromUTF16CodeUnit(utf16: UTF16.CodeUnit) -> Self +} + +extension UTF16.CodeUnit : _StringElementType { + public // @testable + static func _toUTF16CodeUnit(x: UTF16.CodeUnit) -> UTF16.CodeUnit { + return x + } + public // @testable + static func _fromUTF16CodeUnit( + utf16: UTF16.CodeUnit + ) -> UTF16.CodeUnit { + return utf16 + } +} + +extension UTF8.CodeUnit : _StringElementType { + public // @testable + static func _toUTF16CodeUnit(x: UTF8.CodeUnit) -> UTF16.CodeUnit { + _sanityCheck(x <= 0x7f, "should only be doing this with ASCII") + return UTF16.CodeUnit(x) + } + public // @testable + static func _fromUTF16CodeUnit( + utf16: UTF16.CodeUnit + ) -> UTF8.CodeUnit { + _sanityCheck(utf16 <= 0x7f, "should only be doing this with ASCII") + return UTF8.CodeUnit(utf16) + } +} + +extension UTF16 { + /// Return the number of code units required to encode `x`. + @warn_unused_result + public static func width(x: UnicodeScalar) -> Int { + return x.value <= 0xFFFF ? 1 : 2 + } + + /// Return the high surrogate code unit of a [surrogate pair](http://www.unicode.org/glossary/#surrogate_pair) representing + /// `x`. + /// + /// - Requires: `width(x) == 2`. + @warn_unused_result + public static func leadSurrogate(x: UnicodeScalar) -> UTF16.CodeUnit { + _precondition(width(x) == 2) + return UTF16.CodeUnit((x.value - 0x1_0000) >> (10 as UInt32)) + 0xD800 + } + + /// Return the low surrogate code unit of a [surrogate pair](http://www.unicode.org/glossary/#surrogate_pair) representing + /// `x`. + /// + /// - Requires: `width(x) == 2`. + @warn_unused_result + public static func trailSurrogate(x: UnicodeScalar) -> UTF16.CodeUnit { + _precondition(width(x) == 2) + return UTF16.CodeUnit( + (x.value - 0x1_0000) & (((1 as UInt32) << 10) - 1) + ) + 0xDC00 + } + + @warn_unused_result + public static func isLeadSurrogate(x: CodeUnit) -> Bool { + return 0xD800...0xDBFF ~= x + } + + @warn_unused_result + public static func isTrailSurrogate(x: CodeUnit) -> Bool { + return 0xDC00...0xDFFF ~= x + } + + public // @testable + static func _copy( + source: UnsafeMutablePointer, + destination: UnsafeMutablePointer, count: Int + ) { + if strideof(T.self) == strideof(U.self) { + _memcpy( + dest: UnsafeMutablePointer(destination), + src: UnsafeMutablePointer(source), + size: UInt(count) * UInt(strideof(U.self))) + } + else { + for i in 0..( + _: Encoding.Type, input: Input, repairIllFormedSequences: Bool + ) -> (Int, Bool)? { + var input = input + var count = 0 + var isAscii = true + + var inputDecoder = Encoding() + loop: + while true { + switch inputDecoder.decode(&input) { + case .Result(let us): + if us.value > 0x7f { + isAscii = false + } + count += width(us) + case .EmptyInput: + break loop + case .Error: + if !repairIllFormedSequences { + return .None + } + isAscii = false + count += width(UnicodeScalar(0xfffd)) + } + } + return (count, isAscii) + } +} + diff --git a/stdlib/public/core/Character.swift b/stdlib/public/core/Character.swift index cf455f676f06a..58d1542e029a8 100644 --- a/stdlib/public/core/Character.swift +++ b/stdlib/public/core/Character.swift @@ -210,7 +210,7 @@ public struct Character : } transcode( UTF8.self, UTF16.self, _SmallUTF8(u8).iterator(), output, - stopOnError: false) + stoppingOnError: false) self.data = u16 } diff --git a/stdlib/public/core/StringBuffer.swift b/stdlib/public/core/StringBuffer.swift index d2ee61efb876c..39e76cb135f52 100644 --- a/stdlib/public/core/StringBuffer.swift +++ b/stdlib/public/core/StringBuffer.swift @@ -110,7 +110,7 @@ public struct _StringBuffer { } let hadError = transcode( encoding, UTF32.self, input.iterator(), sink, - stopOnError: true) + stoppingOnError: true) _sanityCheck(!hadError, "string can not be ASCII if there were decoding errors") return (result, hadError) } @@ -121,7 +121,7 @@ public struct _StringBuffer { } let hadError = transcode( encoding, UTF16.self, input.iterator(), sink, - stopOnError: !repairIllFormedSequences) + stoppingOnError: !repairIllFormedSequences) return (result, hadError) } } diff --git a/stdlib/public/core/StringCore.swift b/stdlib/public/core/StringCore.swift index 9e5b9e2a44e92..9dd10a8f8b22e 100644 --- a/stdlib/public/core/StringCore.swift +++ b/stdlib/public/core/StringCore.swift @@ -340,7 +340,7 @@ public struct _StringCore { count: count ).iterator(), output, - stopOnError: true + stoppingOnError: true ) _sanityCheck(!hadError, "Swift.String with native storage should not have unpaired surrogates") } diff --git a/stdlib/public/core/Unicode.swift b/stdlib/public/core/Unicode.swift index ad29f5068a019..3bc33e693b2dd 100644 --- a/stdlib/public/core/Unicode.swift +++ b/stdlib/public/core/Unicode.swift @@ -694,7 +694,7 @@ public func transcode< where InputEncoding.CodeUnit == Input.Element>( inputEncoding: InputEncoding.Type, _ outputEncoding: OutputEncoding.Type, _ input: Input, _ output: (OutputEncoding.CodeUnit) -> Void, - stopOnError: Bool + stoppingOnError stopOnError: Bool ) -> Bool { var input = input diff --git a/validation-test/stdlib/Unicode.swift b/validation-test/stdlib/Unicode.swift index e7e899cbb631b..21c0baa392fb1 100644 --- a/validation-test/stdlib/Unicode.swift +++ b/validation-test/stdlib/Unicode.swift @@ -116,7 +116,7 @@ func checkDecodeUTF( var decoded = [UInt32]() let output: (UInt32) -> Void = { decoded.append($0) } let iterator = EOFCountingIterator(utfStr) - transcode(codec, UTF32.self, iterator, output, stopOnError: true) + transcode(codec, UTF32.self, iterator, output, stoppingOnError: true) expectGE(1, iterator.numTimesReturnedEOF) if expectedHead != decoded { return assertionFailure() @@ -133,7 +133,7 @@ func checkDecodeUTF( var decoded = [UInt32]() let output: (UInt32) -> Void = { decoded.append($0) } let iterator = EOFCountingIterator(utfStr) - transcode(codec, UTF32.self, iterator, output, stopOnError: false) + transcode(codec, UTF32.self, iterator, output, stoppingOnError: false) expectEqual(1, iterator.numTimesReturnedEOF) if expected != decoded { return assertionFailure() @@ -175,7 +175,7 @@ func checkEncodeUTF8(expected: [UInt8], let output: (UInt8) -> Void = { encoded.append($0) } let iterator = EOFCountingIterator(scalars) let hadError = - transcode(UTF32.self, UTF8.self, iterator, output, stopOnError: true) + transcode(UTF32.self, UTF8.self, iterator, output, stoppingOnError: true) expectFalse(hadError) expectGE(1, iterator.numTimesReturnedEOF) if expected != encoded { @@ -2085,7 +2085,7 @@ UnicodeAPIs.test("transcode/MutableArray") { var input: [UInt16] = [ 0x0041, 0x0042 ] var transcoded = [UInt16]() let output: (UInt16) -> Void = { transcoded.append($0) } - transcode(UTF16.self, UTF16.self, input.iterator(), output, stopOnError: true) + transcode(UTF16.self, UTF16.self, input.iterator(), output, stoppingOnError: true) expectEqual(input, transcoded) } @@ -2093,7 +2093,7 @@ UnicodeAPIs.test("transcode/ReferenceTypedArray") { var input: [UInt16] = [ 0x0041, 0x0042 ] var transcoded = [UInt16]() let output: (UInt16) -> Void = { transcoded.append($0) } - transcode(UTF16.self, UTF16.self, input.iterator(), output, stopOnError: true) + transcode(UTF16.self, UTF16.self, input.iterator(), output, stoppingOnError: true) expectEqual(input, transcoded) } @@ -2114,7 +2114,7 @@ class NonContiguousNSString : NSString { let output: (UInt16) -> Void = { encoded.append($0) } let iterator = utf8.iterator() let hadError = - transcode(UTF8.self, UTF16.self, iterator, output, stopOnError: true) + transcode(UTF8.self, UTF16.self, iterator, output, stoppingOnError: true) expectFalse(hadError) self.init(encoded) } @@ -2129,7 +2129,7 @@ class NonContiguousNSString : NSString { let output: (UInt16) -> Void = { encoded.append($0) } let iterator = scalars.iterator() let hadError = - transcode(UTF32.self, UTF16.self, iterator, output, stopOnError: true) + transcode(UTF32.self, UTF16.self, iterator, output, stoppingOnError: true) expectFalse(hadError) self.init(encoded) } @@ -2184,7 +2184,7 @@ StringCookedViews.test("UTF8ForContiguousUTF16") { let output: (UInt16) -> Void = { backingStorage.append($0) } var iterator = test.scalars.iterator() - transcode(UTF32.self, UTF16.self, iterator, output, stopOnError: false) + transcode(UTF32.self, UTF16.self, iterator, output, stoppingOnError: false) backingStorage.withUnsafeBufferPointer { (ptr) -> Void in @@ -2205,7 +2205,7 @@ StringCookedViews.test("UTF8ForContiguousUTF16") { let output: (UInt8) -> Void = { expected.append($0) } var expectedScalars = test.scalarsHead + test.scalarsRepairedTail var iterator = expectedScalars.iterator() - transcode(UTF32.self, UTF8.self, iterator, output, stopOnError: false) + transcode(UTF32.self, UTF8.self, iterator, output, stoppingOnError: false) checkUTF8View(expected, subject, test.loc.withCurrentLoc()) } @@ -2251,7 +2251,7 @@ StringCookedViews.test("UTF8ForNonContiguousUTF16") { let output: (UInt8) -> Void = { expected.append($0) } var expectedScalars = test.scalarsHead + test.scalarsRepairedTail var iterator = expectedScalars.iterator() - transcode(UTF32.self, UTF8.self, iterator, output, stopOnError: false) + transcode(UTF32.self, UTF8.self, iterator, output, stoppingOnError: false) var nss = NonContiguousNSString(test.encoded) verifyThatStringIsOpaqueForCoreFoundation(nss) @@ -2319,7 +2319,7 @@ StringCookedViews.test("UTF16") { let output: (UInt16) -> Void = { expected.append($0) } var expectedScalars = test.scalars var iterator = expectedScalars.iterator() - transcode(UTF32.self, UTF16.self, iterator, output, stopOnError: false) + transcode(UTF32.self, UTF16.self, iterator, output, stoppingOnError: false) var nss = NonContiguousNSString(test.scalars) checkUTF16View(expected, nss as String, test.loc.withCurrentLoc()) @@ -2331,7 +2331,7 @@ StringCookedViews.test("UTF16") { let output: (UInt16) -> Void = { expected.append($0) } var expectedScalars = test.scalarsHead + test.scalarsRepairedTail var iterator = expectedScalars.iterator() - transcode(UTF32.self, UTF16.self, iterator, output, stopOnError: false) + transcode(UTF32.self, UTF16.self, iterator, output, stoppingOnError: false) checkUTF16View(expected, subject, test.loc.withCurrentLoc()) } diff --git a/validation-test/stdlib/UnicodeTrie.swift.gyb b/validation-test/stdlib/UnicodeTrie.swift.gyb index 501ceb1a7cb95..74a8f580d6f94 100644 --- a/validation-test/stdlib/UnicodeTrie.swift.gyb +++ b/validation-test/stdlib/UnicodeTrie.swift.gyb @@ -96,7 +96,7 @@ class NonContiguousNSString : NSString { var iter = scalars.iterator() let output: (UInt16) -> Void = { encoded.append($0) } let hadError = - transcode(UTF32.self, UTF16.self, iter, output, stopOnError: true) + transcode(UTF32.self, UTF16.self, iter, output, stoppingOnError: true) expectFalse(hadError) self.init(encoded) }