From f3ecc3cdc5160aeab966a09bcec25eabe61fe295 Mon Sep 17 00:00:00 2001
From: Dmitri Gribenko <gribozavr@gmail.com>
Date: Wed, 4 Nov 2015 16:54:05 -0800
Subject: [PATCH] transcode(..., stopOnError:) => transcode(...,
 stoppingOnError:)

---
 "\\"                                         | 960 +++++++++++++++++++
 stdlib/public/core/Character.swift           |   2 +-
 stdlib/public/core/StringBuffer.swift        |   4 +-
 stdlib/public/core/StringCore.swift          |   2 +-
 stdlib/public/core/Unicode.swift             |   2 +-
 validation-test/stdlib/Unicode.swift         |  24 +-
 validation-test/stdlib/UnicodeTrie.swift.gyb |   2 +-
 7 files changed, 978 insertions(+), 18 deletions(-)
 create mode 100644 "\\"

diff --git "a/\\" "b/\\"
new file mode 100644
index 0000000000000..3bc33e693b2dd
--- /dev/null
+++ "b/\\"
@@ -0,0 +1,960 @@
+//===----------------------------------------------------------------------===//
+//
+// This source file is part of the Swift.org open source project
+//
+// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
+// Licensed under Apache License v2.0 with Runtime Library Exception
+//
+// See http://swift.org/LICENSE.txt for license information
+// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
+//
+//===----------------------------------------------------------------------===//
+
+
+// Conversions between different Unicode encodings.  Note that UTF-16 and
+// UTF-32 decoding are *not* currently resilient to erroneous data.
+
+/// The result of one Unicode decoding step.
+///
+/// A unicode scalar value, an indication that no more unicode scalars
+/// are available, or an indication of a decoding error.
+public enum UnicodeDecodingResult {
+  case Result(UnicodeScalar)
+  case EmptyInput
+  case Error
+
+  /// Return true if `self` indicates no more unicode scalars are
+  /// available.
+  @warn_unused_result
+  public func isEmptyInput() -> Bool {
+    switch self {
+    case .EmptyInput:
+      return true
+    default:
+      return false
+    }
+  }
+}
+
+/// A Unicode [encoding scheme](http://www.unicode.org/glossary/#character_encoding_scheme).
+///
+/// Consists of an underlying [code unit](http://www.unicode.org/glossary/#code_unit) and functions to
+/// translate between sequences of these code units and [unicode scalar values](http://www.unicode.org/glossary/#unicode_scalar_value).
+public protocol UnicodeCodecType {
+
+  /// A type that can hold [code unit](http://www.unicode.org/glossary/#code_unit) values for this
+  /// encoding.
+  typealias CodeUnit
+
+  init()
+
+  /// Start or continue decoding a UTF sequence.
+  ///
+  /// In order to decode a code unit sequence completely, this function should
+  /// be called repeatedly until it returns `UnicodeDecodingResult.EmptyInput`.
+  /// Checking that the iterator was exhausted is not sufficient.  The decoder
+  /// can have an internal buffer that is pre-filled with data from the input
+  /// iterator.
+  ///
+  /// Because of buffering, it is impossible to find the corresponding position
+  /// in the iterator for a given returned `UnicodeScalar` or an error.
+  ///
+  /// - parameter next: An *iterator* of code units to be decoded.
+  mutating func decode<
+    I : IteratorProtocol where I.Element == CodeUnit
+  >(inout next: I) -> UnicodeDecodingResult
+
+  /// Encode a `UnicodeScalar` as a series of `CodeUnit`s by
+  /// calling `output` on each `CodeUnit`.
+  static func encode(input: UnicodeScalar, output: (CodeUnit) -> Void)
+}
+
+/// A codec for [UTF-8](http://www.unicode.org/glossary/#UTF_8).
+public struct UTF8 : UnicodeCodecType {
+
+  /// A type that can hold [code unit](http://www.unicode.org/glossary/#code_unit) values for this
+  /// encoding.
+  public typealias CodeUnit = UInt8
+
+  public init() {}
+
+  /// Returns the number of expected trailing bytes for a given first byte: 0,
+  /// 1, 2 or 3.  If the first byte can not start a valid UTF-8 code unit
+  /// sequence, returns 4.
+  @warn_unused_result
+  public static func _numTrailingBytes(cu0: CodeUnit) -> UInt8 {
+    if _fastPath(cu0 & 0x80 == 0) {
+      // 0x00 -- 0x7f: 1-byte sequences.
+      return 0
+    }
+
+    // 0xc0 -- 0xc1: invalid first byte.
+    // 0xc2 -- 0xdf: 2-byte sequences.
+    // 0xe0 -- 0xef: 3-byte sequences.
+    // 0xf0 -- 0xf4: 4-byte sequences.
+    // 0xf5 -- 0xff: invalid first byte.
+
+    // The rules above are represented as a lookup table.  The lookup table
+    // consists of two words, where `high` contains the high bit of the result,
+    // `low` contains the low bit.
+    //
+    // Bit patterns:
+    // high | low | meaning
+    // -----+-----+----------------
+    //   0  |  0  | 2-byte sequence
+    //   0  |  1  | 3-byte sequence
+    //   1  |  0  | 4-byte sequence
+    //   1  |  1  | invalid
+    //
+    // This implementation allows us to handle these cases without branches.
+
+    //    ---------0xf?-------  ---------0xe?-------  ---------0xd?-------  ---------0xc?-------
+    let low: UInt64 =
+        0b1111_1111__1110_0000__1111_1111__1111_1111__0000_0000__0000_0000__0000_0000__0000_0011
+    let high: UInt64 =
+        0b1111_1111__1111_1111__0000_0000__0000_0000__0000_0000__0000_0000__0000_0000__0000_0011
+
+    let index = UInt64(max(0, Int(cu0) - 0xc0))
+    let highBit = ((high >> index) & 1) << 1
+    let lowBit = (low >> index) & 1
+    return UInt8(1 + (highBit | lowBit))
+  }
+
+  /// Lookahead buffer used for UTF-8 decoding.  New bytes are inserted at LSB,
+  /// and bytes are read at MSB.
+  var _decodeLookahead: UInt32 = 0
+
+  /// Flags with layout: `0bxxxx_yyyy`.
+  ///
+  /// `xxxx` is the EOF flag.  It means that the input iterator has signaled
+  /// end of sequence.  Out of the four bits, only one bit can be set.  The bit
+  /// position specifies how many bytes have been consumed from the lookahead
+  /// buffer already.  A value of `1000` means that there are `yyyy` bytes in
+  /// the buffer, `0100` means that there are `yyyy - 1` bytes, `0010` --
+  /// `yyyy - 2`, `0001` -- `yyyy - 3`.
+  ///
+  /// `yyyy` specifies how many bytes are valid in the lookahead buffer.  Value
+  /// is expressed in unary code.  Valid values: `1111` (4), `0111` (3),
+  /// `0011` (2), `0001` (1), `0000` (0).
+  ///
+  /// This representation is crafted to allow one to consume a byte from a
+  /// buffer with a shift, and update flags with a single-bit right shift.
+  var _lookaheadFlags: UInt8 = 0
+
+  /// Return `true` if the LSB bytes in `buffer` are well-formed UTF-8 code
+  /// unit sequence.
+  @warn_unused_result
+  static func _isValidUTF8Impl(buffer: UInt32, length: UInt8) -> Bool {
+    switch length {
+    case 4:
+      let cu3 = UInt8((buffer >> 24) & 0xff)
+      if cu3 < 0x80 || cu3 > 0xbf {
+        return false
+      }
+      fallthrough
+    case 3:
+      let cu2 = UInt8((buffer >> 16) & 0xff)
+      if cu2 < 0x80 || cu2 > 0xbf {
+        return false
+      }
+      fallthrough
+    case 2:
+      let cu0 = UInt8(buffer & 0xff)
+      let cu1 = UInt8((buffer >> 8) & 0xff)
+      switch cu0 {
+      case 0xe0:
+        if cu1 < 0xa0 || cu1 > 0xbf {
+          return false
+        }
+      case 0xed:
+        if cu1 < 0x80 || cu1 > 0x9f {
+          return false
+        }
+      case 0xf0:
+        if cu1 < 0x90 || cu1 > 0xbf {
+          return false
+        }
+      case 0xf4:
+        if cu1 < 0x80 || cu1 > 0x8f {
+          return false
+        }
+      default:
+        _sanityCheck(cu0 >= 0xc2 && cu0 <= 0xf4,
+            "invalid first bytes should be handled in the caller")
+        if cu1 < 0x80 || cu1 > 0xbf {
+          return false
+        }
+      }
+      return true
+
+    default:
+      _sanityCheckFailure("one-byte sequences should be handled in the caller")
+    }
+  }
+
+  /// Return `true` if the LSB bytes in `buffer` are well-formed UTF-8 code
+  /// unit sequence.
+  @warn_unused_result
+  static func _isValidUTF8(buffer: UInt32, validBytes: UInt8) -> Bool {
+    _sanityCheck(validBytes & 0b0000_1111 != 0,
+        "input buffer should not be empty")
+
+    let cu0 = UInt8(buffer & 0xff)
+    let trailingBytes = _numTrailingBytes(cu0)
+    switch trailingBytes {
+    case 0:
+      return true
+
+    case 1, 2, 3:
+      // We *don't* need to check the if the buffer actually contains at least
+      // `trailingBytes` bytes.  Here's why.
+      //
+      // If the buffer is not full -- contains fewer than 4 bytes, we are at
+      // EOF, and the buffer will be padded with 0x00.  Thus, an incomplete
+      // code unit sequence just before EOF would be seen by code below as
+      // padded with nuls.  This sequence will be rejected by the logic in
+      // `_isValidUTF8Impl`, because the nul byte is not a valid continuation
+      // byte for UTF-8.
+      return _isValidUTF8Impl(buffer, length: trailingBytes + 1)
+
+    default:
+      return false
+    }
+  }
+
+  /// Given an ill-formed sequence, find the length of its maximal subpart.
+  @inline(never)
+  @warn_unused_result
+  static func _findMaximalSubpartOfIllFormedUTF8Sequence(
+      buffer: UInt32, validBytes: UInt8) -> UInt8 {
+    var buffer = buffer
+    var validBytes = validBytes
+    // This function is '@inline(never)' because it is used only in the error
+    // handling path.
+
+    // Clear EOF flag, we don't care about it.
+    validBytes &= 0b0000_1111
+
+    _sanityCheck(validBytes != 0,
+        "input buffer should not be empty")
+    _sanityCheck(!UTF8._isValidUTF8(buffer, validBytes: validBytes),
+        "input sequence should be ill-formed UTF-8")
+
+    // Unicode 6.3.0, D93b:
+    //
+    //     Maximal subpart of an ill-formed subsequence: The longest code unit
+    //     subsequence starting at an unconvertible offset that is either:
+    //     a. the initial subsequence of a well-formed code unit sequence, or
+    //     b. a subsequence of length one.
+
+    // Perform case analysis.  See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
+    // Byte Sequences.
+
+    let cu0 = UInt8(buffer & 0xff)
+    buffer >>= 8
+    validBytes >>= 1
+    if (cu0 >= 0xc2 && cu0 <= 0xdf) {
+      // First byte is valid, but we know that this code unit sequence is
+      // invalid, so the maximal subpart has to end after the first byte.
+      return 1
+    }
+
+    if validBytes == 0 {
+      return 1
+    }
+
+    let cu1 = UInt8(buffer & 0xff)
+    buffer >>= 8
+    validBytes >>= 1
+
+    if (cu0 == 0xe0) {
+      return (cu1 >= 0xa0 && cu1 <= 0xbf) ? 2 : 1
+    }
+    if (cu0 >= 0xe1 && cu0 <= 0xec) {
+      return (cu1 >= 0x80 && cu1 <= 0xbf) ? 2 : 1
+    }
+    if (cu0 == 0xed) {
+      return (cu1 >= 0x80 && cu1 <= 0x9f) ? 2 : 1
+    }
+    if (cu0 >= 0xee && cu0 <= 0xef) {
+      return (cu1 >= 0x80 && cu1 <= 0xbf) ? 2 : 1
+    }
+    if (cu0 == 0xf0) {
+      if (cu1 >= 0x90 && cu1 <= 0xbf) {
+        if validBytes == 0 {
+          return 2
+        }
+
+        let cu2 = UInt8(buffer & 0xff)
+        return (cu2 >= 0x80 && cu2 <= 0xbf) ? 3 : 2
+      }
+      return 1
+    }
+    if (cu0 >= 0xf1 && cu0 <= 0xf3) {
+      if (cu1 >= 0x80 && cu1 <= 0xbf) {
+        if validBytes == 0 {
+          return 2
+        }
+
+        let cu2 = UInt8(buffer & 0xff)
+        return (cu2 >= 0x80 && cu2 <= 0xbf) ? 3 : 2
+      }
+      return 1
+    }
+    if (cu0 == 0xf4) {
+      if (cu1 >= 0x80 && cu1 <= 0x8f) {
+        if validBytes == 0 {
+          return 2
+        }
+
+        let cu2 = UInt8(buffer & 0xff)
+        return (cu2 >= 0x80 && cu2 <= 0xbf) ? 3 : 2
+      }
+      return 1
+    }
+
+    _sanityCheck((cu0 >= 0x80 && cu0 <= 0xc1) || cu0 >= 0xf5,
+        "case analysis above should have handled all valid first bytes")
+
+    // There are no well-formed sequences that start with these bytes.  Maximal
+    // subpart is defined to have length 1 in these cases.
+    return 1
+  }
+
+  /// Start or continue decoding a UTF sequence.
+  ///
+  /// In order to decode a code unit sequence completely, this function should
+  /// be called repeatedly until it returns `UnicodeDecodingResult.EmptyInput`.
+  /// Checking that the iterator was exhausted is not sufficient.  The decoder
+  /// can have an internal buffer that is pre-filled with data from the input
+  /// iterator.
+  ///
+  /// Because of buffering, it is impossible to find the corresponding position
+  /// in the iterator for a given returned `UnicodeScalar` or an error.
+  ///
+  /// - parameter next: A *iterator* over the code units to be decoded.
+  public mutating func decode<
+    I : IteratorProtocol where I.Element == CodeUnit
+  >(inout next: I) -> UnicodeDecodingResult {
+    // If the EOF flag is not set, fill the lookahead buffer from the input
+    // iterator.
+    if _lookaheadFlags & 0b1111_0000 == 0 {
+      // Add more bytes into the buffer until we have 4.
+      while _lookaheadFlags != 0b0000_1111 {
+        if let codeUnit = next.next() {
+          _decodeLookahead = (_decodeLookahead << 8) | UInt32(codeUnit)
+          _lookaheadFlags = (_lookaheadFlags << 1) | 1
+        } else {
+          // Set the EOF flag.
+          switch _lookaheadFlags & 0b0000_1111 {
+          case 0b1111:
+            _sanityCheckFailure("should have not entered buffer refill loop")
+          case 0b0111:
+            _lookaheadFlags |= 0b0100_0000
+          case 0b0011:
+            _lookaheadFlags |= 0b0010_0000
+          case 0b0001:
+            _lookaheadFlags |= 0b0001_0000
+          case 0b0000:
+            _lookaheadFlags |= 0b1000_0000
+            return .EmptyInput
+          default:
+            _sanityCheckFailure("bad value in _lookaheadFlags")
+          }
+          break
+        }
+      }
+    }
+
+    if _slowPath(_lookaheadFlags & 0b0000_1111 == 0) {
+      return .EmptyInput
+    }
+
+    if _slowPath(_lookaheadFlags & 0b1111_0000 != 0) {
+      // Reached EOF.  Restore the invariant: first unread byte is always at
+      // MSB.
+      switch _lookaheadFlags & 0b1111_0000 {
+      case 0b1000_0000:
+        break
+      case 0b0100_0000:
+        _decodeLookahead <<= 1 * 8
+      case 0b0010_0000:
+        _decodeLookahead <<= 2 * 8
+      case 0b0001_0000:
+        _decodeLookahead <<= 3 * 8
+      default:
+        _sanityCheckFailure("bad value in _lookaheadFlags")
+      }
+      _lookaheadFlags = (_lookaheadFlags & 0b0000_1111) | 0b1000_0000
+    }
+
+    // The first byte to read is located at MSB of `_decodeLookahead`.  Get a
+    // representation of the buffer where we can read bytes starting from LSB.
+    var buffer = _decodeLookahead.byteSwapped
+    if _slowPath(!UTF8._isValidUTF8(buffer, validBytes: _lookaheadFlags)) {
+      // The code unit sequence is ill-formed.  According to Unicode
+      // recommendation, replace the maximal subpart of ill-formed sequence
+      // with one replacement character.
+      _lookaheadFlags >>=
+          UTF8._findMaximalSubpartOfIllFormedUTF8Sequence(buffer,
+              validBytes: _lookaheadFlags)
+      return .Error
+    }
+
+    // At this point we know that `buffer` starts with a well-formed code unit
+    // sequence.  Decode it.
+    //
+    // When consuming bytes from the `buffer`, we just need to update
+    // `_lookaheadFlags`.  The stored buffer in `_decodeLookahead` will be
+    // shifted at the beginning of the next decoding cycle.
+    let cu0 = UInt8(buffer & 0xff)
+    buffer >>= 8
+    _lookaheadFlags >>= 1
+
+    if cu0 < 0x80 {
+      // 1-byte sequences.
+      return .Result(UnicodeScalar(UInt32(cu0)))
+    }
+
+    // Start with octet 1 (we'll mask off high bits later).
+    var result = UInt32(cu0)
+
+    let cu1 = UInt8(buffer & 0xff)
+    buffer >>= 8
+    _lookaheadFlags >>= 1
+    result = (result << 6) | UInt32(cu1 & 0x3f)
+    if cu0 < 0xe0 {
+      // 2-byte sequences.
+      return .Result(UnicodeScalar(result & 0x000007ff)) // 11 bits
+    }
+
+    let cu2 = UInt8(buffer & 0xff)
+    buffer >>= 8
+    _lookaheadFlags >>= 1
+    result = (result << 6) | UInt32(cu2 & 0x3f)
+    if cu0 < 0xf0 {
+      // 3-byte sequences.
+      return .Result(UnicodeScalar(result & 0x0000ffff)) // 16 bits
+    }
+
+    // 4-byte sequences.
+    let cu3 = UInt8(buffer & 0xff)
+    _lookaheadFlags >>= 1
+    result = (result << 6) | UInt32(cu3 & 0x3f)
+    return .Result(UnicodeScalar(result & 0x001fffff)) // 21 bits
+  }
+
+  /// Encode a `UnicodeScalar` as a series of `CodeUnit`s by
+  /// calling `output` on each `CodeUnit`.
+  public static func encode(
+    input: UnicodeScalar,
+    output put: (CodeUnit) -> Void
+  ) {
+    var c = UInt32(input)
+    var buf3 = UInt8(c & 0xFF)
+
+    if c >= UInt32(1<<7) {
+      c >>= 6
+      buf3 = (buf3 & 0x3F) | 0x80 // 10xxxxxx
+      var buf2 = UInt8(c & 0xFF)
+      if c < UInt32(1<<5) {
+        buf2 |= 0xC0              // 110xxxxx
+      }
+      else {
+        c >>= 6
+        buf2 = (buf2 & 0x3F) | 0x80 // 10xxxxxx
+        var buf1 = UInt8(c & 0xFF)
+        if c < UInt32(1<<4) {
+          buf1 |= 0xE0              // 1110xxxx
+        }
+        else {
+          c >>= 6
+          buf1 = (buf1 & 0x3F) | 0x80 // 10xxxxxx
+          put(UInt8(c | 0xF0)) // 11110xxx
+        }
+        put(buf1)
+      }
+      put(buf2)
+    }
+    put(buf3)
+  }
+
+  /// Return `true` if `byte` is a continuation byte of the form
+  /// `0b10xxxxxx`.
+  @warn_unused_result
+  public static func isContinuation(byte: CodeUnit) -> Bool {
+    return byte & 0b11_00__0000 == 0b10_00__0000
+  }
+
+  var _value =  UInt8()
+}
+
+/// A codec for [UTF-16](http://www.unicode.org/glossary/#UTF_16).
+public struct UTF16 : UnicodeCodecType {
+  /// A type that can hold [code unit](http://www.unicode.org/glossary/#code_unit) values for this
+  /// encoding.
+  public typealias CodeUnit = UInt16
+
+  public init() {}
+
+  /// A lookahead buffer for one UTF-16 code unit.
+  var _decodeLookahead: UInt32 = 0
+
+  /// Flags with layout: `0b0000_00xy`.
+  ///
+  /// `y` is the EOF flag.
+  ///
+  /// `x` is set when `_decodeLookahead` contains a code unit.
+  var _lookaheadFlags: UInt8 = 0
+
+  /// Start or continue decoding a UTF sequence.
+  ///
+  /// In order to decode a code unit sequence completely, this function should
+  /// be called repeatedly until it returns `UnicodeDecodingResult.EmptyInput`.
+  /// Checking that the iterator was exhausted is not sufficient.  The decoder
+  /// can have an internal buffer that is pre-filled with data from the input
+  /// iterator.
+  ///
+  /// Because of buffering, it is impossible to find the corresponding position
+  /// in the iterator for a given returned `UnicodeScalar` or an error.
+  ///
+  /// - parameter next: An *iterator* of code units to be decoded.
+  public mutating func decode<
+    I : IteratorProtocol where I.Element == CodeUnit
+  >(inout input: I) -> UnicodeDecodingResult {
+    if _lookaheadFlags & 0b01 != 0 {
+      return .EmptyInput
+    }
+
+    // Note: maximal subpart of ill-formed sequence for UTF-16 can only have
+    // length 1.  Length 0 does not make sense.  Neither does length 2 -- in
+    // that case the sequence is valid.
+
+    var unit0: UInt32
+    if _fastPath(_lookaheadFlags & 0b10 == 0) {
+      if let first = input.next() {
+        unit0 = UInt32(first)
+      } else {
+        // Set EOF flag.
+        _lookaheadFlags |= 0b01
+        return .EmptyInput
+      }
+    } else {
+      // Fetch code unit from the lookahead buffer and note this fact in flags.
+      unit0 = _decodeLookahead
+      _lookaheadFlags &= 0b01
+    }
+
+    // A well-formed pair of surrogates looks like this:
+    // [1101 10ww wwxx xxxx] [1101 11xx xxxx xxxx]
+
+    if _fastPath((unit0 >> 11) != 0b1101_1) {
+      // Neither high-surrogate, nor low-surrogate -- sequence of 1 code unit,
+      // decoding is trivial.
+      return .Result(UnicodeScalar(unit0))
+    }
+
+    if _slowPath((unit0 >> 10) == 0b1101_11) {
+      // `unit0` is a low-surrogate.  We have an ill-formed sequence.
+      return .Error
+    }
+
+    // At this point we know that `unit0` is a high-surrogate.
+
+    var unit1: UInt32
+    if let second = input.next() {
+      unit1 = UInt32(second)
+    } else {
+      // EOF reached.  Set EOF flag.
+      _lookaheadFlags |= 0b01
+
+      // We have seen a high-surrogate and EOF, so we have an ill-formed
+      // sequence.
+      return .Error
+    }
+
+    if _fastPath((unit1 >> 10) == 0b1101_11) {
+      // `unit1` is a low-surrogate.  We have a well-formed surrogate pair.
+
+      let result = 0x10000 + (((unit0 & 0x03ff) << 10) | (unit1 & 0x03ff))
+      return .Result(UnicodeScalar(result))
+    }
+
+    // Otherwise, we have an ill-formed sequence.  These are the possible
+    // cases:
+    //
+    // * `unit1` is a high-surrogate, so we have a pair of two high-surrogates.
+    //
+    // * `unit1` is not a surrogate.  We have an ill-formed sequence:
+    //   high-surrogate followed by a non-surrogate.
+
+    // Save the second code unit in the lookahead buffer.
+    _decodeLookahead = unit1
+    _lookaheadFlags |= 0b10
+    return .Error
+  }
+
+  /// Try to decode one Unicode scalar, and return the actual number of code
+  /// units it spanned in the input.  This function may consume more code
+  /// units than required for this scalar.
+  mutating func _decodeOne<
+    I : IteratorProtocol where I.Element == CodeUnit
+  >(inout input: I) -> (UnicodeDecodingResult, Int) {
+    let result = decode(&input)
+    switch result {
+    case .Result(let us):
+      return (result, UTF16.width(us))
+
+    case .EmptyInput:
+      return (result, 0)
+
+    case .Error:
+      return (result, 1)
+    }
+  }
+
+  /// Encode a `UnicodeScalar` as a series of `CodeUnit`s by
+  /// calling `output` on each `CodeUnit`.
+  public static func encode(
+    input: UnicodeScalar,
+    output put: (CodeUnit) -> Void
+  ) {
+    let scalarValue: UInt32 = UInt32(input)
+
+    if scalarValue <= UInt32(UInt16.max) {
+      put(UInt16(scalarValue))
+    }
+    else {
+      let lead_offset = UInt32(0xd800) - UInt32(0x10000 >> 10)
+      put(UInt16(lead_offset + (scalarValue >> 10)))
+      put(UInt16(0xdc00 + (scalarValue & 0x3ff)))
+    }
+  }
+
+  var _value = UInt16()
+}
+
+/// A codec for [UTF-32](http://www.unicode.org/glossary/#UTF_32).
+public struct UTF32 : UnicodeCodecType {
+  /// A type that can hold [code unit](http://www.unicode.org/glossary/#code_unit) values for this
+  /// encoding.
+  public typealias CodeUnit = UInt32
+
+  public init() {}
+
+  /// Start or continue decoding a UTF sequence.
+  ///
+  /// In order to decode a code unit sequence completely, this function should
+  /// be called repeatedly until it returns `UnicodeDecodingResult.EmptyInput`.
+  /// Checking that the iterator was exhausted is not sufficient.  The decoder
+  /// can have an internal buffer that is pre-filled with data from the input
+  /// iterator.
+  ///
+  /// Because of buffering, it is impossible to find the corresponding position
+  /// in the iterator for a given returned `UnicodeScalar` or an error.
+  ///
+  /// - parameter next: An *iterator* over the code units to be decoded.
+  public mutating func decode<
+    I : IteratorProtocol where I.Element == CodeUnit
+  >(inout input: I) -> UnicodeDecodingResult {
+    return UTF32._decode(&input)
+  }
+
+  static func _decode<
+    I : IteratorProtocol where I.Element == CodeUnit
+  >(inout input: I) -> UnicodeDecodingResult {
+    guard let x = input.next() else { return .EmptyInput }
+    if _fastPath((x >> 11) != 0b1101_1 && x <= 0x10ffff) {
+      return .Result(UnicodeScalar(x))
+    } else {
+      return .Error
+    }
+  }
+
+  /// Encode a `UnicodeScalar` as a series of `CodeUnit`s by
+  /// calling `output` on each `CodeUnit`.
+  public static func encode(
+    input: UnicodeScalar,
+    output put: (CodeUnit) -> Void
+  ) {
+    put(UInt32(input))
+  }
+}
+
+/// Translate `input`, in the given `InputEncoding`, into `output`, in
+/// the given `OutputEncoding`.
+///
+/// - parameter stopOnError: Causes encoding to stop when an encoding
+///   error is detected in `input`, if `true`.  Otherwise, U+FFFD
+///   replacement characters are inserted for each detected error.
+public func transcode<
+  Input : IteratorProtocol,
+  InputEncoding : UnicodeCodecType,
+  OutputEncoding : UnicodeCodecType
+  where InputEncoding.CodeUnit == Input.Element>(
+  inputEncoding: InputEncoding.Type, _ outputEncoding: OutputEncoding.Type,
+  _ input: Input, _ output: (OutputEncoding.CodeUnit) -> Void,
+  stoppingOnError stopOnError: Bool
+) -> Bool {
+
+  var input = input
+
+  // NB.  It is not possible to optimize this routine to a memcpy if
+  // InputEncoding == OutputEncoding.  The reason is that memcpy will not
+  // substitute U+FFFD replacement characters for ill-formed sequences.
+
+  var inputDecoder = inputEncoding.init()
+  var hadError = false
+  for var scalar = inputDecoder.decode(&input);
+          !scalar.isEmptyInput();
+          scalar = inputDecoder.decode(&input) {
+    switch scalar {
+    case .Result(let us):
+      OutputEncoding.encode(us, output: output)
+    case .EmptyInput:
+      _sanityCheckFailure("should not enter the loop when input becomes empty")
+    case .Error:
+      if stopOnError {
+        return (hadError: true)
+      } else {
+        OutputEncoding.encode("\u{fffd}", output: output)
+        hadError = true
+      }
+    }
+  }
+  return hadError
+}
+
+/// Transcode UTF-16 to UTF-8, replacing ill-formed sequences with U+FFFD.
+///
+/// Returns the index of the first unhandled code unit and the UTF-8 data
+/// that was encoded.
+@warn_unused_result
+internal func _transcodeSomeUTF16AsUTF8<
+  Input : Collection
+  where
+  Input.Iterator.Element == UInt16>(
+  input: Input, _ startIndex: Input.Index
+) -> (Input.Index, _StringCore.UTF8Chunk) {
+  typealias UTF8Chunk = _StringCore.UTF8Chunk
+
+  let endIndex = input.endIndex
+  let utf8Max = sizeof(UTF8Chunk.self)
+  var result: UTF8Chunk = 0
+  var utf8Count = 0
+  var nextIndex = startIndex
+  while nextIndex != input.endIndex && utf8Count != utf8Max {
+    let u = UInt(input[nextIndex])
+    let shift = UTF8Chunk(utf8Count * 8)
+    var utf16Length: Input.Index.Distance = 1
+
+    if _fastPath(u <= 0x7f) {
+      result |= UTF8Chunk(u) << shift
+      ++utf8Count
+    } else {
+      var scalarUtf8Length: Int
+      var r: UInt
+      if _fastPath((u >> 11) != 0b1101_1) {
+        // Neither high-surrogate, nor low-surrogate -- well-formed sequence
+        // of 1 code unit, decoding is trivial.
+        if u < 0x800 {
+          r = 0b10__00_0000__110__0_0000
+          r |= u >> 6
+          r |= (u & 0b11_1111) << 8
+          scalarUtf8Length = 2
+        }
+        else {
+          r = 0b10__00_0000__10__00_0000__1110__0000
+          r |= u >> 12
+          r |= ((u >> 6) & 0b11_1111) << 8
+          r |= (u        & 0b11_1111) << 16
+          scalarUtf8Length = 3
+        }
+      } else {
+        let unit0 = u
+        if _slowPath((unit0 >> 10) == 0b1101_11) {
+          // `unit0` is a low-surrogate.  We have an ill-formed sequence.
+          // Replace it with U+FFFD.
+          r = 0xbdbfef
+          scalarUtf8Length = 3
+        } else if _slowPath(nextIndex.advancedBy(1) == endIndex) {
+          // We have seen a high-surrogate and EOF, so we have an ill-formed
+          // sequence.  Replace it with U+FFFD.
+          r = 0xbdbfef
+          scalarUtf8Length = 3
+        } else {
+          let unit1 = UInt(input[nextIndex.advancedBy(1)])
+          if _fastPath((unit1 >> 10) == 0b1101_11) {
+            // `unit1` is a low-surrogate.  We have a well-formed surrogate
+            // pair.
+            let v = 0x10000 + (((unit0 & 0x03ff) << 10) | (unit1 & 0x03ff))
+
+            r = 0b10__00_0000__10__00_0000__10__00_0000__1111_0__000
+            r |= v >> 18
+            r |= ((v >> 12) & 0b11_1111) << 8
+            r |= ((v >> 6) & 0b11_1111) << 16
+            r |= (v        & 0b11_1111) << 24
+            scalarUtf8Length = 4
+            utf16Length = 2
+          } else {
+            // Otherwise, we have an ill-formed sequence.  Replace it with
+            // U+FFFD.
+            r = 0xbdbfef
+            scalarUtf8Length = 3
+          }
+        }
+      }
+      // Don't overrun the buffer
+      if utf8Count + scalarUtf8Length > utf8Max {
+        break
+      }
+      result |= numericCast(r) << shift
+      utf8Count += scalarUtf8Length
+    }
+    nextIndex = nextIndex.advancedBy(utf16Length)
+  }
+  // FIXME: Annoying check, courtesy of <rdar://problem/16740169>
+  if utf8Count < sizeofValue(result) {
+    result |= ~0 << numericCast(utf8Count * 8)
+  }
+  return (nextIndex, result)
+}
+
+/// Instances of conforming types are used in internal `String`
+/// representation.
+public // @testable
+protocol _StringElementType {
+  @warn_unused_result
+  static func _toUTF16CodeUnit(_: Self) -> UTF16.CodeUnit
+
+  @warn_unused_result
+  static func _fromUTF16CodeUnit(utf16: UTF16.CodeUnit) -> Self
+}
+
+extension UTF16.CodeUnit : _StringElementType {
+  public // @testable
+  static func _toUTF16CodeUnit(x: UTF16.CodeUnit) -> UTF16.CodeUnit {
+    return x
+  }
+  public // @testable
+  static func _fromUTF16CodeUnit(
+    utf16: UTF16.CodeUnit
+  ) -> UTF16.CodeUnit {
+    return utf16
+  }
+}
+
+extension UTF8.CodeUnit : _StringElementType {
+  public // @testable
+  static func _toUTF16CodeUnit(x: UTF8.CodeUnit) -> UTF16.CodeUnit {
+    _sanityCheck(x <= 0x7f, "should only be doing this with ASCII")
+    return UTF16.CodeUnit(x)
+  }
+  public // @testable
+  static func _fromUTF16CodeUnit(
+    utf16: UTF16.CodeUnit
+  ) -> UTF8.CodeUnit {
+    _sanityCheck(utf16 <= 0x7f, "should only be doing this with ASCII")
+    return UTF8.CodeUnit(utf16)
+  }
+}
+
+extension UTF16 {
+  /// Return the number of code units required to encode `x`.
+  @warn_unused_result
+  public static func width(x: UnicodeScalar) -> Int {
+    return x.value <= 0xFFFF ? 1 : 2
+  }
+
+  /// Return the high surrogate code unit of a [surrogate pair](http://www.unicode.org/glossary/#surrogate_pair) representing
+  /// `x`.
+  ///
+  /// - Requires: `width(x) == 2`.
+  @warn_unused_result
+  public static func leadSurrogate(x: UnicodeScalar) -> UTF16.CodeUnit {
+    _precondition(width(x) == 2)
+    return UTF16.CodeUnit((x.value - 0x1_0000) >> (10 as UInt32)) + 0xD800
+  }
+
+  /// Return the low surrogate code unit of a [surrogate pair](http://www.unicode.org/glossary/#surrogate_pair) representing
+  /// `x`.
+  ///
+  /// - Requires: `width(x) == 2`.
+  @warn_unused_result
+  public static func trailSurrogate(x: UnicodeScalar) -> UTF16.CodeUnit {
+    _precondition(width(x) == 2)
+    return UTF16.CodeUnit(
+      (x.value - 0x1_0000) & (((1 as UInt32) << 10) - 1)
+    ) + 0xDC00
+  }
+
+  @warn_unused_result
+  public static func isLeadSurrogate(x: CodeUnit) -> Bool {
+    return 0xD800...0xDBFF ~= x
+  }
+
+  @warn_unused_result
+  public static func isTrailSurrogate(x: CodeUnit) -> Bool {
+    return 0xDC00...0xDFFF ~= x
+  }
+
+  public // @testable
+  static func _copy<T : _StringElementType, U : _StringElementType>(
+    source: UnsafeMutablePointer<T>,
+    destination: UnsafeMutablePointer<U>, count: Int
+  ) {
+    if strideof(T.self) == strideof(U.self) {
+      _memcpy(
+        dest: UnsafeMutablePointer(destination),
+        src: UnsafeMutablePointer(source),
+        size: UInt(count) * UInt(strideof(U.self)))
+    }
+    else {
+      for i in 0..<count {
+        let u16 = T._toUTF16CodeUnit((source + i).memory)
+        (destination + i).memory = U._fromUTF16CodeUnit(u16)
+      }
+    }
+  }
+
+  /// Returns the number of UTF-16 code units required for the given code unit
+  /// sequence when transcoded to UTF-16, and a bit describing if the sequence
+  /// was found to contain only ASCII characters.
+  ///
+  /// If `repairIllFormedSequences` is `true`, the function always succeeds.
+  /// If it is `false`, `nil` is returned if an ill-formed code unit sequence is
+  /// found in `input`.
+  @warn_unused_result
+  public static func measure<
+      Encoding : UnicodeCodecType, Input : IteratorProtocol
+      where Encoding.CodeUnit == Input.Element
+  >(
+    _: Encoding.Type, input: Input, repairIllFormedSequences: Bool
+  ) -> (Int, Bool)? {
+    var input = input
+    var count = 0
+    var isAscii = true
+
+    var inputDecoder = Encoding()
+    loop:
+    while true {
+      switch inputDecoder.decode(&input) {
+      case .Result(let us):
+        if us.value > 0x7f {
+          isAscii = false
+        }
+        count += width(us)
+      case .EmptyInput:
+        break loop
+      case .Error:
+        if !repairIllFormedSequences {
+          return .None
+        }
+        isAscii = false
+        count += width(UnicodeScalar(0xfffd))
+      }
+    }
+    return (count, isAscii)
+  }
+}
+
diff --git a/stdlib/public/core/Character.swift b/stdlib/public/core/Character.swift
index cf455f676f06a..58d1542e029a8 100644
--- a/stdlib/public/core/Character.swift
+++ b/stdlib/public/core/Character.swift
@@ -210,7 +210,7 @@ public struct Character :
       }
       transcode(
         UTF8.self, UTF16.self, _SmallUTF8(u8).iterator(), output,
-        stopOnError: false)
+        stoppingOnError: false)
       self.data = u16
     }
 
diff --git a/stdlib/public/core/StringBuffer.swift b/stdlib/public/core/StringBuffer.swift
index d2ee61efb876c..39e76cb135f52 100644
--- a/stdlib/public/core/StringBuffer.swift
+++ b/stdlib/public/core/StringBuffer.swift
@@ -110,7 +110,7 @@ public struct _StringBuffer {
       }
       let hadError = transcode(
         encoding, UTF32.self, input.iterator(), sink,
-        stopOnError: true)
+        stoppingOnError: true)
       _sanityCheck(!hadError, "string can not be ASCII if there were decoding errors")
       return (result, hadError)
     }
@@ -121,7 +121,7 @@ public struct _StringBuffer {
       }
       let hadError = transcode(
         encoding, UTF16.self, input.iterator(), sink,
-        stopOnError: !repairIllFormedSequences)
+        stoppingOnError: !repairIllFormedSequences)
       return (result, hadError)
     }
   }
diff --git a/stdlib/public/core/StringCore.swift b/stdlib/public/core/StringCore.swift
index 9e5b9e2a44e92..9dd10a8f8b22e 100644
--- a/stdlib/public/core/StringCore.swift
+++ b/stdlib/public/core/StringCore.swift
@@ -340,7 +340,7 @@ public struct _StringCore {
             count: count
           ).iterator(),
           output,
-          stopOnError: true
+          stoppingOnError: true
         )
         _sanityCheck(!hadError, "Swift.String with native storage should not have unpaired surrogates")
       }
diff --git a/stdlib/public/core/Unicode.swift b/stdlib/public/core/Unicode.swift
index ad29f5068a019..3bc33e693b2dd 100644
--- a/stdlib/public/core/Unicode.swift
+++ b/stdlib/public/core/Unicode.swift
@@ -694,7 +694,7 @@ public func transcode<
   where InputEncoding.CodeUnit == Input.Element>(
   inputEncoding: InputEncoding.Type, _ outputEncoding: OutputEncoding.Type,
   _ input: Input, _ output: (OutputEncoding.CodeUnit) -> Void,
-  stopOnError: Bool
+  stoppingOnError stopOnError: Bool
 ) -> Bool {
 
   var input = input
diff --git a/validation-test/stdlib/Unicode.swift b/validation-test/stdlib/Unicode.swift
index e7e899cbb631b..21c0baa392fb1 100644
--- a/validation-test/stdlib/Unicode.swift
+++ b/validation-test/stdlib/Unicode.swift
@@ -116,7 +116,7 @@ func checkDecodeUTF<Codec : UnicodeCodecType>(
     var decoded = [UInt32]()
     let output: (UInt32) -> Void = { decoded.append($0) }
     let iterator = EOFCountingIterator(utfStr)
-    transcode(codec, UTF32.self, iterator, output, stopOnError: true)
+    transcode(codec, UTF32.self, iterator, output, stoppingOnError: true)
     expectGE(1, iterator.numTimesReturnedEOF)
     if expectedHead != decoded {
       return assertionFailure()
@@ -133,7 +133,7 @@ func checkDecodeUTF<Codec : UnicodeCodecType>(
     var decoded = [UInt32]()
     let output: (UInt32) -> Void = { decoded.append($0) }
     let iterator = EOFCountingIterator(utfStr)
-    transcode(codec, UTF32.self, iterator, output, stopOnError: false)
+    transcode(codec, UTF32.self, iterator, output, stoppingOnError: false)
     expectEqual(1, iterator.numTimesReturnedEOF)
     if expected != decoded {
       return assertionFailure()
@@ -175,7 +175,7 @@ func checkEncodeUTF8(expected: [UInt8],
   let output: (UInt8) -> Void = { encoded.append($0) }
   let iterator = EOFCountingIterator(scalars)
   let hadError =
-    transcode(UTF32.self, UTF8.self, iterator, output, stopOnError: true)
+    transcode(UTF32.self, UTF8.self, iterator, output, stoppingOnError: true)
   expectFalse(hadError)
   expectGE(1, iterator.numTimesReturnedEOF)
   if expected != encoded {
@@ -2085,7 +2085,7 @@ UnicodeAPIs.test("transcode/MutableArray") {
   var input: [UInt16] = [ 0x0041, 0x0042 ]
   var transcoded = [UInt16]()
   let output: (UInt16) -> Void = { transcoded.append($0) }
-  transcode(UTF16.self, UTF16.self, input.iterator(), output, stopOnError: true)
+  transcode(UTF16.self, UTF16.self, input.iterator(), output, stoppingOnError: true)
   expectEqual(input, transcoded)
 }
 
@@ -2093,7 +2093,7 @@ UnicodeAPIs.test("transcode/ReferenceTypedArray") {
   var input: [UInt16] = [ 0x0041, 0x0042 ]
   var transcoded = [UInt16]()
   let output: (UInt16) -> Void = { transcoded.append($0) }
-  transcode(UTF16.self, UTF16.self, input.iterator(), output, stopOnError: true)
+  transcode(UTF16.self, UTF16.self, input.iterator(), output, stoppingOnError: true)
   expectEqual(input, transcoded)
 }
 
@@ -2114,7 +2114,7 @@ class NonContiguousNSString : NSString {
     let output: (UInt16) -> Void = { encoded.append($0) }
     let iterator = utf8.iterator()
     let hadError =
-      transcode(UTF8.self, UTF16.self, iterator, output, stopOnError: true)
+      transcode(UTF8.self, UTF16.self, iterator, output, stoppingOnError: true)
     expectFalse(hadError)
     self.init(encoded)
   }
@@ -2129,7 +2129,7 @@ class NonContiguousNSString : NSString {
     let output: (UInt16) -> Void = { encoded.append($0) }
     let iterator = scalars.iterator()
     let hadError =
-      transcode(UTF32.self, UTF16.self, iterator, output, stopOnError: true)
+      transcode(UTF32.self, UTF16.self, iterator, output, stoppingOnError: true)
     expectFalse(hadError)
     self.init(encoded)
   }
@@ -2184,7 +2184,7 @@ StringCookedViews.test("UTF8ForContiguousUTF16") {
     let output: (UInt16) -> Void = { backingStorage.append($0) }
 
     var iterator = test.scalars.iterator()
-    transcode(UTF32.self, UTF16.self, iterator, output, stopOnError: false)
+    transcode(UTF32.self, UTF16.self, iterator, output, stoppingOnError: false)
 
     backingStorage.withUnsafeBufferPointer {
       (ptr) -> Void in
@@ -2205,7 +2205,7 @@ StringCookedViews.test("UTF8ForContiguousUTF16") {
     let output: (UInt8) -> Void = { expected.append($0) }
     var expectedScalars = test.scalarsHead + test.scalarsRepairedTail
     var iterator = expectedScalars.iterator()
-    transcode(UTF32.self, UTF8.self, iterator, output, stopOnError: false)
+    transcode(UTF32.self, UTF8.self, iterator, output, stoppingOnError: false)
 
     checkUTF8View(expected, subject, test.loc.withCurrentLoc())
   }
@@ -2251,7 +2251,7 @@ StringCookedViews.test("UTF8ForNonContiguousUTF16") {
       let output: (UInt8) -> Void = { expected.append($0) }
       var expectedScalars = test.scalarsHead + test.scalarsRepairedTail
       var iterator = expectedScalars.iterator()
-      transcode(UTF32.self, UTF8.self, iterator, output, stopOnError: false)
+      transcode(UTF32.self, UTF8.self, iterator, output, stoppingOnError: false)
 
       var nss = NonContiguousNSString(test.encoded)
       verifyThatStringIsOpaqueForCoreFoundation(nss)
@@ -2319,7 +2319,7 @@ StringCookedViews.test("UTF16") {
     let output: (UInt16) -> Void = { expected.append($0) }
     var expectedScalars = test.scalars
     var iterator = expectedScalars.iterator()
-    transcode(UTF32.self, UTF16.self, iterator, output, stopOnError: false)
+    transcode(UTF32.self, UTF16.self, iterator, output, stoppingOnError: false)
 
     var nss = NonContiguousNSString(test.scalars)
     checkUTF16View(expected, nss as String, test.loc.withCurrentLoc())
@@ -2331,7 +2331,7 @@ StringCookedViews.test("UTF16") {
     let output: (UInt16) -> Void = { expected.append($0) }
     var expectedScalars = test.scalarsHead + test.scalarsRepairedTail
     var iterator = expectedScalars.iterator()
-    transcode(UTF32.self, UTF16.self, iterator, output, stopOnError: false)
+    transcode(UTF32.self, UTF16.self, iterator, output, stoppingOnError: false)
 
     checkUTF16View(expected, subject, test.loc.withCurrentLoc())
   }
diff --git a/validation-test/stdlib/UnicodeTrie.swift.gyb b/validation-test/stdlib/UnicodeTrie.swift.gyb
index 501ceb1a7cb95..74a8f580d6f94 100644
--- a/validation-test/stdlib/UnicodeTrie.swift.gyb
+++ b/validation-test/stdlib/UnicodeTrie.swift.gyb
@@ -96,7 +96,7 @@ class NonContiguousNSString : NSString {
     var iter = scalars.iterator()
     let output: (UInt16) -> Void = { encoded.append($0) }
     let hadError =
-      transcode(UTF32.self, UTF16.self, iter, output, stopOnError: true)
+      transcode(UTF32.self, UTF16.self, iter, output, stoppingOnError: true)
     expectFalse(hadError)
     self.init(encoded)
   }