Skip to content

Commit

Permalink
Merge pull request swiftlang#23834 from milseman/a_line_scaler
Browse files Browse the repository at this point in the history
[String] Scalar-alignment bug fixes.
  • Loading branch information
milseman authored Jun 27, 2019
2 parents 3e3770f + bd5a40f commit 1de84fc
Show file tree
Hide file tree
Showing 18 changed files with 634 additions and 244 deletions.
8 changes: 8 additions & 0 deletions stdlib/public/core/LegacyABI.swift
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,11 @@ internal func _branchHint(_ actual: Bool, expected: Bool) -> Bool {
// value without any branch hint.
return actual
}

extension String {
@usableFromInline // Never actually used in inlinable code...
internal func _nativeCopyUTF16CodeUnits(
into buffer: UnsafeMutableBufferPointer<UInt16>,
range: Range<String.Index>
) { fatalError() }
}
14 changes: 7 additions & 7 deletions stdlib/public/core/StringBridge.swift
Original file line number Diff line number Diff line change
Expand Up @@ -173,17 +173,17 @@ internal enum _KnownCocoaString {
#if !(arch(i386) || arch(arm))
case tagged
#endif

@inline(__always)
init(_ str: _CocoaString) {

#if !(arch(i386) || arch(arm))
if _isObjCTaggedPointer(str) {
self = .tagged
return
}
#endif

switch unsafeBitCast(_swift_classOfObjCHeapObject(str), to: UInt.self) {
case unsafeBitCast(__StringStorage.self, to: UInt.self):
self = .storage
Expand Down Expand Up @@ -272,21 +272,21 @@ internal func _bridgeCocoaString(_ cocoaString: _CocoaString) -> _StringGuts {
// 3) If it's mutable with associated information, must make the call
let immutableCopy
= _stdlib_binary_CFStringCreateCopy(cocoaString) as AnyObject

#if !(arch(i386) || arch(arm))
if _isObjCTaggedPointer(immutableCopy) {
return _StringGuts(_SmallString(taggedCocoa: immutableCopy))
}
#endif

let (fastUTF8, isASCII): (Bool, Bool)
switch _getCocoaStringPointer(immutableCopy) {
case .ascii(_): (fastUTF8, isASCII) = (true, true)
case .utf8(_): (fastUTF8, isASCII) = (true, false)
default: (fastUTF8, isASCII) = (false, false)
}
let length = _stdlib_binary_CFStringGetLength(immutableCopy)

return _StringGuts(
cocoa: immutableCopy,
providesFastUTF8: fastUTF8,
Expand Down Expand Up @@ -420,6 +420,6 @@ extension String {
) {
_internalInvariant(buffer.count >= range.count)
let indexRange = self._toUTF16Indices(range)
self._nativeCopyUTF16CodeUnits(into: buffer, range: indexRange)
self.utf16._nativeCopy(into: buffer, alignedRange: indexRange)
}
}
15 changes: 11 additions & 4 deletions stdlib/public/core/StringCharacterView.swift
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,14 @@ extension String: BidirectionalCollection {
_precondition(i < endIndex, "String index is out of bounds")

// TODO: known-ASCII fast path, single-scalar-grapheme fast path, etc.
let i = _guts.scalarAlign(i)
let stride = _characterStride(startingAt: i)
let nextOffset = i._encodedOffset &+ stride
let nextStride = _characterStride(
startingAt: Index(_encodedOffset: nextOffset))
startingAt: Index(_encodedOffset: nextOffset)._aligned)

return Index(
encodedOffset: nextOffset, characterStride: nextStride)
encodedOffset: nextOffset, characterStride: nextStride)._aligned
}

/// Returns the position immediately before the given index.
Expand All @@ -78,9 +79,10 @@ extension String: BidirectionalCollection {
_precondition(i > startIndex, "String index is out of bounds")

// TODO: known-ASCII fast path, single-scalar-grapheme fast path, etc.
let i = _guts.scalarAlign(i)
let stride = _characterStride(endingAt: i)
let priorOffset = i._encodedOffset &- stride
return Index(encodedOffset: priorOffset, characterStride: stride)
return Index(encodedOffset: priorOffset, characterStride: stride)._aligned
}
/// Returns an index that is the specified distance from the given index.
///
Expand Down Expand Up @@ -167,7 +169,7 @@ extension String: BidirectionalCollection {
@inlinable @inline(__always)
public func distance(from start: Index, to end: Index) -> IndexDistance {
// TODO: known-ASCII and single-scalar-grapheme fast path, etc.
return _distance(from: start, to: end)
return _distance(from: _guts.scalarAlign(start), to: _guts.scalarAlign(end))
}

/// Accesses the character at the given position.
Expand All @@ -191,12 +193,15 @@ extension String: BidirectionalCollection {

let i = _guts.scalarAlign(i)
let distance = _characterStride(startingAt: i)

return _guts.errorCorrectedCharacter(
startingAt: i._encodedOffset, endingAt: i._encodedOffset &+ distance)
}

@inlinable @inline(__always)
internal func _characterStride(startingAt i: Index) -> Int {
_internalInvariant(i._isAligned)

// Fast check if it's already been measured, otherwise check resiliently
if let d = i.characterStride { return d }

Expand All @@ -207,6 +212,8 @@ extension String: BidirectionalCollection {

@inlinable @inline(__always)
internal func _characterStride(endingAt i: Index) -> Int {
_internalInvariant(i._isAligned)

if i == startIndex { return 0 }

return _guts._opaqueCharacterStride(endingAt: i._encodedOffset)
Expand Down
59 changes: 34 additions & 25 deletions stdlib/public/core/StringGraphemeBreaking.swift
Original file line number Diff line number Diff line change
Expand Up @@ -86,44 +86,50 @@ private func _hasGraphemeBreakBetween(
private func _measureCharacterStrideICU(
of utf8: UnsafeBufferPointer<UInt8>, startingAt i: Int
) -> Int {
let iterator = _ThreadLocalStorage.getUBreakIterator(utf8)
let offset = __swift_stdlib_ubrk_following(
iterator, Int32(truncatingIfNeeded: i))
// ICU will gives us a different result if we feed in the whole buffer, so
// slice it appropriately.
let utf8Slice = UnsafeBufferPointer(rebasing: utf8[i...])
let iterator = _ThreadLocalStorage.getUBreakIterator(utf8Slice)
let offset = __swift_stdlib_ubrk_following(iterator, 0)

// ubrk_following returns -1 (UBRK_DONE) when it hits the end of the buffer.
if _fastPath(offset != -1) {
// The offset into our buffer is the distance.
_internalInvariant(offset > i, "zero-sized grapheme?")
return Int(truncatingIfNeeded: offset) &- i
}
_internalInvariant(utf8.count > i)
return utf8.count &- i
guard _fastPath(offset != -1) else { return utf8Slice.count }

// The offset into our buffer is the distance.
_internalInvariant(offset > 0, "zero-sized grapheme?")
return Int(truncatingIfNeeded: offset)
}

@inline(never) // slow-path
@_effects(releasenone)
private func _measureCharacterStrideICU(
of utf16: UnsafeBufferPointer<UInt16>, startingAt i: Int
) -> Int {
let iterator = _ThreadLocalStorage.getUBreakIterator(utf16)
let offset = __swift_stdlib_ubrk_following(
iterator, Int32(truncatingIfNeeded: i))
// ICU will gives us a different result if we feed in the whole buffer, so
// slice it appropriately.
let utf16Slice = UnsafeBufferPointer(rebasing: utf16[i...])
let iterator = _ThreadLocalStorage.getUBreakIterator(utf16Slice)
let offset = __swift_stdlib_ubrk_following(iterator, 0)

// ubrk_following returns -1 (UBRK_DONE) when it hits the end of the buffer.
if _fastPath(offset != -1) {
// The offset into our buffer is the distance.
_internalInvariant(offset > i, "zero-sized grapheme?")
return Int(truncatingIfNeeded: offset) &- i
}
return utf16.count &- i
guard _fastPath(offset != -1) else { return utf16Slice.count }

// The offset into our buffer is the distance.
_internalInvariant(offset > 0, "zero-sized grapheme?")
return Int(truncatingIfNeeded: offset)
}

@inline(never) // slow-path
@_effects(releasenone)
private func _measureCharacterStrideICU(
of utf8: UnsafeBufferPointer<UInt8>, endingAt i: Int
) -> Int {
let iterator = _ThreadLocalStorage.getUBreakIterator(utf8)
let offset = __swift_stdlib_ubrk_preceding(
iterator, Int32(truncatingIfNeeded: i))
// Slice backwards as well, even though ICU currently seems to give the same
// answer as unsliced.
let utf8Slice = UnsafeBufferPointer(rebasing: utf8[..<i])
let iterator = _ThreadLocalStorage.getUBreakIterator(utf8Slice)
let offset = __swift_stdlib_ubrk_preceding(iterator, Int32(utf8Slice.count))

// ubrk_following returns -1 (UBRK_DONE) when it hits the end of the buffer.
if _fastPath(offset != -1) {
// The offset into our buffer is the distance.
Expand All @@ -138,9 +144,12 @@ private func _measureCharacterStrideICU(
private func _measureCharacterStrideICU(
of utf16: UnsafeBufferPointer<UInt16>, endingAt i: Int
) -> Int {
let iterator = _ThreadLocalStorage.getUBreakIterator(utf16)
let offset = __swift_stdlib_ubrk_preceding(
iterator, Int32(truncatingIfNeeded: i))
// Slice backwards as well, even though ICU currently seems to give the same
// answer as unsliced.
let utf16Slice = UnsafeBufferPointer(rebasing: utf16[..<i])
let iterator = _ThreadLocalStorage.getUBreakIterator(utf16Slice)
let offset = __swift_stdlib_ubrk_preceding(iterator, Int32(utf16Slice.count))

// ubrk_following returns -1 (UBRK_DONE) when it hits the end of the buffer.
if _fastPath(offset != -1) {
// The offset into our buffer is the distance.
Expand Down
4 changes: 2 additions & 2 deletions stdlib/public/core/StringGuts.swift
Original file line number Diff line number Diff line change
Expand Up @@ -268,11 +268,11 @@ extension _StringGuts {

@inlinable @inline(__always)
internal var startIndex: String.Index {
return Index(_encodedOffset: 0)
return Index(_encodedOffset: 0)._aligned
}
@inlinable @inline(__always)
internal var endIndex: String.Index {
return Index(_encodedOffset: self.count)
return Index(_encodedOffset: self.count)._aligned
}
}

Expand Down
94 changes: 76 additions & 18 deletions stdlib/public/core/StringIndex.swift
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,25 @@ import SwiftShims

String's Index has the following layout:

┌──────────┬───────────────────┬────────────────┬──────────┐
│ b63:b16 │ b15:b14 │ b13:b8 │ b7:b0 │
├──────────┼───────────────────┼────────────────┼──────────┤
│ position │ transcoded offset │ grapheme cache │ reserved │
└──────────┴───────────────────┴────────────────┴──────────┘

- grapheme cache: A 6-bit value remembering the distance to the next grapheme
┌──────────┬───────────────────┬─────────╥────────────────┬──────────┐
│ b63:b16 │ b15:b14 │ b13 ║ b12:b8 │ b6:b0 │
├──────────┼───────────────────┼─────────╫────────────────┼──────────┤
│ position │ transcoded offset │ aligned ║ grapheme cache │ reserved │
└──────────┴───────────────────┴─────────╨────────────────┴──────────┘

Position, transcoded offset, and aligned are fully exposed in the ABI. Grapheme
cache and reserved are partially resilient: the fact that there are 13 bits with
a default value of `0` is ABI, but not the layout, construction, or
interpretation of those bits. All use of grapheme cache should be behind
non-inlinable function calls.

- position aka `encodedOffset`: A 48-bit offset into the string's code units
- transcoded offset: a 2-bit sub-scalar offset, derived from transcoding
- aligned, whether this index is known to be scalar-aligned (see below)
<resilience barrier>
- grapheme cache: A 5-bit value remembering the distance to the next grapheme
boundary
- position aka `encodedOffset`: An offset into the string's code units
- transcoded offset: a sub-scalar offset, derived from transcoding

The use and interpretation of both `reserved` and `grapheme cache` is not part
of Index's ABI; it should be hidden behind non-inlinable calls. However, the
position of the sequence of 14 bits allocated is part of Index's ABI, as well as
the default value being `0`.
- reserved: 8-bit for future use.

*/
extension String {
Expand Down Expand Up @@ -82,7 +86,7 @@ extension String.Index {

@usableFromInline
internal var characterStride: Int? {
let value = (_rawBits & 0x3F00) &>> 8
let value = (_rawBits & 0x1F00) &>> 8
return value > 0 ? Int(truncatingIfNeeded: value) : nil
}

Expand Down Expand Up @@ -132,9 +136,7 @@ extension String.Index {
encodedOffset: Int, transcodedOffset: Int, characterStride: Int
) {
self.init(encodedOffset: encodedOffset, transcodedOffset: transcodedOffset)
if _slowPath(characterStride > 63) { return }

_internalInvariant(characterStride == characterStride & 0x3F)
if _slowPath(characterStride > 0x1F) { return }
self._rawBits |= UInt64(truncatingIfNeeded: characterStride &<< 8)
self._invariantCheck()
}
Expand All @@ -150,6 +152,9 @@ extension String.Index {
@usableFromInline @inline(never) @_effects(releasenone)
internal func _invariantCheck() {
_internalInvariant(_encodedOffset >= 0)
if self._isAligned {
_internalInvariant(transcodedOffset == 0)
}
}
#endif // INTERNAL_CHECKS_ENABLED
}
Expand Down Expand Up @@ -188,6 +193,7 @@ extension String.Index {
transcodedOffset: self.transcodedOffset &- 1)
}


// Get an index with an encoded offset relative to this one.
// Note: strips any transcoded offset.
@inlinable @inline(__always)
Expand All @@ -200,7 +206,59 @@ extension String.Index {
_internalInvariant(self.transcodedOffset == 0)
return String.Index(encodedOffset: self._encodedOffset, transcodedOffset: n)
}
}

/*
Index Alignment

SE-0180 unifies the Index type of String and all its views and allows
non-scalar-aligned indices to be used across views. In order to guarantee
behavior, we often have to check and perform scalar alignment. To speed up
these checks, we allocate a bit denoting known-to-be-aligned, so that the
alignment check can skip the load. The below shows what views need to check
for alignment before they can operate, and whether the indices they produce
are aligned.

┌───────────────╥────────────────────┬──────────────────────────┐
│ View ║ Requires Alignment │ Produces Aligned Indices │
╞═══════════════╬════════════════════╪══════════════════════════╡
│ Native UTF8 ║ no │ no │
├───────────────╫────────────────────┼──────────────────────────┤
│ Native UTF16 ║ yes │ no │
╞═══════════════╬════════════════════╪══════════════════════════╡
│ Foreign UTF8 ║ yes │ no │
├───────────────╫────────────────────┼──────────────────────────┤
│ Foreign UTF16 ║ no │ no │
╞═══════════════╬════════════════════╪══════════════════════════╡
│ UnicodeScalar ║ yes │ yes │
├───────────────╫────────────────────┼──────────────────────────┤
│ Character ║ yes │ yes │
└───────────────╨────────────────────┴──────────────────────────┘

The "requires alignment" applies to any operation taking a String.Index that's
not defined entirely in terms of other operations taking a String.Index. These
include:

* index(after:)
* index(before:)
* subscript
* distance(from:to:) (since `to` is compared against directly)
* UTF16View._nativeGetOffset(for:)

*/
extension String.Index {
@_alwaysEmitIntoClient // Swift 5.1
@inline(__always)
internal var _isAligned: Bool { return 0 != _rawBits & 0x2000 }

@_alwaysEmitIntoClient // Swift 5.1
@inline(__always)
internal var _aligned: String.Index {
var idx = self
idx._rawBits |= 0x2000
idx._invariantCheck()
return idx
}
}

extension String.Index: Equatable {
Expand Down
8 changes: 4 additions & 4 deletions stdlib/public/core/StringStorage.swift
Original file line number Diff line number Diff line change
Expand Up @@ -141,9 +141,9 @@ extension _AbstractStringStorage {
// one of ours.

defer { _fixLifetime(other) }

let otherUTF16Length = _stdlib_binary_CFStringGetLength(other)

// CFString will only give us ASCII bytes here, but that's fine.
// We already handled non-ASCII UTF8 strings earlier since they're Swift.
if let otherStart = _cocoaUTF8Pointer(other) {
Expand All @@ -154,11 +154,11 @@ extension _AbstractStringStorage {
return (start == otherStart ||
(memcmp(start, otherStart, count) == 0)) ? 1 : 0
}

if UTF16Length != otherUTF16Length {
return 0
}

/*
The abstract implementation of -isEqualToString: falls back to -compare:
immediately, so when we run out of fast options to try, do the same.
Expand Down
Loading

0 comments on commit 1de84fc

Please sign in to comment.