Skip to content

Commit

Permalink
Merge pull request #8 from weissi/jw-utf8-validation-tests
Browse files Browse the repository at this point in the history
first couple of utf8 validation tests
  • Loading branch information
milseman authored Oct 4, 2018
2 parents 861f652 + 0eabdb6 commit 117de1a
Show file tree
Hide file tree
Showing 4 changed files with 257 additions and 1 deletion.
30 changes: 30 additions & 0 deletions Tests/UTF8StringTests/UTF8StringTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -725,6 +725,36 @@ final class UTF8StringTests: XCTestCase {
}
}

func testStringsThatNeedRepairing() {
struct ExampleThatNeedsRepairing {
let brokenBytes: [UInt8]
let expectedRepairedBytes: [UInt8]
let desc: Swift.String
init(_ brokenBytes: [UInt8], _ expectedRepairedBytes: [UInt8], _ desc: Swift.String) {
self.brokenBytes = brokenBytes
self.expectedRepairedBytes = expectedRepairedBytes
self.desc = desc
}
}

let strings: [ExampleThatNeedsRepairing] = [
.init([0xc3], [0xEF, 0xBF, 0xBD], "half an ä"),
.init(Array("🙈".utf8) + [0xc3] + Array("🏴󠁧󠁢󠁳󠁣󠁴󠁿".utf8), Array("🙈".utf8) + [0xEF, 0xBF, 0xBD] + Array("🏴󠁧󠁢󠁳󠁣󠁴󠁿".utf8), "half an ä sandwiched by emojis"),
.init(Array("🤪".utf8.dropLast(1)), [0xEF, 0xBF, 0xBD], "almost emoji"),
.init(Array("🤪".utf8.dropFirst(1)) + Array("🤪".utf8.dropLast(1)) + Array("🤪".utf8.dropLast(2)),
[0xEF, 0xBF, 0xBD, 0xEF, 0xBF, 0xBD, 0xEF, 0xBF, 0xBD],
"three broken emoji back to back")
]

for s in strings {
XCTAssertEqual(s.expectedRepairedBytes,
Array(UTF8String.String(decoding: s.brokenBytes, as: UTF8.self).utf8),
"\("s.desc) failed)")")
}
}



}

// The most simple subclass of NSString that CoreFoundation does not know
Expand Down
217 changes: 217 additions & 0 deletions Tests/UTF8StringTests/UTF8ValidationTests.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
import XCTest
@testable import UTF8String

public class UTF8ValidationTest: XCTestCase {
struct TestError: Error {
let result: UTF8ValidationResult
}

func run(_ bytes: UnsafeBufferPointer<UInt8>) throws {
if case .error(let range) = _betterValidateUTF8(bytes) {
throw TestError(result: .error(toBeReplaced: range))
}
}

func assertValidUTF8(_ bytes: UnsafeBufferPointer<UInt8>, _ message: Swift.String = "", file: StaticString = #file, line: UInt = #line) {
do {
return try self.run(bytes)
} catch {
XCTFail("not valid: \(error)", file: file, line: line)
}
}

func assertInvalidUTF8(_ bytes: UnsafeBufferPointer<UInt8>,
expectedErrorRange: Range<Int>,
expectedRepairedString: UTF8String.String,
_ message: Swift.String = "", file: StaticString = #file, line: UInt = #line) {
func errorFormat(_ value: (Swift.String, UTF8ValidationResult?)) -> (Swift.String, Swift.String) {
return (value.0, value.1.map(String.init(describing:)) ?? "<no error>")
}
do {
try self.run(bytes)
XCTFail("\(message) is valid", file: file, line: line)
} catch let e as TestError {
XCTAssertEqual(UTF8ValidationResult.error(toBeReplaced: expectedErrorRange), e.result, message, file: file, line: line)
let repaired = utf8Repair(bytes, firstKnownBrokenRange: expectedErrorRange)
XCTAssertEqual(expectedRepairedString, repaired, "repaired string wrong", file: file, line: line)
} catch {
fatalError("unexpected error \(error)")
}
}

func assertValidUTF8(_ bytes: [UInt8], _ message: Swift.String = "", file: StaticString = #file, line: UInt = #line) {
bytes.withUnsafeBufferPointer { ptr in
self.assertValidUTF8(ptr, message, file: file, line: line)
}
}

func assertInvalidUTF8(_ bytes: [UInt8], expectedErrorRange: Range<Int>, expectedRepairedString: UTF8String.String, _ message: Swift.String = "", file: StaticString = #file, line: UInt = #line) {
bytes.withUnsafeBufferPointer { ptr in
self.assertInvalidUTF8(ptr, expectedErrorRange: expectedErrorRange, expectedRepairedString: expectedRepairedString, message, file: file, line: line)
}
}

func assertValidUTF8<Bytes: Collection>(_ bytes: Bytes, _ message: Swift.String = "", file: StaticString = #file, line: UInt = #line) where Bytes.Element == UInt8 {
self.assertValidUTF8(Array(bytes), message, file: file, line: line)
}

func assertInvalidUTF8<Bytes: Collection>(_ bytes: Bytes, expectedErrorRange: Range<Int>, expectedRepairedString: UTF8String.String, _ message: Swift.String = "", file: StaticString = #file, line: UInt = #line) where Bytes.Element == UInt8 {
self.assertInvalidUTF8(Array(bytes), expectedErrorRange: expectedErrorRange, expectedRepairedString: expectedRepairedString, message, file: file, line: line)
}

// MARK: Utils
func makeUTF8ContinuationSequence(totalNumberBytes: Int) -> [UInt8] {
precondition(totalNumberBytes > 0)
guard totalNumberBytes > 1 else {
return [0]
}
var firstByte: UInt8 = 0
for i in 0..<8 {
firstByte |= firstByte | (i <= totalNumberBytes ? 1 : 0)
firstByte <<= 1
}
return [firstByte]+repeatElement(0b1000_0000, count: totalNumberBytes-1)
}

private let replacementChar: UTF8String.String = String(decoding: utf8ReplacementCharacter, as: UTF8.self)

// MARK: Tests
func testValid_Empty() {
self.assertValidUTF8([])
}

func testValid_OneCharacterASCII() {
self.assertValidUTF8(" ".utf8)
}

func testInvalid_ContinuationEndByteWithoutContinuation() {
self.assertInvalidUTF8([0b1000_0000], expectedErrorRange: 0..<1, expectedRepairedString: self.replacementChar)
}

func testInvalid_ContinuationStartByteOnly() {
self.assertInvalidUTF8([0b1100_0000], expectedErrorRange: 0..<1, expectedRepairedString: self.replacementChar)
self.assertInvalidUTF8([0b1110_0000], expectedErrorRange: 0..<1, expectedRepairedString: self.replacementChar)
self.assertInvalidUTF8([0b1111_0000], expectedErrorRange: 0..<1, expectedRepairedString: self.replacementChar)
}

func testInvalid_OnlyTheRawContinuationBytesNoActualData() {
for i in 2...4 {
let bytes = makeUTF8ContinuationSequence(totalNumberBytes: i)
self.assertInvalidUTF8(bytes,
expectedErrorRange: 0..<i,
expectedRepairedString: self.replacementChar,
"\(bytes) (\(i) bytes)")
}
}

func testValid_SomeContinuations() {
for i in 2...4 {
let bytes = makeUTF8ContinuationSequence(totalNumberBytes: i).map { $0 | 0b10 /* make them valid */ }
self.assertValidUTF8(bytes, "\(bytes) (\(i) bytes)")
}
}

func testInvalid_ContinuationsMissingBytes() {
for i in 2...4 {
for dropLast in 1..<i {
let bytes = makeUTF8ContinuationSequence(totalNumberBytes: i)
.map { $0 | 0b10 /* make them valid */ }
.dropLast(dropLast) /* and invalid again */
self.assertInvalidUTF8(bytes,
expectedErrorRange: 0..<(i-dropLast),
expectedRepairedString: self.replacementChar,
"\(bytes) (\(i) bytes), i=\(i), dropLast=\(dropLast)")
}
}
}

func testValid_MultipleValidsConcatenated() {
let allBytes: [UInt8] = (1...4).flatMap { i in
return makeUTF8ContinuationSequence(totalNumberBytes: i).map { $0 | 0b10 /* make them valid */ }
}
self.assertValidUTF8(allBytes, "\(allBytes)")
}

func testInvalid_MultipleValidsInterspersedWithInvalids() {
let illegalSequences: [[UInt8]] = [[0xC0], [0xC1], Array(0xF5...0xFF)]
let validBytes: [UInt8] = (1...4).flatMap { i in
return makeUTF8ContinuationSequence(totalNumberBytes: i).map { $0 | 0b10 /* make them valid */ }
}
self.assertValidUTF8(validBytes, "\(validBytes)")

for illegalSequence in illegalSequences {
for illegalStarterIndex in [0, 1, 3, 6, 10] {
var invalidBytes = validBytes
var expectedRepairedBytes = validBytes
invalidBytes.insert(contentsOf: illegalSequence, at: illegalStarterIndex)
expectedRepairedBytes.insert(contentsOf: utf8ReplacementCharacter, at: illegalStarterIndex)
self.assertInvalidUTF8(invalidBytes,
expectedErrorRange: illegalStarterIndex..<(illegalStarterIndex + illegalSequence.count),
expectedRepairedString: UTF8String.String(decoding: expectedRepairedBytes, as: UTF8.self),
"\(invalidBytes); illegalSequence=\(illegalSequence), illegalStarterIndex=\(illegalStarterIndex)")
}
}
}

func testValid_replacementChracterIsValid() {
let bytes: [UInt8] = [0xEF, 0xBF, 0xBD]
self.assertValidUTF8(bytes, "\(bytes)")
}

func testInvalid_longSequenceOfTruncatedBytes() {
let truncatedSequence = makeUTF8ContinuationSequence(totalNumberBytes: 4).first!
let longSequence: [UInt8] = Array(repeating: truncatedSequence, count: 1000)
let expectedOutput = UTF8String.String(decoding: (0..<1000).flatMap { _ in utf8ReplacementCharacter }, as: UTF8.self)
longSequence.withUnsafeBufferPointer { ptr in
let firstBrokenSequence = 0..<1
self.assertInvalidUTF8(ptr,
expectedErrorRange: 0..<1,
expectedRepairedString: expectedOutput)
let string = utf8Repair(ptr, firstKnownBrokenRange: firstBrokenSequence)
XCTAssertEqual(expectedOutput, string)
}
}

func testInvalid_asciiLeftOfSomethingBroken() {
let brokenBytes = ["a".utf8.first!, "🔥".utf8.first!]
self.assertInvalidUTF8(brokenBytes,
expectedErrorRange: 1..<2,
expectedRepairedString: "a" + self.replacementChar)
}

func testInvalid_asciiRightOfSomethingBroken() {
let brokenBytes = ["🔥".utf8.first!, "a".utf8.first!]
self.assertInvalidUTF8(brokenBytes,
expectedErrorRange: 0..<1,
expectedRepairedString: self.replacementChar + "a")
}

func testInvalid_somethingBrokenSandwichedInASCII() {
let brokenBytes = ["A".utf8.first!, "🔥".utf8.first!, "Z".utf8.first!]
self.assertInvalidUTF8(brokenBytes,
expectedErrorRange: 1..<2,
expectedRepairedString: "A" + self.replacementChar + "Z")
}

func causesCrash_testInvalid_flagLeftOfSomethingBroken() {
let brokenBytes = Array("🏴󠁧󠁢󠁳󠁣󠁴󠁿".utf8) + ["🔥".utf8.first!]
self.assertInvalidUTF8(brokenBytes,
expectedErrorRange: 28..<29,
expectedRepairedString: "🏴󠁧󠁢󠁳󠁣󠁴󠁿" + self.replacementChar)
}

func testInvalid_flagRightOfSomethingBroken() {
let brokenBytes = Array("🤞".utf8.dropLast()) + Array("🏴󠁧󠁢󠁳󠁣󠁴󠁿".utf8)
self.assertInvalidUTF8(brokenBytes,
expectedErrorRange: 0..<3,
expectedRepairedString: self.replacementChar + "🏴󠁧󠁢󠁳󠁣󠁴󠁿")
}

func causesCrash_testInvalid_somethingBrokenSandwichedInFlags() {
let brokenBytes = Array("🏴󠁧󠁢󠁳󠁣󠁴󠁿".utf8) + ["🔥".utf8.first!] + Array("🏴󠁧󠁢󠁳󠁣󠁴󠁿".utf8)
self.assertInvalidUTF8(brokenBytes,
expectedErrorRange: 28..<29,
expectedRepairedString: "🏴󠁧󠁢󠁳󠁣󠁴󠁿" + self.replacementChar + "🏴󠁧󠁢󠁳󠁣󠁴󠁿")
}

}
1 change: 1 addition & 0 deletions Tests/UTF8StringTests/XCTestManifests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import XCTest
public func allTests() -> [XCTestCaseEntry] {
return [
testCase(UTF8StringTests.allTests),
testCase(UTF8ValidationTests.allTests),
]
}
#endif
10 changes: 9 additions & 1 deletion UTF8String.xcodeproj/project.pbxproj
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@
18CD7FCF20FE8F290092F0D9 /* StringCreate.swift in Sources */ = {isa = PBXBuildFile; fileRef = 18CD7FCE20FE8F290092F0D9 /* StringCreate.swift */; };
18CD7FD32101043B0092F0D9 /* ContiguouslyStored.swift in Sources */ = {isa = PBXBuildFile; fileRef = 18CD7FD22101043B0092F0D9 /* ContiguouslyStored.swift */; };
18CD7FD52101516E0092F0D9 /* IntegerParsing.swift in Sources */ = {isa = PBXBuildFile; fileRef = 18CD7FD42101516E0092F0D9 /* IntegerParsing.swift */; };
8F5A16EF215E590F00197AB8 /* UTF8ValidationTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8F5A16EE215E590F00197AB8 /* UTF8ValidationTests.swift */; };
8FDC0C8F215E4EB600DC1711 /* StringUTF8Validation.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8FDC0C8E215E4EB600DC1711 /* StringUTF8Validation.swift */; };
OBJ_46 /* Scratch.swift in Sources */ = {isa = PBXBuildFile; fileRef = OBJ_9 /* Scratch.swift */; };
OBJ_47 /* StdlibMockUp.swift in Sources */ = {isa = PBXBuildFile; fileRef = OBJ_10 /* StdlibMockUp.swift */; };
OBJ_48 /* UTF8String.swift in Sources */ = {isa = PBXBuildFile; fileRef = OBJ_11 /* UTF8String.swift */; };
Expand Down Expand Up @@ -126,9 +128,11 @@
18CD7FCE20FE8F290092F0D9 /* StringCreate.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; name = StringCreate.swift; path = "../../../utf8-swift/swift/stdlib/public/core/StringCreate.swift"; sourceTree = "<group>"; };
18CD7FD22101043B0092F0D9 /* ContiguouslyStored.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; name = ContiguouslyStored.swift; path = "../../../utf8-swift/swift/stdlib/public/core/ContiguouslyStored.swift"; sourceTree = "<group>"; };
18CD7FD42101516E0092F0D9 /* IntegerParsing.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; name = IntegerParsing.swift; path = "../../../utf8-swift/swift/stdlib/public/core/IntegerParsing.swift"; sourceTree = "<group>"; };
8F5A16EE215E590F00197AB8 /* UTF8ValidationTests.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = UTF8ValidationTests.swift; sourceTree = "<group>"; };
8FDC0C8E215E4EB600DC1711 /* StringUTF8Validation.swift */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 2; lastKnownFileType = sourcecode.swift; name = StringUTF8Validation.swift; path = "../../../utf8-swift/swift/stdlib/public/core/StringUTF8Validation.swift"; sourceTree = "<group>"; tabWidth = 2; };
OBJ_10 /* StdlibMockUp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = StdlibMockUp.swift; sourceTree = "<group>"; };
OBJ_11 /* UTF8String.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = UTF8String.swift; sourceTree = "<group>"; };
OBJ_15 /* UTF8StringTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = UTF8StringTests.swift; sourceTree = "<group>"; };
OBJ_15 /* UTF8StringTests.swift */ = {isa = PBXFileReference; indentWidth = 2; lastKnownFileType = sourcecode.swift; path = UTF8StringTests.swift; sourceTree = "<group>"; tabWidth = 2; };
OBJ_16 /* XCTestManifests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = XCTestManifests.swift; sourceTree = "<group>"; };
OBJ_6 /* Package.swift */ = {isa = PBXFileReference; explicitFileType = sourcecode.swift; path = Package.swift; sourceTree = "<group>"; };
OBJ_9 /* Scratch.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Scratch.swift; sourceTree = "<group>"; };
Expand Down Expand Up @@ -167,6 +171,7 @@
isa = PBXGroup;
children = (
OBJ_15 /* UTF8StringTests.swift */,
8F5A16EE215E590F00197AB8 /* UTF8ValidationTests.swift */,
186B4F2920F4027800E9E9B1 /* StdlibUnittestShims.swift */,
OBJ_16 /* XCTestManifests.swift */,
);
Expand Down Expand Up @@ -235,6 +240,7 @@
188CD3D72152CB8D0039DCD4 /* StringGutsRangeReplaceable.swift */,
18A8969420EC45EB0097A770 /* StringStorage.swift */,
18A8969520EC45EB0097A770 /* StringUnicodeScalarView.swift */,
8FDC0C8E215E4EB600DC1711 /* StringUTF8Validation.swift */,
18A8968C20EC45EA0097A770 /* StringUTF8View.swift */,
188CD3D3211E13BD0039DCD4 /* StringTesting.swift */,
18A8969620EC45EB0097A770 /* StringUTF16View.swift */,
Expand Down Expand Up @@ -355,6 +361,7 @@
18A8969F20EC45EC0097A770 /* StringBridge.swift in Sources */,
1846AEE42141E6D0004B8D9B /* UnicodeHelpers.swift in Sources */,
18A8968120EC45CD0097A770 /* Character.swift in Sources */,
8FDC0C8F215E4EB600DC1711 /* StringUTF8Validation.swift in Sources */,
18A8967E20EC45CD0097A770 /* OutputStream.swift in Sources */,
OBJ_48 /* UTF8String.swift in Sources */,
18A8969D20EC45EC0097A770 /* StringHashable.swift in Sources */,
Expand Down Expand Up @@ -390,6 +397,7 @@
buildActionMask = 0;
files = (
OBJ_69 /* UTF8StringTests.swift in Sources */,
8F5A16EF215E590F00197AB8 /* UTF8ValidationTests.swift in Sources */,
OBJ_70 /* XCTestManifests.swift in Sources */,
186B4F2A20F4027800E9E9B1 /* StdlibUnittestShims.swift in Sources */,
);
Expand Down

0 comments on commit 117de1a

Please sign in to comment.