Merge pull request #8 from weissi/jw-utf8-validation-tests

first couple of utf8 validation tests
milseman · Oct 4, 2018 · 117de1a · 117de1a
2 parents 861f652 + 0eabdb6
commit 117de1a
Show file tree

Hide file tree

Showing 4 changed files with 257 additions and 1 deletion.
diff --git a/Tests/UTF8StringTests/UTF8StringTests.swift b/Tests/UTF8StringTests/UTF8StringTests.swift
@@ -725,6 +725,36 @@ final class UTF8StringTests: XCTestCase {
     }
   }
 
+  func testStringsThatNeedRepairing() {
+    struct ExampleThatNeedsRepairing {
+      let brokenBytes: [UInt8]
+      let expectedRepairedBytes: [UInt8]
+      let desc: Swift.String
+      init(_ brokenBytes: [UInt8], _ expectedRepairedBytes: [UInt8], _ desc: Swift.String) {
+        self.brokenBytes = brokenBytes
+        self.expectedRepairedBytes = expectedRepairedBytes
+        self.desc = desc
+      }
+    }
+
+    let strings: [ExampleThatNeedsRepairing] = [
+        .init([0xc3], [0xEF, 0xBF, 0xBD], "half an ä"),
+        .init(Array("🙈".utf8) + [0xc3] + Array("🏴󠁧󠁢󠁳󠁣󠁴󠁿".utf8), Array("🙈".utf8) + [0xEF, 0xBF, 0xBD] + Array("🏴󠁧󠁢󠁳󠁣󠁴󠁿".utf8), "half an ä sandwiched by emojis"),
+        .init(Array("🤪".utf8.dropLast(1)), [0xEF, 0xBF, 0xBD], "almost emoji"),
+        .init(Array("🤪".utf8.dropFirst(1)) + Array("🤪".utf8.dropLast(1)) + Array("🤪".utf8.dropLast(2)),
+              [0xEF, 0xBF, 0xBD, 0xEF, 0xBF, 0xBD, 0xEF, 0xBF, 0xBD],
+              "three broken emoji back to back")
+    ]
+
+    for s in strings {
+      XCTAssertEqual(s.expectedRepairedBytes,
+                     Array(UTF8String.String(decoding: s.brokenBytes, as: UTF8.self).utf8),
+                     "\("s.desc) failed)")")
+    }
+  }
+
+
+
 }
 
 // The most simple subclass of NSString that CoreFoundation does not know

diff --git a/Tests/UTF8StringTests/UTF8ValidationTests.swift b/Tests/UTF8StringTests/UTF8ValidationTests.swift
@@ -0,0 +1,217 @@
+import XCTest
+@testable import UTF8String
+
+public class UTF8ValidationTest: XCTestCase {
+    struct TestError: Error {
+        let result: UTF8ValidationResult
+    }
+
+    func run(_ bytes: UnsafeBufferPointer<UInt8>) throws {
+        if case .error(let range) = _betterValidateUTF8(bytes) {
+            throw TestError(result: .error(toBeReplaced: range))
+        }
+    }
+
+    func assertValidUTF8(_ bytes: UnsafeBufferPointer<UInt8>, _ message: Swift.String = "", file: StaticString = #file, line: UInt = #line) {
+        do {
+            return try self.run(bytes)
+        } catch {
+            XCTFail("not valid: \(error)", file: file, line: line)
+        }
+    }
+
+    func assertInvalidUTF8(_ bytes: UnsafeBufferPointer<UInt8>,
+                           expectedErrorRange: Range<Int>,
+                           expectedRepairedString: UTF8String.String,
+                           _ message: Swift.String = "", file: StaticString = #file, line: UInt = #line) {
+        func errorFormat(_ value: (Swift.String, UTF8ValidationResult?)) -> (Swift.String, Swift.String) {
+            return (value.0, value.1.map(String.init(describing:)) ?? "<no error>")
+        }
+        do {
+            try self.run(bytes)
+            XCTFail("\(message) is valid", file: file, line: line)
+        } catch let e as TestError {
+            XCTAssertEqual(UTF8ValidationResult.error(toBeReplaced: expectedErrorRange), e.result, message, file: file, line: line)
+            let repaired = utf8Repair(bytes, firstKnownBrokenRange: expectedErrorRange)
+            XCTAssertEqual(expectedRepairedString, repaired, "repaired string wrong", file: file, line: line)
+        } catch {
+            fatalError("unexpected error \(error)")
+        }
+    }
+
+    func assertValidUTF8(_ bytes: [UInt8], _ message: Swift.String = "", file: StaticString = #file, line: UInt = #line) {
+        bytes.withUnsafeBufferPointer { ptr in
+            self.assertValidUTF8(ptr, message, file: file, line: line)
+        }
+    }
+
+    func assertInvalidUTF8(_ bytes: [UInt8], expectedErrorRange: Range<Int>, expectedRepairedString: UTF8String.String, _ message: Swift.String = "", file: StaticString = #file, line: UInt = #line) {
+        bytes.withUnsafeBufferPointer { ptr in
+            self.assertInvalidUTF8(ptr, expectedErrorRange: expectedErrorRange, expectedRepairedString: expectedRepairedString, message, file: file, line: line)
+        }
+    }
+
+    func assertValidUTF8<Bytes: Collection>(_ bytes: Bytes, _ message: Swift.String = "", file: StaticString = #file, line: UInt = #line) where Bytes.Element == UInt8 {
+        self.assertValidUTF8(Array(bytes), message, file: file, line: line)
+    }
+
+    func assertInvalidUTF8<Bytes: Collection>(_ bytes: Bytes, expectedErrorRange: Range<Int>, expectedRepairedString: UTF8String.String, _ message: Swift.String = "", file: StaticString = #file, line: UInt = #line) where Bytes.Element == UInt8 {
+        self.assertInvalidUTF8(Array(bytes), expectedErrorRange: expectedErrorRange, expectedRepairedString: expectedRepairedString, message, file: file, line: line)
+    }
+
+    // MARK: Utils
+    func makeUTF8ContinuationSequence(totalNumberBytes: Int) -> [UInt8] {
+        precondition(totalNumberBytes > 0)
+        guard totalNumberBytes > 1 else {
+            return [0]
+        }
+        var firstByte: UInt8 = 0
+        for i in 0..<8 {
+            firstByte |= firstByte | (i <= totalNumberBytes ? 1 : 0)
+            firstByte <<= 1
+        }
+        return [firstByte]+repeatElement(0b1000_0000, count: totalNumberBytes-1)
+    }
+
+    private let replacementChar: UTF8String.String = String(decoding: utf8ReplacementCharacter, as: UTF8.self)
+
+    // MARK: Tests
+    func testValid_Empty() {
+        self.assertValidUTF8([])
+    }
+
+    func testValid_OneCharacterASCII() {
+        self.assertValidUTF8(" ".utf8)
+    }
+
+    func testInvalid_ContinuationEndByteWithoutContinuation() {
+        self.assertInvalidUTF8([0b1000_0000], expectedErrorRange: 0..<1, expectedRepairedString: self.replacementChar)
+    }
+
+    func testInvalid_ContinuationStartByteOnly() {
+        self.assertInvalidUTF8([0b1100_0000], expectedErrorRange: 0..<1, expectedRepairedString: self.replacementChar)
+        self.assertInvalidUTF8([0b1110_0000], expectedErrorRange: 0..<1, expectedRepairedString: self.replacementChar)
+        self.assertInvalidUTF8([0b1111_0000], expectedErrorRange: 0..<1, expectedRepairedString: self.replacementChar)
+    }
+
+    func testInvalid_OnlyTheRawContinuationBytesNoActualData() {
+        for i in 2...4 {
+            let bytes = makeUTF8ContinuationSequence(totalNumberBytes: i)
+            self.assertInvalidUTF8(bytes,
+                                   expectedErrorRange: 0..<i,
+                                   expectedRepairedString: self.replacementChar,
+                                   "\(bytes) (\(i) bytes)")
+        }
+    }
+
+    func testValid_SomeContinuations() {
+        for i in 2...4 {
+            let bytes = makeUTF8ContinuationSequence(totalNumberBytes: i).map { $0 | 0b10 /* make them valid */ }
+            self.assertValidUTF8(bytes, "\(bytes) (\(i) bytes)")
+        }
+    }
+
+    func testInvalid_ContinuationsMissingBytes() {
+        for i in 2...4 {
+            for dropLast in 1..<i {
+            let bytes = makeUTF8ContinuationSequence(totalNumberBytes: i)
+                .map { $0 | 0b10 /* make them valid */ }
+                .dropLast(dropLast) /* and invalid again */
+                self.assertInvalidUTF8(bytes,
+                                       expectedErrorRange: 0..<(i-dropLast),
+                                       expectedRepairedString: self.replacementChar,
+                                       "\(bytes) (\(i) bytes), i=\(i), dropLast=\(dropLast)")
+            }
+        }
+    }
+
+    func testValid_MultipleValidsConcatenated() {
+        let allBytes: [UInt8] = (1...4).flatMap { i in
+            return makeUTF8ContinuationSequence(totalNumberBytes: i).map { $0 | 0b10 /* make them valid */ }
+        }
+        self.assertValidUTF8(allBytes, "\(allBytes)")
+    }
+
+    func testInvalid_MultipleValidsInterspersedWithInvalids() {
+        let illegalSequences: [[UInt8]] = [[0xC0], [0xC1], Array(0xF5...0xFF)]
+        let validBytes: [UInt8] = (1...4).flatMap { i in
+            return makeUTF8ContinuationSequence(totalNumberBytes: i).map { $0 | 0b10 /* make them valid */ }
+        }
+        self.assertValidUTF8(validBytes, "\(validBytes)")
+
+        for illegalSequence in illegalSequences {
+            for illegalStarterIndex in [0, 1, 3, 6, 10] {
+                var invalidBytes = validBytes
+                var expectedRepairedBytes = validBytes
+                invalidBytes.insert(contentsOf: illegalSequence, at: illegalStarterIndex)
+                expectedRepairedBytes.insert(contentsOf: utf8ReplacementCharacter, at: illegalStarterIndex)
+                self.assertInvalidUTF8(invalidBytes,
+                                       expectedErrorRange: illegalStarterIndex..<(illegalStarterIndex + illegalSequence.count),
+                                       expectedRepairedString: UTF8String.String(decoding: expectedRepairedBytes, as: UTF8.self),
+                                       "\(invalidBytes); illegalSequence=\(illegalSequence), illegalStarterIndex=\(illegalStarterIndex)")
+            }
+        }
+    }
+
+    func testValid_replacementChracterIsValid() {
+        let bytes: [UInt8] = [0xEF, 0xBF, 0xBD]
+        self.assertValidUTF8(bytes, "\(bytes)")
+    }
+
+    func testInvalid_longSequenceOfTruncatedBytes() {
+        let truncatedSequence = makeUTF8ContinuationSequence(totalNumberBytes: 4).first!
+        let longSequence: [UInt8] = Array(repeating: truncatedSequence, count: 1000)
+        let expectedOutput = UTF8String.String(decoding: (0..<1000).flatMap { _ in utf8ReplacementCharacter }, as: UTF8.self)
+        longSequence.withUnsafeBufferPointer { ptr in
+            let firstBrokenSequence = 0..<1
+            self.assertInvalidUTF8(ptr,
+                                   expectedErrorRange: 0..<1,
+                                   expectedRepairedString: expectedOutput)
+            let string = utf8Repair(ptr, firstKnownBrokenRange: firstBrokenSequence)
+            XCTAssertEqual(expectedOutput, string)
+        }
+    }
+
+    func testInvalid_asciiLeftOfSomethingBroken() {
+        let brokenBytes = ["a".utf8.first!, "🔥".utf8.first!]
+        self.assertInvalidUTF8(brokenBytes,
+                               expectedErrorRange: 1..<2,
+                               expectedRepairedString: "a" + self.replacementChar)
+    }
+
+    func testInvalid_asciiRightOfSomethingBroken() {
+        let brokenBytes = ["🔥".utf8.first!, "a".utf8.first!]
+        self.assertInvalidUTF8(brokenBytes,
+                               expectedErrorRange: 0..<1,
+                               expectedRepairedString: self.replacementChar + "a")
+    }
+
+    func testInvalid_somethingBrokenSandwichedInASCII() {
+        let brokenBytes = ["A".utf8.first!, "🔥".utf8.first!, "Z".utf8.first!]
+        self.assertInvalidUTF8(brokenBytes,
+                               expectedErrorRange: 1..<2,
+                               expectedRepairedString: "A" + self.replacementChar + "Z")
+    }
+
+    func causesCrash_testInvalid_flagLeftOfSomethingBroken() {
+        let brokenBytes = Array("🏴󠁧󠁢󠁳󠁣󠁴󠁿".utf8) + ["🔥".utf8.first!]
+        self.assertInvalidUTF8(brokenBytes,
+                               expectedErrorRange: 28..<29,
+                               expectedRepairedString: "🏴󠁧󠁢󠁳󠁣󠁴󠁿" + self.replacementChar)
+    }
+
+    func testInvalid_flagRightOfSomethingBroken() {
+        let brokenBytes = Array("🤞".utf8.dropLast()) + Array("🏴󠁧󠁢󠁳󠁣󠁴󠁿".utf8)
+        self.assertInvalidUTF8(brokenBytes,
+                               expectedErrorRange: 0..<3,
+                               expectedRepairedString: self.replacementChar + "🏴󠁧󠁢󠁳󠁣󠁴󠁿")
+    }
+
+    func causesCrash_testInvalid_somethingBrokenSandwichedInFlags() {
+        let brokenBytes = Array("🏴󠁧󠁢󠁳󠁣󠁴󠁿".utf8) + ["🔥".utf8.first!] + Array("🏴󠁧󠁢󠁳󠁣󠁴󠁿".utf8)
+        self.assertInvalidUTF8(brokenBytes,
+                               expectedErrorRange: 28..<29,
+                               expectedRepairedString: "🏴󠁧󠁢󠁳󠁣󠁴󠁿" + self.replacementChar + "🏴󠁧󠁢󠁳󠁣󠁴󠁿")
+    }
+
+}
diff --git a/Tests/UTF8StringTests/XCTestManifests.swift b/Tests/UTF8StringTests/XCTestManifests.swift
@@ -4,6 +4,7 @@ import XCTest
 public func allTests() -> [XCTestCaseEntry] {
   return [
     testCase(UTF8StringTests.allTests),
+    testCase(UTF8ValidationTests.allTests),
   ]
 }
 #endif
diff --git a/UTF8String.xcodeproj/project.pbxproj b/UTF8String.xcodeproj/project.pbxproj
@@ -60,6 +60,8 @@
 		18CD7FCF20FE8F290092F0D9 /* StringCreate.swift in Sources */ = {isa = PBXBuildFile; fileRef = 18CD7FCE20FE8F290092F0D9 /* StringCreate.swift */; };
 		18CD7FD32101043B0092F0D9 /* ContiguouslyStored.swift in Sources */ = {isa = PBXBuildFile; fileRef = 18CD7FD22101043B0092F0D9 /* ContiguouslyStored.swift */; };
 		18CD7FD52101516E0092F0D9 /* IntegerParsing.swift in Sources */ = {isa = PBXBuildFile; fileRef = 18CD7FD42101516E0092F0D9 /* IntegerParsing.swift */; };
+		8F5A16EF215E590F00197AB8 /* UTF8ValidationTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8F5A16EE215E590F00197AB8 /* UTF8ValidationTests.swift */; };
+		8FDC0C8F215E4EB600DC1711 /* StringUTF8Validation.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8FDC0C8E215E4EB600DC1711 /* StringUTF8Validation.swift */; };
 		OBJ_46 /* Scratch.swift in Sources */ = {isa = PBXBuildFile; fileRef = OBJ_9 /* Scratch.swift */; };
 		OBJ_47 /* StdlibMockUp.swift in Sources */ = {isa = PBXBuildFile; fileRef = OBJ_10 /* StdlibMockUp.swift */; };
 		OBJ_48 /* UTF8String.swift in Sources */ = {isa = PBXBuildFile; fileRef = OBJ_11 /* UTF8String.swift */; };
@@ -126,9 +128,11 @@
 		18CD7FCE20FE8F290092F0D9 /* StringCreate.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; name = StringCreate.swift; path = "../../../utf8-swift/swift/stdlib/public/core/StringCreate.swift"; sourceTree = "<group>"; };
 		18CD7FD22101043B0092F0D9 /* ContiguouslyStored.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; name = ContiguouslyStored.swift; path = "../../../utf8-swift/swift/stdlib/public/core/ContiguouslyStored.swift"; sourceTree = "<group>"; };
 		18CD7FD42101516E0092F0D9 /* IntegerParsing.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; name = IntegerParsing.swift; path = "../../../utf8-swift/swift/stdlib/public/core/IntegerParsing.swift"; sourceTree = "<group>"; };
+		8F5A16EE215E590F00197AB8 /* UTF8ValidationTests.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = UTF8ValidationTests.swift; sourceTree = "<group>"; };
+		8FDC0C8E215E4EB600DC1711 /* StringUTF8Validation.swift */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 2; lastKnownFileType = sourcecode.swift; name = StringUTF8Validation.swift; path = "../../../utf8-swift/swift/stdlib/public/core/StringUTF8Validation.swift"; sourceTree = "<group>"; tabWidth = 2; };
 		OBJ_10 /* StdlibMockUp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = StdlibMockUp.swift; sourceTree = "<group>"; };
 		OBJ_11 /* UTF8String.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = UTF8String.swift; sourceTree = "<group>"; };
-		OBJ_15 /* UTF8StringTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = UTF8StringTests.swift; sourceTree = "<group>"; };
+		OBJ_15 /* UTF8StringTests.swift */ = {isa = PBXFileReference; indentWidth = 2; lastKnownFileType = sourcecode.swift; path = UTF8StringTests.swift; sourceTree = "<group>"; tabWidth = 2; };
 		OBJ_16 /* XCTestManifests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = XCTestManifests.swift; sourceTree = "<group>"; };
 		OBJ_6 /* Package.swift */ = {isa = PBXFileReference; explicitFileType = sourcecode.swift; path = Package.swift; sourceTree = "<group>"; };
 		OBJ_9 /* Scratch.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Scratch.swift; sourceTree = "<group>"; };
@@ -167,6 +171,7 @@
 			isa = PBXGroup;
 			children = (
 				OBJ_15 /* UTF8StringTests.swift */,
+				8F5A16EE215E590F00197AB8 /* UTF8ValidationTests.swift */,
 				186B4F2920F4027800E9E9B1 /* StdlibUnittestShims.swift */,
 				OBJ_16 /* XCTestManifests.swift */,
 			);
@@ -235,6 +240,7 @@
 				188CD3D72152CB8D0039DCD4 /* StringGutsRangeReplaceable.swift */,
 				18A8969420EC45EB0097A770 /* StringStorage.swift */,
 				18A8969520EC45EB0097A770 /* StringUnicodeScalarView.swift */,
+				8FDC0C8E215E4EB600DC1711 /* StringUTF8Validation.swift */,
 				18A8968C20EC45EA0097A770 /* StringUTF8View.swift */,
 				188CD3D3211E13BD0039DCD4 /* StringTesting.swift */,
 				18A8969620EC45EB0097A770 /* StringUTF16View.swift */,
@@ -355,6 +361,7 @@
 				18A8969F20EC45EC0097A770 /* StringBridge.swift in Sources */,
 				1846AEE42141E6D0004B8D9B /* UnicodeHelpers.swift in Sources */,
 				18A8968120EC45CD0097A770 /* Character.swift in Sources */,
+				8FDC0C8F215E4EB600DC1711 /* StringUTF8Validation.swift in Sources */,
 				18A8967E20EC45CD0097A770 /* OutputStream.swift in Sources */,
 				OBJ_48 /* UTF8String.swift in Sources */,
 				18A8969D20EC45EC0097A770 /* StringHashable.swift in Sources */,
@@ -390,6 +397,7 @@
 			buildActionMask = 0;
 			files = (
 				OBJ_69 /* UTF8StringTests.swift in Sources */,
+				8F5A16EF215E590F00197AB8 /* UTF8ValidationTests.swift in Sources */,
 				OBJ_70 /* XCTestManifests.swift in Sources */,
 				186B4F2A20F4027800E9E9B1 /* StdlibUnittestShims.swift in Sources */,
 			);