Skip to content

Commit

Permalink
Implemented support for PDF documents
Browse files Browse the repository at this point in the history
  • Loading branch information
insidegui committed Mar 24, 2024
1 parent 6336ae8 commit a9d6884
Show file tree
Hide file tree
Showing 7 changed files with 205 additions and 48 deletions.
26 changes: 26 additions & 0 deletions .swiftpm/xcode/xcshareddata/xcschemes/ocrit.xcscheme
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,32 @@
ReferencedContainer = "container:">
</BuildableReference>
</BuildableProductRunnable>
<CommandLineArguments>
<CommandLineArgument
argument = "&quot;~/Downloads/test/gov.uscourts.njd.544402.1.0_3 2.pdf&quot;"
isEnabled = "NO">
</CommandLineArgument>
<CommandLineArgument
argument = "/Users/insidegui/Developer/_projects/ocrit/Tests/ocritTests/Resources/test-en.png"
isEnabled = "YES">
</CommandLineArgument>
<CommandLineArgument
argument = "-o"
isEnabled = "YES">
</CommandLineArgument>
<CommandLineArgument
argument = "~/Downloads/output"
isEnabled = "YES">
</CommandLineArgument>
<CommandLineArgument
argument = "-l"
isEnabled = "YES">
</CommandLineArgument>
<CommandLineArgument
argument = "en-US"
isEnabled = "YES">
</CommandLineArgument>
</CommandLineArguments>
</LaunchAction>
<ProfileAction
buildConfiguration = "Release"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,38 +1,30 @@
import Vision
import Cocoa

final class OCROperation {
let imageURL: URL
final class CGImageOCR {

let image: CGImage
let customLanguages: [String]
init(imageURL: URL, customLanguages: [String]) {
self.imageURL = imageURL

init(image: CGImage, customLanguages: [String]) {
self.image = image
self.customLanguages = customLanguages
}

private var request: VNRecognizeTextRequest?
private var handler: VNImageRequestHandler?

func run() async throws -> String {
guard let image = NSImage(contentsOf: imageURL) else {
throw Failure("Couldn't read image at \(imageURL.path)")
}

guard let cgImage = image.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
throw Failure("Couldn't read CGImage fir \(imageURL.lastPathComponent)")
}

func run() async throws -> String {
return try await withCheckedThrowingContinuation { (continuation: CheckedContinuation<String, Error>) -> Void in
performRequest(with: cgImage) { request, error in
performRequest(with: image) { request, error in
if let error = error {
continuation.resume(throwing: error)
} else {
guard let observations = request.results as? [VNRecognizedTextObservation] else {
continuation.resume(throwing: Failure("No results"))
return
}

var transcript: String = ""
for observation in observations {
transcript.append(observation.topCandidates(1)[0].string)
Expand All @@ -44,13 +36,13 @@ final class OCROperation {
}
}
}
private func performRequest(with image: CGImage, completion: @escaping VNRequestCompletionHandler) {

func performRequest(with image: CGImage, completion: @escaping VNRequestCompletionHandler) {
let newHandler = VNImageRequestHandler(cgImage: image)

let newRequest = VNRecognizeTextRequest(completionHandler: completion)
newRequest.recognitionLevel = .accurate

do {
if let customLanguages = try resolveLanguages(for: newRequest) {
newRequest.recognitionLanguages = customLanguages
Expand All @@ -62,26 +54,26 @@ final class OCROperation {

request = newRequest
handler = newHandler

do {
try newHandler.perform([newRequest])
} catch {
completion(newRequest, error)
}
}

private func resolveLanguages(for request: VNRecognizeTextRequest) throws -> [String]? {
guard !customLanguages.isEmpty else { return nil }

let supportedLanguages = try request.supportedRecognitionLanguages()

for customLanguage in customLanguages {
guard supportedLanguages.contains(customLanguage) else {
throw Failure("Unsupported language \"\(customLanguage)\". Supported languages are \(supportedLanguages.joined(separator: ", "))")
}
}

return customLanguages
}

}
11 changes: 11 additions & 0 deletions Sources/ocrit/Implementation/Base/OCROperation.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import Foundation

struct OCRResult {
var text: String
var suggestedFilename: String
}

protocol OCROperation {
init(fileURL: URL, customLanguages: [String])
func run() throws -> AsyncThrowingStream<OCRResult, Error>
}
31 changes: 31 additions & 0 deletions Sources/ocrit/Implementation/CGPDFDocument+CGImage.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import Quartz

extension CGPDFDocument {
func cgImage(at pageNumber: Int) throws -> CGImage {
guard let page = page(at: pageNumber) else {
throw Failure("Page #\(pageNumber) not found.")
}

let pageRect = page.getBoxRect(.mediaBox)

let img = NSImage(size: pageRect.size, flipped: true) { rect in
guard let ctx = NSGraphicsContext.current?.cgContext else { return false }

NSColor.white.setFill()
rect.fill()

ctx.translateBy(x: 0, y: pageRect.size.height)
ctx.scaleBy(x: 1.0, y: -1.0)

ctx.drawPDFPage(page)

return true
}

guard let cgImage = img.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
throw Failure("Failed to create CGImage.")
}

return cgImage
}
}
39 changes: 39 additions & 0 deletions Sources/ocrit/Implementation/ImageOCROperation.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import Vision
import Cocoa

final class ImageOCROperation: OCROperation {

let imageURL: URL
let customLanguages: [String]

init(fileURL: URL, customLanguages: [String]) {
self.imageURL = fileURL
self.customLanguages = customLanguages
}

func run() throws -> AsyncThrowingStream<OCRResult, Error> {
guard let image = NSImage(contentsOf: imageURL) else {
throw Failure("Couldn't read image at \(imageURL.path)")
}

guard let cgImage = image.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
throw Failure("Couldn't read CGImage fir \(imageURL.lastPathComponent)")
}

let filename = imageURL.deletingPathExtension().lastPathComponent

let ocr = CGImageOCR(image: cgImage, customLanguages: customLanguages)

return AsyncThrowingStream { continuation in
Task {
let text = try await ocr.run()

let result = OCRResult(text: text, suggestedFilename: filename)

continuation.yield(result)
continuation.finish()
}
}
}

}
49 changes: 49 additions & 0 deletions Sources/ocrit/Implementation/PDFOCROperation.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import Vision
import Quartz

final class PDFOCROperation: OCROperation {

let documentURL: URL
let customLanguages: [String]

init(fileURL: URL, customLanguages: [String]) {
self.documentURL = fileURL
self.customLanguages = customLanguages
}

func run() throws -> AsyncThrowingStream<OCRResult, Error> {
let basename = documentURL.deletingPathExtension().lastPathComponent

guard let document = CGPDFDocument(documentURL as CFURL) else {
throw Failure("Failed to read PDF at \(documentURL.path)")
}

guard document.numberOfPages > 0 else {
throw Failure("PDF has no pages at \(documentURL.path)")
}

return AsyncThrowingStream { continuation in
Task {
for page in (1...document.numberOfPages) {
do {
let cgImage = try document.cgImage(at: page)

let ocr = CGImageOCR(image: cgImage, customLanguages: customLanguages)

let text = try await ocr.run()

let result = OCRResult(text: text, suggestedFilename: basename + "-\(page)")

continuation.yield(result)
} catch {
/// Don't want to interrupt processing if a single page fails, so don't terminate the stream here.
fputs("WARN: Error processing PDF page #\(page) at \(documentURL.path): \(error)\n", stderr)
}
}

continuation.finish()
}
}
}

}
49 changes: 29 additions & 20 deletions Sources/ocrit/OCRIT.swift
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@ import Foundation
import ArgumentParser
import UniformTypeIdentifiers

struct Failure: LocalizedError {
struct Failure: LocalizedError, CustomStringConvertible {
var errorDescription: String?
init(_ desc: String) { self.errorDescription = desc }
var description: String { errorDescription ?? "" }
}

@main
Expand All @@ -18,20 +19,24 @@ struct ocrit: AsyncParsableCommand {

@Option(name: .shortAndLong, help: "Language code to use for the recognition, can be repeated to select multiple languages")
var language: [String] = []

private var shouldOutputToStdout: Bool { output == "-" }

func run() async throws {
let outputDirectoryURL = URL(fileUrlWithTildePath: output)

if !shouldOutputToStdout {
guard URL(fileURLWithPath: output).isExistingDirectory else {
guard outputDirectoryURL.isExistingDirectory else {
throw Failure("Output path doesn't exist (or is not a directory) at \(output)")
}
}

let imageURLs = imagePaths.map(URL.init(fileUrlWithTildePath:))

fputs("Validating images…\n", stderr)


var operationType: OCROperation.Type = ImageOCROperation.self

do {
for url in imageURLs {
guard FileManager.default.fileExists(atPath: url.path) else {
Expand All @@ -42,8 +47,12 @@ struct ocrit: AsyncParsableCommand {
throw Failure("Unable to determine file type at \(url.path)")
}

guard type.conforms(to: .image) else {
throw Failure("File at \(url.path) is not an image")
if type.conforms(to: .image) {
operationType = ImageOCROperation.self
} else if type.conforms(to: .pdf) {
operationType = PDFOCROperation.self
} else {
throw Failure("File type at \(url.path) is not supported: \(type.identifier)")
}
}
} catch {
Expand All @@ -61,34 +70,34 @@ struct ocrit: AsyncParsableCommand {
}

for url in imageURLs {
let operation = OCROperation(imageURL: url, customLanguages: language)
let operation = operationType.init(fileURL: url, customLanguages: language)

do {
let text = try await operation.run()

try writeOutput(text, for: url)
for try await result in try operation.run() {
try writeResult(result, for: url, outputDirectoryURL: outputDirectoryURL)
}
} catch {
fputs("OCR failed for \(url.lastPathComponent): \(error.localizedDescription)\n", stderr)
}
}
}

private func writeOutput(_ text: String, for imageURL: URL) throws {
guard output != "-" else {
private func writeResult(_ result: OCRResult, for imageURL: URL, outputDirectoryURL: URL) throws {
guard !shouldOutputToStdout else {
print(imageURL.lastPathComponent + ":")
print(text + "\n")
print(result.text + "\n")
return
}

var outputURL = URL(fileURLWithPath: output)
.appendingPathComponent(imageURL.deletingPathExtension().lastPathComponent)
var outputFileURL = outputDirectoryURL
.appendingPathComponent(result.suggestedFilename)
.appendingPathExtension("txt")

try text.write(to: outputURL, atomically: true, encoding: .utf8)
try result.text.write(to: outputFileURL, atomically: true, encoding: .utf8)

if let attributes = try? imageURL.resourceValues(forKeys: [.creationDateKey, .contentModificationDateKey])
{
try outputURL.setResourceValues(attributes)
try outputFileURL.setResourceValues(attributes)
}
}

Expand Down

0 comments on commit a9d6884

Please sign in to comment.