Skip to content

Commit

Permalink
Tests from pdf.link files
Browse files Browse the repository at this point in the history
  • Loading branch information
joepio authored and jrmuizel committed Mar 23, 2023
1 parent 2f449cb commit 96cb66e
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 7 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
target
.idea/*
Cargo.lock
Cargo.lock
tests/docs_cache
15 changes: 9 additions & 6 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
[package]
authors = ["Jeff Muizelaar <[email protected]>"]
name = "pdf-extract"
version = "0.6.5-alpha.0"
license = "MIT"
documentation = "https://docs.rs/crate/pdf-extract/"
description = "A library to extract content from pdfs"
documentation = "https://docs.rs/crate/pdf-extract/"
edition = "2018"
keywords = ["pdf2text", "text", "pdf", "pdf2txt"]
license = "MIT"
name = "pdf-extract"
repository = "https://github.com/jrmuizel/pdf-extract"
edition = "2018"
version = "0.6.5-alpha.0"

[profile.release]
debug = true
Expand All @@ -16,8 +16,11 @@ debug = true
adobe-cmap-parser = "0.3.3"
encoding = "0.2.33"
euclid = "0.20.5"
lopdf = { version = "0.29", default-features = false, features = [ "pom_parser" ] }
linked-hash-map = "=0.5.3"
lopdf = {version = "0.29", default-features = false, features = ["pom_parser"]}
postscript = "0.14"
type1-encoding-parser = "0.1.0"
unicode-normalization = "0.1.19"

[dev-dependencies]
ureq = "2.6.2"
File renamed without changes.
1 change: 1 addition & 0 deletions tests/docs/documents_stack.pdf.link
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
https://web.archive.org/web/20160112115354/http://www.fao.org/fileadmin/user_upload/tci/docs/2_About%20Stacks.pdf
77 changes: 77 additions & 0 deletions tests/tests.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
use pdf_extract::extract_text;

// Shorthand for creating ExpectedText
// example: expected!("atomic.pdf", "Atomic Data");
macro_rules! expected {
($filename:expr, $text:expr) => {
ExpectedText {
filename: $filename,
text: $text,
}
};
}

// Use the macro to create a list of ExpectedText
// and then check if the text is correctly extracted
#[test]
fn extract_expected_text() {
let docs = vec![expected!("documents_stack.pdf.link", "mouse button until")];
for doc in docs {
doc.test();
}
}

#[test]
// iterate over all docs in the `tests/docs` directory, don't crash
fn extract_all_docs() {
let docs = std::fs::read_dir("tests/docs").unwrap();
for doc in docs {
let doc = doc.unwrap();
let path = doc.path();
let filename = path.file_name().unwrap().to_string_lossy();
expected!(&filename, "").test();
}
}

// data structure to make it easy to check if certain files are correctly parsed
// e.g. ExpectedText { filename: "atomic.pdf", text: "Atomic Data" }
#[derive(Debug, PartialEq)]
struct ExpectedText<'a> {
filename: &'a str,
text: &'a str,
}

impl ExpectedText<'_> {
/// Opens the `filename` from `tests/docs`, extracts the text and checks if it contains `text`
/// If the file ends with `_link`, it will download the file from the url in the file to the `tests/docs_cache` directory
fn test(self) {
let ExpectedText { filename, text } = self;
let file_path = if filename.ends_with(".pdf.link") {
let docs_cache = "tests/docs_cache";
if !std::path::Path::new(docs_cache).exists() {
std::fs::create_dir(docs_cache).unwrap();
}
let file_path = format!("{}/{}", docs_cache, filename.replace(".link", ""));
if std::path::Path::new(&file_path).exists() {
file_path
} else {
let url = std::fs::read_to_string(format!("tests/docs/{}", filename)).unwrap();
let resp = ureq::get(&url).call().unwrap();
let mut file = std::fs::File::create(&file_path).unwrap();
std::io::copy(&mut resp.into_reader(), &mut file).unwrap();
file_path
}
} else {
format!("tests/docs/{}", filename)
};
let out = extract_text(file_path)
.unwrap_or_else(|e| panic!("Failed to extract text from {}, {}", filename, e));
println!("{}", out);
assert!(
out.contains(text),
"Text {} does not contain '{}'",
filename,
text
);
}
}

0 comments on commit 96cb66e

Please sign in to comment.