forked from jrmuizel/pdf-extract
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
89 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
target | ||
.idea/* | ||
Cargo.lock | ||
Cargo.lock | ||
tests/docs_cache |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,13 @@ | ||
[package] | ||
authors = ["Jeff Muizelaar <[email protected]>"] | ||
name = "pdf-extract" | ||
version = "0.6.5-alpha.0" | ||
license = "MIT" | ||
documentation = "https://docs.rs/crate/pdf-extract/" | ||
description = "A library to extract content from pdfs" | ||
documentation = "https://docs.rs/crate/pdf-extract/" | ||
edition = "2018" | ||
keywords = ["pdf2text", "text", "pdf", "pdf2txt"] | ||
license = "MIT" | ||
name = "pdf-extract" | ||
repository = "https://github.com/jrmuizel/pdf-extract" | ||
edition = "2018" | ||
version = "0.6.5-alpha.0" | ||
|
||
[profile.release] | ||
debug = true | ||
|
@@ -16,8 +16,11 @@ debug = true | |
adobe-cmap-parser = "0.3.3" | ||
encoding = "0.2.33" | ||
euclid = "0.20.5" | ||
lopdf = { version = "0.29", default-features = false, features = [ "pom_parser" ] } | ||
linked-hash-map = "=0.5.3" | ||
lopdf = {version = "0.29", default-features = false, features = ["pom_parser"]} | ||
postscript = "0.14" | ||
type1-encoding-parser = "0.1.0" | ||
unicode-normalization = "0.1.19" | ||
|
||
[dev-dependencies] | ||
ureq = "2.6.2" |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
https://web.archive.org/web/20160112115354/http://www.fao.org/fileadmin/user_upload/tci/docs/2_About%20Stacks.pdf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
use pdf_extract::extract_text; | ||
|
||
// Shorthand for creating ExpectedText | ||
// example: expected!("atomic.pdf", "Atomic Data"); | ||
macro_rules! expected { | ||
($filename:expr, $text:expr) => { | ||
ExpectedText { | ||
filename: $filename, | ||
text: $text, | ||
} | ||
}; | ||
} | ||
|
||
// Use the macro to create a list of ExpectedText | ||
// and then check if the text is correctly extracted | ||
#[test] | ||
fn extract_expected_text() { | ||
let docs = vec![expected!("documents_stack.pdf.link", "mouse button until")]; | ||
for doc in docs { | ||
doc.test(); | ||
} | ||
} | ||
|
||
#[test] | ||
// iterate over all docs in the `tests/docs` directory, don't crash | ||
fn extract_all_docs() { | ||
let docs = std::fs::read_dir("tests/docs").unwrap(); | ||
for doc in docs { | ||
let doc = doc.unwrap(); | ||
let path = doc.path(); | ||
let filename = path.file_name().unwrap().to_string_lossy(); | ||
expected!(&filename, "").test(); | ||
} | ||
} | ||
|
||
// data structure to make it easy to check if certain files are correctly parsed | ||
// e.g. ExpectedText { filename: "atomic.pdf", text: "Atomic Data" } | ||
#[derive(Debug, PartialEq)] | ||
struct ExpectedText<'a> { | ||
filename: &'a str, | ||
text: &'a str, | ||
} | ||
|
||
impl ExpectedText<'_> { | ||
/// Opens the `filename` from `tests/docs`, extracts the text and checks if it contains `text` | ||
/// If the file ends with `_link`, it will download the file from the url in the file to the `tests/docs_cache` directory | ||
fn test(self) { | ||
let ExpectedText { filename, text } = self; | ||
let file_path = if filename.ends_with(".pdf.link") { | ||
let docs_cache = "tests/docs_cache"; | ||
if !std::path::Path::new(docs_cache).exists() { | ||
std::fs::create_dir(docs_cache).unwrap(); | ||
} | ||
let file_path = format!("{}/{}", docs_cache, filename.replace(".link", "")); | ||
if std::path::Path::new(&file_path).exists() { | ||
file_path | ||
} else { | ||
let url = std::fs::read_to_string(format!("tests/docs/{}", filename)).unwrap(); | ||
let resp = ureq::get(&url).call().unwrap(); | ||
let mut file = std::fs::File::create(&file_path).unwrap(); | ||
std::io::copy(&mut resp.into_reader(), &mut file).unwrap(); | ||
file_path | ||
} | ||
} else { | ||
format!("tests/docs/{}", filename) | ||
}; | ||
let out = extract_text(file_path) | ||
.unwrap_or_else(|e| panic!("Failed to extract text from {}, {}", filename, e)); | ||
println!("{}", out); | ||
assert!( | ||
out.contains(text), | ||
"Text {} does not contain '{}'", | ||
filename, | ||
text | ||
); | ||
} | ||
} |