Tests from pdf.link files

GQAdonis · Mar 23, 2023 · 96cb66e · 96cb66e
1 parent 2f449cb
commit 96cb66e
Show file tree

Hide file tree

Showing 5 changed files with 89 additions and 7 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 target
 .idea/*
-Cargo.lock
+Cargo.lock
+tests/docs_cache
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,13 +1,13 @@
 [package]
 authors = ["Jeff Muizelaar <[email protected]>"]
-name = "pdf-extract"
-version = "0.6.5-alpha.0"
-license = "MIT"
-documentation = "https://docs.rs/crate/pdf-extract/"
 description = "A library to extract content from pdfs"
+documentation = "https://docs.rs/crate/pdf-extract/"
+edition = "2018"
 keywords = ["pdf2text", "text", "pdf", "pdf2txt"]
+license = "MIT"
+name = "pdf-extract"
 repository = "https://github.com/jrmuizel/pdf-extract"
-edition = "2018"
+version = "0.6.5-alpha.0"
 
 [profile.release]
 debug = true
@@ -16,8 +16,11 @@ debug = true
 adobe-cmap-parser = "0.3.3"
 encoding = "0.2.33"
 euclid = "0.20.5"
-lopdf = { version = "0.29", default-features = false, features = [ "pom_parser" ] }
 linked-hash-map = "=0.5.3"
+lopdf = {version = "0.29", default-features = false, features = ["pom_parser"]}
 postscript = "0.14"
 type1-encoding-parser = "0.1.0"
 unicode-normalization = "0.1.19"
+
+[dev-dependencies]
+ureq = "2.6.2"
diff --git a/tests/alternate-color-space.pdf.link → tests/docs/alternate-color-space.pdf.link b/tests/alternate-color-space.pdf.link → tests/docs/alternate-color-space.pdf.link
diff --git a/tests/docs/documents_stack.pdf.link b/tests/docs/documents_stack.pdf.link
@@ -0,0 +1 @@
+https://web.archive.org/web/20160112115354/http://www.fao.org/fileadmin/user_upload/tci/docs/2_About%20Stacks.pdf
diff --git a/tests/tests.rs b/tests/tests.rs
@@ -0,0 +1,77 @@
+use pdf_extract::extract_text;
+
+// Shorthand for creating ExpectedText
+// example: expected!("atomic.pdf", "Atomic Data");
+macro_rules! expected {
+    ($filename:expr, $text:expr) => {
+        ExpectedText {
+            filename: $filename,
+            text: $text,
+        }
+    };
+}
+
+// Use the macro to create a list of ExpectedText
+// and then check if the text is correctly extracted
+#[test]
+fn extract_expected_text() {
+    let docs = vec![expected!("documents_stack.pdf.link", "mouse button until")];
+    for doc in docs {
+        doc.test();
+    }
+}
+
+#[test]
+// iterate over all docs in the `tests/docs` directory, don't crash
+fn extract_all_docs() {
+    let docs = std::fs::read_dir("tests/docs").unwrap();
+    for doc in docs {
+        let doc = doc.unwrap();
+        let path = doc.path();
+        let filename = path.file_name().unwrap().to_string_lossy();
+        expected!(&filename, "").test();
+    }
+}
+
+// data structure to make it easy to check if certain files are correctly parsed
+// e.g. ExpectedText { filename: "atomic.pdf", text: "Atomic Data" }
+#[derive(Debug, PartialEq)]
+struct ExpectedText<'a> {
+    filename: &'a str,
+    text: &'a str,
+}
+
+impl ExpectedText<'_> {
+    /// Opens the `filename` from `tests/docs`, extracts the text and checks if it contains `text`
+    /// If the file ends with `_link`, it will download the file from the url in the file to the `tests/docs_cache` directory
+    fn test(self) {
+        let ExpectedText { filename, text } = self;
+        let file_path = if filename.ends_with(".pdf.link") {
+            let docs_cache = "tests/docs_cache";
+            if !std::path::Path::new(docs_cache).exists() {
+                std::fs::create_dir(docs_cache).unwrap();
+            }
+            let file_path = format!("{}/{}", docs_cache, filename.replace(".link", ""));
+            if std::path::Path::new(&file_path).exists() {
+                file_path
+            } else {
+                let url = std::fs::read_to_string(format!("tests/docs/{}", filename)).unwrap();
+                let resp = ureq::get(&url).call().unwrap();
+                let mut file = std::fs::File::create(&file_path).unwrap();
+                std::io::copy(&mut resp.into_reader(), &mut file).unwrap();
+                file_path
+            }
+        } else {
+            format!("tests/docs/{}", filename)
+        };
+        let out = extract_text(file_path)
+            .unwrap_or_else(|e| panic!("Failed to extract text from {}, {}", filename, e));
+        println!("{}", out);
+        assert!(
+            out.contains(text),
+            "Text {} does not contain '{}'",
+            filename,
+            text
+        );
+    }
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		https://web.archive.org/web/20160112115354/http://www.fao.org/fileadmin/user_upload/tci/docs/2_About%20Stacks.pdf