Skip to content

Commit

Permalink
Create namespaced metadata keywords keywords
Browse files Browse the repository at this point in the history
  • Loading branch information
lenaschoenburg committed Mar 1, 2016
1 parent 25dc231 commit 4e81879
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 18 deletions.
14 changes: 12 additions & 2 deletions src/clojure/pantomime/extract.clj
Original file line number Diff line number Diff line change
@@ -1,18 +1,28 @@
(ns pantomime.extract
(:require [pantomime.internal :refer :all]
[clojure.string :as string]
[clojure.java.io :refer [input-stream copy]])
(:import [java.io File InputStream ByteArrayInputStream]
[org.apache.tika Tika]
[java.net URL]
[org.apache.tika.metadata Metadata]
[org.apache.tika.sax BodyContentHandler XHTMLContentHandler]
[org.apache.tika.sax BodyContentHandler]
[org.apache.tika.parser Parser AbstractParser
AutoDetectParser ParseContext]))

(defn convert-key [k]
(let [lisp-case (.toLowerCase ^String (string/replace k \_ \-))
segments (string/split lisp-case #":")
nspace (butlast segments)
n (last segments)]
(if (seq nspace)
(keyword (string/join "." nspace) n)
(keyword n))))

(defn conv-metadata
[^Metadata mdata]
(let [names (.names mdata)]
(zipmap (map #(keyword (.toLowerCase ^String %1)) names)
(zipmap (map convert-key names)
(map #(seq (.getValues mdata ^String %1)) names))))

(def ^{:private true} tika-class (Tika.))
Expand Down
25 changes: 11 additions & 14 deletions test/pantomime/test/extract_test.clj
Original file line number Diff line number Diff line change
@@ -1,37 +1,34 @@
(ns pantomime.test.extract-test
(:require [clojure.java.io :as io]
[pantomime.extract :as extract]
[clj-http.client :as http]
[clojure.test :refer :all])
(:import [java.io File FileInputStream]
java.net.URL))
[clojure.test :refer :all]))


(deftest test-extract-metadata
(let [parsed (-> "resources/pdf/qrl.pdf"
io/resource
extract/parse)]
(are [x y] (= (x parsed) (list y))
:pdf:pdfversion "1.2"
:dc:title "main.dvi")))
:pdf/pdfversion "1.2"
:dc/title "main.dvi")))

(deftest test-extract-metadata-input-stream
(let [parsed (-> "resources/pdf/qrl.pdf"
io/resource
io/input-stream
extract/parse)]
(are [x y] (= (x parsed) (list y))
:pdf:pdfversion "1.2"
:dc:title "main.dvi")))
:pdf/pdfversion "1.2"
:dc/title "main.dvi")))

(deftest test-extract-metadata-file
(let [parsed (-> "resources/pdf/qrl.pdf"
io/resource
io/as-file
extract/parse)]
(are [x y] (= (x parsed) (list y))
:pdf:pdfversion "1.2"
:dc:title "main.dvi")))
:pdf/pdfversion "1.2"
:dc/title "main.dvi")))

;; http://stackoverflow.com/questions/7181658/byte-collection-to-string-on-clojure
;; http://alexander-hill.tumblr.com/post/88883810180/working-with-binary-files-in-clojure
Expand All @@ -44,8 +41,8 @@
_ (.read reader buffer 0 length)
parsed (extract/parse buffer)]
(are [x y] (= (x parsed) (list y))
:pdf:pdfversion "1.2"
:dc:title "main.dvi")))))
:pdf/pdfversion "1.2"
:dc/title "main.dvi")))))

(deftest test-extract-metadata-string
(let [parsed (extract/parse "test/resources/txt/english.txt")]
Expand Down Expand Up @@ -74,5 +71,5 @@
io/as-url
extract/parse)]
(are [x y] (= (x parsed) (list y))
:pdf:pdfversion "1.4"
:dc:title "Advanced Message Queuing Protocol Specification")))
:pdf/pdfversion "1.4"
:dc/title "Advanced Message Queuing Protocol Specification")))
3 changes: 1 addition & 2 deletions test/pantomime/test/web_test.clj
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
(ns pantomime.test.web-test
(:require [clojure.java.io :as io]
[clj-http.client :as http]
(:require [clj-http.client :as http]
[clojure.test :refer :all]
[pantomime.web :refer :all]))

Expand Down

0 comments on commit 4e81879

Please sign in to comment.