Skip to content

Commit

Permalink
fix tokenizer for opennlp 1.5
Browse files Browse the repository at this point in the history
  • Loading branch information
dakrone committed Oct 27, 2010
1 parent f33c953 commit 53f6efe
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 18 deletions.
Binary file added models/en-token.bin
Binary file not shown.
30 changes: 15 additions & 15 deletions src/opennlp/nlp.clj
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#_(:import [opennlp.maxent.io PooledGISModelReader SuffixSensitiveGISModelReader])
#_(:import [opennlp.tools.util Span])
#_(:import [opennlp.tools.dictionary Dictionary])
#_(:import [opennlp.tools.tokenize TokenizerME])
(:import [opennlp.tools.tokenize TokenizerModel TokenizerME])
(:import [opennlp.tools.sentdetect SentenceModel SentenceDetectorME])
#_(:import [opennlp.tools.namefind NameFinderME])
#_(:import [opennlp.tools.chunker ChunkerME])
Expand All @@ -32,30 +32,30 @@
(reduce 'and (map file-exist? filenames)))

(defn make-sentence-detector
"Return a function for splitting sentences given a model file."
[modelfile]
(if-not (file-exist? modelfile)
(throw (FileNotFoundException.))
(fn sentencizer
(fn sentence-detector
[text]
(let [model-stream (FileInputStream. modelfile)
model (SentenceModel. model-stream)
detector (SentenceDetectorME. model)
sentences (.sentDetect detector text)]
(into [] sentences)))))
(with-open [model-stream (FileInputStream. modelfile)]
(let [model (SentenceModel. model-stream)
detector (SentenceDetectorME. model)
sentences (.sentDetect detector text)]
(into [] sentences))))))


#_(defn make-tokenizer
(defn make-tokenizer
"Return a function for tokenizing a sentence based on a given model file."
[modelfile]
(if-not (file-exist? modelfile)
(throw (FileNotFoundException. "Model file does not exist."))
(throw (FileNotFoundException.))
(fn tokenizer
[sentence]
(let [model (.getModel (SuffixSensitiveGISModelReader. (File. modelfile)))
tokenizer (TokenizerME. model)
tokens (.tokenize tokenizer sentence)]
(into [] tokens)))))

(with-open [model-stream (FileInputStream. modelfile)]
(let [model (TokenizerModel. model-stream)
tokenizer (TokenizerME. model)
tokens (.tokenize tokenizer sentence)]
(into [] tokens))))))

#_(defn make-pos-tagger
"Return a function for tagging tokens based on a given model file."
Expand Down
6 changes: 3 additions & 3 deletions test/opennlp/test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
(:import [java.io File FileNotFoundException]))

(def get-sentences (make-sentence-detector "models/en-sent.bin"))
#_(def tokenize (make-tokenizer "models/EnglishTok.bin.gz"))
(def tokenize (make-tokenizer "models/en-token.bin"))
#_(def pos-tag (make-pos-tagger "models/tag.bin.gz"))
#_(def name-find (make-name-finder "models/namefind/person.bin.gz"))
#_(def chunker (make-treebank-chunker "models/EnglishChunk.bin.gz"))
Expand All @@ -17,13 +17,13 @@
(is (= (get-sentences "'Hmmm.... now what?' Mr. Green said to H.A.L.")
["'Hmmm.... now what?'" "Mr. Green said to H.A.L."])))

(comment

(deftest tokenizer-test
(is (= (tokenize "First sentence.")
["First" "sentence" "."]))
(is (= (tokenize "Mr. Smith gave a car to his son on Friday.")
["Mr." "Smith" "gave" "a" "car" "to" "his" "son" "on" "Friday" "."])))

(comment
(deftest pos-tag-test
(is (= (pos-tag (tokenize "Mr. Smith gave a car to his son on Friday."))
'(["Mr." "NNP"] ["Smith" "NNP"] ["gave" "VBD"] ["a" "DT"] ["car" "NN"] ["to" "TO"] ["his" "PRP$"] ["son" "NN"] ["on" "IN"] ["Friday" "NNP"] ["." "."]))))
Expand Down

0 comments on commit 53f6efe

Please sign in to comment.