Skip to content

Commit

Permalink
start work on moving to opennlp 1.5
Browse files Browse the repository at this point in the history
  • Loading branch information
dakrone committed Oct 27, 2010
1 parent a3dbd6a commit f33c953
Show file tree
Hide file tree
Showing 12 changed files with 94 additions and 92 deletions.
2 changes: 1 addition & 1 deletion README.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ Treebank-parsing
----------------

<b>Note: Treebank parsing is very memory intensive, make sure your JVM has
a sufficient amount of memory available (using something like -Xmx1024m)
a sufficient amount of memory available (using something like -Xmx512m)
or you will run out of heap space when using a treebank parser.</b>

Treebank parsing gets its own section due to how complex it is. One difference
Expand Down
Binary file removed models/EnglishChunk.bin.gz
Binary file not shown.
Binary file removed models/EnglishSD.bin.gz
Binary file not shown.
Binary file removed models/EnglishTok.bin.gz
Binary file not shown.
Binary file added models/en-sent.bin
Binary file not shown.
1 change: 0 additions & 1 deletion models/namefind/README

This file was deleted.

Binary file removed models/namefind/organization.bin.gz
Binary file not shown.
Binary file removed models/namefind/person.bin.gz
Binary file not shown.
Binary file removed models/tag.bin.gz
Binary file not shown.
6 changes: 4 additions & 2 deletions project.clj
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
:description "Natural Language Processing with Clojure, library for opennlp. http://github.com/dakrone/clojure-opennlp"
:dependencies [[org.clojure/clojure "1.2.0"]
[org.clojure/clojure-contrib "1.2.0"]
[org.clojars.thnetos/opennlp-tools "1.4.3"]]
:dev-dependencies [[lein-clojars "0.5.0-SNAPSHOT"]])
;; [org.clojars.thnetos/opennlp-tools "1.4.3"]
[opennlp/tools "1.5.0"]]
:dev-dependencies [[lein-clojars "0.5.0-SNAPSHOT"]]
:repositories {"opennlp.sf.net" "http://opennlp.sourceforge.net/maven2"})

64 changes: 31 additions & 33 deletions src/opennlp/nlp.clj
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,21 @@
(ns opennlp.nlp
(:use [clojure.contrib.seq-utils :only [indexed]])
(:use [clojure.contrib.pprint :only [pprint]])
(:import [java.io File FileNotFoundException])
(:import [opennlp.maxent DataStream GISModel])
(:import [opennlp.maxent.io PooledGISModelReader SuffixSensitiveGISModelReader])
(:import [opennlp.tools.util Span])
(:import [opennlp.tools.dictionary Dictionary])
(:import [opennlp.tools.tokenize TokenizerME])
(:import [opennlp.tools.sentdetect SentenceDetectorME])
(:import [opennlp.tools.namefind NameFinderME])
(:import [opennlp.tools.chunker ChunkerME])
(:import [opennlp.tools.coref LinkerMode])
(:import [opennlp.tools.coref.mention Mention DefaultParse])
(:import [opennlp.tools.lang.english ParserTagger ParserChunker HeadRules TreebankLinker CorefParse])
(:import [opennlp.tools.parser.chunking Parser])
(:import [opennlp.tools.parser AbstractBottomUpParser Parse])
(:import [opennlp.tools.postag POSTaggerME DefaultPOSContextGenerator POSContextGenerator]))
(:import [java.io File FileNotFoundException FileInputStream])
#_(:import [opennlp.maxent DataStream GISModel])
#_(:import [opennlp.maxent.io PooledGISModelReader SuffixSensitiveGISModelReader])
#_(:import [opennlp.tools.util Span])
#_(:import [opennlp.tools.dictionary Dictionary])
#_(:import [opennlp.tools.tokenize TokenizerME])
(:import [opennlp.tools.sentdetect SentenceModel SentenceDetectorME])
#_(:import [opennlp.tools.namefind NameFinderME])
#_(:import [opennlp.tools.chunker ChunkerME])
#_(:import [opennlp.tools.coref LinkerMode])
#_(:import [opennlp.tools.coref.mention Mention DefaultParse])
#_(:import [opennlp.tools.lang.english ParserTagger ParserChunker HeadRules TreebankLinker CorefParse])
#_(:import [opennlp.tools.parser.chunking Parser])
#_(:import [opennlp.tools.parser AbstractBottomUpParser Parse])
#_(:import [opennlp.tools.postag POSTaggerME DefaultPOSContextGenerator POSContextGenerator]))


;;; OpenNLP property for pos-tagging. Meant to be rebound before
Expand All @@ -27,26 +27,24 @@
[filename]
(.exists (File. filename)))


(defn files-exist?
(defn- files-exist?
[filenames]
(reduce 'and (map file-exist? filenames)))


(defn make-sentence-detector
"Return a function for detecting sentences based on a given model file."
[modelfile]
(if-not (file-exist? modelfile)
(throw (FileNotFoundException. "Model file does not exist."))
(fn sentenizer
(throw (FileNotFoundException.))
(fn sentencizer
[text]
(let [model (.getModel (SuffixSensitiveGISModelReader. (File. modelfile)))
detector (SentenceDetectorME. model)
(let [model-stream (FileInputStream. modelfile)
model (SentenceModel. model-stream)
detector (SentenceDetectorME. model)
sentences (.sentDetect detector text)]
(into [] sentences)))))


(defn make-tokenizer
#_(defn make-tokenizer
"Return a function for tokenizing a sentence based on a given model file."
[modelfile]
(if-not (file-exist? modelfile)
Expand All @@ -59,7 +57,7 @@
(into [] tokens)))))


(defn make-pos-tagger
#_(defn make-pos-tagger
"Return a function for tagging tokens based on a given model file."
[modelfile]
(if-not (file-exist? modelfile)
Expand All @@ -74,7 +72,7 @@
(map #(vector %1 %2) tokens (first tags))))))


(defn make-name-finder
#_(defn make-name-finder
"Return a function for finding names from tokens based on given model file(s)."
[& modelfiles]
(if-not (files-exist? modelfiles)
Expand Down Expand Up @@ -130,7 +128,7 @@

(defstruct treebank-phrase :phrase :tag)

(defn make-treebank-chunker
#_(defn make-treebank-chunker
"Return a function for chunking phrases from pos-tagged tokens based on
a given model file."
[modelfile]
Expand Down Expand Up @@ -193,7 +191,7 @@
(.replaceAll "\\}" "-RCB-")))


(defn- parse-line
#_(defn- parse-line
"Given a line and Parser object, return a list of Parses."
[line parser]
(let [line (strip-parens line)
Expand All @@ -216,7 +214,7 @@
(.toString results)))


(defn make-treebank-parser
#_(defn make-treebank-parser
"Return a function for treebank parsing a sequence of sentences, based on
given build, check, tag, chunk models and a set of head rules."
[buildmodel checkmodel tagmodel chunkmodel headrules & opts]
Expand Down Expand Up @@ -297,7 +295,7 @@
(reset! start (.getEnd s))))

;;; This is broken, don't use this.
(defn print-parse
#_(defn print-parse
"Given a parse and the EntityMentions-to-parse map, print out the parse."
[p parse-map]
(let [start (atom (.getStart (.getSpan p)))
Expand Down Expand Up @@ -346,7 +344,7 @@
(map #(print-parse % parse-map) parses)))


(defn coref-extent
#_(defn coref-extent
[extent p index]
(if (nil? extent)
(let [snp (Parse. (.getText p) (.getSpan extent) "NML" 1.0 0)]
Expand All @@ -355,7 +353,7 @@
nil))


(defn coref-sentence
#_(defn coref-sentence
[sentence parses index tblinker]
(let [p (Parse/parseParse sentence)
extents (.getMentions (.getMentionFinder tblinker) (DefaultParse. p index))]
Expand All @@ -365,7 +363,7 @@


; Second Attempt
(defn make-treebank-linker
#_(defn make-treebank-linker
"Make a TreebankLinker, given a model directory."
[modeldir]
(let [tblinker (TreebankLinker. modeldir LinkerMode/TEST)]
Expand Down
113 changes: 58 additions & 55 deletions test/opennlp/test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -4,72 +4,75 @@
(:use [clojure.test])
(:import [java.io File FileNotFoundException]))

(def get-sentences (make-sentence-detector "models/EnglishSD.bin.gz"))
(def tokenize (make-tokenizer "models/EnglishTok.bin.gz"))
(def pos-tag (make-pos-tagger "models/tag.bin.gz"))
(def name-find (make-name-finder "models/namefind/person.bin.gz"))
(def chunker (make-treebank-chunker "models/EnglishChunk.bin.gz"))
(def get-sentences (make-sentence-detector "models/en-sent.bin"))
#_(def tokenize (make-tokenizer "models/EnglishTok.bin.gz"))
#_(def pos-tag (make-pos-tagger "models/tag.bin.gz"))
#_(def name-find (make-name-finder "models/namefind/person.bin.gz"))
#_(def chunker (make-treebank-chunker "models/EnglishChunk.bin.gz"))


(deftest sentence-split-test
(is (= (get-sentences "First sentence. Second sentence? Here is another one. And so on and so forth - you get the idea...")
["First sentence. " "Second sentence? " "Here is another one. " "And so on and so forth - you get the idea..."]))
["First sentence." "Second sentence?" "Here is another one." "And so on and so forth - you get the idea..."]))
(is (= (get-sentences "'Hmmm.... now what?' Mr. Green said to H.A.L.")
["'Hmmm.... now what?' Mr. Green said to H.A.L."])))
["'Hmmm.... now what?'" "Mr. Green said to H.A.L."])))

(deftest tokenizer-test
(is (= (tokenize "First sentence.")
["First" "sentence" "."]))
(is (= (tokenize "Mr. Smith gave a car to his son on Friday.")
["Mr." "Smith" "gave" "a" "car" "to" "his" "son" "on" "Friday" "."])))
(comment
(deftest tokenizer-test
(is (= (tokenize "First sentence.")
["First" "sentence" "."]))
(is (= (tokenize "Mr. Smith gave a car to his son on Friday.")
["Mr." "Smith" "gave" "a" "car" "to" "his" "son" "on" "Friday" "."])))

(deftest pos-tag-test
(is (= (pos-tag (tokenize "Mr. Smith gave a car to his son on Friday."))
'(["Mr." "NNP"] ["Smith" "NNP"] ["gave" "VBD"] ["a" "DT"] ["car" "NN"] ["to" "TO"] ["his" "PRP$"] ["son" "NN"] ["on" "IN"] ["Friday" "NNP"] ["." "."]))))
(deftest pos-tag-test
(is (= (pos-tag (tokenize "Mr. Smith gave a car to his son on Friday."))
'(["Mr." "NNP"] ["Smith" "NNP"] ["gave" "VBD"] ["a" "DT"] ["car" "NN"] ["to" "TO"] ["his" "PRP$"] ["son" "NN"] ["on" "IN"] ["Friday" "NNP"] ["." "."]))))

(deftest name-finder-test
(is (= (name-find (tokenize "My name is Lee, not John"))
'("Lee" "John"))))
(deftest name-finder-test
(is (= (name-find (tokenize "My name is Lee, not John"))
'("Lee" "John"))))

(deftest chunker-test
(is (= (chunker (pos-tag (tokenize "The override system is meant to deactivate the accelerator when the brake pedal is pressed.")))
'({:phrase ["The" "override" "system"] :tag "NP"}
{:phrase ["is" "meant" "to" "deactivate"] :tag "VP"}
{:phrase ["the" "accelerator"] :tag "NP"}
{:phrase ["when"] :tag "ADVP"}
{:phrase ["the" "brake" "pedal"] :tag "NP"}
{:phrase ["is" "pressed"] :tag "VP"}))))
(deftest chunker-test
(is (= (chunker (pos-tag (tokenize "The override system is meant to deactivate the accelerator when the brake pedal is pressed.")))
'({:phrase ["The" "override" "system"] :tag "NP"}
{:phrase ["is" "meant" "to" "deactivate"] :tag "VP"}
{:phrase ["the" "accelerator"] :tag "NP"}
{:phrase ["when"] :tag "ADVP"}
{:phrase ["the" "brake" "pedal"] :tag "NP"}
{:phrase ["is" "pressed"] :tag "VP"}))))

(try
(do
(def parser (make-treebank-parser "parser-models/build.bin.gz" "parser-models/check.bin.gz" "parser-models/tag.bin.gz" "parser-models/chunk.bin.gz" "parser-models/head_rules"))
(deftest parser-test
(is (= (parser ["This is a sentence ."])
["(TOP (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN sentence))) (. .)))"]))))
(catch FileNotFoundException e
(println "Unable to execute treebank-parser tests. Download the model files to $PROJECT_ROOT/parser-models.")))
(try
(do
(def parser (make-treebank-parser "parser-models/build.bin.gz" "parser-models/check.bin.gz" "parser-models/tag.bin.gz" "parser-models/chunk.bin.gz" "parser-models/head_rules"))
(deftest parser-test
(is (= (parser ["This is a sentence ."])
["(TOP (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN sentence))) (. .)))"]))))
(catch FileNotFoundException e
(println "Unable to execute treebank-parser tests. Download the model files to $PROJECT_ROOT/parser-models.")))

(deftest no-model-file-test
(is (thrown? FileNotFoundException (make-sentence-detector "nonexistantfile")))
(is (thrown? FileNotFoundException (make-tokenizer "nonexistantfile")))
(is (thrown? FileNotFoundException (make-pos-tagger "nonexistantfile")))
(is (thrown? FileNotFoundException (make-name-finder "nonexistantfile" "anotherfilethatdoesnotexist")))
(is (thrown? FileNotFoundException (make-treebank-chunker "nonexistantfile")))
(is (thrown? FileNotFoundException (make-treebank-parser "nonexistantfile" "asdf" "fdsa" "qwer" "rewq"))))
(deftest no-model-file-test
(is (thrown? FileNotFoundException (make-sentence-detector "nonexistantfile")))
(is (thrown? FileNotFoundException (make-tokenizer "nonexistantfile")))
(is (thrown? FileNotFoundException (make-pos-tagger "nonexistantfile")))
(is (thrown? FileNotFoundException (make-name-finder "nonexistantfile" "anotherfilethatdoesnotexist")))
(is (thrown? FileNotFoundException (make-treebank-chunker "nonexistantfile")))
(is (thrown? FileNotFoundException (make-treebank-parser "nonexistantfile" "asdf" "fdsa" "qwer" "rewq"))))

(deftest laziness-test
(let [s (get-sentences "First sentence. Second sentence?")]
(is (= (type (lazy-tokenize s tokenize))
clojure.lang.LazySeq))
(is (= (first (lazy-tokenize s tokenize))
["First" "sentence" "."]))
(is (= (type (lazy-tag s tokenize pos-tag))
clojure.lang.LazySeq))
(is (= (first (lazy-tag s tokenize pos-tag))
'(["First" "RB"] ["sentence" "NN"] ["." "."])))
(is (= (type (lazy-chunk s tokenize pos-tag chunker))
clojure.lang.LazySeq))
(is (= (first (lazy-chunk s tokenize pos-tag chunker))
'({:phrase ["First"], :tag "ADVP"} {:phrase ["sentence"], :tag "NP"})))))
(deftest laziness-test
(let [s (get-sentences "First sentence. Second sentence?")]
(is (= (type (lazy-tokenize s tokenize))
clojure.lang.LazySeq))
(is (= (first (lazy-tokenize s tokenize))
["First" "sentence" "."]))
(is (= (type (lazy-tag s tokenize pos-tag))
clojure.lang.LazySeq))
(is (= (first (lazy-tag s tokenize pos-tag))
'(["First" "RB"] ["sentence" "NN"] ["." "."])))
(is (= (type (lazy-chunk s tokenize pos-tag chunker))
clojure.lang.LazySeq))
(is (= (first (lazy-chunk s tokenize pos-tag chunker))
'({:phrase ["First"], :tag "ADVP"} {:phrase ["sentence"], :tag "NP"})))))

)


0 comments on commit f33c953

Please sign in to comment.