First commit of the opennlp name-finder code. The name-finder is very…

… rudimentary right now. Names like "John" work, but not "Jon".
crisweber · Feb 22, 2010 · 31e2cfb · 31e2cfb
1 parent 0c04741
commit 31e2cfb
Show file tree

Hide file tree

Showing 5 changed files with 41 additions and 5 deletions.
diff --git a/README.markdown b/README.markdown
@@ -6,7 +6,7 @@ A library to interface with the OpenNLP (Open Natural Language Processing) libra
 Basic Example usage (from a REPL):
 ----------------------------------
 
-    (use 'clojure.contrib.pprint)
+    (use 'clojure.contrib.pprint) ; just for this documentation
     (use 'opennlp.nlp)
 
 You will need to make the processing functions using the model files. These assume you're running
@@ -16,7 +16,13 @@ at http://opennlp.sourceforge.net/models/
     user=> (def get-sentences (make-sentence-detector "models/EnglishSD.bin.gz"))
     user=> (def tokenize (make-tokenizer "models/EnglishTok.bin.gz"))
     user=> (def pos-tag (make-pos-tagger "models/tag.bin.gz"))
+
+For name-finders in particular, it's possible to have multiple model files:
+
+    user=> (def name-find (make-name-finder "models/namefind/person.bin.gz" "models/namefind/organization.bin.gz"))
 
+Then, use the functions you've created to perform operations on text:
+
     user=> (pprint (get-sentences "First sentence. Second sentence? Here is another one. And so on and so forth - you get the idea..."))
     ["First sentence. ", "Second sentence? ", "Here is another one. ",
      "And so on and so forth - you get the idea..."]
@@ -40,6 +46,9 @@ at http://opennlp.sourceforge.net/models/
      ["Friday." "NNP"])
     nil
 
+    user=> (name-find (tokenize "My name is Lee, not John."))
+    ("Lee" "John")
+
 Filtering pos-tagged sequences
 ------------------------------
 

diff --git a/models/namefind/README b/models/namefind/README
@@ -0,0 +1 @@
+There are more namefind models than the two here. Check http://opennlp.sourceforge.net/models/ for more.
diff --git a/models/namefind/organization.bin.gz b/models/namefind/organization.bin.gz
diff --git a/models/namefind/person.bin.gz b/models/namefind/person.bin.gz
diff --git a/src/opennlp/nlp.clj b/src/opennlp/nlp.clj
@@ -1,11 +1,15 @@
 ; Clojure opennlp tools
 (ns opennlp.nlp
+  (:use [clojure.contrib.pprint])
+  (:use [clojure.contrib.seq-utils])
   (:import [java.io File FileNotFoundException])
   (:import [opennlp.maxent DataStream GISModel])
-  (:import [opennlp.maxent.io SuffixSensitiveGISModelReader])
+  (:import [opennlp.maxent.io PooledGISModelReader SuffixSensitiveGISModelReader])
+  (:import [opennlp.tools.util Span])
   (:import [opennlp.tools.dictionary Dictionary])
   (:import [opennlp.tools.tokenize TokenizerME])
   (:import [opennlp.tools.sentdetect SentenceDetectorME])
+  (:import [opennlp.tools.namefind NameFinderME])
   (:import [opennlp.tools.postag POSTaggerME DefaultPOSContextGenerator POSContextGenerator]))
 
 ; OpenNLP property for pos-tagging
@@ -56,6 +60,21 @@
         (map #(vector %1 %2) tokens (first tags))))))
 
 
+(defn make-name-finder
+  "Return a function for finding names from tokens based on given model file(s)."
+  [& modelfiles]
+  (fn
+    [tokens]
+    (distinct
+      (flatten
+        (for [modelfile modelfiles]
+          (let [model   (.getModel (PooledGISModelReader. (File. modelfile)))
+                finder  (NameFinderME. model)
+                matches (.find finder tokens)]
+            (map #(nth tokens (.getStart %)) matches)))))))
+
+
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -68,24 +87,25 @@
 (def get-sentences (make-sentence-detector "models/EnglishSD.bin.gz"))
 (def tokenize (make-tokenizer "models/EnglishTok.bin.gz"))
 (def pos-tag (make-pos-tagger "models/tag.bin.gz"))
+(def name-find (make-name-finder "models/namefind/person.bin.gz" "models/namefind/organization.bin.gz"))
 
 (pprint (get-sentences "First sentence. Second sentence? Here is another one. And so on and so forth - you get the idea..."))
 
-;opennlp=> (pprint (get-sentences "First sentence. Second sentence? Here is another one. And so on and so forth - you get the idea..."))
+;opennlp.nlp=> (pprint (get-sentences "First sentence. Second sentence? Here is another one. And so on and so forth - you get the idea..."))
 ;["First sentence. ", "Second sentence? ", "Here is another one. ",
 ; "And so on and so forth - you get the idea..."]
 ;nil
 
 (pprint (tokenize "Mr. Smith gave a car to his son on Friday"))
 
-;opennlp=> (pprint (tokenize "Mr. Smith gave a car to his son on Friday"))
+;opennlp.nlp=> (pprint (tokenize "Mr. Smith gave a car to his son on Friday"))
 ;["Mr.", "Smith", "gave", "a", "car", "to", "his", "son", "on",
 ; "Friday"]
 ;nil
 
 (pprint (pos-tag (tokenize "Mr. Smith gave a car to his son on Friday.")))
 
-;opennlp=> (pprint (pos-tag (tokenize "Mr. Smith gave a car to his son on Friday.")))
+;opennlp.nlp=> (pprint (pos-tag (tokenize "Mr. Smith gave a car to his son on Friday.")))
 ;(["Mr." "NNP"]
 ; ["Smith" "NNP"]
 ; ["gave" "VBD"]
@@ -98,5 +118,11 @@
 ; ["Friday." "NNP"])
 ;nil
 
+(name-find (tokenize "My name is Lee, not John."))
+
+;opennlp.nlp=> (name-find (tokenize "My name is Lee Hinman, not John Locke."))
+;("Lee" "John")
+
+
 )
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		There are more namefind models than the two here. Check http://opennlp.sourceforge.net/models/ for more.