Use the correct way to pull Spans into a seq, allowing capture of mul…

…ti-word names
hellcoderz · Feb 12, 2012 · 887add2 · 887add2
1 parent 493ab08
commit 887add2
Show file tree

Hide file tree

Showing 3 changed files with 8 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@ docs/*
 parser-model/en-parser-chunking.bin
 .lein-failures
 multi-lib/*
+.lein-deps-sum
diff --git a/src/opennlp/nlp.clj b/src/opennlp/nlp.clj
@@ -11,7 +11,8 @@
                            Detokenizer$DetokenizationOperation
                            DictionaryDetokenizer
                            TokenizerME
-                           TokenizerModel)))
+                           TokenizerModel)
+   (opennlp.tools.util Span)))
 
 ;; OpenNLP property for pos-tagging. Meant to be rebound before
 ;; calling the tagging creators
@@ -108,7 +109,7 @@
           matches (.find finder (into-array String tokens))
           probs (seq (.probs finder))]
       (with-meta
-        (distinct (map #(get tokens (.getStart %)) matches))
+        (distinct (Span/spansToStrings matches (into-array String tokens)))
         {:probabilities probs}))))
 
 (defmulti make-detokenizer

diff --git a/test/opennlp/test/nlp.clj b/test/opennlp/test/nlp.clj
@@ -35,7 +35,10 @@
   (is (= (name-find (tokenize "My name is Lee, not John"))
          '("Lee" "John")))
   (is (= (name-find ["adsf"])
-         '())))
+         '()))
+  (is (= (name-find (tokenize "My name is James Brown"))
+         '("James Brown"))
+      "should find names with two words"))
 
 (deftest detokenizer-test
   (is (= (detokenize (tokenize "I don't think he would've."))