From fa9ece994fbdcd81462649b712a62fa130e7b9f0 Mon Sep 17 00:00:00 2001 From: Lee Hinman Date: Mon, 21 Mar 2011 22:29:37 -0600 Subject: [PATCH] add a test from corpus that wasn't passing due to improper tokenization --- src/opennlp/nlp.clj | 12 ++++++------ test/opennlp/test/nlp.clj | 8 +++++++- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/opennlp/nlp.clj b/src/opennlp/nlp.clj index 5ea51c6..6aa4081 100644 --- a/src/opennlp/nlp.clj +++ b/src/opennlp/nlp.clj @@ -132,10 +132,10 @@ (loop [ts tokens dt-ops detoken-ops] (let [op (first dt-ops) op2 (second dt-ops)] - ;;(println :op op) - ;;(println :op2 op) - ;;(println :ts (first ts)) - ;;(println :sb (.toString sb)) + ;; (println :op op) + ;; (println :op2 op) + ;; (println :ts (first ts)) + ;; (println :sb (.toString sb)) (cond (or (= op2 nil) (= op2 Detokenizer$DetokenizationOperation/MERGE_TO_LEFT)) @@ -148,8 +148,8 @@ (= op DetokenizationDictionary$Operation/RIGHT_LEFT_MATCHING) (if (contains? @token-set (first ts)) (do - ;;(println :token-set @token-set) - ;;(println :ts (first ts)) + ;; (println :token-set @token-set) + ;; (println :ts (first ts)) (swap! token-set disj (first ts)) (.append sb (first ts))) (do diff --git a/test/opennlp/test/nlp.clj b/test/opennlp/test/nlp.clj index 5a04741..696c452 100644 --- a/test/opennlp/test/nlp.clj +++ b/test/opennlp/test/nlp.clj @@ -53,7 +53,13 @@ (is (= (detokenize (tokenize "Mary likes cows (Mary is a cow).")) "Mary likes cows (Mary is a cow).")) (is (= (detokenize (tokenize "Mary exclaimed \"I am a cow!\"")) - "Mary exclaimed \"I am a cow!\""))) + "Mary exclaimed \"I am a cow!\"")) + (is (= (detokenize ["I" "know" "what" "\"" "it" "\"" "means" "well" "enough" + "," "when" "I" "find" "a" "thing" "," "said" "the" "Duck" + ":" "its" "generally" "a" "frog" "or" "a" "worm" "."]) + (str "I know what \"it\" means well enough, when" + " I find a thing, said the Duck: its" + " generally a frog or a worm.")))) (deftest precondition-test (is (thrown? java.lang.AssertionError (get-sentences 1)))