Skip to content

Commit

Permalink
much work on detokenization to finally get parentheses working
Browse files Browse the repository at this point in the history
  • Loading branch information
dakrone committed Mar 22, 2011
1 parent 48797fe commit 79277f2
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 38 deletions.
30 changes: 21 additions & 9 deletions models/english-detokenizer.xml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ under the License.
-->

<dictionary>
<entry operation="RIGHT_LEFT_MATCHING">
<token>"</token>
</entry>
<entry operation="RIGHT_LEFT_MATCHING">
<token>'</token>
</entry>
<entry operation="MOVE_LEFT">
<token>.</token>
</entry>
Expand All @@ -38,12 +44,12 @@ under the License.
<entry operation="MOVE_LEFT">
<token>:</token>
</entry>
<entry operation="MOVE_LEFT">
<token>)</token>
</entry>
<entry operation="MOVE_RIGHT">
<token>(</token>
</entry>
<entry operation="MOVE_LEFT">
<token>)</token>
</entry>
<entry operation="MOVE_LEFT">
<token>}</token>
</entry>
Expand All @@ -65,12 +71,6 @@ under the License.
<entry operation="MOVE_LEFT">
<token>%</token>
</entry>
<entry operation="RIGHT_LEFT_MATCHING">
<token>"</token>
</entry>
<entry operation="RIGHT_LEFT_MATCHING">
<token>"</token>
</entry>
<entry operation="MOVE_LEFT">
<token>n't</token>
</entry>
Expand All @@ -92,4 +92,16 @@ under the License.
<entry operation="MOVE_LEFT">
<token>'m</token>
</entry>
<entry operation="MOVE_LEFT">
<token>.org</token>
</entry>
<entry operation="MOVE_LEFT">
<token>.com</token>
</entry>
<entry operation="MOVE_LEFT">
<token>.net</token>
</entry>
<entry operation="MOVE_RIGHT">
<token>#</token>
</entry>
</dictionary>
62 changes: 34 additions & 28 deletions src/opennlp/nlp.clj
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
(:import [opennlp.tools.util Span])
(:import [opennlp.tools.tokenize TokenizerModel TokenizerME
DictionaryDetokenizer DetokenizationDictionary Detokenizer
Detokenizer$DetokenizationOperation])
Detokenizer$DetokenizationOperation
DetokenizationDictionary$Operation])
(:import [opennlp.tools.sentdetect SentenceModel SentenceDetectorME])
(:import [opennlp.tools.namefind TokenNameFinderModel NameFinderME])
(:import [opennlp.tools.postag POSModel POSTaggerME]))
Expand Down Expand Up @@ -122,38 +123,43 @@
(make-detokenizer (DetokenizationDictionary. model-stream))))

;; TODO: clean this up, recursion is a smell
;; TODO: remove debug printlns once I'm satisfied
(defn- collapse-tokens
[tokens detoken-ops]
(let [sb (StringBuilder.)]
(let [sb (StringBuilder.)
token-set (atom #{})]
;;(println :ops detoken-ops)
(loop [ts tokens dt-ops detoken-ops]
(let [op (first dt-ops)
op2 (second dt-ops)]
(if (or (= op2 nil)
(= op2 Detokenizer$DetokenizationOperation/MERGE_TO_LEFT))
(.append sb (first ts))
(.append sb (str (first ts) " ")))
(when (and op op2)
(recur (next ts) (next dt-ops)))))
(.toString sb)))

;; older, cruddier version
#_(defn- collapse-tokens
[tokens detoken-ops]
(let [sb (StringBuilder.)]
(loop [ts tokens dt-ops detoken-ops]
(let [op (first dt-ops)
op2 (second dt-ops)]
(println :ts ts)
(println :op op)
(println :op2 op2)
(if (and op
(or op2
(= op2 Detokenizer$DetokenizationOperation/MERGE_TO_LEFT)
(= op Detokenizer$DetokenizationOperation/MERGE_TO_RIGHT)))
(.append sb (first ts))
(if (> (count dt-ops) 1)
(.append sb (str (first ts) " "))
(.append sb (str (first ts)))))
;;(println :op op)
;;(println :op2 op)
;;(println :ts (first ts))
;;(println :sb (.toString sb))
(cond
(or (= op2 nil)
(= op2 Detokenizer$DetokenizationOperation/MERGE_TO_LEFT))
(.append sb (first ts))

(or (= op nil)
(= op Detokenizer$DetokenizationOperation/MERGE_TO_RIGHT))
(.append sb (first ts))

(= op DetokenizationDictionary$Operation/RIGHT_LEFT_MATCHING)
(if (contains? @token-set (first ts))
(do
;;(println :token-set @token-set)
;;(println :ts (first ts))
(swap! token-set disj (first ts))
(.append sb (first ts)))
(do
;;(println :token-set @token-set)
;;(println :ts (first ts))
(swap! token-set conj (first ts))
(.append sb (str (first ts) " "))))

:else
(.append sb (str (first ts) " ")))
(when (and op op2)
(recur (next ts) (next dt-ops)))))
(.toString sb)))
Expand Down
6 changes: 5 additions & 1 deletion test/opennlp/test/nlp.clj
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,11 @@
(is (= (detokenize (tokenize "She's the best."))
"She's the best."))
(is (= (detokenize (tokenize "I'm not sure."))
"I'm not sure.")))
"I'm not sure."))
(is (= (detokenize (tokenize "Mary likes cows (Mary is a cow)."))
"Mary likes cows (Mary is a cow)."))
(is (= (detokenize (tokenize "Mary exclaimed \"I am a cow!\""))
"Mary exclaimed \"I am a cow!\"")))

(deftest precondition-test
(is (thrown? java.lang.AssertionError (get-sentences 1)))
Expand Down

0 comments on commit 79277f2

Please sign in to comment.