Skip to content

Commit

Permalink
Merge remote-tracking branch 'diligenceengine/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
dakrone committed Oct 2, 2012
2 parents 2206661 + 7be9da4 commit 6336bf0
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 17 deletions.
2 changes: 1 addition & 1 deletion project.clj
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
:url "http://github.com/dakrone/clojure-opennlp"
:min-lein-version "2.0.0"
:dependencies [[org.clojure/clojure "1.4.0"]
[org.apache.opennlp/opennlp-tools "1.5.1-incubating"]]
[org.apache.opennlp/opennlp-tools "1.5.2-incubating"]]
:profiles {:1.2 {:dependencies [[org.clojure/clojure "1.2.1"]]}
:1.3 {:dependencies [[org.clojure/clojure "1.3.0"]]}}
:aliases {"all" ["with-profile" "dev,1.2:dev,1.3:dev"]}
Expand Down
84 changes: 68 additions & 16 deletions src/opennlp/nlp.clj
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"The main namespace for the clojure-opennlp project. Functions for
creating NLP performers can be created with the tools in this namespace."
(:use [clojure.java.io :only [input-stream]])
(:require [opennlp.span :as nspan])
(:import
(opennlp.tools.doccat DoccatModel
DocumentCategorizerME)
Expand All @@ -12,6 +13,7 @@
DetokenizationDictionary$Operation
Detokenizer$DetokenizationOperation
DictionaryDetokenizer
TokenSample
TokenizerME
TokenizerModel)
(opennlp.tools.util Span)))
Expand All @@ -23,6 +25,20 @@
;; Caching to use for pos-tagging
(def #^{:dynamic true} *cache-size* 1024)

(defn- opennlp-span-strings
"Takes a collection of spans and the data they refer to. Returns a list of substrings
corresponding to spans."
[span-col data]
(if (seq span-col)
(seq (Span/spansToStrings (into-array span-col) (if (string? data) data (into-array data))))
[]))

(defn- to-native-span
"Take an OpenNLP span object and return a pair [i j] where i and j are the
start and end positions of the span."
[span]
(nspan/make-span (.getStart span) (.getEnd span) (.getType span)))

(defmulti make-sentence-detector
"Return a function for splitting sentences given a model file."
class)
Expand All @@ -37,12 +53,14 @@
(fn sentence-detector
[text]
{:pre [(string? text)]}
(let [detector (SentenceDetectorME. model)
sentences (.sentDetect detector text)
probs (seq (.getSentenceProbabilities detector))]
(let [detector (SentenceDetectorME. model)
spans (.sentPosDetect detector text)
sentences (opennlp-span-strings spans text)
probs (seq (.getSentenceProbabilities detector))]
(with-meta
(into [] sentences)
{:probabilities probs}))))
{:probabilities probs
:spans (map to-native-span spans)}))))

(defmulti make-tokenizer
"Return a function for tokenizing a sentence based on a given model file."
Expand All @@ -59,15 +77,13 @@
[sentence]
{:pre [(string? sentence)]}
(let [tokenizer (TokenizerME. model)
tokens (.tokenize tokenizer sentence)
spans (map #(hash-map :start (.getStart %)
:end (.getEnd %))
(seq (.tokenizePos tokenizer sentence)))
probs (seq (.getTokenProbabilities tokenizer))]
spans (.tokenizePos tokenizer sentence)
probs (seq (.getTokenProbabilities tokenizer))
tokens (opennlp-span-strings spans sentence)]
(with-meta
(into [] tokens)
{:probabilities probs
:spans spans}))))
:spans (map to-native-span spans)}))))

(defmulti make-pos-tagger
"Return a function for tagging tokens based on a givel model file."
Expand All @@ -82,7 +98,7 @@
[model]
(fn pos-tagger
[tokens]
{:pre [(vector? tokens)]}
{:pre [(seq tokens)]}
(let [token-array (into-array tokens)
tagger (POSTaggerME. model *beam-size* *cache-size*)
tags (.tag tagger token-array)
Expand Down Expand Up @@ -112,7 +128,8 @@
probs (seq (.probs finder))]
(with-meta
(distinct (Span/spansToStrings matches (into-array String tokens)))
{:probabilities probs}))))
{:probabilities probs
:spans (map to-native-span matches)}))))

(defmulti make-detokenizer
"Return a function for taking tokens and recombining them into a sentence
Expand All @@ -126,7 +143,7 @@

;; TODO: clean this up, recursion is a smell
;; TODO: remove debug printlns once I'm satisfied
(defn- collapse-tokens
#_(defn- collapse-tokens
[tokens detoken-ops]
(let [sb (StringBuilder.)
token-set (atom #{})]
Expand Down Expand Up @@ -166,15 +183,50 @@
(recur (next ts) (next dt-ops)))))
(.toString sb)))

(defmethod make-detokenizer DetokenizationDictionary
;; In the current documentation there is no RIGHT_LEFT_MATCHING and
;; I've never seen such an operation in practice.
;; http://opennlp.apache.org/documentation/apidocs/opennlp-tools/opennlp/tools/tokenize/Detokenizer.DetokenizationOperation.html
(defn- detokenize*
"Given a sequence of DetokenizationOperations, produce a string."
[tokens ops]
(loop [toks (seq tokens)
ops (seq ops)
result-toks []]
(if toks
(let [op (first ops)
rtoks (cond
(= op Detokenizer$DetokenizationOperation/MERGE_TO_LEFT)
(if (not-empty result-toks)
(conj (pop result-toks) (first toks) " ")
(conj result-toks (first toks) " "))

(= op Detokenizer$DetokenizationOperation/MERGE_TO_RIGHT)
(conj result-toks (first toks))

:else
(conj result-toks (first toks) " "))]
(recur (next toks) (next ops) rtoks))
(apply str (butlast result-toks)))))

#_(defmethod make-detokenizer DetokenizationDictionary
[model]
(fn detokenizer
[tokens]
{:pre [(seq tokens)
(every? #(= (class %) String) tokens)]}
(let [detoken (DictionaryDetokenizer. model)
ops (.detokenize detoken (into-array String tokens))]
(collapse-tokens tokens ops))))
ops (.detokenize detoken (into-array String tokens))]
(detokenize* tokens ops))))

(defmethod make-detokenizer DetokenizationDictionary
[model]
(fn detokenizer
[tokens]
{:pre [(seq tokens)
(every? #(= (class %) String) tokens)]}
(-> (DictionaryDetokenizer. model)
(TokenSample. (into-array String tokens))
(.getText))))

(defn parse-categories [outcomes-string outcomes]
"Given a string that represents the opennlp outcomes and an array of
Expand Down
72 changes: 72 additions & 0 deletions src/opennlp/span.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
(ns opennlp.span)

(defrecord Span [start end type])

(defn make-span
"Make a native span object."
[start end type]
(Span. start end type))

(defn in-span?
"Return true if location k is in span. We assume span is [i,j)."
[span k]
(and (>= k (:start span)) (< k (:end span))))

(defn contains-span?
"Return true if s1 is contains spans s2."
[s1 s2]
(and (>= (:start s2) (:start s1))
(<= (:start s2) (:end s1))
(>= (:end s2) (:start s1))
(<= (:end s2) (:end s1))))

(defn right-of-span?
"Return true if location k is to the right of span."
[span k]
(>= k (:end span)))

(defn end-of-span?
"Return true if location k is the end of span."
[span k]
(== (dec (:end span)) k))

(defn merge-spans
"Given two overlapping spans where the first comes before the second, return a merged span
with the type of the first."
[A B] (assoc A :end (:end B)))

(defn span-disjoint?
"Return true of A does not overlap B."
[A B]
(or (<= (:end A) (:start B)) (>= (:start A) (:end B))))

(defn span-overlaps?
"Return true if A overlaps B."
[A B] (not (span-disjoint? A B)))

(defn intersection-span
"Return the intersection of two spans as a span."
[A B]
{:pre [(not (span-disjoint? A B))]}
(make-span (max (:start A) (:start B)) (min (:end A) (:end B)) nil))

(defn span-length
"Return the length of the span."
[s]
(- (:end s) (:start s)))

(defn subs-span
"Return the substring corresponding to the span."
[s span]
(subs s (:start span) (:end span)))

(defn shift-span
"Shift a span by i positions."
[span i]
(make-span (+ (:start span) i) (+ (:end span) i) (:type span)))

(defn between-span
"Return a span of the area between two spans A and B. Precondition: (:end A) < (:start B)."
[a b]
{:pre [(< (:end a) (:start b))]}
(make-span (:end a) (:start b) :between))

0 comments on commit 6336bf0

Please sign in to comment.