Skip to content

Commit

Permalink
formatting fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
dakrone committed Apr 27, 2011
1 parent ac87503 commit 97430a0
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 58 deletions.
42 changes: 21 additions & 21 deletions src/opennlp/nlp.clj
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
(ns #^{:author "Lee Hinman"}
opennlp.nlp
(ns opennlp.nlp
"The main namespace for the clojure-opennlp project. Functions for
creating NLP performers can be created with the tools in this namespace."
(:use [clojure.contrib.seq-utils :only [indexed]])
(:use [clojure.java.io :only [file input-stream]])
(:import [opennlp.tools.util Span])
(:import [opennlp.tools.tokenize TokenizerModel TokenizerME
DictionaryDetokenizer DetokenizationDictionary Detokenizer
Detokenizer$DetokenizationOperation
DetokenizationDictionary$Operation])
(:import [opennlp.tools.sentdetect SentenceModel SentenceDetectorME])
(:import [opennlp.tools.namefind TokenNameFinderModel NameFinderME])
(:import [opennlp.tools.postag POSModel POSTaggerME]))

(:use [clojure.java.io :only [input-stream]])
(:import
(opennlp.tools.namefind NameFinderME TokenNameFinderModel)
(opennlp.tools.postag POSModel POSTaggerME)
(opennlp.tools.sentdetect SentenceDetectorME SentenceModel)
(opennlp.tools.tokenize
DetokenizationDictionary
DetokenizationDictionary$Operation
Detokenizer$DetokenizationOperation
DictionaryDetokenizer
TokenizerME
TokenizerModel)))

;; OpenNLP property for pos-tagging. Meant to be rebound before
;; calling the tagging creators
Expand All @@ -33,14 +33,14 @@
(defmethod make-sentence-detector SentenceModel
[model]
(fn sentence-detector
[text]
{:pre [(string? text)]}
(let [detector (SentenceDetectorME. model)
sentences (.sentDetect detector text)
probs (seq (.getSentenceProbabilities detector))]
(with-meta
(into [] sentences)
{:probabilities probs}))))
[text]
{:pre [(string? text)]}
(let [detector (SentenceDetectorME. model)
sentences (.sentDetect detector text)
probs (seq (.getSentenceProbabilities detector))]
(with-meta
(into [] sentences)
{:probabilities probs}))))

(defmulti make-tokenizer
"Return a function for tokenizing a sentence based on a given model file."
Expand Down
4 changes: 1 addition & 3 deletions src/opennlp/tools/filters.clj
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
[elements#]
(filter (fn [t#] (re-find ~r (second t#))) elements#))))


(defmacro chunk-filter
"Declare a filter for treebank-chunked lists with the given name and regex."
[n r]
Expand All @@ -21,8 +20,7 @@
[elements#]
(filter (fn [t#] (re-find ~r (:tag t#))) elements#))))


; It's easy to define your own filters!
;; It's easy to define your own filters!
(pos-filter nouns #"^NN")
(pos-filter nouns-and-verbs #"^(NN|VB)")
(pos-filter proper-nouns #"^NNP")
Expand Down
23 changes: 12 additions & 11 deletions src/opennlp/tools/lazy.clj
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,16 @@
given sentence-finder. rdr must implement java.io.BufferedReader."
[^java.io.BufferedReader rdr sentence-finder]
(.mark rdr 0)
(loop [c (.read rdr) sb (StringBuilder.)]
(if-not (= -1 c)
(do (.append sb (char c))
(let [sents (sentence-finder (.toString sb))]
(if (> (count sents) 1)
(do (.reset rdr)
(cons (first sents)
(lazy-seq (sentence-seq rdr sentence-finder))))
(do (.mark rdr 0)
(recur (.read rdr) sb)))))
[(.trim (.toString sb))])))
(let [sb (StringBuilder.)]
(loop [c (.read rdr)]
(if-not (= -1 c)
(do (.append sb (char c))
(let [sents (sentence-finder (.toString sb))]
(if (> (count sents) 1)
(do (.reset rdr)
(cons (first sents)
(lazy-seq (sentence-seq rdr sentence-finder))))
(do (.mark rdr 0)
(recur (.read rdr))))))
[(.trim (.toString sb))]))))

39 changes: 16 additions & 23 deletions src/opennlp/treebank.clj
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,6 @@
;; So, B-* starts a sequence, I-* continues it. New phrase starts when
;; B-* is encountered



;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Treebank parsing
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
Expand All @@ -131,10 +129,10 @@
"Treebank-parser does not like parens and braces, so replace them."
[s]
(-> s
(.replaceAll "\\(" "-LRB-")
(.replaceAll "\\)" "-RRB-")
(.replaceAll "\\{" "-LCB-")
(.replaceAll "\\}" "-RCB-")))
(.replaceAll "\\(" "-LRB-")
(.replaceAll "\\)" "-RRB-")
(.replaceAll "\\{" "-LCB-")
(.replaceAll "\\}" "-RCB-")))


(defn- parse-line
Expand Down Expand Up @@ -167,24 +165,22 @@
parses (map #(parse-line % parser) text)]
(vec parses))))


(defn- strip-funny-chars
"Strip out some characters that might cause trouble parsing the tree."
[s]
(-> s
(.replaceAll "'" "-SQUOTE-")
(.replaceAll "\"" "-DQUOTE-")
(.replaceAll "~" "-TILDE-")
(.replaceAll "`" "-BACKTICK-")
(.replaceAll "," "-COMMA-")
(.replaceAll "\\\\" "-BSLASH-")
(.replaceAll "\\/" "-FSLASH-")
(.replaceAll "\\^" "-CARROT-")
(.replaceAll "@" "-ATSIGN-")
(.replaceAll "#" "-HASH-")))


; Credit for this function goes to carkh in #clojure
(.replaceAll "'" "-SQUOTE-")
(.replaceAll "\"" "-DQUOTE-")
(.replaceAll "~" "-TILDE-")
(.replaceAll "`" "-BACKTICK-")
(.replaceAll "," "-COMMA-")
(.replaceAll "\\\\" "-BSLASH-")
(.replaceAll "\\/" "-FSLASH-")
(.replaceAll "\\^" "-CARROT-")
(.replaceAll "@" "-ATSIGN-")
(.replaceAll "#" "-HASH-")))

;; Credit for this function goes to carkh in #clojure
(defn- tr
"Generate a tree from the string output of a treebank-parser."
[to-parse]
Expand All @@ -194,15 +190,12 @@
(map tr body)
(tr (first body)))})))


(defn make-tree
"Make a tree from the string output of a treebank-parser."
[tree-text]
(let [text (strip-funny-chars tree-text)]
(tr (read-string text))))



;;------------------------------------------------------------------------
;;------------------------------------------------------------------------
;; Treebank Linking
Expand Down

0 comments on commit 97430a0

Please sign in to comment.