Skip to content

Commit

Permalink
Merge remote-tracking branch 'jimpil/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
dakrone committed Oct 16, 2012
2 parents ddc251a + 9e5b46a commit 71d7252
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 34 deletions.
15 changes: 8 additions & 7 deletions src/opennlp/nlp.clj
Original file line number Diff line number Diff line change
Expand Up @@ -108,29 +108,30 @@ start and end positions of the span."
(map vector tokens tags)
{:probabilities probs}))))

(defmulti make-name-finder
(defmulti make-name-finder
"Return a function for finding names from tokens based on a given
model file."
class)
(fn [model & args] (class model)))

(defmethod make-name-finder :default
[modelfile]
[modelfile & args]
(with-open [model-stream (input-stream modelfile)]
(make-name-finder (TokenNameFinderModel. model-stream))))

(defmethod make-name-finder TokenNameFinderModel
[model]
[model & {:keys [feature-generator beam] :or {beam *beam-size*}}]
(fn name-finder
[tokens & contexts]
{:pre [(coll? tokens)
{:pre [(seq tokens)
(every? #(= (class %) String) tokens)]}
(let [finder (NameFinderME. model)
(let [finder (NameFinderME. model feature-generator beam)
matches (.find finder (into-array String tokens))
probs (seq (.probs finder))]
(with-meta
(distinct (Span/spansToStrings matches (into-array String tokens)))
{:probabilities probs
:spans (map to-native-span matches)}))))
:spans (map to-native-span matches)}))))


(defmulti make-detokenizer
"Return a function for taking tokens and recombining them into a sentence
Expand Down
71 changes: 44 additions & 27 deletions src/opennlp/tools/train.clj
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
(ns opennlp.tools.train
"This namespace contains tools used to train OpenNLP models"
(:use [clojure.java.io :only [output-stream reader]])
(:import (opennlp.tools.util PlainTextByLineStream)
(:import (opennlp.tools.util PlainTextByLineStream TrainingParameters)
(opennlp.tools.util.model BaseModel ModelType)
(opennlp.tools.dictionary Dictionary)
(opennlp.tools.tokenize TokenizerME
Expand Down Expand Up @@ -30,106 +30,123 @@
(defn write-model
"Write a model to disk"
[#^BaseModel model out-stream]
(.serialize model (output-stream out-stream)))
(with-open [out (output-stream out-stream)]
(.serialize model out)))

(defn build-dictionary
"Build a Dictionary based on file in appropriate format"
[in]
(Dictionary/parseOneEntryPerLine (reader in)))
(with-open [rdr (reader in)]
(Dictionary/parseOneEntryPerLine rdr)))

(defn build-posdictionary
"Build a POSDictionary based on file in appropriate format
A POSDictionary records which part-of-speech tags a word
may be assigned"
[in]
(POSDictionary/create (reader in)))
(with-open [rdr (reader in)]
(POSDictionary/create rdr)))

(defn ^ChunkerModel train-treebank-chunker
"Returns a treebank chunker based on given training file"
([in] (train-treebank-chunker "en" in))
([lang in] (train-treebank-chunker lang in 100 5))
([lang in iter cut] (ChunkerME/train
lang
(ChunkSampleStream.
(PlainTextByLineStream.
(reader in)))
cut iter)))
([lang in iter cut]
(with-open [rdr (reader in)]
(ChunkerME/train
lang
(ChunkSampleStream.
(PlainTextByLineStream. rdr))
cut iter))))

(defn ^ParserModel train-treebank-parser
"Returns a treebank parser based a training file and a set of head rules"
([in headrules] (train-treebank-parser "en" in headrules))
([lang in headrules] (train-treebank-parser lang in headrules 100 5))
([lang in headrules iter cut]
(with-open [rdr (reader headrules)
fis (java.io.FileInputStream. in)]
(Parser/train
lang
(ParseSampleStream.
(PlainTextByLineStream.
(.getChannel (java.io.FileInputStream. in)) "UTF-8"))
(HeadRules. (reader headrules)) iter cut)))
(.getChannel fis) "UTF-8"))
(HeadRules. rdr) iter cut))))


(defn ^TokenNameFinderModel train-name-finder
"Returns a name finder based on a given training file"
"Returns a trained name finder based on a given training file. Uses a non-deprecated train() method that allows
for perceptron training with minimum modification. Optional arguments include the type of entity (e.g \"person\"), custom feature generation and
a knob for switching to perceptron training (maXent is the default). For perceptron prefer cutoff 0, whereas for maXent 5."
([in] (train-name-finder "en" in))
([lang in] (train-name-finder lang in 100 5))
([lang in iter cut]
([lang in iter cut & {:keys [entity-type feature-gen classifier]
:or {entity-type "default" classifier "MAXENT"}}] ;;MUST be either "MAXENT" or "PERCEPTRON"
(with-open [rdr (reader in)]
(NameFinderME/train
lang
"default"
(->> (reader in)
entity-type
(->> rdr
(PlainTextByLineStream.)
(NameSampleDataStream.))
{}
iter
cut)))
(doto (TrainingParameters.)
(.put TrainingParameters/ALGORITHM_PARAM classifier)
(.put TrainingParameters/ITERATIONS_PARAM (Integer/toString iter))
(.put TrainingParameters/CUTOFF_PARAM (Integer/toString cut)))
feature-gen {}))))

(defn ^TokenizerModel train-tokenizer
"Returns a tokenizer based on given training file"
([in] (train-tokenizer "en" in))
([lang in] (train-tokenizer lang in 100 5))
([lang in iter cut]
(with-open [rdr (reader in)]
(TokenizerME/train
lang
(->> (reader in)
(->> rdr
(PlainTextByLineStream.)
(TokenSampleStream.))
false
cut
iter)))
iter))))

(defn ^POSModel train-pos-tagger
"Returns a pos-tagger based on given training file"
([in] (train-pos-tagger "en" in))
([lang in] (train-pos-tagger lang in nil))
([lang in tagdict] (train-pos-tagger lang in tagdict 100 5))
([lang in tagdict iter cut]
(with-open [rdr (reader in)]
(POSTaggerME/train
lang
(WordTagSampleStream. (reader in))
(WordTagSampleStream. rdr)
(ModelType/MAXENT)
tagdict
nil
cut
iter)))
iter))))

(defn ^SentenceModel train-sentence-detector
"Returns a sentence model based on a given training file"
([in] (train-sentence-detector "en" in))
([lang in]
(with-open [rdr (reader in)]
(SentenceDetectorME/train lang
(->> (reader in)
(->> rdr
(PlainTextByLineStream.)
(SentenceSampleStream.))
true
nil)))
nil))))

(defn ^DoccatModel train-document-categorization
"Returns a classification model based on a given training file"
([in] (train-document-categorization "en" in 1 100))
([lang in] (train-document-categorization "en" in 1 100))
([lang in cutoff iterations]
(with-open [rdr (reader in)]
(DocumentCategorizerME/train lang
(->> (reader in)
(->> rdr
(PlainTextByLineStream.)
(DocumentSampleStream.))
cutoff iterations)))
cutoff iterations))))

0 comments on commit 71d7252

Please sign in to comment.