Skip to content

Commit

Permalink
Implement Treebank expression parser
Browse files Browse the repository at this point in the history
I've used the Instaparse library to implement the parser

this should fix dakrone#11
  • Loading branch information
alexott committed May 11, 2013
1 parent d789cdd commit 554b056
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 46 deletions.
3 changes: 2 additions & 1 deletion project.clj
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
:url "http://github.com/dakrone/clojure-opennlp"
:min-lein-version "2.0.0"
:dependencies [[org.clojure/clojure "1.5.1"]
[org.apache.opennlp/opennlp-tools "1.5.3"]]
[org.apache.opennlp/opennlp-tools "1.5.3"]
[instaparse "1.0.1"]]
:profiles {:1.3 {:dependencies [[org.clojure/clojure "1.3.0"]]}
:1.4 {:dependencies [[org.clojure/clojure "1.4.0"]]}}
:aliases {"all" ["with-profile" "dev,1.3:dev,1.4:dev"]}
Expand Down
54 changes: 11 additions & 43 deletions src/opennlp/treebank.clj
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
opennlp.treebank
(:use [opennlp.nlp :only [*beam-size*]]
[clojure.java.io :only [input-stream]])
(:require [clojure.string :as str])
(:require [clojure.string :as str]
[instaparse.core :as insta])
(:import (opennlp.tools.chunker ChunkerModel ChunkerME)
(opennlp.tools.cmdline.parser ParserTool)
(opennlp.tools.parser Parse ParserModel
Expand Down Expand Up @@ -165,55 +166,22 @@
parses (map #(parse-line % parser) text)]
(vec parses))))

(defn- strip-funny-chars
"Strip out some characters that might cause trouble parsing the tree."
[s]
(-> s
(str/replace "'" "-SQUOTE-")
(str/replace "\"" "-DQUOTE-")
(str/replace "~" "-TILDE-")
(str/replace "`" "-BACKTICK-")
(str/replace "," "-COMMA-")
(str/replace "\\\\" "-BSLASH-")
(str/replace "\\/" "-FSLASH-")
(str/replace "\\^" "-CARROT-")
(str/replace "@" "-ATSIGN-")
(str/replace "#" "-HASH-")
(str/replace ";" "-SEMICOLON-")
(str/replace ":" "-COLON-")))

(defn- unstrip-funny-chars
"Un-strip out some characters that might cause trouble parsing the tree."
[s]
(-> s
(str/replace "-SQUOTE-" "'")
(str/replace "-DQUOTE-" "\"")
(str/replace "-TILDE-" "~")
(str/replace "-BACKTICK-" "`")
(str/replace "-COMMA-" ",")
(str/replace "-BSLASH-" "\\\\")
(str/replace "-FSLASH-" "\\/")
(str/replace "-CARROT-" "\\^")
(str/replace "-ATSIGN-" "@")
(str/replace "-HASH-" "#")
(str/replace "-SEMICOLON-" ";")
(str/replace "-COLON-" ":")))
(def ^:private s-parser
(insta/parser
"E = <'('> T <WS> (T | (E <WS?>)+) <')'> <WS?> ; T = #'[^)\\s]+' ; WS = #'\\s+'"))

;; Credit for this function goes to carkh in #clojure
(defn- tr
"Generate a tree from the string output of a treebank-parser."
[to-parse]
(if (seq? to-parse)
{:tag (first to-parse) :chunk (map tr (rest to-parse))}
(str to-parse)))
"Transforms treebank string into series of s-like expressions."
[ptree]
(if (= :E (first ptree))
{:tag (symbol (second (second ptree))) :chunk (map tr (drop 2 ptree))}
(second ptree)))

(defn make-tree
"Make a tree from the string output of a treebank-parser."
[tree-text]
(let [text (strip-funny-chars tree-text)]
(tr (read-string text))))


(tr (s-parser tree-text)))

;;------------------------------------------------------------------------
;;------------------------------------------------------------------------
Expand Down
19 changes: 18 additions & 1 deletion test/opennlp/test/treebank.clj
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
(is (thrown? FileNotFoundException (make-treebank-chunker "nonexistantfile")))
(is (thrown? FileNotFoundException (make-treebank-parser "nonexistantfile"))))

(deftest parser-test
(deftest parser-test-normal
(try
(let [parser (make-treebank-parser "parser-model/en-parser-chunking.bin")]
(is (= (parser ["This is a sentence ."])
Expand All @@ -46,6 +46,23 @@
(println "Unable to execute treebank-parser tests."
"Download the model files to $PROJECT_ROOT/parser-models."))))

(deftest parser-test-with-bad-chars
(try
(let [parser (make-treebank-parser "parser-model/en-parser-chunking.bin")]
(is (= (parser ["2:30 isn't bad"])
["(TOP (NP (CD 2:30) (RB isn't) (JJ bad)))"]))
(is (= (make-tree (first (parser ["2:30 isn't bad"])))
'{:tag TOP,
:chunk ({:tag NP,
:chunk ({:tag CD,
:chunk ("2:30")}
{:tag RB, :chunk ("isn't")}
{:tag JJ, :chunk ("bad")})})
})))
(catch FileNotFoundException e
(println "Unable to execute treebank-parser tests."
"Download the model files to $PROJECT_ROOT/parser-models."))))

#_(deftest treebank-coref-test
(try
(let [tbl (make-treebank-linker "coref")
Expand Down
2 changes: 1 addition & 1 deletion test/opennlp/test/treebank_tree.clj
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
:chunk ({:tag NP
:chunk ({:tag NN
:chunk
("-DQUOTE-2-COLON-30-DQUOTE-")})}
("\"2:30\"")})}
{:tag VP
:chunk ({:tag VBZ :chunk ("is")}
{:tag NP
Expand Down

0 comments on commit 554b056

Please sign in to comment.