forked from dakrone/clojure-opennlp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
web-tag.clj
38 lines (31 loc) · 1007 Bytes
/
web-tag.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
(ns examples.web-tag
(:use [opennlp.nlp])
(:use [clojure.pprint :only [pprint]]))
(def get-sentences (make-sentence-detector "models/en-sent.bin"))
(def tokenize (make-tokenizer "models/en-token.bin"))
(def pos-tag (make-pos-tagger "models/en-pos-maxent.bin"))
(defn strip-html-tags
"Messily strip html tags from a web page"
[string]
(-> string
(.replaceAll "<script .*?>.*?</script>" " ")
(.replaceAll "<style .*?>.*?</style>" " ")
(.replaceAll "<.*?>" " ")
(.replaceAll "[ ]+" " ")))
(defn fetch-page
[url]
(let [html (.replaceAll (slurp url) "[\t\n\r]" " ")]
(re-find #"<body.*?</body>" html)))
(defn fetch-plain-page
[url]
(strip-html-tags (fetch-page url)))
(defn- tag-sentences
[sent-seq]
(map #(pos-tag (tokenize %)) sent-seq))
(defn tag-page
[url]
(let [page (fetch-plain-page url)
sentences (get-sentences page)
sent-seq (partition-all 10 sentences)]
(pmap tag-sentences sent-seq)))
(tag-page "http://writequit.org")