Skip to content

Commit

Permalink
Added ngrams and language-models. Added *stopwords-en*, find-collocat…
Browse files Browse the repository at this point in the history
…ions and bin-search. Added NLTK ch1-2.
  • Loading branch information
vseloved committed Feb 26, 2013
1 parent c45666c commit 3cc7760
Show file tree
Hide file tree
Showing 16 changed files with 1,345 additions and 192 deletions.
30 changes: 24 additions & 6 deletions cl-nlp.asd
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
(in-package #:asdf)

(defsystem #:cl-nlp
:version "0.0.2"
:description "NLP toolkit for Common Lisp"
:version "0.0.3"
:description "NLP toolkit for Common Lisp."
:author "Vsevolod Dyomkin <[email protected]>"
:maintainer "Vsevolod Dyomkin <[email protected]>"
:license "Apache 2.0"
Expand All @@ -26,12 +26,30 @@
(:module #:core
:serial t
:components
((:file "tokenization")
(:file "freq")
#+nil (:file "distance")
#+nil (:file "ngram")))
((:file "measures")
(:file "tokenization")
(:file "ngrams")
(:file "language-models")
(:file "indexing")))
(:module #:generation
:serial t
:components
((:file "markov-chain")))
(:file "user")))))

(defsystem #:cl-nlp.contib
:version "0.0.1"
:description "CL-NLP additional packages."
:author "Vsevolod Dyomkin <[email protected]>"
:maintainer "Vsevolod Dyomkin <[email protected]>"
:license "Apache 2.0"
:depends-on (#:cl-nlp #:drakma)
:serial t
:components
((:module #:src
:components
((:module #:contrib
:serial t
:components
((:file "packages")
(:file "ms-ngrams")))))))
5 changes: 3 additions & 2 deletions cl-nltk.asd
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
(in-package #:asdf)

(defsystem #:cl-nltk
:version "0.0.1"
:version "0.0.2"
:description "Implementation of the examples from the NLTK book."
:author "Vsevolod Dyomkin <[email protected]>"
:maintainer "Vsevolod Dyomkin <[email protected]>"
Expand All @@ -15,4 +15,5 @@
:serial t
:components
((:file "package")
(:file "ch1-1")))))
(:file "ch1-1")
(:file "ch1-2")))))
4 changes: 0 additions & 4 deletions nltk/ch1-1.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -136,10 +136,6 @@
(set# elt uniqs t))
(if raw uniqs (ht-keys uniqs))))

(defun sorted-ht-keys (test ht)
"Return hash-table keys of HT in sorted order accroding to TEST."
(sort (ht-keys ht) test :key #`(get# % ht)))

(defun match-ctxs (word1-ctx word2-ctx)
"Find the intersection between WORD1-CTX and WORD2-CTX tables
and for each common context calculate the commonality weight."
Expand Down
163 changes: 127 additions & 36 deletions nltk/ch1-1.md

Large diffs are not rendered by default.

75 changes: 75 additions & 0 deletions nltk/ch1-2.lisp
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
;;; (c) 2013 Vsevolod Dyomkin

(in-package #:nltk)
(named-readtables:in-readtable rutils-readtable)

(defclass text ()
((name :initarg :name)
(raw :initarg :raw :accessor text-raw)
(words :accessor text-words)
(ctxs :accessor text-ctxs)
(transitions :accessor text-transitions)
(dispersion :accessor text-dispersion)
(ugrams :accessor text-ugrams)
(bigrams :accessor text-bigrams)
(trigrams :accessor text-trigrams)))

(defmethod slot-unbound (class (obj text) (slot (eql 'ugrams)))
(with-slots (words ugrams) obj
(format t "~&Indexing unigrams...~%")
(prog1 (setf ugrams (index-ngrams 1 words))
(format t "Number of ugrams: ~A~%" (ngrams-count ugrams)))))

(defmethod slot-unbound (class (obj text) (slot (eql 'bigrams)))
(with-slots (words bigrams) obj
(format t "~&Indexing bigrams...~%")
(prog1 (setf bigrams (index-ngrams 2 words))
(format t "Number of bigrams: ~A~%" (ngrams-count bigrams)))))

(defmethod slot-unbound (class (obj text) (slot (eql 'trigrams)))
(with-slots (words trigrams) obj
(format t "~&Indexing trigrams...~%")
(prog1 (setf trigrams (index-ngrams 3 words))
(format t "Number of trigrams: ~A~%" (ngrams-count trigrams)))))

(defun collocations (text)
(find-collocations (text-bigrams text) :n 30))

(defun generate (text &key (n 20) (order 2))
"Generate random text of N words, based on TEXT."
(with-slots (transitions) text
(string-trim (append +white-chars+ +newline-chars+)
(fmt "~{~A ~}"
(generate-text (make 'markov-chain-generator :order order)
(make-lm 'stupid-backoff-lm
:1g (text-ugrams text)
:2g (when (> order 1)
(text-bigrams text))
:3g (when (> order 2)
(text-trigrams text)))
n)))))

;; Plotting

(defun dump-counts (ngrams n order-by cumulative)
"Dump N NGRAMS counts (or CUMULATIVE counts) orderd by ORDER-BY."
(let ((filename (fmt "/tmp/~A" (gensym)))
(total 0))
(with-out-file (out filename)
(doindex (i pair (ngrams-pairs ngrams :order-by order-by))
(when (and n (> i n))
(return))
(format out "~A~t~S~t~A~%" (1+ i) (car pair)
(if cumulative
(incf total (cdr pair))
(cdr pair))))
filename)))

(defun plot (ngrams &key n (order-by '>) cumulative)
"Plot NGRAMS counts."
(cgn:with-gnuplot (t)
(cgn:format-gnuplot "set xtics rotate 90")
(cgn:format-gnuplot "set ylabel \"~@[Cumulative ~]Counts\"" cumulative)
(cgn:format-gnuplot
"plot \"~A\" using 1:3:xtic(2) with lines title \"\""
(dump-cumulative-counts ngrams n order-by cumulative))))
Loading

0 comments on commit 3cc7760

Please sign in to comment.