forked from vseloved/cl-nlp
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added ngrams and language-models. Added *stopwords-en*, find-collocat…
…ions and bin-search. Added NLTK ch1-2.
- Loading branch information
Showing
16 changed files
with
1,345 additions
and
192 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,8 +3,8 @@ | |
(in-package #:asdf) | ||
|
||
(defsystem #:cl-nlp | ||
:version "0.0.2" | ||
:description "NLP toolkit for Common Lisp" | ||
:version "0.0.3" | ||
:description "NLP toolkit for Common Lisp." | ||
:author "Vsevolod Dyomkin <[email protected]>" | ||
:maintainer "Vsevolod Dyomkin <[email protected]>" | ||
:license "Apache 2.0" | ||
|
@@ -26,12 +26,30 @@ | |
(:module #:core | ||
:serial t | ||
:components | ||
((:file "tokenization") | ||
(:file "freq") | ||
#+nil (:file "distance") | ||
#+nil (:file "ngram"))) | ||
((:file "measures") | ||
(:file "tokenization") | ||
(:file "ngrams") | ||
(:file "language-models") | ||
(:file "indexing"))) | ||
(:module #:generation | ||
:serial t | ||
:components | ||
((:file "markov-chain"))) | ||
(:file "user"))))) | ||
|
||
(defsystem #:cl-nlp.contib | ||
:version "0.0.1" | ||
:description "CL-NLP additional packages." | ||
:author "Vsevolod Dyomkin <[email protected]>" | ||
:maintainer "Vsevolod Dyomkin <[email protected]>" | ||
:license "Apache 2.0" | ||
:depends-on (#:cl-nlp #:drakma) | ||
:serial t | ||
:components | ||
((:module #:src | ||
:components | ||
((:module #:contrib | ||
:serial t | ||
:components | ||
((:file "packages") | ||
(:file "ms-ngrams"))))))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,7 +3,7 @@ | |
(in-package #:asdf) | ||
|
||
(defsystem #:cl-nltk | ||
:version "0.0.1" | ||
:version "0.0.2" | ||
:description "Implementation of the examples from the NLTK book." | ||
:author "Vsevolod Dyomkin <[email protected]>" | ||
:maintainer "Vsevolod Dyomkin <[email protected]>" | ||
|
@@ -15,4 +15,5 @@ | |
:serial t | ||
:components | ||
((:file "package") | ||
(:file "ch1-1"))))) | ||
(:file "ch1-1") | ||
(:file "ch1-2"))))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
;;; (c) 2013 Vsevolod Dyomkin | ||
|
||
(in-package #:nltk) | ||
(named-readtables:in-readtable rutils-readtable) | ||
|
||
(defclass text () | ||
((name :initarg :name) | ||
(raw :initarg :raw :accessor text-raw) | ||
(words :accessor text-words) | ||
(ctxs :accessor text-ctxs) | ||
(transitions :accessor text-transitions) | ||
(dispersion :accessor text-dispersion) | ||
(ugrams :accessor text-ugrams) | ||
(bigrams :accessor text-bigrams) | ||
(trigrams :accessor text-trigrams))) | ||
|
||
(defmethod slot-unbound (class (obj text) (slot (eql 'ugrams))) | ||
(with-slots (words ugrams) obj | ||
(format t "~&Indexing unigrams...~%") | ||
(prog1 (setf ugrams (index-ngrams 1 words)) | ||
(format t "Number of ugrams: ~A~%" (ngrams-count ugrams))))) | ||
|
||
(defmethod slot-unbound (class (obj text) (slot (eql 'bigrams))) | ||
(with-slots (words bigrams) obj | ||
(format t "~&Indexing bigrams...~%") | ||
(prog1 (setf bigrams (index-ngrams 2 words)) | ||
(format t "Number of bigrams: ~A~%" (ngrams-count bigrams))))) | ||
|
||
(defmethod slot-unbound (class (obj text) (slot (eql 'trigrams))) | ||
(with-slots (words trigrams) obj | ||
(format t "~&Indexing trigrams...~%") | ||
(prog1 (setf trigrams (index-ngrams 3 words)) | ||
(format t "Number of trigrams: ~A~%" (ngrams-count trigrams))))) | ||
|
||
(defun collocations (text) | ||
(find-collocations (text-bigrams text) :n 30)) | ||
|
||
(defun generate (text &key (n 20) (order 2)) | ||
"Generate random text of N words, based on TEXT." | ||
(with-slots (transitions) text | ||
(string-trim (append +white-chars+ +newline-chars+) | ||
(fmt "~{~A ~}" | ||
(generate-text (make 'markov-chain-generator :order order) | ||
(make-lm 'stupid-backoff-lm | ||
:1g (text-ugrams text) | ||
:2g (when (> order 1) | ||
(text-bigrams text)) | ||
:3g (when (> order 2) | ||
(text-trigrams text))) | ||
n))))) | ||
|
||
;; Plotting | ||
|
||
(defun dump-counts (ngrams n order-by cumulative) | ||
"Dump N NGRAMS counts (or CUMULATIVE counts) orderd by ORDER-BY." | ||
(let ((filename (fmt "/tmp/~A" (gensym))) | ||
(total 0)) | ||
(with-out-file (out filename) | ||
(doindex (i pair (ngrams-pairs ngrams :order-by order-by)) | ||
(when (and n (> i n)) | ||
(return)) | ||
(format out "~A~t~S~t~A~%" (1+ i) (car pair) | ||
(if cumulative | ||
(incf total (cdr pair)) | ||
(cdr pair)))) | ||
filename))) | ||
|
||
(defun plot (ngrams &key n (order-by '>) cumulative) | ||
"Plot NGRAMS counts." | ||
(cgn:with-gnuplot (t) | ||
(cgn:format-gnuplot "set xtics rotate 90") | ||
(cgn:format-gnuplot "set ylabel \"~@[Cumulative ~]Counts\"" cumulative) | ||
(cgn:format-gnuplot | ||
"plot \"~A\" using 1:3:xtic(2) with lines title \"\"" | ||
(dump-cumulative-counts ngrams n order-by cumulative)))) |
Oops, something went wrong.