From 3cc7760ba23743ad587a873d8a2c8b62627a36eb Mon Sep 17 00:00:00 2001 From: vseloved Date: Tue, 26 Feb 2013 10:02:28 +0200 Subject: [PATCH] Added ngrams and language-models. Added *stopwords-en*, find-collocations and bin-search. Added NLTK ch1-2. --- cl-nlp.asd | 30 ++- cl-nltk.asd | 5 +- nltk/ch1-1.lisp | 4 - nltk/ch1-1.md | 163 +++++++++++---- nltk/ch1-2.lisp | 75 +++++++ nltk/ch1-2.md | 340 +++++++++++++++++++++++++++++++ src/contrib/ms-ngrams.lisp | 65 ++++++ src/contrib/packages.lisp | 8 + src/core/freq.lisp | 77 ------- src/core/indexing.lisp | 149 ++++++++++++++ src/core/language-models.lisp | 132 ++++++++++++ src/core/measures.lisp | 29 +++ src/core/ngrams.lisp | 242 ++++++++++++++++++++++ src/generation/markov-chain.lisp | 61 ++++-- src/packages.lisp | 103 +++++----- src/util.lisp | 54 ++++- 16 files changed, 1345 insertions(+), 192 deletions(-) create mode 100644 nltk/ch1-2.lisp create mode 100644 nltk/ch1-2.md create mode 100644 src/contrib/ms-ngrams.lisp create mode 100644 src/contrib/packages.lisp delete mode 100644 src/core/freq.lisp create mode 100644 src/core/indexing.lisp create mode 100644 src/core/language-models.lisp create mode 100644 src/core/measures.lisp create mode 100644 src/core/ngrams.lisp diff --git a/cl-nlp.asd b/cl-nlp.asd index 02e30fb..63fb250 100644 --- a/cl-nlp.asd +++ b/cl-nlp.asd @@ -3,8 +3,8 @@ (in-package #:asdf) (defsystem #:cl-nlp - :version "0.0.2" - :description "NLP toolkit for Common Lisp" + :version "0.0.3" + :description "NLP toolkit for Common Lisp." :author "Vsevolod Dyomkin " :maintainer "Vsevolod Dyomkin " :license "Apache 2.0" @@ -26,12 +26,30 @@ (:module #:core :serial t :components - ((:file "tokenization") - (:file "freq") - #+nil (:file "distance") - #+nil (:file "ngram"))) + ((:file "measures") + (:file "tokenization") + (:file "ngrams") + (:file "language-models") + (:file "indexing"))) (:module #:generation :serial t :components ((:file "markov-chain"))) (:file "user"))))) + +(defsystem #:cl-nlp.contib + :version "0.0.1" + :description "CL-NLP additional packages." + :author "Vsevolod Dyomkin " + :maintainer "Vsevolod Dyomkin " + :license "Apache 2.0" + :depends-on (#:cl-nlp #:drakma) + :serial t + :components + ((:module #:src + :components + ((:module #:contrib + :serial t + :components + ((:file "packages") + (:file "ms-ngrams"))))))) \ No newline at end of file diff --git a/cl-nltk.asd b/cl-nltk.asd index 1fd2b2a..2bb75d4 100644 --- a/cl-nltk.asd +++ b/cl-nltk.asd @@ -3,7 +3,7 @@ (in-package #:asdf) (defsystem #:cl-nltk - :version "0.0.1" + :version "0.0.2" :description "Implementation of the examples from the NLTK book." :author "Vsevolod Dyomkin " :maintainer "Vsevolod Dyomkin " @@ -15,4 +15,5 @@ :serial t :components ((:file "package") - (:file "ch1-1"))))) + (:file "ch1-1") + (:file "ch1-2"))))) diff --git a/nltk/ch1-1.lisp b/nltk/ch1-1.lisp index af83474..8d27d5c 100644 --- a/nltk/ch1-1.lisp +++ b/nltk/ch1-1.lisp @@ -136,10 +136,6 @@ (set# elt uniqs t)) (if raw uniqs (ht-keys uniqs)))) -(defun sorted-ht-keys (test ht) - "Return hash-table keys of HT in sorted order accroding to TEST." - (sort (ht-keys ht) test :key #`(get# % ht))) - (defun match-ctxs (word1-ctx word2-ctx) "Find the intersection between WORD1-CTX and WORD2-CTX tables and for each common context calculate the commonality weight." diff --git a/nltk/ch1-1.md b/nltk/ch1-1.md index 208630c..f43c3bf 100644 --- a/nltk/ch1-1.md +++ b/nltk/ch1-1.md @@ -1,20 +1,32 @@ # NLTK - Computing with Language: Texts and Words -OK, let's get started with the NLTK book. Its first chapter tries to impress the reader with how simple it is to accomplish some neat things with texts using it. Actually, the underlying algorithms that allow to achieve these results are mostly quite basic. We'll discuss them in this post and the code for the first part of the chapter can be found in [nltk/ch1-1.lisp](https://github.com/vseloved/cl-nlp/blob/master/nltk/ch1-1.lisp). +OK, let's get started with the NLTK book. Its first chapter tries to +impress the reader with how simple it is to accomplish some neat +things with texts using it. Actually, the underlying algorithms that +allow to achieve these results are mostly quite basic. We'll discuss +them in this post and the code for the first part of the chapter can +be found in [nltk/ch1-1.lisp](https://github.com/vseloved/cl-nlp/blob/master/nltk/ch1-1.lisp). ## Setting up texts for processing -For the purpose of this demonstration we'll need several texts which can be downloaded from [NLTK data](http://nltk.org/nltk_data/). Namely, we'll use the following 5 texts: +For the purpose of this demonstration we'll need several texts which +can be downloaded from [NLTK data](http://nltk.org/nltk_data/). +Namely, we'll use the following 5 texts: - Moby Dick (can be found inside Project Gutenberg) - Sense and Sensibility (likewise from Project Gutenberg) - The Book of Genesis -- Inaugural Address Corpus (this one comes as a collection of separate texts, that you'll need to cat together into one file) +- Inaugural Address Corpus (this one comes as a collection of separate + texts, that you'll need to cat together into one file) - NPS Chat Corpus These texts are in `nltk/data/` directory in `CL-NLP`. -NLTK guys have created a special `Text` class and have defined all the operations in this chapter as its methods. We'll employ a slightly simpler approach and implement them as ordinary functions. Yet we'll also have a special-purpose `text` class to cache reusable results of long-running operations, like tokenization. +NLTK guys have created a special `Text` class and have defined all the +operations in this chapter as its methods. We'll employ a slightly +simpler approach and implement them as ordinary functions. Yet we'll +also have a special-purpose `text` class to cache reusable results of +long-running operations, like tokenization. NLTK> (load-nltk-texts "data/") # @@ -22,15 +34,19 @@ NLTK guys have created a special `Text` class and have defined all the operation # ... -As you've already guessed, we've just loaded all the texts. The number in the last column is each text's character count. +As you've already guessed, we've just loaded all the texts. +The number in the last column is each text's character count. -Now they are stored in `*texts*` hash-table. This is how we can access an individual text and name them for future usage: +Now they are stored in `*texts*` hash-table. +This is how we can access an individual text and name them for future usage: (defparameter *sense* (get# :sense *texts*)) -(`get#` is one of the shorthand functions for operating on hash-tables defined in rutils) +(`get#` is one of the shorthand functions for operating on hash-tables +defined in [rutils](https://github.com/vseloved/rutils/blob/master/core/packages.lisp)) -Now we have a variable pointing to "Sense and Sensibility". If we examine it, this is what we'll see: +Now we have a variable pointing to "Sense and Sensibility". +If we examine it, this is what we'll see: NLTK> (describe *sense*) # @@ -43,7 +59,14 @@ Now we have a variable pointing to "Sense and Sensibility". If we examine it, th TRANSITIONS = # DISPERSION = # -As you see, there are some unbound slots in this structure: `words` will hold every word in the text after tokenization, `ctxs` will be a table of contexts for each word with their probabilities. By analogy, `transitons` will be a table of transition probabilities between words. Finally, `dispersion` will be a table of indices of word occurences in text. We'll use a lazy initialization strategy for them by defining `slot-unbound` CLOS methods, that will be called on first access to each slot. For example, here's how `words` is initialized: +As you see, there are some unbound slots in this structure: `words` +will hold every word in the text after tokenization, `ctxs` will be a +table of contexts for each word with their probabilities. By analogy, +`transitons` will be a table of transition probabilities between +words. Finally, `dispersion` will be a table of indices of word +occurences in text. We'll use a lazy initialization strategy for them +by defining `slot-unbound` CLOS methods, that will be called on first +access to each slot. For example, here's how `words` is initialized: (defmethod slot-unbound (class (obj text) (slot (eql 'words))) (with-slots (raw words) obj @@ -52,16 +75,26 @@ As you see, there are some unbound slots in this structure: `words` will hold ev (tokenize raw))) (format t "Number of words: ~A~%" (length words))))) -First we split the raw text in paragraphs, because we'd like to preserve paragraph information. Splitting is slightly involved as paragraphs are separated by double newlines, while single newlines end every line in the text, and we have to distinguish this. We insert pillcrow signs paragraph boundaries. Then we tokenize the paragraphs into separate words (real words, punctuation marks, symbols, etc). +First we split the raw text in paragraphs, because we'd like to +preserve paragraph information. Splitting is slightly involved as +paragraphs are separated by double newlines, while single newlines end +every line in the text, and we have to distinguish this. We insert +pillcrow signs paragraph boundaries. Then we tokenize the paragraphs +into separate words (real words, punctuation marks, symbols, etc). -NB. I consider tokenization the crucial function of the NLP toolkit, and we'll explore it in more detail in one of the future posts. +NB. I consider tokenization the crucial function of the NLP toolkit, +and we'll explore it in more detail in one of the future posts. ## Implementing the examples -OK, now we are ready to start churning out examples from the first chapter. +OK, now we are ready to start churning out examples from the first +chapter. -The first one finds occurences of certain words in the text. NLTK guys perform the search on the tokenized texts. But I think, it's quite OK to do it on raw strings with regexes. This has an added benefit of preserving text structure. +The first one finds occurences of certain words in the text. NLTK guys +perform the search on the tokenized texts. But I think, it's quite OK +to do it on raw strings with regexes. This has an added benefit of +preserving text structure. NLTK> (concordance *moby* "monstrous") Displaying 11 of 11 matches @@ -77,7 +110,8 @@ The first one finds occurences of certain words in the text. NLTK guys perform t ave been rummaged out of this monstrous Whale-Bones; for Whales of a monstrous size are -With `:pass-newlines` on we can get the output similar to NLTK's. Let's try one of the homework tasks: +With `:pass-newlines` on we can get the output similar to +NLTK's. Let's try one of the homework tasks: NLTK> (concordance *genesis* "lived" :pass-newlines t) Displaying 75 of 75 matches @@ -105,11 +139,18 @@ Now let's try similarity. Here we won't do without proper tokenization. ("amazingly" "vast" "heartily" "extremely" "remarkably" "great" "exceedingly" "sweet" "very" "so" "good" "a" "as") -We mostly get the same words as NLTK's result, but with a slightly different ordering. It turns out, that the reason for this is very simple. The function `similar` matches words based on the contexts, where they occur. According to the famous quote by John Rupert Firth: +We mostly get the same words as NLTK's result, but with a slightly +different ordering. It turns out, that the reason for this is very +simple. The function `similar` matches words based on the contexts, +where they occur. According to the famous quote by John Rupert Firth: > You shall know a word by the company it keeps -But if we look at context overlap between various words from our list we'll see that the similarity relation between all these words is extremely weak: the decision is based on the match of a single context in which both words appeared in text. In fact, all the listed words are similar to the same extent. +But if we look at context overlap between various words from our list +we'll see that the similarity relation between all these words is +extremely weak: the decision is based on the match of a single context +in which both words appeared in text. In fact, all the listed words +are similar to the same extent. NLTK> (common-contexts *moby* "monstrous" "loving") ("most_and") @@ -118,16 +159,33 @@ But if we look at context overlap between various words from our list we'll see NLTK> (apply #'common-contexts *moby* (similar *moby* "monstrous")) ("most_and") -Actually, the next NLTK example is, probably, the best context overlap you can get from those texts: +Actually, the next NLTK example is, probably, the best context overlap +you can get from those texts: NLTK> (common-contexts *sense* "monstrous" "very") ("am_glad" "is_pretty" "a_pretty" "a_lucky") -Now let's draw a dispersion plot of the words from inaugural corpus. This task may seem difficult to approach at first, because the authors use a Python library `matplotlib` for drawing the graph. Fortunately, there's a language-agnostic tool to achieve similar goals, which is called `gnuplot`. There is a couple of Lisp wrapper libraries for it, and the actual code you need to write to drive it amounts to 2 lines (not counting the code to format the data for consumption). There are, actually, numerous language-agnostic tools on the Unix platform — don't forget to look for them when you have such kind of specific need :) +Now let's draw a dispersion plot of the words from inaugural corpus. +This task may seem difficult to approach at first, because the authors +use a Python library `matplotlib` for drawing the graph. Fortunately, +there's a language-agnostic tool to achieve similar goals, which is +called `gnuplot`. There is a couple of Lisp wrapper libraries for it, +and the actual code you need to write to drive it amounts to 2 lines +(not counting the code to format the data for consumption). There are, +actually, numerous language-agnostic tools on the Unix platform — +don't forget to look for them when you have such kind of specific need :) ![gnuplot dispersion graph with cl-nlp](http://img.photobucket.com/albums/v473/pufpuf/dispersion_zps7447117e.jpg) -The next problem in this part also seems pretty hard. And, in fact, it is extremely hard if framed correctly — to generate a meaningful text based on some other text. But the example solves an easier task to generate a somewhat meaningful text. And the approach taken to solve it is a very simple one — it is the baseline method in this area and is based on Markov chains. There was even a famous mock with Markov chains in the times of Usenet called Mark V. Shaney. Markov models have one principal parameter — _order_. Mark V. Shaney was an order 2 chain. +The next problem in this part also seems pretty hard. And, in fact, it +is extremely hard if framed correctly — to generate a meaningful text +based on some other text. But the example solves an easier task to +generate _a somewhat_ meaningful text. And the approach taken to +solve it is a very simple one — it is the baseline method in this area +and is based on Markov chains. There was even a famous mock with +Markov chains in the times of Usenet called [Mark V. Shaney](http://en.wikipedia.org/wiki/Mark_V_Shaney). +Markov models have one principal parameter — _order_. +Mark V. Shaney was an order 2 chain. Let's try to generate something with it: @@ -144,7 +202,10 @@ And what if we raise the order? In the beginning God created the large sea creatures , and every bird , whatever moves on the earth . He stayed yet another seven days , and sent over that which he had gained in Paddan Aram . Esau saw that the interpretation was good , he said to them , they conspired against him to kill him . They took captive all their little ones , and for days and years ; and let it divide the waters from the waters . God said to the younger , and his seed -The text starts to resemble the original more and more. Also you may notice, that the text will always start with "In". That's because Genesis isn't split in paragraphs, and our generation starts from paragraph beginnings, of which there's only one here. +The text starts to resemble the original more and more. Also you may +notice, that the text will always start with "In". That's because +Genesis isn't split in paragraphs, and our generation starts from +paragraph beginnings, of which there's only one here. OK, this seems to work, but with probabilities you never know for sure... ;) @@ -153,13 +214,16 @@ Now, we're left with very simple tasks. Let's just do them: NLTK> (length (text-words *genesis*)) 44671 -In the book they had a slightly different number: 44764. This is because of the different tokenization scheme. The differences can be seen in the next snippet (we have a cleaner version for this use case :) +In the book they had a slightly different number: 44764. This is +because of the different tokenization scheme. The differences can be +seen in the next snippet (we have a cleaner version for this use case :) NLTK> (take 20 (sort (remove-duplicates (text-words *genesis*) :test 'string=) 'string<)) ("!" "\"" "'" "(" ")" "," "-" "." ":" ";" "?" "A" "Abel" "Abida" "Abimael" "Abimelech" "About" "Abraham" "Abram" "Accad") -What about the vocabulary size? Well, once again very similar to the NLTK number (2789). +What about the vocabulary size? Well, once again very similar to the +NLTK number (2789). NLTK> (length (remove-duplicates (text-words *genesis*) :test 'string=)) 2634 @@ -174,7 +238,10 @@ Hmm... What about some other word? NLTK> (count "Abraham" (text-words *genesis*) :test 'string=) 134 -This seems to work. What's the problem with `"smote"`? Turns out, there's no such word in the Genesis text: at least the examination of the text doesn't show any traces of it. Looks like we've found a bug in the book :) +This seems to work. What's the problem with `"smote"`? Turns out, +there's no such word in the Genesis text: at least the examination of +the text doesn't show any traces of it. Looks like we've found a bug +in the book :) (defun percentage (count total) (/ (* 100.0 count) total)) @@ -191,13 +258,19 @@ This seems to work. What's the problem with `"smote"`? Turns out, there's no suc NLTK> (lexical-diversity *chat*) 6.9837084 -Interestingly, the results for `*chat*` corpus differ from the NLTK ones, although they are calculated based on tokens, provided in the corpus and not extracted by our tokenization algorithms. This text is special, because it is extracted from the XML-structured document, which also contains the full tokenization. To use it we swap `words` in `*chat*` corpus: +Interestingly, the results for `*chat*` corpus differ from the NLTK +ones, although they are calculated based on tokens, provided in the +corpus and not extracted by our tokenization algorithms. This text is +special, because it is extracted from the XML-structured document, +which also contains the full tokenization. +To use it we swap `words` in `*chat*` corpus: NLTK> (setf (text-words *chat*) (mapcar #'token-word (flatten (corpus-text-tokens ncorp:+nps-chat-corpus+)))) -But first we need to get the corpus and extract the data from it — see `corpora/nps-chat.lisp` for details. +But first we need to get the corpus and extract the data from it — +see `corpora/nps-chat.lisp` for details. And, finally, we can examine the Brown Corpus. @@ -220,13 +293,16 @@ And, finally, we can examine the Brown Corpus. HUMOR | 21695 | 5016 | 4.3 -OK, seems like we're done with this chapter. So far there was no rocket science involved, but it was interesting... +OK, seems like we're done with this chapter. So far there was no +rocket science involved, but it was interesting... ## Implementation details So, what are the interesting bits we haven't discussed? -First, let's look at a small optimization trick for calculating `lexical-diversity`. Our initial variant uses a library function `remove-duplicates` which is highly inefficient for this case. +First, let's look at a small optimization trick for calculating +`lexical-diversity`. Our initial variant uses a library function +`remove-duplicates` which is highly inefficient for this case. NLTK> (time (lexical-diversity *chat*)) Evaluation took: @@ -236,7 +312,9 @@ First, let's look at a small optimization trick for calculating `lexical-diversi 23,687,560,947 processor cycles 229,392 bytes consed -What we'd like to do is something similar to the Python's version, which puts everything in a set and calculates its size. A set is easily represented with a hash-table: +What we'd like to do is something similar to the Python's version +which puts everything in a set and calculates its size. +A set is easily represented with a hash-table: (defun uniq (list &key raw case-insensitive) "Return only unique elements from LIST either as a new list @@ -258,7 +336,8 @@ Here's the time of the same calculation using `uniq`: A 1000x speed increase! -Now, let's return to text generation. It is accomplished with the following loop (a simplified version): +Now, let's return to text generation. It is accomplished with the +following loop (a simplified version): (loop :for i :from 1 :to length :do (let ((r (random 1.0)) @@ -276,9 +355,15 @@ Now, let's return to text generation. It is accomplished with the following loop (setf prefix (cons word (butlast prefix))) (return))))) -On each iteration it places all possible continuations of the current prefix on a segment from 0 to 1 and generates a random number that points to one of the variants. If there's no continuation it starts anew. +On each iteration it places all possible continuations of the current +prefix on a segment from 0 to 1 and generates a random number that +points to one of the variants. If there's no continuation it starts +anew. -NLTK book, actually, uses a slightly more complicated model: first, it builds a probability distribution on top of the transition frequencies and then generated the text from the probabilities. As of now I don't see why this is needed and if it makes any difference in the results. +NLTK book, actually, uses a slightly more complicated model: first, it +builds a probability distribution on top of the transition frequencies +and then generated the text from the probabilities. As of now I don't +see why this is needed and if it makes any difference in the results. And, finally, here's how we draw the dispersion plot: @@ -288,9 +373,10 @@ And, finally, here's how we draw the dispersion plot: (cgn:format-gnuplot "set title \"Lexical Dispersion Plot\"") (cgn:format-gnuplot "plot \"~A\" using 1:2:yticlabels(3) title \"\"" file)) -It's just 1 line of gnuplot code, actually, but we also need to prepare the data in a tab-separated text file: +It's just 1 line of `gnuplot` code, actually, but we also need to +prepare the data in a tab-separated text file: - (defun dump-data (words dispersion-table)7 + (defun dump-data (words dispersion-table) "Dump data from DISPERSION-TABLE for WORDS into a temporary file and return its name." (let ((filename (fmt "/tmp/~A" (gensym)))) @@ -300,6 +386,11 @@ It's just 1 line of gnuplot code, actually, but we also need to prepare the data (dolist (idx (get# word dispersion-table)) (format out "~A~t~A~t~A~%" idx (1+ i) word))) (format out "0~t~A~t~%" (1+ (length words)))) - filename))` - -To wrap up, we've seen a demonstration of a lot of useful tools for text processing, and also discussed how they can be built. Among all of them I want to outline the utility of a seemingly simplistic `concordance` that is actually kind of a `grep` tool that is indispensable for any text exploration. I even used it a couple of times debugging issues in more complex functions from this pack. + filename)) + +To wrap up, we've seen a demonstration of a lot of useful tools for +text processing, and also discussed how they can be built. Among all +of them I want to outline the utility of a seemingly simplistic +`concordance` that is actually kind of a `grep` tool that is +indispensable for any text exploration. I even used it a couple of +times debugging issues in more complex functions from this pack. diff --git a/nltk/ch1-2.lisp b/nltk/ch1-2.lisp new file mode 100644 index 0000000..022ab4e --- /dev/null +++ b/nltk/ch1-2.lisp @@ -0,0 +1,75 @@ +;;; (c) 2013 Vsevolod Dyomkin + +(in-package #:nltk) +(named-readtables:in-readtable rutils-readtable) + +(defclass text () + ((name :initarg :name) + (raw :initarg :raw :accessor text-raw) + (words :accessor text-words) + (ctxs :accessor text-ctxs) + (transitions :accessor text-transitions) + (dispersion :accessor text-dispersion) + (ugrams :accessor text-ugrams) + (bigrams :accessor text-bigrams) + (trigrams :accessor text-trigrams))) + +(defmethod slot-unbound (class (obj text) (slot (eql 'ugrams))) + (with-slots (words ugrams) obj + (format t "~&Indexing unigrams...~%") + (prog1 (setf ugrams (index-ngrams 1 words)) + (format t "Number of ugrams: ~A~%" (ngrams-count ugrams))))) + +(defmethod slot-unbound (class (obj text) (slot (eql 'bigrams))) + (with-slots (words bigrams) obj + (format t "~&Indexing bigrams...~%") + (prog1 (setf bigrams (index-ngrams 2 words)) + (format t "Number of bigrams: ~A~%" (ngrams-count bigrams))))) + +(defmethod slot-unbound (class (obj text) (slot (eql 'trigrams))) + (with-slots (words trigrams) obj + (format t "~&Indexing trigrams...~%") + (prog1 (setf trigrams (index-ngrams 3 words)) + (format t "Number of trigrams: ~A~%" (ngrams-count trigrams))))) + +(defun collocations (text) + (find-collocations (text-bigrams text) :n 30)) + +(defun generate (text &key (n 20) (order 2)) + "Generate random text of N words, based on TEXT." + (with-slots (transitions) text + (string-trim (append +white-chars+ +newline-chars+) + (fmt "~{~A ~}" + (generate-text (make 'markov-chain-generator :order order) + (make-lm 'stupid-backoff-lm + :1g (text-ugrams text) + :2g (when (> order 1) + (text-bigrams text)) + :3g (when (> order 2) + (text-trigrams text))) + n))))) + +;; Plotting + +(defun dump-counts (ngrams n order-by cumulative) + "Dump N NGRAMS counts (or CUMULATIVE counts) orderd by ORDER-BY." + (let ((filename (fmt "/tmp/~A" (gensym))) + (total 0)) + (with-out-file (out filename) + (doindex (i pair (ngrams-pairs ngrams :order-by order-by)) + (when (and n (> i n)) + (return)) + (format out "~A~t~S~t~A~%" (1+ i) (car pair) + (if cumulative + (incf total (cdr pair)) + (cdr pair)))) + filename))) + +(defun plot (ngrams &key n (order-by '>) cumulative) + "Plot NGRAMS counts." + (cgn:with-gnuplot (t) + (cgn:format-gnuplot "set xtics rotate 90") + (cgn:format-gnuplot "set ylabel \"~@[Cumulative ~]Counts\"" cumulative) + (cgn:format-gnuplot + "plot \"~A\" using 1:3:xtic(2) with lines title \"\"" + (dump-cumulative-counts ngrams n order-by cumulative)))) \ No newline at end of file diff --git a/nltk/ch1-2.md b/nltk/ch1-2.md new file mode 100644 index 0000000..c29218c --- /dev/null +++ b/nltk/ch1-2.md @@ -0,0 +1,340 @@ +# NLTK 1.3 - Computing with Language: Simple Statistics + +Most of the remaining parts of the first chapter of NLTK book serve as +an introduction to Python in the context of text processing. I won't +translate that to Lisp, because there're much better resources +explaining how to use Lisp properly. First and foremost I'd refer +anyone interested to the appropriate chapters of +[Practical Common Lisp](http://gigamonkeys.com/book): + +- [List Processing](http://gigamonkeys.com/book/they-called-it-lisp-for-a-reason-list-processing.html) +- [Collections](http://gigamonkeys.com/book/collections.html) +- [Variables](http://gigamonkeys.com/book/variables.html) +- [Macros: Standard Control Constructs](http://gigamonkeys.com/book/macros-standard-control-constructs.html) + +It's only worth noting that Lisp has a different notion of lists, than +Python. Lisp's lists are linked lists, while Python's are essentially +vectors. Lisp also has vectors as a separate data-structure, and it +also has multidimensional arrays (something Python mostly lacks). And +the set of Lisp's list operations is somewhat different from +Python's. List is the default sequence data-structure, but you should +understand its limitations and know, when to switch to vectors (when +you will have a lot of elements and often access them at random). Also +Lisp doesn't provide Python-style syntactic sugar for slicing and +dicing lists, although all the operations are there in the form of +functions. The only thing which isn't easily reproducible in Lisp is +assigning to a slice: + + >>> sent[1:9] = ['Second', 'Third'] + >>> sent + ['First', 'Second', 'Third', 'Last'] + +There's `replace` but it can't shrink a sequence: + + CL-USER> (defvar sent '(1 2 3 4 5 6 7 8 9 0)) + CL-USER> (replace sent '("Second" "Third") :start1 1 :end1 9) + (1 "Second" "Third" 4 5 6 7 8 9 0) + +## Ngrams + +So, the only part worth discussing here is statistics. + +Let's start with a __frequency distribution__. We have already used +something similar in the previous part for text generation, but it was +very basic and tailored to the task. Now, it's time to get into some +serious language modeling and discuss a more general-purpose +implementation. + +Such modeling is accomplished via collecting of large amounts of +statistical data about words and their sequences appearances in +texts. These sequences are called __ngrams__. In a nutshell, you can +think of ngrams distribution as a table mapping ngram sequences to +numbers. + + (defclass ngrams () + ((order :initarg :order :reader ngrams-order) + (count :reader ngrams-count) + (max-freq :reader ngrams-max-freq) + (min-freq :reader ngrams-min-freq) + (total-freq :reader ngrams-total-freq))) + +The crucial parameter of this class is `order` which defines the +length of a sequence. In practice, ngrams of order from 1 to 5 may be +used. + +`ngrams` is an abstract class. In Lisp you don't have to somehow +specify this property, you just don't implement methods for it. The +simplest `ngrams` implementation — `table-ngrams` — uses an in-memory +hash-table as a store. You can get ngram frequency and "probability" +(the maximum likelihood estimation) from it, as well as log of +probability which is used more often in calculations, because it +allows to avoid the problem of floating point rounding errors +occurring when multiplying probabilities which are rather small +values. + + NLTK> (freq (text-bigrams *moby*) "The whale") + Indexing bigrams... + Number of bigrams: 116727 + 14 + NLTK> (logprob (text-bigrams *moby*) "The whale") + -14.255587 + +So how do we get bigrams of Moby Dick? For that we just have to count +all of them in text (this is a simplified version — some additional +processing for sentence start/ends is needed): + + (defun index-ngrams (order words &key ignore-case) + (make 'table-ngrams :order order + :table + (let ((ht (make-hash-table :test (if ignore-case 'equalp 'equal)))) + (do ((tail words (rest tail))) + ((shorter? tail order)) + (incf (get# (if (= order 1) + (car tail) + (sub tail 0 order)) + ht 0))) + ht))) + +`table-ngrams` will be useful for simple experimentation and prototyping, +like we do in our NLTK examples. + + NLTK> (defvar *1grams* (text-ugrams *moby*)) + Indexing unigrams... + Number of unigrams: 19244 + NLTK> (freq *1grams* "whale") + 906 + NLTK> (take 50 (vocab *1grams* :order-by '>)) + ("," "the" "" "" "." "of" "and" "-" "a" "to" ";" "in" "\"" "that" "'" + "his" "it" "I" "!" "s" "is" "he" "with" "was" "as" "all" "for" "this" "at" + "by" "but" "not" "him" "from" "be" "on" "?" "so" "whale" "one" "you" "had" + "have" "there" "But" "or" "were" "now" "which" "me") + +The strings "" and "" here denote special symbols for sentence +start and end. + +Here's a cumulative plot of them: + +![Cumulative Frequency Plot for 50 Most Frequent Words in Moby Dick](http://img.photobucket.com/albums/v473/pufpuf/ccounts_zpsbc41c690.png) + +And here's just the counts graph: + +![Frequency Plot for 50 Most Frequent Words in Moby Dick](http://img.photobucket.com/albums/v473/pufpuf/counts_zpsa3d96079.png) + +And, finally, here's hapaxes: + + NLTK> (take 50 (hapaxes (text-ugrams *moby*))) + ("orphan" "retracing" "sheathed" "padlocks" "dirgelike" "Buoyed" "liberated" + "Till" "Ixion" "closing" "suction" "halfspent" "THEE" "ESCAPED" "ONLY" + "Epilogue" "thrill" "etherial" "intercept" "incommoding" "tauntingly" + "backwardly" "coincidings" "ironical" "intermixingly" "whelmings" "inanimate" + "animate" "lookouts" "infatuation" "Morgana" "Fata" "gaseous" "mediums" + "bewildering" "bowstring" "mutes" "voicelessly" "THUS" "grapple" + "unconquering" "comber" "foregone" "bullied" "uncracked" "unsurrendered" + "Diving" "flume" "dislodged" "buttress") + +The next Python feature showcased here is __list comprehensions__. +The idea behind them is to resemble theoretical-set notation in list +definition. There's no such thing out-of-the box in Lisp (although you +can implement an even closer to set-notation variant in +[just 24 lines](http://lisp-univ-etc.blogspot.com/2013/01/real-list-comprehensions-in-lisp.html)), +and the general approach is to favor functional style filtering with +variants of `map` and `remove-if`. + + NLTK> (sort (remove-if #`(< (length %) 15) + (uniq (text-words *moby*))) + 'string<) + ("CIRCUMNAVIGATION" "Physiognomically" "apprehensiveness" "cannibalistically" "characteristically" "circumnavigating" "circumnavigation" "circumnavigations" "comprehensiveness" "hermaphroditical" "indiscriminately" "indispensableness" "irresistibleness" "physiognomically" "preternaturalness" "responsibilities" "simultaneousness" "subterraneousness" "supernaturalness" "superstitiousness" "uncomfortableness" "uncompromisedness" "undiscriminating" "uninterpenetratingly") + NLTK> (sort (remove-if #`(or (<= (length %) 7) + (<= (freq (text-ugrams *chat*) %) 7)) + (vocab (text-ugrams *chat*))) + 'string<) + ("20sUser104" <... another 130 users ...> "Question" "actually" "anything" "computer" "everyone" "football" "innocent" "listening" "remember" "seriously" "something" "talkcity_adults" "thinking" "together" "watching") + +In NLTK variant all users are removed from the corpus with some pre-processing. + +## Language Modeling + +But to be useful for real-world scenarios ngrams have to be large, +really large (on the orders of tens of gigabytes of data for +trigrams). This means that you won't be able to simply store them in +memory and will have to use some external storage: a general-purpose +data-store, like the relational database or a special-purpose +software. + +One such ngrams service that is available on the internet is +[Microsoft Web N-gram Services](http://web-ngram.research.microsoft.com/). +If you have a developer token you can query it over HTTP. The service +only returns log-probabilities and also log-conditional-probabilities +and runs really slow, but it is capable of serving batch requests, +i.e. return probabilities for several ngrams at once. The +implementation of `ngrams` interface for such service is provided in +[contrib/ms-ngrams.lisp](https://github.com/vseloved/cl-nlp/blob/master/src/contrib/ms-ngrams.lisp). + +We have already encountered conditional probabilities in the previous +part. They have the following relationship with regular (so called, +"joint") probabilities (for bigrams): + + p(A,B) = p(B|A) * p(A) + where P(A,B) is a joint probability and P(B|A) is the conditional one + +I.e. they can be calculated from current ngrams plus the ngrams of +preceding order. So, this operation is performed not on a single +`ngrams` object, but on a pair of such objects. And they serve an +important role we'll see below. But first we need to talk about +language models. + +A __language model__ is, basically, a collection of ngrams of +different orders. Combining these ngrams we're able to obtain some +other measures beyond a simple frequency value or probability +estimate. The biggest added value of such model is in smoothing +capabilities that it implements. The problem smoothing solves is that +you'll almost never be able to have all possible ngrams in your +data-store — there's just too many of them and the language users keep +adding more. But it's very nasty to get 0 probability for some +ngram. The language model allows to find a balance between the number +of ngrams you have to store and the possibility to get meaningful +probability numbers for any ngram. This is achieved with various +smoothing techniques: interpolation and discounting. Some of the +smoothing methods are: + +- +1 smoothing +- Kneser-Ney smoothing +- and Stupid backoff + +A good general compilation of various smoothing methods is assembled in +[this presentation](http://courses.washington.edu/ling570/fei_fall09/10_26_Smoothing.pdf). + +Let's look at the simplified implementation of scoring a sentence +with the Stupid Backoff model: + + (defmethod logprob ((model language-model) (sentence list)) + (with-slots (order) model + (let ((rez 0) + (s (append (cons "" sentence) (list "")))) + (when (shorter? s order) + (return-from logprob (logprob (get-ngrams (length s) model) s))) + ;; start of the sentence: p(A|) * p(B|,A) * ... + (do ((i 2 (1+ i))) + ((= i order)) + (incf rez (cond-logprob model (sub s 0 i)))) + ;; middle of the sentence + (do ((tail s (rest tail))) + ((shorter? tail order)) + (incf rez (cond-logprob model (sub tail 0 order)))) + rez))) + +Eventually, the language model is able to return the estimated +probability of any sequence of words, not limited to the maximum order +of ngram in it. This is usually calculated using the Markov assumption +with the following formula (for a bigram language model): + + p(s) = p(A) * p(B|A) * p(C|A,B) * p(D|B,C) ... * p(Z|X,Y) + where s = A B ... Z + + NLTK> (defvar *moby-lm2* + (make-lm 'stupid-backoff-lm + :1g (text-ugrams *moby*) + :2g (text-bigrams *moby*))) + NLTK> (prob *moby-lm2* "This is a test sentence.") + 6.139835e-20 + +That was, by the way, the probability of an unseen sentence with the +word "sentence" completely missing from vocabulary. + + NLTK> (prob *moby-lm2* '("" "Moby" "Dick" "." "")) + 5.084481e-9 + NLTK> (float (prob (text-bigrams *moby*) '("Moby" "Dick"))) + 3.0310333e-4 + +As you see, it's much more likely to encounter the sentence "Moby Dick." +in this text, although not so likely as the phrase "Moby Dick". :) + +Also such model is able to generate random texts just like we did in +the previous part. But because of the smoothing capability it's much +more general, i.e. it can generate sequences with any word from the +vocabulary, even the phrases unseen before. At the same time it's much +more computationally expensive, because now generating each new word +takes `O(vocabulary size)` while it was `O(average number of words +following any particular word)`. + + NLTK> (princ (generate *genesis* :order 2 :n 93)) + burial to judged eaten sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung foreign longed them ought up Temanites Aran earth earth blessings surface surface surface surface surface surface surface surface surface floated Darkness Now homage earth Now In princes said vengeance It passed said divide In beginning earth Asenath said re The peaceful kind Calah said blameless mistress Chaldees said hunter said middle surface surface surface surface yonder earth rib said said smoking smoking smoking + +And, as you see, this example totally doesn't resemble the one in the +previous part. Is this a bug? No, just a trick that is played with us +because we aren't following the basic math principles. In the Stupid +Backoff model the probabilities don't add up to 1 and the conditional +probability of an unseen ngrams may be larger than the largest +probability of any recorded one! This is the reason we get to produce +sequences of repeated words. This problem is much less obvious for the +trigram model, although the text remains a complete gibberish. + + NLTK> (princ (generate *genesis* :order 3 :n 93)) + brink time wagons fourth Besides south darkness listen foreigner Stay possessor lentils backwards be call dignity Kenizzites tar witness strained Yes appear colts bodies Reuel burn inheritance Galeed Hadar money touches conceal mighty foreigner spices Set pit straw son hurry yoke numbered gutters Dedan honest drove Magdiel Nod life assembly your Massa iniquity Tola still fifteen ascending wilderness everywhere shepherd harm bore Elah Jebusites Assyria butler Euphrates sinners gave Nephilim Stay garments find lifted communing closed Ir lights doing weeping shortly disobedience possessions drank peoples fifteen bless talked songs lamb far Shaveh heavens + +What this example shows us are at least two things: + +- we should always check that mathematical properties of our models + still hold as we tweak them +- although the major use-case for language model is scoring, you can + get a feel of how good it will perform by looking at the texts it + generates + +## Finding collocations + +This is another interesting and useful NLP problem with a very elegant +baseline solution, which is explained in this +[article](http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html). +Hopefully, we'll get back to it in more detail in the future chapters. +And for now here's the results of implementing the algorithm from the article: + + NLTK> (collocations *inaugural*) + (("United" "States") ("fellow" "citizens") ("four" "years") ("years" "ago") + ("Federal" "Government") ("General" "Government") ("American" "people") + ("Vice" "President") ("Old" "World") ("Almighty" "God") ("Fellow" "citizens") + ("Chief" "Magistrate") ("Chief" "Justice") ("God" "bless") ("go" "forward") + ("every" "citizen") ("Indian" "tribes") ("public" "debt") ("one" "another") + ("foreign" "nations") ("political" "parties") ("State" "governments") + ("National" "Government") ("United" "Nations") ("public" "money") + ("national" "life") ("beloved" "country") ("upon" "us") ("fellow" "Americans") + ("Western" "Hemisphere")) + +I'm surprised at how similar they are to NLTK's considering that I +didn't look at their implementation. In fact, they are the same up to +the difference in the list of __stopwords__ (the dirty secret of every +NLP application :) The code for collocation extraction function can be +found in [core/measures.lisp](https://github.com/vseloved/cl-nlp/blob/master/src/core/measures.lisp). + +## Other uses of ngrams + +Ngrams are also sometimes used for individual characters to build +Character language models. And here's another usage from NLTK — for +counting word lengths. + + NLTK> (defvar *moby-lengths* + (index-ngrams 1 (mapcar #'length (text-words *moby*)))) + NLTK> (vocab *moby-lengths*) + (1 4 2 6 8 9 11 5 7 3 10 12 13 14 16 15 17 18 20) + NLTK> (ngrams-pairs *moby-lengths*) + ((1 . 58368) (4 . 42273) (2 . 35726) (6 . 17111) (8 . 9966) (9 . 6428) + (11 . 1873) (5 . 26595) (7 . 14399) (3 . 49633) (10 . 3528) (12 . 1053) + (13 . 567) (14 . 177) (16 . 22) (15 . 70) (17 . 12) (18 . 1) (20 . 1)) + NLTK> (ngrams-max-freq *moby-lengths*) + 58368 + NLTK> (freq *moby-lengths* 3) + 49633 + +## Final thoughts + +Language modeling is really the foundation of any serious NLP +work. Having access to ngrams expands your possibilities immensely, +but the problem with them is that moving from prototype to production +implementation becomes tricky due to the problems of collecting a +representative data-set and consequently efficiently storing it. +Yet, there are solutions: the [Google Books Ngrams](http://storage.googleapis.com/books/ngrams/books/datasetsv2.html) +and [Google Web1T](http://googleresearch.blogspot.com/2006/08/all-our-n-gram-are-belong-to-you.html) +are an example of web-scale ngrams data-set, and there's also +special-purpose software for storing large ngrams corpora and +obtaining language models from them. The notable examples are +[BerkeleyLM](http://code.google.com/p/berkeleylm/) and +[KenLM](http://kheafield.com/code/kenlm/). diff --git a/src/contrib/ms-ngrams.lisp b/src/contrib/ms-ngrams.lisp new file mode 100644 index 0000000..016d9cb --- /dev/null +++ b/src/contrib/ms-ngrams.lisp @@ -0,0 +1,65 @@ +;;; (c) 2013 Vsevolod Dyomkin + +(in-package #:nlp.contrib.ngrams) +(named-readtables:in-readtable rutils-readtable) + + +;;; Microsoft ngrams + +(defclass ms-ngrams (ngrams) + ((count :initform -1) ; special value to inidicate that we don't know it :) + (url :initarg :url :accessor ms-ngrams-url + :initform "http://web-ngram.research.microsoft.com/rest/lookup.svc") + (user-token :initarg :user-token :accessor ms-ngrams-user-token) + (catalog :initarg :catalog :initform "bing-body/apr10" + :accessor ms-ngrams-catalog)) + (:documentation + "Frontend to Microsoft Ngrams service. + See http://web-ngram.research.microsoft.com/info/")) + +(flet ((to-string (ngram) + "If NGRAM is a list, convert it to string." + (if (listp ngram) (strjoin " " ngram) ngram))) + +(macrolet ((query-ngrams (op) + `(with-slots (url user-token catalog order) ms-ngrams + (let ((*read-eval* nil)) + (read-from-string + (drakma:http-request + (fmt "~A/~A/~A/~A?u=~A&p=~A" + url catalog order ,op user-token + (to-string ngram)))))))) + +(defmethod logprob ((ngrams ms-ngrams) ngram) + (query-ngrams "jp")) + +(defmethod cond-logprob ((ngrams ms-ngrams) ngram) + (query-ngrams "cp")) + +) ; end of marolet + +(macrolet ((query-ngrams (op) + `(with-slots (url user-token catalog order) ms-ngrams + (let ((*read-eval* nil)) + (mapcar #'read-from-string + (split-sequence + #/Newline (drakma:http-request + (fmt "~A/~A/~A/~A?u=~A" + url catalog order ,op user-token) + :method :post + :content (fmt "~{~A~%~}" + (mapcar #'to-string + ngrams-list))))))))) + +(defmethod logprobs ((ngrams ms-ngrams) &rest ngrams-list) + (query-ngrams "jp")) + +(defmethod cond-logprobs ((ngrams ms-ngrams) &rest ngrams-list) + (query-ngrams "cp")) + +) ; end of marolet +) ; end of flet + + +(defmethod ngrams-eq ((ngrams ms-ngrams)) + #'equalp) \ No newline at end of file diff --git a/src/contrib/packages.lisp b/src/contrib/packages.lisp new file mode 100644 index 0000000..dc82abc --- /dev/null +++ b/src/contrib/packages.lisp @@ -0,0 +1,8 @@ +;;; (c) 2013 Vsevolod Dyomkin + +(cl:defpackage #:nlp.contib.ngrams + (:use #:common-lisp #:rutil #:nlp) + (:export #:ms-ngrams + #:ms-ngrams-url + #:ms-ngrams-user-token + #:ms-ngrams-catalog)) \ No newline at end of file diff --git a/src/core/freq.lisp b/src/core/freq.lisp deleted file mode 100644 index 112d7b2..0000000 --- a/src/core/freq.lisp +++ /dev/null @@ -1,77 +0,0 @@ -;;; (c) 2013 Vsevolod Dyomkin - -(in-package #:nlp.core) -(named-readtables:in-readtable rutils-readtable) - - -(defun index-context-freqs (words &key ignore-order) - "Create a table of weighted conditional frequencies - of 1-word contexts to each side of a word - (if IGNORE-ORDER both left_right and right_left - are normalized and treated as the same context) - for each distinct word in WORDS." - ;; TODO: generalize for broader contexts - (let ((ctxs (make-hash-table :test 'equal))) - (loop :for (prev cur next) :on (cons "" (append words (list ""))) - :while next :do - (unless (get# cur ctxs) - (set# cur ctxs (make-hash-table :test 'equal))) - (when (and (upper-case-p (char cur 0)) - (ending-word-p prev)) - (setf prev "")) - (when (and (upper-case-p (char next 0)) - (ending-word-p cur)) - (setf next "")) - (let ((prev_next (if (and ignore-order (string< next prev)) - (strcat next "_" prev) - (strcat prev "_" next)))) - (set# prev_next (get# cur ctxs) - (1+ (get# prev_next (get# cur ctxs) 0))))) - (normalize-freqs ctxs))) - -(defun index-prefix-transition-freqs (words &key (n 1)) - "Create a table of weighted conditional frequencies - of next words for each distinct reversed N-word sequence in WORDS." - (let ((transitions (make-hash-table :test 'equalp)) - (limit (length words)) - (count 0)) - ;; traversing the list of words from end - (loop :for tail :on (reverse (append (make-list n) words)) - :while (< count limit) :do - (incf count) - (let* ((word (car tail)) - (prefix (subseq tail 1 (1+ n)))) - (when (and (> n 1) (string= "¶" (car prefix))) - (setf prefix (cons "¶" (make-list (1- n))))) - (unless (get# prefix transitions) - (set# prefix transitions (make-hash-table :test 'equal))) - (set# word (get# prefix transitions) - (1+ (get# word (get# prefix transitions) 0))))) - (normalize-freqs transitions))) - -(defun index-word-transition-freqs (words) - "Create a table of weighted conditional frequencies - of next words for each distinct word in WORDS." - (let ((transitions (make-hash-table :test 'equalp)) - (word-vec (make-array (1+ (length words)) - :initial-contents (cons "¶" words)))) - (dotimes (i (1- (length words))) - (let ((prev (elt word-vec i)) - (cur (elt word-vec (+ i 1)))) - (unless (get# prev transitions) - (set# prev transitions (make-hash-table :test 'equal))) - (set# cur (get# prev transitions) - (1+ (get# cur (get# prev transitions) 0))))) - (normalize-freqs transitions))) - - -;;; Helpers - -(defun normalize-freqs (ht-of-hts) - "For each table in HT-OF-HTS normalize all the values. - Returns the modified HT-OF-HTS." - (maphash #`(let ((total (reduce '+ (ht-vals %%)))) - (dotable (k v %%) - (set# k %% (/ v total)))) - ht-of-hts) - ht-of-hts) \ No newline at end of file diff --git a/src/core/indexing.lisp b/src/core/indexing.lisp new file mode 100644 index 0000000..cb84bfa --- /dev/null +++ b/src/core/indexing.lisp @@ -0,0 +1,149 @@ +;;; (c) 2013 Vsevolod Dyomkin + +(in-package #:nlp.core) +(named-readtables:in-readtable rutils-readtable) +(declaim (optimize (compilation-speed 2) (speed 3) (space 2) (debug 1))) + + +(defun index-ngrams (order words &key ignore-case) + "Make and ngrams-table of ORDER from a list of WORDS. + May IGNORE-CASE." + (make 'table-ngrams :order order + :table + (let ((ht (make-hash-table :test (if ignore-case 'equalp 'equal) + :rehash-size 10.0)) + (last-idx (1- order))) + (do ((tail words (rest tail))) + ((shorter? tail order)) + (let ((cur (car tail))) + (if (= 1 order) + (progn + (when (stringp cur) + (cond ((string= "¶" cur) + (setf cur "")) + ((and (every #'period-char-p cur) + (or (null (rest tail)) + (upper-case-p (char (second tail) 0)))) + (setf tail (cons nil (cons "" (rest tail))))))) + (incf (get# cur ht 0))) + (incf (get# (let* ((ngram (sub tail 0 order)) + (suffix (rest ngram))) + (cond + ;; paragraph start is sentence start + ((string= "¶" cur) + (cons "" suffix)) + ;; paragraph end + ((string= "¶" (nth last-idx ngram)) + (setf tail (nthcdr (1- last-idx) tail)) + (append (butlast ngram) (list ""))) + ;; sentence end + ((and (upper-case-p + (char (nth last-idx ngram) 0)) + (every #'period-char-p + (nth (1- last-idx) ngram))) + (setf tail (append (list nil "") + (nthcdr last-idx tail))) + (append (butlast ngram) (list ""))) + ;; inside sentence + (t ngram))) + ht 0))))) + (when (= order 1) + (when (get# "" ht) + (set# "" ht (decf (get# "" ht))))) + ht))) + +(defun index-context-freqs (words &key ignore-order) + "Create a table of weighted conditional frequencies + of 1-word contexts to each side of a word + (if IGNORE-ORDER both left_right and right_left + are normalized and treated as the same context) + for each distinct word in WORDS." + ;; TODO: generalize for broader contexts + (let ((ctxs (make-hash-table :test 'equal))) + (loop :for (prev cur next) :on (cons "" (append words (list ""))) + :while next :do + (unless (get# cur ctxs) + (set# cur ctxs (make-hash-table :test 'equal))) + (when (and (upper-case-p (char cur 0)) + (ending-word-p prev)) + (setf prev "")) + (when (and (upper-case-p (char next 0)) + (ending-word-p cur)) + (setf next "")) + (let ((prev_next (if (and ignore-order (string< next prev)) + (strcat next "_" prev) + (strcat prev "_" next)))) + (set# prev_next (get# cur ctxs) + (1+ (get# prev_next (get# cur ctxs) 0))))) + (normalize-freqs ctxs))) + +(defun index-prefix-transition-freqs (words &key (n 1)) + "Create a table of weighted conditional frequencies + of next words for each distinct reversed N-word sequence in WORDS." + (let ((transitions (make-hash-table :test 'equalp)) + (limit (length words)) + (count 0)) + ;; traversing the list of words from end + (loop :for tail :on (reverse (append (make-list n) words)) + :while (< count limit) :do + (incf count) + (let* ((word (car tail)) + (prefix (sub tail 1 (1+ n)))) + (when (and (> n 1) (string= "¶" (car prefix))) + (setf prefix (cons "¶" (make-list (1- n))))) + (unless (get# prefix transitions) + (set# prefix transitions (make-hash-table :test 'equal))) + (set# word (get# prefix transitions) + (1+ (get# word (get# prefix transitions) 0))))) + (normalize-freqs transitions))) + +(defun index-word-transition-freqs (words) + "Create a table of weighted conditional frequencies + of next words for each distinct word in WORDS." + (let ((transitions (make-hash-table :test 'equalp)) + (word-vec (make-array (1+ (length words)) + :initial-contents (cons "¶" words)))) + (dotimes (i (1- (length words))) + (let ((prev (elt word-vec i)) + (cur (elt word-vec (+ i 1)))) + (unless (get# prev transitions) + (set# prev transitions (make-hash-table :test 'equal))) + (set# cur (get# prev transitions) + (1+ (get# cur (get# prev transitions) 0))))) + (normalize-freqs transitions))) + + +;;; Collocations + +(defun find-collocations (bigrams &key (n 20)) + "Find up to N strongest collocations in BIGRAMS." + (let ((rez (make-hash-table :test 'equal)) + (left (make-hash-table :test 'equal)) + (right (make-hash-table :test 'equal)) + (total (ngrams-total-freq bigrams))) + (dotable (ngram freq (ngrams-table bigrams)) + (ds-bind (l r) ngram + (set# l left (+ freq (get# l left 0))) + (set# r right (+ freq (get# r right 0))))) + (dotable (ngram freq (ngrams-table bigrams)) + (unless (reduce #'or2 + (mapcar #`(member % *stopwords-en* :test 'string-equal) + ngram)) + (let ((lfreq (- (get# (car ngram) left) freq)) + (rfreq (- (get# (cadr ngram) right) freq))) + (set# ngram rez + (log-likelihood-ratio freq lfreq + rfreq (- total lfreq rfreq freq)))))) + (take n (sorted-ht-keys '> rez)))) + + +;;; Helpers + +(defun normalize-freqs (ht-of-hts) + "For each table in HT-OF-HTS normalize all the values. + Returns the modified HT-OF-HTS." + (maphash #`(let ((total (reduce '+ (ht-vals %%)))) + (dotable (k v %%) + (set# k %% (/ v total)))) + ht-of-hts) + ht-of-hts) \ No newline at end of file diff --git a/src/core/language-models.lisp b/src/core/language-models.lisp new file mode 100644 index 0000000..dc9d49e --- /dev/null +++ b/src/core/language-models.lisp @@ -0,0 +1,132 @@ +;;; (c) 2013 Vsevolod Dyomkin + +(in-package #:nlp.core) +(named-readtables:in-readtable rutils-readtable) + + +(defclass language-model () + ((order :initarg :order :reader lm-order) + (ngrams :initarg :ngrams :reader lm-ngrams)) + (:documentation + "Language model is a collection of NGRAMS of all orders from 1 upto ORDER.")) + +(defmethod vocab ((model language-model) &key order-by) + (vocab (get-ngrams 1 model) :order-by order-by)) + +(defgeneric make-lm (class &key 1g 2g 3g 4g 5g &allow-other-keys) + (:documentation + "Make instance of a langauge model of a certain CLASS + with provided unigrams (1G), ... up to fivegrams (5G).")) + +(defmethod make-lm (class &key 1g 2g 3g 4g 5g &allow-other-keys) + (let ((order (cond (5g 5) + (4g 4) + (3g 3) + (2g 2) + (1g 1) + (t (error "No ngrams supplied"))))) + (make class + :order order + :ngrams + (make-array + (1+ order) + :initial-contents + (case order + (1 (list nil + 1g)) + (2 (list nil + (or 1g (error "No unigrams supplied for LM of order 2")) + 2g)) + (3 (list nil + (or 1g (error "No unigrams supplied for LM of order 3")) + (or 2g (error "No bigrams supplied for LM of order 3")) + 3g)) + (4 (list nil + (or 1g (error "No unigrams supplied for LM of order 4")) + (or 2g (error "No bigrams supplied for LM of order 4")) + (or 3g (error "No trigrams supplied for LM of order 4")) + 4g)) + (5 (list nil + (or 1g (error "No unigrams supplied for LM of order 5")) + (or 2g (error "No bigrams supplied for LM of order 5")) + (or 3g (error "No trigrams supplied for LM of order 5")) + (or 4g (error "No fourgrams supplied for LM of order 5")) + 5g))))))) + +(defgeneric perplexity (model test-sentences) + (:documentation + "Calculate perplexity of the MODEL on the list of TEST-SENTENCES.")) + +(defmethod perplexity ((model language-model) test-sentences) + (expt 2 (- (/ (reduce #'+ (mapcar #`(logprob ngrams %) test-sentences)) + (reduce #'+ (mapcar #'length test-sentences)))))) + + +;;; Stupid Backoff LM + +(defclass stupid-backoff-lm (language-model) + ((backoff :initarg backoff :initform 0.4 :reader lm-backoff)) + (:documentation + "Stupid Backoff language model.")) + +(defmethod prob ((lm language-model) (sentence string)) + (prob lm (tokenize sentence))) + +(defmethod prob ((lm language-model) (sentence list)) + (expt 2 (logprob lm sentence))) + +(defmethod logprob ((lm language-model) (sentence string)) + (logprob lm (tokenize sentence))) + +(defmethod logprob ((model language-model) (sentence list)) + (unless sentence (return-from logprob nil)) + (with-slots (order) model + (let ((rez 0)) + (if (= 1 order) + (dolist (word sentence rez) + (incf rez (logprob (get-ngrams 1 model) word))) + (let ((s (append (cons "" sentence) (list "")))) + (if (shorter? s order) + (logprob (get-ngrams (length s) model) s) + (progn + (do ((i 2 (1+ i))) + ((= i order)) + (incf rez (cond-logprob model (sub s 0 i)))) + (do ((tail s (rest tail))) + ((shorter? tail order)) + (let ((ngram (sub tail 0 order))) + (unless (search '("" "") ngram :test 'equal) + (incf rez (cond-logprob model ngram))))) + rez))))))) + +(defmethod cond-prob ((model stupid-backoff-lm) ngram) + (with-accessors ((ngrams lm-ngrams) (backoff lm-backoff)) model + (let ((coef 1) + (len (length ngram))) + (loop :for i :from len :downto 1 :do + (let* ((cur (butlast ngram (- len i))) + (freq (freq (elt ngrams i) + (if (cdr cur) cur (car cur))))) + (if (zerop freq) + (setf coef (* coef backoff)) + (return-from cond-prob + (* coef (/ freq + (case i + (1 (ngrams-total-freq (elt ngrams 1))) + (2 (freq (elt ngrams 1) (car ngram))) + (otherwise + (freq (elt ngrams (1- i)) (butlast ngram)))))))))) + (* coef (/ (ngrams-min-freq (elt ngrams 1)) + (ngrams-total-freq (elt ngrams 1))))))) + +(defmethod cond-logprob ((model stupid-backoff-lm) ngram) + (log2 (cond-prob model ngram))) + + +;;; Helper functions + +(declaim (inline get-ngrams)) +(defun get-ngrams (order model) + "Get ngrams of a given ORDER from MODEL." + (assert (<= order (lm-order model))) + (elt (lm-ngrams model) order)) \ No newline at end of file diff --git a/src/core/measures.lisp b/src/core/measures.lisp new file mode 100644 index 0000000..2a7c353 --- /dev/null +++ b/src/core/measures.lisp @@ -0,0 +1,29 @@ +;;; (c) 2013 Vsevolod Dyomkin + +(in-package #:nlp.core) +(named-readtables:in-readtable rutils-readtable) + + +(declaim (inline log2)) +(defun log2 (x) + "Base 2 logarithm." + (/ (log x) (log 2))) + +(defun entropy (samples &optional total) + "Compute Shannon's entropy of SAMPLES list. + To save on calculation a pre-calculated TOTAL can be provided." + (unless total + (setf total (reduce #'+ samples))) + (reduce #'+ (mapcar #`(if (zerop %) + 0 + (let ((r (/ % total))) + (* r (log2 r)))) + samples))) + +(defun log-likelihood-ratio (ab a~b ~ab ~a~b) + "Calculate log-likelihood ratio between event A and B given + probabilites of A and B occurring together and separately." + (let ((total (+ ab a~b ~ab ~a~b))) + (* 2 total (- (entropy (list ab a~b ~ab ~a~b) total) + (entropy (list (+ ab a~b) (+ ~ab ~a~b)) total) + (entropy (list (+ ab ~ab) (+ a~b ~a~b)) total))))) \ No newline at end of file diff --git a/src/core/ngrams.lisp b/src/core/ngrams.lisp new file mode 100644 index 0000000..b448acb --- /dev/null +++ b/src/core/ngrams.lisp @@ -0,0 +1,242 @@ +;;; (c) 2013 Vsevolod Dyomkin + +(in-package #:nlp.core) +(named-readtables:in-readtable rutils-readtable) + + +;;; Abstract Ngrams + +(defclass ngrams () + ((order :initarg :order :reader ngrams-order) + (count :reader ngrams-count) + (max-freq :reader ngrams-max-freq) + (min-freq :reader ngrams-min-freq) + (total-freq :reader ngrams-total-freq)) + (:documentation + "An abstract ngrams interface.")) + +(defmethod print-object ((ngrams ngrams) stream) + (print-unreadable-object (ngrams stream :type t :identity t) + (if (slot-boundp ngrams 'order) + (with-accessors ((order ngrams-order) (count ngrams-count) + (total-freq ngrams-total-freq)) ngrams + (format stream "order:~A count:~A outcomes:~A" + order count total-freq)) + (format stream "not initialized")))) + +(defgeneric ngrams-eq (ngrams) + (:documentation + "Get the equality predicate of NGRAMS (can be EQUAL or EQUALP).")) + +(defgeneric ngrams-pairs (ngrams &key order-by) + (:documentation + "Get the alist of all ngrams with their frequencies in NGRAMS, + possibly ordered by ORDER-BY predicate (e.g. < or >).")) + +(defgeneric vocab (ngrams &key order-by) + (:documentation + "Get the list of all ngrams in NGRAMS, + possibly ordered by ORDER-BY predicate (e.g. < or >).")) + +(defgeneric freq (ngrams ngram) + (:documentation + "Get the NGRAM frequency in NGRAMS.") + (:method :around ((ngrams ngrams) (ngram string)) + (if (> (ngrams-order ngrams) 1) + (freq ngrams (tokenize-ngram ngrams ngram)) + (call-next-method)))) + +(defgeneric prob (ngrams ngram) + (:documentation + "Get the NGRAM probability in NGRAMS.") + (:method ((ngrams ngrams) ngram) + (/ (freq ngrams ngram) + (ngrams-total-freq ngrams))) + (:method :around ((ngrams ngrams) (ngram string)) + (if (> (ngrams-order ngrams) 1) + (prob ngrams (tokenize-ngram ngrams ngram)) + (call-next-method)))) + +(defgeneric logprob (ngrams ngram) + (:documentation + "Get the log (to base 2) of NGRAM probability in NGRAMS.") + (:method ((ngrams ngrams) ngram) + (let ((prob (prob ngrams ngram))) + (if (zerop prob) + nil + (* (log prob) #.(/ 1 (log 2)))))) + (:method :around ((ngrams ngrams) (ngram string)) + (if (> (ngrams-order ngrams) 1) + (logprob ngrams (tokenize-ngram ngrams ngram)) + (call-next-method)))) + +(defgeneric cond-prob (ngrams ngram) + (:documentation + "Get the NGRAM conditional probability in NGRAMS. + By conditional probability we mean the probability of occurrence + of the last word given the previous words.") + (:method :around ((ngrams ngrams) ngram) + (if (= 1 (ngrams-order ngrams)) + 1 + (call-next-method))) + (:method :around ((ngrams ngrams) (ngram string)) + (if (> (ngrams-order ngrams) 1) + (cond-prob ngrams (tokenize-ngram ngrams ngram)) + (call-next-method)))) + +(defgeneric cond-logprob (ngrams ngram) + (:documentation + "Get the log of NGRAM conditional probability in NGRAMS. + By conditional probability we mean the probability of occurrence + of the last word given the previous words.") + (:method :around ((ngrams ngrams) (ngram string)) + (if (> (ngrams-order ngrams) 1) + (cond-logprob ngrams (tokenize-ngram ngrams ngram)) + (call-next-method)))) + +(defgeneric freqs (ngrams &rest ngrams-list) + (:documentation + "Get the list of frequencies of ngrams from NGRAMS-LIST in NGRAMS.") + (:method (ngrams &rest ngrams-list) + (mapcar #`(freq ngrams %) ngrams-list))) + +(defgeneric probs (ngrams &rest ngrams-list) + (:documentation + "Get the list of probabilities of ngrams from NGRAMS-LIST in NGRAMS.") + (:method (ngrams &rest ngrams-list) + (mapcar #`(prob ngrams %) ngrams-list))) + +(defgeneric logprobs (ngrams &rest ngrams-list) + (:documentation + "Get the list of logs of probability of ngrams from NGRAMS-LIST in NGRAMS.") + (:method (ngrams &rest ngrams-list) + (mapcar #`(logprob ngrams %) ngrams-list))) + +(defgeneric cond-probs (ngrams &rest ngrams-list) + (:documentation + "Get the conditional probabilities of ngrams from NGRAMS-LIST in NGRAMS. + By conditional probability we mean the probability of occurrence + of the last word given the previous words.") + (:method ((ngrams ngrams) &rest ngrams-list) + (mapcar #`(cond-prob ngrams %) ngrams-list))) + +(defgeneric cond-logprobs (ngrams &rest ngrams-list) + (:documentation + "Get the logs of conditional probability of ngrams from NGRAMS-LIST in NGRAMS. + By conditional probability we mean the probability of occurrence + of the last word given the previous words.") + (:method ((ngrams ngrams) &rest ngrams-list) + (mapcar #`(cond-logprob ngrams %) ngrams-list))) + +(defgeneric top-ngram (ngrams) + (:documentation + "Get some ngram with the highest frequency in NGRAMS.")) + +(defgeneric hapaxes (ngrams) + (:documentation + "Get all the ngrams with the lowest frequency in NGRAMS. + Second value is the frequency itself.")) + + +;;; Table-based ngrams + +(defclass table-ngrams (ngrams) + ((table :initform (make-hash-table :test 'equal) :initarg :table + :reader ngrams-table)) + (:documentation + "Ngrams with hash-table source.")) + +(defmethod initialize-instance :after ((ngrams table-ngrams) &key) + (with-slots (table order max-freq min-freq total-freq) ngrams + (check-type table hash-table) + (assert (member (hash-table-test table) '(equal equalp))) + (with-hash-table-iterator (gen-fn table) + (when-it (nth-value 2 (gen-fn)) + (setf total-freq (setf max-freq (setf min-freq it))) + (loop + (mv-bind (next? _ freq) (gen-fn) + (unless next? (return)) + (incf total-freq freq) + (when (< freq min-freq) + (setf min-freq freq)) + (when (> freq max-freq) + (setf max-freq freq)))))))) + +(defmethod ngrams-count ((ngrams table-ngrams)) + (hash-table-count (ngrams-table ngrams))) + +(defmethod ngrams-eq ((ngrams table-ngrams)) + (hash-table-test (ngrams-table ngrams))) + +(defmethod vocab ((ngrams table-ngrams) &key order-by) + (with-slots (table) ngrams + (if order-by + (mapcar #'car (sort (ngrams-pairs ngrams) order-by :key #'cdr)) + (ht-keys table)))) + +(defmethod ngrams-pairs ((ngrams table-ngrams) &key order-by) + (with-slots (table) ngrams + (if order-by + (sort (ht->alist table) order-by :key #'cdr) + (ht->alist table)))) + +;; (defmethod ngrams-pairs ((ngrams table-ngrams)) +;; (let ((total 0) +;; (freq 0) +;; (eq-test (ngrams-eq ngrams)) +;; (prefix (butlast ngram))) +;; (dolist (ng (vocab ngrams)) +;; (cond ((funcall eq-test ngram ng) +;; (incf total (setf freq (freq ng)))) +;; ((funcall eq-test prefix (butlast ng)) +;; (incf total (freq ng))))) +;; (if (zerop total) 0 +;; (/ freq total)))) + +(defmethod freq ((ngrams table-ngrams) ngram) + (get# ngram (ngrams-table ngrams) 0)) + +(defmethod freqs ((ngrams table-ngrams) &rest ngrams-list) + (mapcar #`(freq (ngrams-table ngrams) %) + ngrams-list)) + +(defmethod probs ((ngrams table-ngrams) &rest ngrams-list) + (mapcar #`(prob (ngrams-table ngrams) %) + ngrams-list)) + +(defmethod cond-prob ((ngrams hash-table) ngram) + (let ((total 0) + (freq 0) + (eq-test (ngrams-eq ngrams)) + (prefix (butlast ngram))) + (maphash #`(cond ((funcall eq-test ngram %) + (incf total (setf freq %%))) + ((funcall eq-test prefix (butlast %)) + (incf total %%))) + (ngrams-source ngrams)) + (if (zerop total) 0 + (/ freq total)))) + +(defmethod top-ngram ((ngrams table-ngrams)) + (with-slots (table max-freq) ngrams + (dotable (ngram freq table) + (when (= (max-freq ngrams) freq) + (return ngram))))) + +(defmethod hapaxes ((ngrams table-ngrams)) + (let (rez) + (with-slots (table min-freq) ngrams + (dotable (ngram freq table) + (when (= min-freq freq) + (push ngram rez))) + (values rez + min-freq)))) + + +;;; Helper functions + +(defun tokenize-ngram (ngrams str) + "Transform string STR to a list if necessary (depending of order of NGRAMS)." + (if (> (ngrams-order ngrams) 1) + (tokenize str) + str)) diff --git a/src/generation/markov-chain.lisp b/src/generation/markov-chain.lisp index 6180ffd..4fcd3c2 100644 --- a/src/generation/markov-chain.lisp +++ b/src/generation/markov-chain.lisp @@ -4,11 +4,11 @@ (named-readtables:in-readtable rutils-readtable) -(defgeneric generate-text (generator data length &key skip-paragraphs) +(defgeneric generate-text (generator data length &key) (:documentation "Generate random text of LENGTH words based on some DATA (usually, table of transition probabilities between tokens). - Unless SKIP-PARAGRAPHS is set, the text may include newlines. + May not return period at the end. ")) @@ -28,15 +28,15 @@ "Markov chain generator of the 1st order — it is defined, because: - this is the general and most useful case - it allows to use optimized data-structures and a simpler algorithm - - the name is well-known - ")) + - the name is well-known")) -(defmethod generate-text ((generator markov-chain-generator) transitions length - &key skip-paragraphs) - "Generate text with a markov model of some MARKOV-ORDER described by - table TRANSITIONS of transition probabilities between reverse prefixes +(defmethod generate-text ((generator markov-chain-generator) + (transitions hash-table) + length &key skip-paragraphs &allow-other-keys) + "Generate text of LENGTH with a markov model of some MARKOV-ORDER described + by the table TRANSITIONS of transition probabilities between reverse prefixes of MARKOV-ORDER length and words. - May not return period at the end." + Unless SKIP-PARAGRAPHS is set, the text may include newlines." (let* ((order (markov-order generator)) (initial-prefix (if (> order 1) (cons "¶" (make-list (1- order))) @@ -44,8 +44,7 @@ (prefix initial-prefix) rez) (loop :for i :from 1 :to length :do - (let ((r (random 1.0)) - (total 0)) + (let ((r (random 1.0))) (dotable (word prob (or (get# prefix transitions) ;; no continuation - start anew @@ -54,7 +53,7 @@ (unless (every #'period-char-p (car rez)) (push "." rez) (incf i))))) - (when (> (incf total prob) r) + (when (<= (decf r prob) 0) (if (string= "¶" word) (if skip-paragraphs (decf i) ; don't count newline if we don't include it @@ -64,5 +63,43 @@ (return))))) (reverse rez))) +(defmethod generate-text ((generator markov-chain-generator) + (model language-model) + length &key &allow-other-keys) + "Generate text of LENGTH with a markov model of some MARKOV-ORDER + with the given language MODEL. + May not return period at the end." + (assert (<= (markov-order generator) (lm-order model))) + (let* ((order (markov-order generator)) + (vocab (vocab model)) + (len (length vocab)) + (ngram (list "")) + rez) + (loop :for i :from 1 :to length :do + (when (= (length ngram) order) + (setf ngram (rest ngram))) + (let ((total 0) + (cond-probs (list (cons "" 0)))) + (dolist (word vocab) + (unless (string= "" word) + (push (cons word + (incf total (cond-prob model (append ngram + (list word))))) + cond-probs))) + (let ((word (car (bin-search (random total) + (make-array len + :initial-contents cond-probs) + #'> :key #'cdr)))) + (if (string= "" word) + (progn (if (every #'period-char-p (car rez)) + (unless (= i length) + (decf i)) ; just skip + (push "." rez)) + (setf ngram (list ""))) + (setf rez (cons word rez) + ngram (append ngram (list word))))))) + (reverse rez))) + + (define-lazy-singleton mark-v-shaney (make 'markov-chain-generator :order 2) "The infamous Mark V. Shaney.") \ No newline at end of file diff --git a/src/packages.lisp b/src/packages.lisp index de9a2d1..008ac71 100644 --- a/src/packages.lisp +++ b/src/packages.lisp @@ -3,25 +3,30 @@ (cl:defpackage #:nlp.util (:nicknames #:nutil) (:use #:common-lisp #:rutil) - (:export #:+newline+ + (:export #:cl-nlp-error + #:not-implemented-error + + #:+newline+ #:+newline-chars+ #:newline-char-p #:+white-chars+ #:white-char-p #:+period-chars+ #:period-char-p + + #:*stopwords-en* #:ending-word-p #:filler #:+project-root+ #:data-file - #:list-from-file - ;; #:alist-from-file - ;; #:table-from-file + + #:bin-search #:define-lazy-singleton + #:sorted-ht-keys )) (cl:defpackage #:nlp.corpora @@ -57,9 +62,39 @@ (cl:defpackage #:nlp.core (:nicknames #:ncore) (:use #:common-lisp #:rutil #:nlp.util) - (:export ;; #:ngram-freq - ;; #:ngram-prob - + (:export #:ngrams + #:ngrams-eq + #:ngrams-order + #:ngrams-count + #:ngrams-total-freq + #:ngrams-max-freq + #:ngrams-min-freq + #:ngrams-pairs + #:vocab + #:freq + #:prob + #:logprob + #:cond-prob + #:cond-logprob + #:freqs + #:probs + #:logprobs + #:cond-probs + #:cond-logprobs + #:top-ngram + #:hapaxes + #:table-ngrams + #:ngrams-table + + #:language-model + #:lm-order + #:lm-ngrams + #:make-lm + #:perplexity + #:stupid-backoff-lm + #:lm-backoff + + #:index-ngrams #:index-context-freqs #:index-prefix-transition-freqs #:index-word-transition-freqs @@ -67,7 +102,6 @@ #:tokenize ;; #:stream-tokenize - #:tokenizer #:regex-word-tokenizer #:baseline-sentence-tokenizer @@ -77,6 +111,8 @@ #:doublenewline-paragraph-splitter #: + + #:find-collocations )) ;; (cl:defpackage #:nlp.phonetics @@ -95,7 +131,7 @@ (cl:defpackage #:nlp.generation (:nicknames #:ngen) - (:use #:common-lisp #:rutil #:nlp.util) + (:use #:common-lisp #:rutil #:nlp.util #:nlp.core) (:export #:generate-text #:text-generator @@ -116,48 +152,11 @@ (cl:defpackage #:nlp-user + (:nicknames #:nlp) (:use #:common-lisp #:rutil - #:nlp.util #:nlp.corpora #:nlp.core #:nlp.generation) - (:export #:+newline+ - #:+newline-chars+ - #:+white-chars+ - #:+period-chars+ - #:white-char-p - #:period-char-p - #:newline-char-p - #:ending-word-p - #:filler + #:nlp.util #:nlp.corpora #:nlp.core #:nlp.generation)) - #:corpus-name - #:corpus-lang - #:corpus-raw-texts - #:corpus-clean-texts - #:corpus-text-tokens - #:token - #:token-word - #:token-beg - #:token-end - #:token-tag - - #:index-context-freqs - #:index-prefix-transition-freqs - #:index-word-transition-freqs - #:normalize-freqs - #:tokenize - #:regex-word-tokenizer - #:baseline-sentence-tokenizer - #:doublenewline-paragraph-splitter - #: - #: - #: - #: - - #:generate-text - #:text-generator - #:markov-chain-generator - #:mark-v-shaney-generator - #: - #:markov-order - - #:print-word-in-contexts - )) +(re-export-symbols '#:nutil '#:nlp-user) +(re-export-symbols '#:ncore '#:nlp-user) +(re-export-symbols '#:ncorp '#:nlp-user) +(re-export-symbols '#:ngen '#:nlp-user) \ No newline at end of file diff --git a/src/util.lisp b/src/util.lisp index 9c97aac..2e6e74c 100644 --- a/src/util.lisp +++ b/src/util.lisp @@ -3,12 +3,20 @@ (in-package #:nlp.util) (named-readtables:in-readtable rutils-readtable) + ;;; Some package renaming (rename-package "CL-PPCRE" "CL-PPCRE" '("PPCRE" "RE")) -;;; Character utilities +;;; Conditions + +(define-condition nlp-error (simple-error) ()) + +(define-condition not-implemented-error (simple-error) ()) + + +;;; Working with chars (defparameter +newline+ (fmt "~%") @@ -43,13 +51,20 @@ "Produce an N-element filler string of FILL-CHAR's." (make-string n :initial-element fill-char)) + +;;; Working wtih words + +(defparameter *stopwords-en* + '("!" "\"" "'" "," "-" "." ":" ";" "" "" "?" "a" "about" "above" "after" "again" "against" "all" "am" "an" "and" "any" "are" "aren't" "as" "at" "be" "because" "been" "before" "being" "below" "between" "both" "but" "by" "can't" "cannot" "could" "couldn't" "d" "did" "didn't" "do" "does" "doesn't" "doing" "don't" "down" "during" "each" "few" "for" "from" "further" "had" "hadn't" "has" "hasn't" "have" "haven't" "having" "he" "he'd" "he'll" "he's" "her" "here" "here's" "hers" "herself" "him" "himself" "his" "how" "how's" "i" "i'd" "i'll" "i'm" "i've" "if" "in" "into" "is" "isn't" "it" "it's" "its" "itself" "let" "let's" "ll" "me" "more" "most" "mustn't" "my" "myself" "n't" "no" "nor" "not" "of" "off" "on" "once" "only" "or" "other" "ought" "our" "ours " "ourselves" "out" "over" "own" "s" "same" "shan't" "she" "she'd" "she'll" "she's" "should" "shouldn't" "so" "some" "such" "t" "than" "that" "that's" "the" "their" "theirs" "them" "themselves" "then" "there" "there's" "these" "they" "they'd" "they'll" "they're" "they've" "this" "those" "through" "to" "too" "under" "until" "up" "very" "was" "wasn't" "we" "we'd" "we'll" "we're" "we've" "were" "weren't" "what" "what's" "when" "when's" "where" "where's" "which" "while" "who" "who's" "whom" "why" "why's" "with" "won't" "would" "wouldn't" "you" "you'd" "you'll" "you're" "you've" "your" "yours" "yourself" "yourselves") + "List of english stopwords.") + (defun ending-word-p (word) "Check if string WORD is some kind of a period char or a paragraph mark." (or (every #'period-char-p word) (string= "¶" word))) -;;; Utilities for working with project files +;;; Working with project files (eval-always (defparameter +project-root+ (asdf:system-relative-pathname 'cl-nlp "") @@ -68,6 +83,28 @@ (reverse rez))) +;;; Search + +(defun bin-search (val vec test-less &key (start 0) end key test) + "Binary search for VAL in sorted vector VEC (the order property isn't checked). + Needs to specify TEST-LESS predicate. Handles START, END, KEY as usual. + It TEST is provided tests the value at the found position against VAL, + and returns nil if it returns nil." + (let ((low start) + (high (or end (1- (length vec))))) + (do () + ((= low high) (when (or (null test) + (funcall test val (svref vec high))) + (elt vec high))) + (let ((mid (floor (+ low high) 2))) + (if (funcall test-less (if key + (funcall key (svref vec mid)) + (svref vec mid)) + val) + (setf low (1+ mid)) + (setf high mid)))))) + + ;;; Misc (defmacro define-lazy-singleton (name init &optional docstring) @@ -80,4 +117,15 @@ ,docstring (or ,singleton (setf ,singleton ,init))) - (define-symbol-macro ,(mksym name :format "<~A>") (,name))))) \ No newline at end of file + (define-symbol-macro ,(mksym name :format "<~A>") (,name))))) + +(defun sorted-ht-keys (test ht) + "Return hash-table keys of HT in sorted order accroding to TEST." + (sort (ht-keys ht) test :key #`(get# % ht))) + +(defun shorter? (list n) + "Tests if LIST has at least N elements." + (let ((tail list)) + (loop :repeat (1- n) :do (setf tail (cdr tail))) + (null tail))) +