-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' of github.com:nltk/nltk
- Loading branch information
Showing
1 changed file
with
48 additions
and
27 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,7 @@ | |
# | ||
# Copyright (C) 2001-2012 NLTK Project | ||
# Authors: Steven Bird <[email protected]> | ||
# Daniel Blanchard <[email protected]> | ||
# Daniel Blanchard <[email protected]> | ||
# URL: <http://www.nltk.org/> | ||
# For license information, see LICENSE.TXT | ||
|
||
|
@@ -30,31 +30,40 @@ class NgramModel(ModelI): | |
""" | ||
|
||
# add cutoff | ||
def __init__(self, n, train, estimator=None, *estimator_args, **estimator_kw_args): | ||
def __init__(self, n, train, pad_left=True, pad_right=False, | ||
estimator=None, *estimator_args, **estimator_kwargs): | ||
""" | ||
Creates an ngram language model to capture patterns in n consecutive | ||
Create an ngram language model to capture patterns in n consecutive | ||
words of training text. An estimator smooths the probabilities derived | ||
from the text and may allow generation of ngrams not seen during | ||
training. | ||
>>> from nltk.corpus import brown | ||
>>> from nltk.probability import LidstoneProbDist | ||
>>> estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) | ||
>>> lm = NgramModel(3, brown.words(categories='news'), estimator) | ||
>>> est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) | ||
>>> lm = NgramModel(3, brown.words(categories='news'), estimator=est) | ||
>>> lm | ||
<NgramModel with 91603 3-grams> | ||
>>> lm._backoff | ||
<NgramModel with 62888 2-grams> | ||
>>> lm.entropy(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', | ||
... 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', | ||
... 'primary', 'election', 'produced', '``', 'no', 'evidence', | ||
... "''", 'that', 'any', 'irregularities', 'took', 'place', '.']) | ||
... # doctest: +ELLIPSIS | ||
1.682... | ||
0.5776... | ||
:param n: the order of the language model (ngram size) | ||
:type n: int | ||
:param train: the training text | ||
:type train: list of string | ||
:type train: list(str) or list(list(str)) | ||
:param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings | ||
:type pad_left: bool | ||
:param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings | ||
:type pad_right: bool | ||
:param estimator: a function for generating a probability distribution | ||
:type estimator: a function that takes a ConditionalFreqDist and | ||
returns a ConditionalProbDist | ||
returns a ConditionalProbDist | ||
:param estimator_args: Extra arguments for estimator. | ||
These arguments are usually used to specify extra | ||
properties for the probability distributions of individual | ||
|
@@ -63,33 +72,46 @@ def __init__(self, n, train, estimator=None, *estimator_args, **estimator_kw_arg | |
number of bins in the underlying ConditionalFreqDist are passed to | ||
the estimator as an argument. | ||
:type estimator_args: (any) | ||
:param estimator_kw_args: Extra keyword arguments for estimator. | ||
:type estimator_kw_args: (any) | ||
:param estimator_kwargs: Extra keyword arguments for the estimator | ||
:type estimator_kwargs: (any) | ||
""" | ||
|
||
# protection from cryptic behavior for calling programs | ||
# that use the pre-2.0.2 interface | ||
assert(isinstance(pad_left, bool)) | ||
assert(isinstance(pad_right, bool)) | ||
|
||
self._n = n | ||
self._lpad = ('',) * (n - 1) if pad_left else () | ||
self._rpad = ('',) * (n - 1) if pad_right else () | ||
|
||
if estimator is None: | ||
estimator = _estimator | ||
|
||
cfd = ConditionalFreqDist() | ||
self._ngrams = set() | ||
self._prefix = ('',) * (n - 1) | ||
|
||
for ngram in ingrams(chain(self._prefix, train), n): | ||
self._ngrams.add(ngram) | ||
context = tuple(ngram[:-1]) | ||
token = ngram[-1] | ||
cfd[context].inc(token) | ||
|
||
if (not estimator_args) and (not estimator_kw_args): | ||
|
||
|
||
# If given a list of strings instead of a list of lists, create enclosing list | ||
if (train is not None) and isinstance(train[0], basestring): | ||
train = [train] | ||
|
||
for sent in train: | ||
for ngram in ingrams(chain(self._lpad, sent, self._rpad), n): | ||
self._ngrams.add(ngram) | ||
context = tuple(ngram[:-1]) | ||
token = ngram[-1] | ||
cfd[context].inc(token) | ||
|
||
if not estimator_args and not estimator_kwargs: | ||
self._model = ConditionalProbDist(cfd, estimator, len(cfd)) | ||
else: | ||
self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kw_args) | ||
self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs) | ||
|
||
# recursively construct the lower-order models | ||
if n > 1: | ||
self._backoff = NgramModel(n-1, train, estimator, *estimator_args, **estimator_kw_args) | ||
self._backoff = NgramModel(n-1, train, pad_left, pad_right, | ||
estimator, *estimator_args, **estimator_kwargs) | ||
|
||
def prob(self, word, context): | ||
""" | ||
|
@@ -138,8 +160,8 @@ def choose_random_word(self, context): | |
|
||
return self.generate(1, context)[-1] | ||
|
||
# NB, this will always start with same word since model | ||
# is trained on a single text | ||
# NB, this will always start with same word if the model | ||
# was trained on a single text | ||
def generate(self, num_words, context=()): | ||
''' | ||
Generate random text based on the language model. | ||
|
@@ -156,7 +178,7 @@ def generate(self, num_words, context=()): | |
return text | ||
|
||
def _generate_one(self, context): | ||
context = (self._prefix + tuple(context))[-self._n+1:] | ||
context = (self._lpad + tuple(context))[-self._n+1:] | ||
# print "Context (%d): <%s>" % (self._n, ','.join(context)) | ||
if context in self: | ||
return self[context].generate() | ||
|
@@ -176,9 +198,8 @@ def entropy(self, text): | |
""" | ||
|
||
e = 0.0 | ||
# Add prefix to front to correctly handle first n-1 words | ||
text = list(self._prefix) + text | ||
for i in range(len(text)): | ||
text = list(self._lpad) + text + list(self._rpad) | ||
for i in range(self._n-1, len(text)): | ||
context = tuple(text[i-self._n+1:i]) | ||
token = text[i] | ||
e += self.logprob(token, context) | ||
|