Skip to content

Commit

Permalink
Merge branch 'master' of github.com:nltk/nltk
Browse files Browse the repository at this point in the history
  • Loading branch information
stevenbird committed Jul 5, 2012
2 parents 2cded6f + 2584497 commit 662d192
Showing 1 changed file with 48 additions and 27 deletions.
75 changes: 48 additions & 27 deletions nltk/model/ngram.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#
# Copyright (C) 2001-2012 NLTK Project
# Authors: Steven Bird <[email protected]>
# Daniel Blanchard <[email protected]>
# Daniel Blanchard <[email protected]>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT

Expand Down Expand Up @@ -30,31 +30,40 @@ class NgramModel(ModelI):
"""

# add cutoff
def __init__(self, n, train, estimator=None, *estimator_args, **estimator_kw_args):
def __init__(self, n, train, pad_left=True, pad_right=False,
estimator=None, *estimator_args, **estimator_kwargs):
"""
Creates an ngram language model to capture patterns in n consecutive
Create an ngram language model to capture patterns in n consecutive
words of training text. An estimator smooths the probabilities derived
from the text and may allow generation of ngrams not seen during
training.
>>> from nltk.corpus import brown
>>> from nltk.probability import LidstoneProbDist
>>> estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
>>> lm = NgramModel(3, brown.words(categories='news'), estimator)
>>> est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
>>> lm = NgramModel(3, brown.words(categories='news'), estimator=est)
>>> lm
<NgramModel with 91603 3-grams>
>>> lm._backoff
<NgramModel with 62888 2-grams>
>>> lm.entropy(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said',
... 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent',
... 'primary', 'election', 'produced', '``', 'no', 'evidence',
... "''", 'that', 'any', 'irregularities', 'took', 'place', '.'])
... # doctest: +ELLIPSIS
1.682...
0.5776...
:param n: the order of the language model (ngram size)
:type n: int
:param train: the training text
:type train: list of string
:type train: list(str) or list(list(str))
:param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings
:type pad_left: bool
:param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings
:type pad_right: bool
:param estimator: a function for generating a probability distribution
:type estimator: a function that takes a ConditionalFreqDist and
returns a ConditionalProbDist
returns a ConditionalProbDist
:param estimator_args: Extra arguments for estimator.
These arguments are usually used to specify extra
properties for the probability distributions of individual
Expand All @@ -63,33 +72,46 @@ def __init__(self, n, train, estimator=None, *estimator_args, **estimator_kw_arg
number of bins in the underlying ConditionalFreqDist are passed to
the estimator as an argument.
:type estimator_args: (any)
:param estimator_kw_args: Extra keyword arguments for estimator.
:type estimator_kw_args: (any)
:param estimator_kwargs: Extra keyword arguments for the estimator
:type estimator_kwargs: (any)
"""

# protection from cryptic behavior for calling programs
# that use the pre-2.0.2 interface
assert(isinstance(pad_left, bool))
assert(isinstance(pad_right, bool))

self._n = n
self._lpad = ('',) * (n - 1) if pad_left else ()
self._rpad = ('',) * (n - 1) if pad_right else ()

if estimator is None:
estimator = _estimator

cfd = ConditionalFreqDist()
self._ngrams = set()
self._prefix = ('',) * (n - 1)

for ngram in ingrams(chain(self._prefix, train), n):
self._ngrams.add(ngram)
context = tuple(ngram[:-1])
token = ngram[-1]
cfd[context].inc(token)

if (not estimator_args) and (not estimator_kw_args):


# If given a list of strings instead of a list of lists, create enclosing list
if (train is not None) and isinstance(train[0], basestring):
train = [train]

for sent in train:
for ngram in ingrams(chain(self._lpad, sent, self._rpad), n):
self._ngrams.add(ngram)
context = tuple(ngram[:-1])
token = ngram[-1]
cfd[context].inc(token)

if not estimator_args and not estimator_kwargs:
self._model = ConditionalProbDist(cfd, estimator, len(cfd))
else:
self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kw_args)
self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs)

# recursively construct the lower-order models
if n > 1:
self._backoff = NgramModel(n-1, train, estimator, *estimator_args, **estimator_kw_args)
self._backoff = NgramModel(n-1, train, pad_left, pad_right,
estimator, *estimator_args, **estimator_kwargs)

def prob(self, word, context):
"""
Expand Down Expand Up @@ -138,8 +160,8 @@ def choose_random_word(self, context):

return self.generate(1, context)[-1]

# NB, this will always start with same word since model
# is trained on a single text
# NB, this will always start with same word if the model
# was trained on a single text
def generate(self, num_words, context=()):
'''
Generate random text based on the language model.
Expand All @@ -156,7 +178,7 @@ def generate(self, num_words, context=()):
return text

def _generate_one(self, context):
context = (self._prefix + tuple(context))[-self._n+1:]
context = (self._lpad + tuple(context))[-self._n+1:]
# print "Context (%d): <%s>" % (self._n, ','.join(context))
if context in self:
return self[context].generate()
Expand All @@ -176,9 +198,8 @@ def entropy(self, text):
"""

e = 0.0
# Add prefix to front to correctly handle first n-1 words
text = list(self._prefix) + text
for i in range(len(text)):
text = list(self._lpad) + text + list(self._rpad)
for i in range(self._n-1, len(text)):
context = tuple(text[i-self._n+1:i])
token = text[i]
e += self.logprob(token, context)
Expand Down

0 comments on commit 662d192

Please sign in to comment.