Merge branch 'master' of github.com:nltk/nltk

KeyuGG · Jul 5, 2012 · 662d192 · 662d192
2 parents 2cded6f + 2584497
commit 662d192
Showing 1 changed file with 48 additions and 27 deletions.
diff --git a/nltk/model/ngram.py b/nltk/model/ngram.py
@@ -2,7 +2,7 @@
 #
 # Copyright (C) 2001-2012 NLTK Project
 # Authors: Steven Bird <[email protected]>
-#          Daniel Blanchard <[email protected]>
+#          Daniel Blanchard <[email protected]>
 # URL: <http://www.nltk.org/>
 # For license information, see LICENSE.TXT
 
@@ -30,31 +30,40 @@ class NgramModel(ModelI):
     """
 
     # add cutoff
-    def __init__(self, n, train, estimator=None, *estimator_args, **estimator_kw_args):
+    def __init__(self, n, train, pad_left=True, pad_right=False,
+                 estimator=None, *estimator_args, **estimator_kwargs):
         """
-        Creates an ngram language model to capture patterns in n consecutive
+        Create an ngram language model to capture patterns in n consecutive
         words of training text.  An estimator smooths the probabilities derived
         from the text and may allow generation of ngrams not seen during
         training.
 
             >>> from nltk.corpus import brown
             >>> from nltk.probability import LidstoneProbDist
-            >>> estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
-            >>> lm = NgramModel(3, brown.words(categories='news'), estimator)
+            >>> est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
+            >>> lm = NgramModel(3, brown.words(categories='news'), estimator=est)
+            >>> lm
+            <NgramModel with 91603 3-grams>
+            >>> lm._backoff
+            <NgramModel with 62888 2-grams>
             >>> lm.entropy(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said',
             ... 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent',
             ... 'primary', 'election', 'produced', '``', 'no', 'evidence',
             ... "''", 'that', 'any', 'irregularities', 'took', 'place', '.'])
             ... # doctest: +ELLIPSIS
-            1.682...
+            0.5776...
 
         :param n: the order of the language model (ngram size)
         :type n: int
         :param train: the training text
-        :type train: list of string
+        :type train: list(str) or list(list(str))
+        :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings
+        :type pad_left: bool
+        :param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings
+        :type pad_right: bool
         :param estimator: a function for generating a probability distribution
         :type estimator: a function that takes a ConditionalFreqDist and
-              returns a ConditionalProbDist
+            returns a ConditionalProbDist
         :param estimator_args: Extra arguments for estimator.
             These arguments are usually used to specify extra
             properties for the probability distributions of individual
@@ -63,33 +72,46 @@ def __init__(self, n, train, estimator=None, *estimator_args, **estimator_kw_arg
             number of bins in the underlying ConditionalFreqDist are passed to
             the estimator as an argument.
         :type estimator_args: (any)
-        :param estimator_kw_args: Extra keyword arguments for estimator.
-        :type estimator_kw_args: (any)
+        :param estimator_kwargs: Extra keyword arguments for the estimator
+        :type estimator_kwargs: (any)
         """
 
+        # protection from cryptic behavior for calling programs
+        # that use the pre-2.0.2 interface
+        assert(isinstance(pad_left, bool))
+        assert(isinstance(pad_right, bool))
+
         self._n = n
+        self._lpad = ('',) * (n - 1) if pad_left else ()
+        self._rpad = ('',) * (n - 1) if pad_right else ()
 
         if estimator is None:
             estimator = _estimator
 
         cfd = ConditionalFreqDist()
         self._ngrams = set()
-        self._prefix = ('',) * (n - 1)
-
-        for ngram in ingrams(chain(self._prefix, train), n):
-            self._ngrams.add(ngram)
-            context = tuple(ngram[:-1])
-            token = ngram[-1]
-            cfd[context].inc(token)
-
-        if (not estimator_args) and (not estimator_kw_args):
+
+
+        # If given a list of strings instead of a list of lists, create enclosing list
+        if (train is not None) and isinstance(train[0], basestring):
+            train = [train]
+
+        for sent in train:
+            for ngram in ingrams(chain(self._lpad, sent, self._rpad), n):
+                self._ngrams.add(ngram)
+                context = tuple(ngram[:-1])
+                token = ngram[-1]
+                cfd[context].inc(token)
+
+        if not estimator_args and not estimator_kwargs:
             self._model = ConditionalProbDist(cfd, estimator, len(cfd))
         else:
-            self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kw_args)
+            self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs)
 
         # recursively construct the lower-order models
         if n > 1:
-            self._backoff = NgramModel(n-1, train, estimator, *estimator_args, **estimator_kw_args)
+            self._backoff = NgramModel(n-1, train, pad_left, pad_right,
+                                       estimator, *estimator_args, **estimator_kwargs)
 
     def prob(self, word, context):
         """
@@ -138,8 +160,8 @@ def choose_random_word(self, context):
 
         return self.generate(1, context)[-1]
 
-    # NB, this will always start with same word since model
-    # is trained on a single text
+    # NB, this will always start with same word if the model
+    # was trained on a single text
     def generate(self, num_words, context=()):
         '''
         Generate random text based on the language model.
@@ -156,7 +178,7 @@ def generate(self, num_words, context=()):
         return text
 
     def _generate_one(self, context):
-        context = (self._prefix + tuple(context))[-self._n+1:]
+        context = (self._lpad + tuple(context))[-self._n+1:]
         # print "Context (%d): <%s>" % (self._n, ','.join(context))
         if context in self:
             return self[context].generate()
@@ -176,9 +198,8 @@ def entropy(self, text):
         """
 
         e = 0.0
-        # Add prefix to front to correctly handle first n-1 words
-        text = list(self._prefix) + text
-        for i in range(len(text)):
+        text = list(self._lpad) + text + list(self._rpad)
+        for i in range(self._n-1, len(text)):
             context = tuple(text[i-self._n+1:i])
             token = text[i]
             e += self.logprob(token, context)