reworked imports and doctests for stem package

timClicks · Nov 10, 2011 · e1c800a · e1c800a
1 parent 2d4d905
commit e1c800a
Show file tree

Hide file tree

Showing 9 changed files with 576 additions and 536 deletions.
diff --git a/nltk/stem/__init__.py b/nltk/stem/__init__.py
@@ -8,32 +8,29 @@
 # For license information, see LICENSE.TXT
 
 """
+NLTK Stemmers
+
 Interfaces used to remove morphological affixes from words, leaving
 only the word stem.  Stemming algorithms aim to remove those affixes
 required for eg. grammatical role, tense, derivational morphology
 leaving only the stem of the word.  This is a difficult problem due to
 irregular words (eg. common verbs in English), complicated
 morphological rules, and part-of-speech and sense ambiguities
-(eg. C{ceil-} is not the stem of C{ceiling}).
+(eg. ``ceil-`` is not the stem of ``ceiling``).
 
-C{StemmerI} defines a standard interface for stemmers.
+StemmerI defines a standard interface for stemmers.
 """
 
-from api import *
-from regexp import *
-from lancaster import *
-from isri import *
-from snowball import *
-from wordnet import *
-from rslp import *
-
-__all__ = [
-    # Stemmer interface
-    'StemmerI',
+from nltk.stem.api import StemmerI
+from nltk.stem.regexp import RegexpStemmer
+from nltk.stem.lancaster import LancasterStemmer
+from nltk.stem.isri import ISRIStemmer
+from nltk.stem.porter import PorterStemmer
+from nltk.stem.snowball import SnowballStemmer
+from nltk.stem.wordnet import WordNetLemmatizer
+from nltk.stem.rslp import RSLPStemmer
 
-    # Stemmers
-    'RegexpStemmer', 'PorterStemmer', 'LancasterStemmer',
-    'RSLPStemmer', 'WordNetLemmatizer',
-    'ISRIStemmer', 'SnowballStemmer'
-    ] 
 
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/stem/api.py b/nltk/stem/api.py
@@ -10,15 +10,19 @@
 class StemmerI(object):
     """
     A processing interface for removing morphological affixes from
-    words.  This process is known as X{stemming}.
+    words.  This process is known as stemming.
     
     """
     def stem(self, token):
         """
         Strip affixes from the token and return the stem.
 
-        @param token: The token that should be stemmed.
-        @type token: C{str}
+        :param token: The token that should be stemmed.
+        :type token: str
         """
         raise NotImplementedError()
 
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/stem/isri.py b/nltk/stem/isri.py
@@ -8,7 +8,8 @@
 # URL: <http://www.nltk.org/>
 # For license information, see LICENSE.TXT
 
-"""ISRI Arabic Stemmer
+"""
+ISRI Arabic Stemmer
 
 The algorithm for this stemmer is described in:
 
@@ -29,7 +30,8 @@
 
 """
 import re
-from api import *
+
+from api import StemmerI
 
 class ISRIStemmer(StemmerI):
     '''
@@ -49,22 +51,87 @@ class ISRIStemmer(StemmerI):
     def __init__(self):
         self.stm = 'defult none'
 
-        self.p3 = [u'\u0643\u0627\u0644', u'\u0628\u0627\u0644', u'\u0648\u0644\u0644', u'\u0648\u0627\u0644']    # length three prefixes
+        self.p3 = [u'\u0643\u0627\u0644', u'\u0628\u0627\u0644',
+                   u'\u0648\u0644\u0644', u'\u0648\u0627\u0644']    # length three prefixes
         self.p2 = [u'\u0627\u0644', u'\u0644\u0644']    # length two prefixes
-        self.p1 = [u'\u0644', u'\u0628', u'\u0641', u'\u0633', u'\u0648', u'\u064a', u'\u062a', u'\u0646', u'\u0627']   # length one prefixes
-
-        self.s3 =  [u'\u062a\u0645\u0644', u'\u0647\u0645\u0644', u'\u062a\u0627\u0646', u'\u062a\u064a\u0646', u'\u0643\u0645\u0644']  # length three suffixes
-        self.s2 = [u'\u0648\u0646', u'\u0627\u062a', u'\u0627\u0646', u'\u064a\u0646', u'\u062a\u0646', u'\u0643\u0645', u'\u0647\u0646', u'\u0646\u0627', u'\u064a\u0627', u'\u0647\u0627', u'\u062a\u0645', u'\u0643\u0646', u'\u0646\u064a', u'\u0648\u0627', u'\u0645\u0627', u'\u0647\u0645']   # length two suffixes
-        self.s1 = [u'\u0629', u'\u0647', u'\u064a', u'\u0643', u'\u062a', u'\u0627', u'\u0646']   # length one suffixes
-
-        self.pr4 = {0:[u'\u0645'], 1:[u'\u0627'], 2:[u'\u0627', u'\u0648', u'\u064A'], 3:[u'\u0629']}   # groups of length four patterns
-        self.pr53 = {0:[u'\u0627', u'\u062a'], 1:[u'\u0627', u'\u064a', u'\u0648'], 2:[u'\u0627', u'\u062a', u'\u0645'], 3:[u'\u0645', u'\u064a', u'\u062a'], 4:[u'\u0645', u'\u062a'], 5:[u'\u0627', u'\u0648'], 6:[u'\u0627', u'\u0645']}   # Groups of length five patterns and length three roots
+        self.p1 = [u'\u0644', u'\u0628', u'\u0641', u'\u0633', u'\u0648',
+                   u'\u064a', u'\u062a', u'\u0646', u'\u0627']   # length one prefixes
+
+        self.s3 =  [u'\u062a\u0645\u0644', u'\u0647\u0645\u0644',
+                    u'\u062a\u0627\u0646', u'\u062a\u064a\u0646',
+                    u'\u0643\u0645\u0644']  # length three suffixes
+        self.s2 = [u'\u0648\u0646', u'\u0627\u062a', u'\u0627\u0646',
+                   u'\u064a\u0646', u'\u062a\u0646', u'\u0643\u0645',
+                   u'\u0647\u0646', u'\u0646\u0627', u'\u064a\u0627',
+                   u'\u0647\u0627', u'\u062a\u0645', u'\u0643\u0646',
+                   u'\u0646\u064a', u'\u0648\u0627', u'\u0645\u0627',
+                   u'\u0647\u0645']   # length two suffixes
+        self.s1 = [u'\u0629', u'\u0647', u'\u064a', u'\u0643', u'\u062a',
+                   u'\u0627', u'\u0646']   # length one suffixes
+
+        self.pr4 = {0: [u'\u0645'], 1:[u'\u0627'],
+                    2: [u'\u0627', u'\u0648', u'\u064A'], 3:[u'\u0629']}   # groups of length four patterns
+        self.pr53 = {0: [u'\u0627', u'\u062a'],
+                     1: [u'\u0627', u'\u064a', u'\u0648'],
+                     2: [u'\u0627', u'\u062a', u'\u0645'],
+                     3: [u'\u0645', u'\u064a', u'\u062a'],
+                     4: [u'\u0645', u'\u062a'],
+                     5: [u'\u0627', u'\u0648'],
+                     6: [u'\u0627', u'\u0645']}   # Groups of length five patterns and length three roots
 
         self.re_short_vowels = re.compile(ur'[\u064B-\u0652]')
         self.re_hamza = re.compile(ur'[\u0621\u0624\u0626]')
         self.re_intial_hamza = re.compile(ur'^[\u0622\u0623\u0625]')
 
-        self.stop_words = [u'\u064a\u0643\u0648\u0646', u'\u0648\u0644\u064a\u0633', u'\u0648\u0643\u0627\u0646', u'\u0643\u0630\u0644\u0643', u'\u0627\u0644\u062a\u064a', u'\u0648\u0628\u064a\u0646', u'\u0639\u0644\u064a\u0647\u0627', u'\u0645\u0633\u0627\u0621', u'\u0627\u0644\u0630\u064a', u'\u0648\u0643\u0627\u0646\u062a', u'\u0648\u0644\u0643\u0646', u'\u0648\u0627\u0644\u062a\u064a', u'\u062a\u0643\u0648\u0646', u'\u0627\u0644\u064a\u0648\u0645', u'\u0627\u0644\u0644\u0630\u064a\u0646', u'\u0639\u0644\u064a\u0647', u'\u0643\u0627\u0646\u062a', u'\u0644\u0630\u0644\u0643', u'\u0623\u0645\u0627\u0645', u'\u0647\u0646\u0627\u0643', u'\u0645\u0646\u0647\u0627', u'\u0645\u0627\u0632\u0627\u0644', u'\u0644\u0627\u0632\u0627\u0644', u'\u0644\u0627\u064a\u0632\u0627\u0644', u'\u0645\u0627\u064a\u0632\u0627\u0644', u'\u0627\u0635\u0628\u062d', u'\u0623\u0635\u0628\u062d', u'\u0623\u0645\u0633\u0649', u'\u0627\u0645\u0633\u0649', u'\u0623\u0636\u062d\u0649', u'\u0627\u0636\u062d\u0649', u'\u0645\u0627\u0628\u0631\u062d', u'\u0645\u0627\u0641\u062a\u0626', u'\u0645\u0627\u0627\u0646\u0641\u0643', u'\u0644\u0627\u0633\u064a\u0645\u0627', u'\u0648\u0644\u0627\u064a\u0632\u0627\u0644', u'\u0627\u0644\u062d\u0627\u0644\u064a', u'\u0627\u0644\u064a\u0647\u0627', u'\u0627\u0644\u0630\u064a\u0646', u'\u0641\u0627\u0646\u0647', u'\u0648\u0627\u0644\u0630\u064a', u'\u0648\u0647\u0630\u0627', u'\u0644\u0647\u0630\u0627', u'\u0641\u0643\u0627\u0646', u'\u0633\u062a\u0643\u0648\u0646', u'\u0627\u0644\u064a\u0647', u'\u064a\u0645\u0643\u0646', u'\u0628\u0647\u0630\u0627', u'\u0627\u0644\u0630\u0649']
+        self.stop_words = [u'\u064a\u0643\u0648\u0646',
+                           u'\u0648\u0644\u064a\u0633',
+                           u'\u0648\u0643\u0627\u0646',
+                           u'\u0643\u0630\u0644\u0643',
+                           u'\u0627\u0644\u062a\u064a',
+                           u'\u0648\u0628\u064a\u0646',
+                           u'\u0639\u0644\u064a\u0647\u0627',
+                           u'\u0645\u0633\u0627\u0621',
+                           u'\u0627\u0644\u0630\u064a',
+                           u'\u0648\u0643\u0627\u0646\u062a',
+                           u'\u0648\u0644\u0643\u0646',
+                           u'\u0648\u0627\u0644\u062a\u064a',
+                           u'\u062a\u0643\u0648\u0646',
+                           u'\u0627\u0644\u064a\u0648\u0645',
+                           u'\u0627\u0644\u0644\u0630\u064a\u0646',
+                           u'\u0639\u0644\u064a\u0647',
+                           u'\u0643\u0627\u0646\u062a',
+                           u'\u0644\u0630\u0644\u0643',
+                           u'\u0623\u0645\u0627\u0645',
+                           u'\u0647\u0646\u0627\u0643',
+                           u'\u0645\u0646\u0647\u0627',
+                           u'\u0645\u0627\u0632\u0627\u0644',
+                           u'\u0644\u0627\u0632\u0627\u0644',
+                           u'\u0644\u0627\u064a\u0632\u0627\u0644',
+                           u'\u0645\u0627\u064a\u0632\u0627\u0644',
+                           u'\u0627\u0635\u0628\u062d',
+                           u'\u0623\u0635\u0628\u062d',
+                           u'\u0623\u0645\u0633\u0649',
+                           u'\u0627\u0645\u0633\u0649',
+                           u'\u0623\u0636\u062d\u0649',
+                           u'\u0627\u0636\u062d\u0649',
+                           u'\u0645\u0627\u0628\u0631\u062d',
+                           u'\u0645\u0627\u0641\u062a\u0626',
+                           u'\u0645\u0627\u0627\u0646\u0641\u0643',
+                           u'\u0644\u0627\u0633\u064a\u0645\u0627',
+                           u'\u0648\u0644\u0627\u064a\u0632\u0627\u0644',
+                           u'\u0627\u0644\u062d\u0627\u0644\u064a',
+                           u'\u0627\u0644\u064a\u0647\u0627',
+                           u'\u0627\u0644\u0630\u064a\u0646',
+                           u'\u0641\u0627\u0646\u0647',
+                           u'\u0648\u0627\u0644\u0630\u064a',
+                           u'\u0648\u0647\u0630\u0627',
+                           u'\u0644\u0647\u0630\u0627',
+                           u'\u0641\u0643\u0627\u0646',
+                           u'\u0633\u062a\u0643\u0648\u0646',
+                           u'\u0627\u0644\u064a\u0647',
+                           u'\u064a\u0645\u0643\u0646',
+                           u'\u0628\u0647\u0630\u0627',
+                           u'\u0627\u0644\u0630\u0649']
 
 
     def stem(self, token):
@@ -308,3 +375,8 @@ def pre1(self):
             if self.stm.startswith(sp1):
                 self.stm = self.stm[1:]
                 return self.stm
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/stem/lancaster.py b/nltk/stem/lancaster.py
@@ -11,9 +11,37 @@
 """
 
 import re
-from api import *
+
+from api import StemmerI
 
 class LancasterStemmer(StemmerI):
+    """
+    Lancaster Stemmer
+
+        >>> st = LancasterStemmer()
+        >>> st.stem('maximum')     # Remove "-um" when word is intact
+        'maxim'
+        >>> st.stem('presumably')  # Don't remove "-um" when word is not intact
+        'presum'
+        >>> st.stem('multiply')    # No action taken if word ends with "-ply" 
+        'multiply'
+        >>> st.stem('provision')   # Replace "-sion" with "-j" to trigger "j" set of rules
+        'provid'
+        >>> st.stem('owed')        # Word starting with vowel must contain at least 2 letters
+        'ow'
+        >>> st.stem('ear')         # ditto
+        'ear'
+        >>> st.stem('saying')      # Words starting with consonant must contain at least 3 
+        'say'
+        >>> st.stem('crying')      #     letters and one of those letters must be a vowel
+        'cry'
+        >>> st.stem('string')      # ditto
+        'string'
+        >>> st.stem('meant')       # ditto
+        'meant'
+        >>> st.stem('cement')      # ditto
+        'cem'
+    """
 
     # The rule list is static since it doesn't change between instances
     rule_tuple = (
@@ -276,33 +304,7 @@ def __applyRule(self, word, remove_total, append_string):
     def __repr__(self):
         return '<LancasterStemmer>'
 
-def demo():
-    """A demonstration of the lancaster stemmer on a samples described in
-    Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
-    """
-    from nltk import stem
-
-    stemmer = stem.LancasterStemmer()
-
-    print "%-20s%-20s" % ("Original Word", "Stemmed Word")
-    print "*" * 40
-
-    for word in (
-        'maximum',    # Remove "-um" when word is intact
-        'presumably', # Don't remove "-um" when word is not intact
-        'multiply',   # No action taken if word ends with "-ply" 
-        'provision',  # Replace "-sion" with "-j" to trigger "j" set of rules
-        'owed',       # Word starting with vowel must contain at least 2 letters
-        'ear',        # ditto.
-        'saying',     # Words starting with consonant must contain at least 3 
-        'crying',     #     letters and one of those letters must be a vowel
-        'string',     # ditto.
-        'meant',      # ditto.
-        'cement'):    # ditto.
-        stemmed_word = stemmer.stem(word)
-        print "%-20s%-20s" % (word, stemmed_word)
-
-
-if __name__ == '__main__':
-    demo()
 
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/stem/porter.py b/nltk/stem/porter.py
@@ -39,7 +39,8 @@
 # are not obliged to do so. If you do not wish to do so, delete this
 # exception statement from your version.
 
-"""Porter Stemming Algorithm
+"""
+Porter Stemmer
 
 This is the Porter stemming algorithm, ported to Python from the
 version coded up in ANSI C by the author. It follows the algorithm
@@ -94,7 +95,7 @@
 
 ## --NLTK--
 ## Import the nltk.stemmer module, which defines the stemmer interface
-from api import *
+from api import StemmerI
 
 class PorterStemmer(StemmerI):
 
@@ -113,16 +114,15 @@ class PorterStemmer(StemmerI):
     The Porter Stemmer requires that all tokens have string types.
     """
 
+    # The main part of the stemming algorithm starts here.
+    # b is a buffer holding a word to be stemmed. The letters are in b[k0],
+    # b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is
+    # readjusted downwards as the stemming progresses. Zero termination is
+    # not in fact used in the algorithm.
+    # Note that only lower case sequences are stemmed. Forcing to lower case
+    # should be done before stem(...) is called.
+
     def __init__(self):
-        """The main part of the stemming algorithm starts here.
-        b is a buffer holding a word to be stemmed. The letters are in b[k0],
-        b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is
-        readjusted downwards as the stemming progresses. Zero termination is
-        not in fact used in the algorithm.
-
-        Note that only lower case sequences are stemmed. Forcing to lower case
-        should be done before stem(...) is called.
-        """
 
         self.b = ""  # buffer for word to be stemmed 
         self.k = 0
@@ -612,5 +612,9 @@ def demo():
     print '*'*70
 
 ##--NLTK--
-## Call demo() if we're invoked directly.
-if __name__ == '__main__': demo()
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
+
diff --git a/nltk/stem/regexp.py b/nltk/stem/regexp.py
@@ -9,24 +9,33 @@
 
 import re
 
-from api import *
+from api import StemmerI
 
 class RegexpStemmer(StemmerI):
     """
     A stemmer that uses regular expressions to identify morphological
     affixes.  Any substrings that match the regular expressions will
     be removed.
+
+        >>> st = RegexpStemmer('ing$|s$|e$', min=4)
+        >>> st.stem('cars')
+        'car'
+        >>> st.stem('mass')
+        'mas'
+        >>> st.stem('was')
+        'was'
+        >>> st.stem('bee')
+        'bee'
+        >>> st.stem('compute')
+        'comput'
+
+    :type regexp: str or regexp
+    :param regexp: The regular expression that should be used to
+        identify morphological affixes.
+    :type min: int
+    :param min: The minimum length of string to stem
     """
     def __init__(self, regexp, min=0):
-        """
-        Create a new regexp stemmer.
-
-        @type regexp: C{string} or C{regexp}
-        @param regexp: The regular expression that should be used to
-            identify morphological affixes.
-        @type min: int
-        @param min: The minimum length of string to stem
-        """
 
         if not hasattr(regexp, 'pattern'):
             regexp = re.compile(regexp)
@@ -42,21 +51,9 @@ def stem(self, word):
     def __repr__(self):
         return '<RegexpStemmer: %r>' % self._regexp.pattern
 
-def demo():
-    from nltk import tokenize, stem
-
-    # Create a simple regular expression based stemmer
-    stemmer = stem.RegexpStemmer('ing$|s$|e$', min=4)
-    text = "John was eating icecream"
-    tokens = text.split()
 
-    # Print the results.
-    print stemmer
-    for word in tokens:
-        print '%20s => %s' % (word, stemmer.stem(word))
-    print
-
 
-if __name__ == '__main__': demo()
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
 
-