Skip to content

Commit

Permalink
reworked imports and doctests for stem package
Browse files Browse the repository at this point in the history
  • Loading branch information
stevenbird committed Nov 10, 2011
1 parent 2d4d905 commit e1c800a
Show file tree
Hide file tree
Showing 9 changed files with 576 additions and 536 deletions.
33 changes: 15 additions & 18 deletions nltk/stem/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,32 +8,29 @@
# For license information, see LICENSE.TXT

"""
NLTK Stemmers
Interfaces used to remove morphological affixes from words, leaving
only the word stem. Stemming algorithms aim to remove those affixes
required for eg. grammatical role, tense, derivational morphology
leaving only the stem of the word. This is a difficult problem due to
irregular words (eg. common verbs in English), complicated
morphological rules, and part-of-speech and sense ambiguities
(eg. C{ceil-} is not the stem of C{ceiling}).
(eg. ``ceil-`` is not the stem of ``ceiling``).
C{StemmerI} defines a standard interface for stemmers.
StemmerI defines a standard interface for stemmers.
"""

from api import *
from regexp import *
from lancaster import *
from isri import *
from snowball import *
from wordnet import *
from rslp import *

__all__ = [
# Stemmer interface
'StemmerI',
from nltk.stem.api import StemmerI
from nltk.stem.regexp import RegexpStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.isri import ISRIStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.rslp import RSLPStemmer

# Stemmers
'RegexpStemmer', 'PorterStemmer', 'LancasterStemmer',
'RSLPStemmer', 'WordNetLemmatizer',
'ISRIStemmer', 'SnowballStemmer'
]

if __name__ == "__main__":
import doctest
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
10 changes: 7 additions & 3 deletions nltk/stem/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,19 @@
class StemmerI(object):
"""
A processing interface for removing morphological affixes from
words. This process is known as X{stemming}.
words. This process is known as stemming.
"""
def stem(self, token):
"""
Strip affixes from the token and return the stem.
@param token: The token that should be stemmed.
@type token: C{str}
:param token: The token that should be stemmed.
:type token: str
"""
raise NotImplementedError()


if __name__ == "__main__":
import doctest
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
96 changes: 84 additions & 12 deletions nltk/stem/isri.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT

"""ISRI Arabic Stemmer
"""
ISRI Arabic Stemmer
The algorithm for this stemmer is described in:
Expand All @@ -29,7 +30,8 @@
"""
import re
from api import *

from api import StemmerI

class ISRIStemmer(StemmerI):
'''
Expand All @@ -49,22 +51,87 @@ class ISRIStemmer(StemmerI):
def __init__(self):
self.stm = 'defult none'

self.p3 = [u'\u0643\u0627\u0644', u'\u0628\u0627\u0644', u'\u0648\u0644\u0644', u'\u0648\u0627\u0644'] # length three prefixes
self.p3 = [u'\u0643\u0627\u0644', u'\u0628\u0627\u0644',
u'\u0648\u0644\u0644', u'\u0648\u0627\u0644'] # length three prefixes
self.p2 = [u'\u0627\u0644', u'\u0644\u0644'] # length two prefixes
self.p1 = [u'\u0644', u'\u0628', u'\u0641', u'\u0633', u'\u0648', u'\u064a', u'\u062a', u'\u0646', u'\u0627'] # length one prefixes

self.s3 = [u'\u062a\u0645\u0644', u'\u0647\u0645\u0644', u'\u062a\u0627\u0646', u'\u062a\u064a\u0646', u'\u0643\u0645\u0644'] # length three suffixes
self.s2 = [u'\u0648\u0646', u'\u0627\u062a', u'\u0627\u0646', u'\u064a\u0646', u'\u062a\u0646', u'\u0643\u0645', u'\u0647\u0646', u'\u0646\u0627', u'\u064a\u0627', u'\u0647\u0627', u'\u062a\u0645', u'\u0643\u0646', u'\u0646\u064a', u'\u0648\u0627', u'\u0645\u0627', u'\u0647\u0645'] # length two suffixes
self.s1 = [u'\u0629', u'\u0647', u'\u064a', u'\u0643', u'\u062a', u'\u0627', u'\u0646'] # length one suffixes

self.pr4 = {0:[u'\u0645'], 1:[u'\u0627'], 2:[u'\u0627', u'\u0648', u'\u064A'], 3:[u'\u0629']} # groups of length four patterns
self.pr53 = {0:[u'\u0627', u'\u062a'], 1:[u'\u0627', u'\u064a', u'\u0648'], 2:[u'\u0627', u'\u062a', u'\u0645'], 3:[u'\u0645', u'\u064a', u'\u062a'], 4:[u'\u0645', u'\u062a'], 5:[u'\u0627', u'\u0648'], 6:[u'\u0627', u'\u0645']} # Groups of length five patterns and length three roots
self.p1 = [u'\u0644', u'\u0628', u'\u0641', u'\u0633', u'\u0648',
u'\u064a', u'\u062a', u'\u0646', u'\u0627'] # length one prefixes

self.s3 = [u'\u062a\u0645\u0644', u'\u0647\u0645\u0644',
u'\u062a\u0627\u0646', u'\u062a\u064a\u0646',
u'\u0643\u0645\u0644'] # length three suffixes
self.s2 = [u'\u0648\u0646', u'\u0627\u062a', u'\u0627\u0646',
u'\u064a\u0646', u'\u062a\u0646', u'\u0643\u0645',
u'\u0647\u0646', u'\u0646\u0627', u'\u064a\u0627',
u'\u0647\u0627', u'\u062a\u0645', u'\u0643\u0646',
u'\u0646\u064a', u'\u0648\u0627', u'\u0645\u0627',
u'\u0647\u0645'] # length two suffixes
self.s1 = [u'\u0629', u'\u0647', u'\u064a', u'\u0643', u'\u062a',
u'\u0627', u'\u0646'] # length one suffixes

self.pr4 = {0: [u'\u0645'], 1:[u'\u0627'],
2: [u'\u0627', u'\u0648', u'\u064A'], 3:[u'\u0629']} # groups of length four patterns
self.pr53 = {0: [u'\u0627', u'\u062a'],
1: [u'\u0627', u'\u064a', u'\u0648'],
2: [u'\u0627', u'\u062a', u'\u0645'],
3: [u'\u0645', u'\u064a', u'\u062a'],
4: [u'\u0645', u'\u062a'],
5: [u'\u0627', u'\u0648'],
6: [u'\u0627', u'\u0645']} # Groups of length five patterns and length three roots

self.re_short_vowels = re.compile(ur'[\u064B-\u0652]')
self.re_hamza = re.compile(ur'[\u0621\u0624\u0626]')
self.re_intial_hamza = re.compile(ur'^[\u0622\u0623\u0625]')

self.stop_words = [u'\u064a\u0643\u0648\u0646', u'\u0648\u0644\u064a\u0633', u'\u0648\u0643\u0627\u0646', u'\u0643\u0630\u0644\u0643', u'\u0627\u0644\u062a\u064a', u'\u0648\u0628\u064a\u0646', u'\u0639\u0644\u064a\u0647\u0627', u'\u0645\u0633\u0627\u0621', u'\u0627\u0644\u0630\u064a', u'\u0648\u0643\u0627\u0646\u062a', u'\u0648\u0644\u0643\u0646', u'\u0648\u0627\u0644\u062a\u064a', u'\u062a\u0643\u0648\u0646', u'\u0627\u0644\u064a\u0648\u0645', u'\u0627\u0644\u0644\u0630\u064a\u0646', u'\u0639\u0644\u064a\u0647', u'\u0643\u0627\u0646\u062a', u'\u0644\u0630\u0644\u0643', u'\u0623\u0645\u0627\u0645', u'\u0647\u0646\u0627\u0643', u'\u0645\u0646\u0647\u0627', u'\u0645\u0627\u0632\u0627\u0644', u'\u0644\u0627\u0632\u0627\u0644', u'\u0644\u0627\u064a\u0632\u0627\u0644', u'\u0645\u0627\u064a\u0632\u0627\u0644', u'\u0627\u0635\u0628\u062d', u'\u0623\u0635\u0628\u062d', u'\u0623\u0645\u0633\u0649', u'\u0627\u0645\u0633\u0649', u'\u0623\u0636\u062d\u0649', u'\u0627\u0636\u062d\u0649', u'\u0645\u0627\u0628\u0631\u062d', u'\u0645\u0627\u0641\u062a\u0626', u'\u0645\u0627\u0627\u0646\u0641\u0643', u'\u0644\u0627\u0633\u064a\u0645\u0627', u'\u0648\u0644\u0627\u064a\u0632\u0627\u0644', u'\u0627\u0644\u062d\u0627\u0644\u064a', u'\u0627\u0644\u064a\u0647\u0627', u'\u0627\u0644\u0630\u064a\u0646', u'\u0641\u0627\u0646\u0647', u'\u0648\u0627\u0644\u0630\u064a', u'\u0648\u0647\u0630\u0627', u'\u0644\u0647\u0630\u0627', u'\u0641\u0643\u0627\u0646', u'\u0633\u062a\u0643\u0648\u0646', u'\u0627\u0644\u064a\u0647', u'\u064a\u0645\u0643\u0646', u'\u0628\u0647\u0630\u0627', u'\u0627\u0644\u0630\u0649']
self.stop_words = [u'\u064a\u0643\u0648\u0646',
u'\u0648\u0644\u064a\u0633',
u'\u0648\u0643\u0627\u0646',
u'\u0643\u0630\u0644\u0643',
u'\u0627\u0644\u062a\u064a',
u'\u0648\u0628\u064a\u0646',
u'\u0639\u0644\u064a\u0647\u0627',
u'\u0645\u0633\u0627\u0621',
u'\u0627\u0644\u0630\u064a',
u'\u0648\u0643\u0627\u0646\u062a',
u'\u0648\u0644\u0643\u0646',
u'\u0648\u0627\u0644\u062a\u064a',
u'\u062a\u0643\u0648\u0646',
u'\u0627\u0644\u064a\u0648\u0645',
u'\u0627\u0644\u0644\u0630\u064a\u0646',
u'\u0639\u0644\u064a\u0647',
u'\u0643\u0627\u0646\u062a',
u'\u0644\u0630\u0644\u0643',
u'\u0623\u0645\u0627\u0645',
u'\u0647\u0646\u0627\u0643',
u'\u0645\u0646\u0647\u0627',
u'\u0645\u0627\u0632\u0627\u0644',
u'\u0644\u0627\u0632\u0627\u0644',
u'\u0644\u0627\u064a\u0632\u0627\u0644',
u'\u0645\u0627\u064a\u0632\u0627\u0644',
u'\u0627\u0635\u0628\u062d',
u'\u0623\u0635\u0628\u062d',
u'\u0623\u0645\u0633\u0649',
u'\u0627\u0645\u0633\u0649',
u'\u0623\u0636\u062d\u0649',
u'\u0627\u0636\u062d\u0649',
u'\u0645\u0627\u0628\u0631\u062d',
u'\u0645\u0627\u0641\u062a\u0626',
u'\u0645\u0627\u0627\u0646\u0641\u0643',
u'\u0644\u0627\u0633\u064a\u0645\u0627',
u'\u0648\u0644\u0627\u064a\u0632\u0627\u0644',
u'\u0627\u0644\u062d\u0627\u0644\u064a',
u'\u0627\u0644\u064a\u0647\u0627',
u'\u0627\u0644\u0630\u064a\u0646',
u'\u0641\u0627\u0646\u0647',
u'\u0648\u0627\u0644\u0630\u064a',
u'\u0648\u0647\u0630\u0627',
u'\u0644\u0647\u0630\u0627',
u'\u0641\u0643\u0627\u0646',
u'\u0633\u062a\u0643\u0648\u0646',
u'\u0627\u0644\u064a\u0647',
u'\u064a\u0645\u0643\u0646',
u'\u0628\u0647\u0630\u0627',
u'\u0627\u0644\u0630\u0649']


def stem(self, token):
Expand Down Expand Up @@ -308,3 +375,8 @@ def pre1(self):
if self.stm.startswith(sp1):
self.stm = self.stm[1:]
return self.stm


if __name__ == "__main__":
import doctest
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
62 changes: 32 additions & 30 deletions nltk/stem/lancaster.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,37 @@
"""

import re
from api import *

from api import StemmerI

class LancasterStemmer(StemmerI):
"""
Lancaster Stemmer
>>> st = LancasterStemmer()
>>> st.stem('maximum') # Remove "-um" when word is intact
'maxim'
>>> st.stem('presumably') # Don't remove "-um" when word is not intact
'presum'
>>> st.stem('multiply') # No action taken if word ends with "-ply"
'multiply'
>>> st.stem('provision') # Replace "-sion" with "-j" to trigger "j" set of rules
'provid'
>>> st.stem('owed') # Word starting with vowel must contain at least 2 letters
'ow'
>>> st.stem('ear') # ditto
'ear'
>>> st.stem('saying') # Words starting with consonant must contain at least 3
'say'
>>> st.stem('crying') # letters and one of those letters must be a vowel
'cry'
>>> st.stem('string') # ditto
'string'
>>> st.stem('meant') # ditto
'meant'
>>> st.stem('cement') # ditto
'cem'
"""

# The rule list is static since it doesn't change between instances
rule_tuple = (
Expand Down Expand Up @@ -276,33 +304,7 @@ def __applyRule(self, word, remove_total, append_string):
def __repr__(self):
return '<LancasterStemmer>'

def demo():
"""A demonstration of the lancaster stemmer on a samples described in
Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
"""
from nltk import stem

stemmer = stem.LancasterStemmer()

print "%-20s%-20s" % ("Original Word", "Stemmed Word")
print "*" * 40

for word in (
'maximum', # Remove "-um" when word is intact
'presumably', # Don't remove "-um" when word is not intact
'multiply', # No action taken if word ends with "-ply"
'provision', # Replace "-sion" with "-j" to trigger "j" set of rules
'owed', # Word starting with vowel must contain at least 2 letters
'ear', # ditto.
'saying', # Words starting with consonant must contain at least 3
'crying', # letters and one of those letters must be a vowel
'string', # ditto.
'meant', # ditto.
'cement'): # ditto.
stemmed_word = stemmer.stem(word)
print "%-20s%-20s" % (word, stemmed_word)


if __name__ == '__main__':
demo()

if __name__ == "__main__":
import doctest
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
30 changes: 17 additions & 13 deletions nltk/stem/porter.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@
# are not obliged to do so. If you do not wish to do so, delete this
# exception statement from your version.

"""Porter Stemming Algorithm
"""
Porter Stemmer
This is the Porter stemming algorithm, ported to Python from the
version coded up in ANSI C by the author. It follows the algorithm
Expand Down Expand Up @@ -94,7 +95,7 @@

## --NLTK--
## Import the nltk.stemmer module, which defines the stemmer interface
from api import *
from api import StemmerI

class PorterStemmer(StemmerI):

Expand All @@ -113,16 +114,15 @@ class PorterStemmer(StemmerI):
The Porter Stemmer requires that all tokens have string types.
"""

# The main part of the stemming algorithm starts here.
# b is a buffer holding a word to be stemmed. The letters are in b[k0],
# b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is
# readjusted downwards as the stemming progresses. Zero termination is
# not in fact used in the algorithm.
# Note that only lower case sequences are stemmed. Forcing to lower case
# should be done before stem(...) is called.

def __init__(self):
"""The main part of the stemming algorithm starts here.
b is a buffer holding a word to be stemmed. The letters are in b[k0],
b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is
readjusted downwards as the stemming progresses. Zero termination is
not in fact used in the algorithm.
Note that only lower case sequences are stemmed. Forcing to lower case
should be done before stem(...) is called.
"""

self.b = "" # buffer for word to be stemmed
self.k = 0
Expand Down Expand Up @@ -612,5 +612,9 @@ def demo():
print '*'*70

##--NLTK--
## Call demo() if we're invoked directly.
if __name__ == '__main__': demo()


if __name__ == "__main__":
import doctest
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)

47 changes: 22 additions & 25 deletions nltk/stem/regexp.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,33 @@

import re

from api import *
from api import StemmerI

class RegexpStemmer(StemmerI):
"""
A stemmer that uses regular expressions to identify morphological
affixes. Any substrings that match the regular expressions will
be removed.
>>> st = RegexpStemmer('ing$|s$|e$', min=4)
>>> st.stem('cars')
'car'
>>> st.stem('mass')
'mas'
>>> st.stem('was')
'was'
>>> st.stem('bee')
'bee'
>>> st.stem('compute')
'comput'
:type regexp: str or regexp
:param regexp: The regular expression that should be used to
identify morphological affixes.
:type min: int
:param min: The minimum length of string to stem
"""
def __init__(self, regexp, min=0):
"""
Create a new regexp stemmer.
@type regexp: C{string} or C{regexp}
@param regexp: The regular expression that should be used to
identify morphological affixes.
@type min: int
@param min: The minimum length of string to stem
"""

if not hasattr(regexp, 'pattern'):
regexp = re.compile(regexp)
Expand All @@ -42,21 +51,9 @@ def stem(self, word):
def __repr__(self):
return '<RegexpStemmer: %r>' % self._regexp.pattern

def demo():
from nltk import tokenize, stem

# Create a simple regular expression based stemmer
stemmer = stem.RegexpStemmer('ing$|s$|e$', min=4)
text = "John was eating icecream"
tokens = text.split()

# Print the results.
print stemmer
for word in tokens:
print '%20s => %s' % (word, stemmer.stem(word))
print


if __name__ == '__main__': demo()
if __name__ == "__main__":
import doctest
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)


Loading

0 comments on commit e1c800a

Please sign in to comment.