Merge pull request nltk#226 from aboSamoor/senna_polish1

Adding documentation and error handling to the senna module.
fyuval · Feb 19, 2012 · 96f0c36 · 96f0c36
2 parents 19dec81 + 667e3c4
commit 96f0c36
Showing 1 changed file with 184 additions and 41 deletions.
diff --git a/nltk/tag/senna.py b/nltk/tag/senna.py
@@ -1,149 +1,292 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Interface to the Senna tagger
 #
-# Copyright (C) 2001-2011 NLTK Project
+# Copyright (C) 2001-2012 NLTK Project
 # Author: Rami Al-Rfou' <[email protected]>
 # URL: <http://www.nltk.org/>
 # For license information, see LICENSE.TXT
 #
 # $Id: senna.py $
 
 """
-A module for interfacing with the SENNA tagger.
+A module for interfacing with the SENNA pipeline.
 """
 
-import os
-import subprocess
-import tempfile
-import nltk
+from os import path, sep
+from subprocess import Popen, PIPE 
 from platform import architecture, system
-from nltk.tag.api import *
+from nltk.tag.api import TaggerI
 
 _senna_url = 'http://ml.nec-labs.com/senna/'
 
+
+class Error(Exception):
+    """Basic error handling class to be extended by the module specific
+    exceptions"""
+
+
+class ExecutableNotFound(Error):
+    """Raised if the senna executable does not exist"""
+
+
+class RunFailure(Error):
+    """Raised if the pipeline fails to execute"""
+
+
+class SentenceMisalignment(Error):
+    """Raised if the new sentence is shorter than the original one or the number
+    of sentences in the result is less than the input."""
+
+
 class SennaTagger(TaggerI):
-    __OPS = ['pos', 'chk', 'ner']
+    """
+    A general interface of the SENNA pipeline that supports any of the
+    operations specified in SUPPORTED_OPERATIONS.
+
+    Applying multiple operations at once has the speed advantage. For example,
+    senna v2.0 will calculate the POS tags in case you are extracting the named
+    entities. Applying both of the operations will cost only the time of
+    extracting the named entities.
+
+    SENNA pipeline has a fixed maximum size of the sentences that it can read.
+    By default it is 1024 token/sentence. If you have larger sentences, changing
+    the MAX_SENTENCE_SIZE value in SENNA_main.c should be considered and your
+    system specific binary should be rebuilt. Otherwise this could introduce
+    misalignment errors.
+
+    The input is:
+    - path to the directory that contains SENNA executables.
+    - List of the operations needed to be performed.
+    - (optionally) the encoding of the input data (default:utf-8)
+
+    Example:
+
+    .. doctest::
+        :options: +SKIP
+
+        >>> from nltk.tag.senna import SennaTagger
+        >>> pipeline = SennaTagger('/usr/share/senna-v2.0', ['pos', 'chk', 'ner'])
+        >>> sent = u'Düsseldorf is an international business center'.split()
+        >>> pipeline.tag(sent)
+        [{'word': u'D\xfcsseldorf', 'chk': u'B-NP', 'ner': u'B-PER', 'pos': u'NNP'},
+        {'word': u'is', 'chk': u'B-VP', 'ner': u'O', 'pos': u'VBZ'},
+        {'word': u'an', 'chk': u'B-NP', 'ner': u'O', 'pos': u'DT'},
+        {'word': u'international', 'chk': u'I-NP', 'ner': u'O', 'pos': u'JJ'},
+        {'word': u'business', 'chk': u'I-NP', 'ner': u'O', 'pos': u'NN'},
+        {'word': u'center', 'chk': u'I-NP', 'ner': u'O','pos': u'NN'}]
+    """
 
-    def __init__(self, path, operations, encoding=None, verbose=False):
+    SUPPORTED_OPERATIONS = ['pos', 'chk', 'ner']
+
+    def __init__(self, senna_path, operations, encoding='utf-8'):
         self._encoding = encoding
-        self._path = os.path.normpath(path) + os.sep
+        self._path = path.normpath(senna_path) + sep
         self.operations = operations
 
     @property
     def executable(self):
+        """
+        A property that determines the system specific binary that should be
+        used in the pipeline. In case, the system is not known the senna binary will
+        be used.
+        """        
         os_name = system()
         if os_name == 'Linux':
             bits = architecture()[0]
             if bits == '64bit':
-                return os.path.join(self._path, 'senna-linux64')
-            return os.path.join(self._path, 'senna-linux32')
+                return path.join(self._path, 'senna-linux64')
+            return path.join(self._path, 'senna-linux32')
         if os_name == 'Windows':
-            return os.path.join(self._path, 'senna-win32.exe')
+            return path.join(self._path, 'senna-win32.exe')
         if os_name == 'Darwin':
-            return os.path.join(self._path, 'senna-osx')
-        return os.path.join(self._path, 'senna')
+            return path.join(self._path, 'senna-osx')
+        return path.join(self._path, 'senna')
 
     def _map(self):
-        _map = {'word':0}
+        """
+        A method that calculates the order of the columns that SENNA pipeline
+        will output the tags into. This depends on the operations being ordered.
+        """
+        _map = {}
         i = 1
-        for operation in SennaTagger.__OPS:
+        for operation in SennaTagger.SUPPORTED_OPERATIONS:
             if operation in self.operations:
                 _map[operation] = i
                 i+= 1
         return _map
 
     def tag(self, tokens):
+        """
+        Applies the specified operation(s) on a list of tokens.
+        """
         return self.batch_tag([tokens])[0]
 
     def batch_tag(self, sentences):
+        """
+        Applies the tag method over a list of sentences. This method will return a
+        list of dictionaries. Every dictionary will contain a word with its
+        calculated annotations/tags.
+        """
         encoding = self._encoding
 
+        # Verifies the existence of the executable
+        if not path.isfile(self.executable):
+          raise ExecutableNotFound("Senna executable expected at %s but not found" %
+                                   self.executable)
+
         # Build the senna command to run the tagger
         _senna_cmd = [self.executable, '-path', self._path, '-usrtokens', '-iobtags']
         _senna_cmd.extend(['-'+op for op in self.operations])
 
-        # Write the actual sentences to the temporary input file
+        # Serialize the actual sentences to a temporary string
         _input = '\n'.join((' '.join(x) for x in sentences))+'\n'
         if isinstance(_input, unicode) and encoding:
             _input = _input.encode(encoding)
 
         # Run the tagger and get the output
-        p = subprocess.Popen(_senna_cmd,
-                             stdin=subprocess.PIPE,
-                             stdout=subprocess.PIPE,
-                             stderr=subprocess.PIPE)
+        p = Popen(_senna_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
         (stdout, stderr) = p.communicate(input=_input)
         senna_output = stdout
 
         # Check the return code.
         if p.returncode != 0:
-            print stderr
-            raise OSError('Senna command failed!')
+            raise RunFailure('Senna command failed! Details: %s' % stderr)
 
         if encoding:
             senna_output = stdout.decode(encoding)
 
         # Output the tagged sentences
         map_ = self._map()
         tagged_sentences = [[]]
+        sentence_index = 0
+        token_index = 0
         for tagged_word in senna_output.strip().split("\n"):
             if not tagged_word:
                 tagged_sentences.append([])
+                sentence_index += 1
+                token_index = 0
                 continue
             tags = tagged_word.split('\t')
             result = {}
             for tag in map_:
               result[tag] = tags[map_[tag]].strip()
+            try:
+              result['word'] = sentences[sentence_index][token_index]
+            except IndexError:
+              raise SentenceMisalignment(
+                "Misalignment error occurred at sentence number %d. Possible reason"
+                " is that the sentence size exceeded the maximum size. Check the "
+                "documentation of SennaTagger class for more information."
+                % sentence_index)
             tagged_sentences[-1].append(result)
+            token_index += 1
         return tagged_sentences
 
 
 class POSTagger(SennaTagger):
     """
-    A class for pos tagging with Senna POSTagger. The input is the paths to:
-     - A path to the senna executables
+    A Part of Speech tagger.
+
+    The input is:
+    - path to the directory that contains SENNA executables.
+    - (optionally) the encoding of the input data (default:utf-8)
 
     Example:
 
-        >>> tagger = senna.POSTagger(path='/media/data/NER/senna-v2.0')
-        >>> tagger.tag('What is the airspeed of an unladen swallow ?'.split())
+    .. doctest::
+        :options: +SKIP
+
+        >>> from nltk.tag.senna import POSTagger
+        >>> postagger = POSTagger('/usr/share/senna-v2.0')
+        >>> postagger.tag('What is the airspeed of an unladen swallow ?'.split())
         [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'),
         ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
     """
-    def __init__(self, path, encoding=None, verbose=False):
-        super(POSTagger, self).__init__(path, ['pos'], encoding, verbose)
+    def __init__(self, path, encoding='utf-8'):
+        super(POSTagger, self).__init__(path, ['pos'], encoding)
 
     def batch_tag(self, sentences):
+        """
+        Applies the tag method over a list of sentences. This method will return
+        for each sentence a list of tuples of (word, tag).
+        """
         tagged_sents = super(POSTagger, self).batch_tag(sentences)
         for i in range(len(tagged_sents)):
             for j in range(len(tagged_sents[i])):
-                tagged_sents[i][j] = (sentences[i][j], tagged_sents[i][j]['pos'])
+                annotations = tagged_sents[i][j]
+                tagged_sents[i][j] = (annotations['word'], annotations['pos'])
         return tagged_sents
 
 
 class NERTagger(SennaTagger):
-    def __init__(self, path, encoding=None, verbose=False):
-        super(NERTagger, self).__init__(path, ['ner'], encoding, verbose)
+    """
+    A named entity extractor.
+
+    The input is:
+    - path to the directory that contains SENNA executables.
+    - (optionally) the encoding of the input data (default:utf-8)
+
+    Example:
+
+    .. doctest::
+        :options: +SKIP
+
+        >>> from nltk.tag.senna import NERTagger
+        >>> nertagger = NERTagger('/usr/share/senna-v2.0')
+        >>> nertagger.tag('Shakespeare theatre was in London .'.split())
+        [('Shakespeare', u'B-PER'), ('theatre', u'O'), ('was', u'O'), ('in', u'O'),
+        ('London', u'B-LOC'), ('.', u'O')] 
+        >>> nertagger.tag('UN headquarters are in NY , USA .'.split())
+        [('UN', u'B-ORG'), ('headquarters', u'O'), ('are', u'O'), ('in', u'O'),
+        ('NY', u'B-LOC'), (',', u'O'), ('USA', u'B-LOC'), ('.', u'O')]
+    """
+    def __init__(self, path, encoding='utf-8'):
+        super(NERTagger, self).__init__(path, ['ner'], encoding)
 
     def batch_tag(self, sentences):
+        """
+        Applies the tag method over a list of sentences. This method will return
+        for each sentence a list of tuples of (word, tag).
+        """
         tagged_sents = super(NERTagger, self).batch_tag(sentences)
         for i in range(len(tagged_sents)):
             for j in range(len(tagged_sents[i])):
-                try:
-                    tagged_sents[i][j] = (sentences[i][j], tagged_sents[i][j]['ner'])
-                except:
-                    import pdb
-                    pdb.set_trace()
+                annotations = tagged_sents[i][j]
+                tagged_sents[i][j] = (annotations['word'], annotations['ner'])
         return tagged_sents
 
 
 class CHKTagger(SennaTagger):
-    def __init__(self, path, encoding=None, verbose=False):
-        super(CHKTagger, self).__init__(path, ['chk'], encoding, verbose)
+    """
+    A chunker.
+
+    The input is:
+    - path to the directory that contains SENNA executables.
+    - (optionally) the encoding of the input data (default:utf-8)
+
+    Example:
+
+    .. doctest::
+        :options: +SKIP
+
+        >>> from nltk.tag.senna import CHKTagger
+        >>> chktagger = CHKTagger('/usr/share/senna-v2.0')
+        >>> chktagger.tag('What is the airspeed of an unladen swallow ?'.split())
+        [('What', u'B-NP'), ('is', u'B-VP'), ('the', u'B-NP'), ('airspeed', u'I-NP'),
+        ('of', u'B-PP'), ('an', u'B-NP'), ('unladen', u'I-NP'), ('swallow',u'I-NP'),
+        ('?', u'O')]
+    """
+    def __init__(self, path, encoding='utf-8'):
+        super(CHKTagger, self).__init__(path, ['chk'], encoding)
 
     def batch_tag(self, sentences):
+        """
+        Applies the tag method over a list of sentences. This method will return
+        for each sentence a list of tuples of (word, tag).
+        """
         tagged_sents = super(CHKTagger, self).batch_tag(sentences)
         for i in range(len(tagged_sents)):
             for j in range(len(tagged_sents[i])):
-                tagged_sents[i][j] = (sentences[i][j], tagged_sents[i][j]['chk'])
+                annotations = tagged_sents[i][j]
+                tagged_sents[i][j] = (annotations['word'], annotations['chk'])
         return tagged_sents