documentation changed to numpy docstring format.

sujitpal · Dec 17, 2019 · b6e9e2a · b6e9e2a
1 parent 24b3b56
commit b6e9e2a
Show file tree

Hide file tree

Showing 7 changed files with 423 additions and 202 deletions.
diff --git a/nerds/models/base.py b/nerds/models/base.py
@@ -35,6 +35,19 @@ def load(self, file_path):
     def score(self, X, y, sample_weights=None):
         """ Returns score for the model based on predicting on (X, y).  This 
             method is needed for GridSearch like operations.
+
+            Parameters
+            ----------
+            X : list(list(str))
+                list of list of tokens.
+            y : list(list(str))
+                list of list of tags
+            sample_weights : list(float), not used
+
+            Returns
+            -------
+            score: float
+                numeric score for estimator.
         """
         y_pred = self.predict(X)
         return accuracy_score(flatten_list(y), flatten_list(y_pred))

diff --git a/nerds/models/bilstm.py b/nerds/models/bilstm.py
@@ -14,7 +14,6 @@
 
 log = get_logger()
 
-
 class BiLstmCrfNER(NERModel):
 
     def __init__(self,
@@ -34,19 +33,39 @@ def __init__(self,
             level embeddings as well as word embeddings by default. Implementation 
             is provided by the Anago project.
 
-            Args:
-                word_embedding_dim (int): word embedding dimensions.
-                char_embedding_dim (int): character embedding dimensions.
-                word_lstm_size (int): character LSTM feature extractor output dimensions.
-                char_lstm_size (int): word tagger LSTM output dimensions.
-                fc_dim (int): output fully-connected layer size.
-                dropout (float): dropout rate.
-                embeddings (numpy array): word embedding matrix.
-                use_char (boolean): add char feature.
-                use_crf (boolean): use crf as last layer.
-                batch_size (int): training batch size.
-                learning_rate (float): learning rate for Adam optimizer.
-                num_epochs (int): number of epochs of training.
+            Parameters
+            ----------
+            word_embedding_dim : int, optional, default 100
+                word embedding dimensions.
+            char_embedding_dim : int, optional, default 25
+                character embedding dimensions.
+            word_lstm_size : int, optional, default 100
+                character LSTM feature extractor output dimensions.
+            char_lstm_size : int, optional, default 25
+                word tagger LSTM output dimensions.
+            fc_dim : int, optional, default 100
+                output fully-connected layer size.
+            dropout : float, optional, default 0.5
+                dropout rate.
+            embeddings : numpy array
+                word embedding matrix.
+            use_char : bool, optional, default True
+                add char feature.
+            use_crf : bool, optional, default True
+                use crf as last layer.
+            batch_size : int, optional, default 16
+                training batch size.
+            learning_rate : float, optional, default 0.001
+                learning rate for Adam optimizer
+            max_iter : int
+                number of epochs of training
+
+            Attributes
+            ----------
+            preprocessor_ : reference to preprocessor
+            model_ : reference to generated model
+            trainer_ : internal reference to Anago Trainer (model)
+            tagger_ : internal reference to Anago Tagger (predictor)
         """
         super().__init__()
         self.word_embedding_dim = word_embedding_dim
@@ -71,9 +90,16 @@ def __init__(self,
     def fit(self, X, y):
         """ Trains the NER model. Input is list of AnnotatedDocuments.
 
-            Args:
-                X list(list(str)): list of list of tokens
-                y list(list(str)): list of list of BIO tags
+            Parameters
+            ----------
+            X : list(list(str))
+                list of list of tokens
+            y : list(list(str))
+                list of list of BIO tags
+
+            Returns
+            -------
+            self
         """
         log.info("Preprocessing dataset...")
         self.preprocessor_ = IndexTransformer(use_char=self.use_char)
@@ -112,10 +138,15 @@ def fit(self, X, y):
     def predict(self, X):
         """ Predicts using the NER model.
 
-            Args:
-                X list(list(str)): list of list of tokens.
-            Returns:
-                y list(list(str)): list of list of predicted BIO tags.
+            Parameters
+            ----------
+            X : list(list(str))
+                list of list of tokens.
+
+            Returns
+            -------
+            y : list(list(str))
+                list of list of predicted BIO tags.
         """
         if self.tagger_ is None:
             raise ValueError("No tagger found, either run fit() to train or load() a trained model")
@@ -128,10 +159,16 @@ def predict(self, X):
     def save(self, dirpath):
         """ Saves model to local disk, given a dirpath 
         
-        Args:
-            dirpath (str): a directory where model artifacts will be saved.
+            Parameters
+            ----------
+            dirpath : str
+                a directory where model artifacts will be saved.
                 Model saves a weights.h5 weights file, a params.json parameter
                 file, and a preprocessor.pkl preprocessor file.
+
+            Returns
+            -------
+            None
         """
         if self.model_ is None or self.preprocessor_ is None:
             raise ValueError("No model artifacts to save, either run fit() to train or load() a trained model")
@@ -150,8 +187,14 @@ def save(self, dirpath):
     def load(self, dirpath):
         """ Loads a trained model from local disk, given the dirpath
 
-        Args:
-            dirpath (str): a directory where model artifacts are saved.
+            Parameters
+            ----------
+            dirpath : str
+                a directory where model artifacts are saved.
+
+            Returns
+            -------
+            self
         """
         if not os.path.exists(dirpath):
             raise ValueError("Model directory not found: {:s}".format(dirpath))

diff --git a/nerds/models/crf.py b/nerds/models/crf.py
@@ -19,17 +19,24 @@ def __init__(self,
         """ Construct a Conditional Random Fields (CRF) based NER. Implementation
             of CRF NER is provided by sklearn.crfsuite.CRF.
 
-            Args:
-                max_iter (int, default 100): maximum number of iterations to run
-                    CRF training
-                c1 (float, default 0.1): L1 regularization coefficient.
-                c2 (float, default 0.1): L2 regularization coefficient.
-                featurizer (function, default None): if None, the default featurizer
-                    _sent2features() is used to convert list of tokens for each
-                    sentence to a list of features, where each feature is a dictionary
-                    of name-value pairs. For custom features, a featurizer function must
-                    be provided that takes in a list of tokens (sentence) and returns a
-                    list of features.
+            Parameters
+            ----------
+            max_iter : int, optional, default 100
+                maximum number of iterations to run CRF training
+            c1 : float, optional, default 0.1
+                L1 regularization coefficient.
+            c2 : float, optional, default 0.1
+                L2 regularization coefficient.
+            featurizer : function, default None
+                if None, the default featurizer _sent2features() is used to convert 
+                list of tokens for each sentence to a list of features, where each 
+                feature is a dictionary of name-value pairs. For custom features, a 
+                featurizer function must be provided that takes in a list of tokens 
+                (sentence) and returns a list of features.
+
+            Attributes
+            ----------
+            model_ : reference to the internal sklearn_crfsuite.CRF model.
         """
         super().__init__()
         self.max_iter = max_iter
@@ -42,16 +49,19 @@ def __init__(self,
 
     def fit(self, X, y):
         """ Build feature vectors and train CRF model. Wrapper for 
-            sklearn_crfsuite.CRF model. The underlying model takes many
-            parameters (for full list (and possible future enhancement), see
-            https://sklearn-crfsuite.readthedocs.io/en/latest/_modules/sklearn_crfsuite/estimator.html#CRF)
-
-            Args:
-                X (list(list(str))) or (list(list(dict(str, str)))): list of 
-                    sentences or features. Sentences are tokenized into list 
-                    of words, and features are a list of word features, each
-                    word feature is a dictionary of name-value pairs.
-                y (list(list(str))): list of list of BIO tags.
+            sklearn_crfsuite.CRF model.
+
+            Parameters
+            ----------
+            X : list(list(str))
+                list of sentences. Sentences are tokenized into list 
+                of words.
+            y : list(list(str))
+                list of list of BIO tags.
+
+            Returns
+            -------
+            self
         """
         if self.featurizer is None:
             features = [self._sent2features(sent) for sent in X]
@@ -76,13 +86,15 @@ def fit(self, X, y):
     def predict(self, X):
         """ Predicts using trained CRF model.
 
-            Args:
-                X (list(list(dict(str, str))) or list(list(str))): list
-                of sentences or features.
-                is_featurized (bool, default False): if True, X is a list
-                    of list of features, else X is a list of list of tokens.
-            Returns:
-                y (list(list(str))): list of list of predicted BIO tags.
+            Parameters
+            ----------
+            X : list(list(dict(str, str))
+                list of sentences. Sentences are tokenized into list of words.
+
+            Returns
+            -------
+            y : list(list(str))
+                list of list of predicted BIO tags.
         """
         if self.model_ is None:
             raise ValueError("CRF model not found, run fit() to train or load() pre-trained model")
@@ -98,8 +110,14 @@ def predict(self, X):
     def save(self, dirpath):
         """ Save a trained CRF model at dirpath.
 
-            Args:
-                dirpath (str): path to model directory.
+            Parameters
+            ----------
+            dirpath : str
+                path to model directory.
+
+            Returns
+            -------
+            None
         """
         if self.model_ is None:
             raise ValueError("No model to save, run fit() to train or load() pre-trained model")
@@ -114,10 +132,14 @@ def save(self, dirpath):
     def load(self, dirpath):
         """ Load a pre-trained CRF model from dirpath.
 
-            Args:
-                dirpath (str): path to model directory.
-            Returns:
-                this object populated with pre-trained model.
+            Parameters
+            -----------
+            dirpath : str
+                path to model directory.
+            
+            Returns
+            --------
+            self
         """
         model_file = os.path.join(dirpath, "crf-model.pkl")
         if not os.path.exists(model_file):
@@ -135,12 +157,16 @@ def _sent2features(self, sent):
         """ Converts a list of tokens to a list of features for CRF.
             Each feature is a dictionary of feature name value pairs.
 
-            Args:
-                sent (list(str)): a list of tokens representing a sentence.
+            Parameters
+            ----------
+            sent : list(str))
+                a list of tokens representing a sentence.
 
-            Returns:
-                feats (list(dict(str, obj))): a list of features, where each
-                    feature is a dictionary of name-value pairs.
+            Returns
+            -------
+            feats : list(dict(str, obj))
+                a list of features, where each feature represents a token
+                as a dictionary of name-value pairs.
         """
         if self._nlp is None:
             self._nlp = self._load_language_model()