Skip to content

Commit

Permalink
documentation changed to numpy docstring format.
Browse files Browse the repository at this point in the history
  • Loading branch information
sujitpal committed Dec 17, 2019
1 parent 24b3b56 commit b6e9e2a
Show file tree
Hide file tree
Showing 7 changed files with 423 additions and 202 deletions.
13 changes: 13 additions & 0 deletions nerds/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,19 @@ def load(self, file_path):
def score(self, X, y, sample_weights=None):
""" Returns score for the model based on predicting on (X, y). This
method is needed for GridSearch like operations.
Parameters
----------
X : list(list(str))
list of list of tokens.
y : list(list(str))
list of list of tags
sample_weights : list(float), not used
Returns
-------
score: float
numeric score for estimator.
"""
y_pred = self.predict(X)
return accuracy_score(flatten_list(y), flatten_list(y_pred))
Expand Down
93 changes: 68 additions & 25 deletions nerds/models/bilstm.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

log = get_logger()


class BiLstmCrfNER(NERModel):

def __init__(self,
Expand All @@ -34,19 +33,39 @@ def __init__(self,
level embeddings as well as word embeddings by default. Implementation
is provided by the Anago project.
Args:
word_embedding_dim (int): word embedding dimensions.
char_embedding_dim (int): character embedding dimensions.
word_lstm_size (int): character LSTM feature extractor output dimensions.
char_lstm_size (int): word tagger LSTM output dimensions.
fc_dim (int): output fully-connected layer size.
dropout (float): dropout rate.
embeddings (numpy array): word embedding matrix.
use_char (boolean): add char feature.
use_crf (boolean): use crf as last layer.
batch_size (int): training batch size.
learning_rate (float): learning rate for Adam optimizer.
num_epochs (int): number of epochs of training.
Parameters
----------
word_embedding_dim : int, optional, default 100
word embedding dimensions.
char_embedding_dim : int, optional, default 25
character embedding dimensions.
word_lstm_size : int, optional, default 100
character LSTM feature extractor output dimensions.
char_lstm_size : int, optional, default 25
word tagger LSTM output dimensions.
fc_dim : int, optional, default 100
output fully-connected layer size.
dropout : float, optional, default 0.5
dropout rate.
embeddings : numpy array
word embedding matrix.
use_char : bool, optional, default True
add char feature.
use_crf : bool, optional, default True
use crf as last layer.
batch_size : int, optional, default 16
training batch size.
learning_rate : float, optional, default 0.001
learning rate for Adam optimizer
max_iter : int
number of epochs of training
Attributes
----------
preprocessor_ : reference to preprocessor
model_ : reference to generated model
trainer_ : internal reference to Anago Trainer (model)
tagger_ : internal reference to Anago Tagger (predictor)
"""
super().__init__()
self.word_embedding_dim = word_embedding_dim
Expand All @@ -71,9 +90,16 @@ def __init__(self,
def fit(self, X, y):
""" Trains the NER model. Input is list of AnnotatedDocuments.
Args:
X list(list(str)): list of list of tokens
y list(list(str)): list of list of BIO tags
Parameters
----------
X : list(list(str))
list of list of tokens
y : list(list(str))
list of list of BIO tags
Returns
-------
self
"""
log.info("Preprocessing dataset...")
self.preprocessor_ = IndexTransformer(use_char=self.use_char)
Expand Down Expand Up @@ -112,10 +138,15 @@ def fit(self, X, y):
def predict(self, X):
""" Predicts using the NER model.
Args:
X list(list(str)): list of list of tokens.
Returns:
y list(list(str)): list of list of predicted BIO tags.
Parameters
----------
X : list(list(str))
list of list of tokens.
Returns
-------
y : list(list(str))
list of list of predicted BIO tags.
"""
if self.tagger_ is None:
raise ValueError("No tagger found, either run fit() to train or load() a trained model")
Expand All @@ -128,10 +159,16 @@ def predict(self, X):
def save(self, dirpath):
""" Saves model to local disk, given a dirpath
Args:
dirpath (str): a directory where model artifacts will be saved.
Parameters
----------
dirpath : str
a directory where model artifacts will be saved.
Model saves a weights.h5 weights file, a params.json parameter
file, and a preprocessor.pkl preprocessor file.
Returns
-------
None
"""
if self.model_ is None or self.preprocessor_ is None:
raise ValueError("No model artifacts to save, either run fit() to train or load() a trained model")
Expand All @@ -150,8 +187,14 @@ def save(self, dirpath):
def load(self, dirpath):
""" Loads a trained model from local disk, given the dirpath
Args:
dirpath (str): a directory where model artifacts are saved.
Parameters
----------
dirpath : str
a directory where model artifacts are saved.
Returns
-------
self
"""
if not os.path.exists(dirpath):
raise ValueError("Model directory not found: {:s}".format(dirpath))
Expand Down
104 changes: 65 additions & 39 deletions nerds/models/crf.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,24 @@ def __init__(self,
""" Construct a Conditional Random Fields (CRF) based NER. Implementation
of CRF NER is provided by sklearn.crfsuite.CRF.
Args:
max_iter (int, default 100): maximum number of iterations to run
CRF training
c1 (float, default 0.1): L1 regularization coefficient.
c2 (float, default 0.1): L2 regularization coefficient.
featurizer (function, default None): if None, the default featurizer
_sent2features() is used to convert list of tokens for each
sentence to a list of features, where each feature is a dictionary
of name-value pairs. For custom features, a featurizer function must
be provided that takes in a list of tokens (sentence) and returns a
list of features.
Parameters
----------
max_iter : int, optional, default 100
maximum number of iterations to run CRF training
c1 : float, optional, default 0.1
L1 regularization coefficient.
c2 : float, optional, default 0.1
L2 regularization coefficient.
featurizer : function, default None
if None, the default featurizer _sent2features() is used to convert
list of tokens for each sentence to a list of features, where each
feature is a dictionary of name-value pairs. For custom features, a
featurizer function must be provided that takes in a list of tokens
(sentence) and returns a list of features.
Attributes
----------
model_ : reference to the internal sklearn_crfsuite.CRF model.
"""
super().__init__()
self.max_iter = max_iter
Expand All @@ -42,16 +49,19 @@ def __init__(self,

def fit(self, X, y):
""" Build feature vectors and train CRF model. Wrapper for
sklearn_crfsuite.CRF model. The underlying model takes many
parameters (for full list (and possible future enhancement), see
https://sklearn-crfsuite.readthedocs.io/en/latest/_modules/sklearn_crfsuite/estimator.html#CRF)
Args:
X (list(list(str))) or (list(list(dict(str, str)))): list of
sentences or features. Sentences are tokenized into list
of words, and features are a list of word features, each
word feature is a dictionary of name-value pairs.
y (list(list(str))): list of list of BIO tags.
sklearn_crfsuite.CRF model.
Parameters
----------
X : list(list(str))
list of sentences. Sentences are tokenized into list
of words.
y : list(list(str))
list of list of BIO tags.
Returns
-------
self
"""
if self.featurizer is None:
features = [self._sent2features(sent) for sent in X]
Expand All @@ -76,13 +86,15 @@ def fit(self, X, y):
def predict(self, X):
""" Predicts using trained CRF model.
Args:
X (list(list(dict(str, str))) or list(list(str))): list
of sentences or features.
is_featurized (bool, default False): if True, X is a list
of list of features, else X is a list of list of tokens.
Returns:
y (list(list(str))): list of list of predicted BIO tags.
Parameters
----------
X : list(list(dict(str, str))
list of sentences. Sentences are tokenized into list of words.
Returns
-------
y : list(list(str))
list of list of predicted BIO tags.
"""
if self.model_ is None:
raise ValueError("CRF model not found, run fit() to train or load() pre-trained model")
Expand All @@ -98,8 +110,14 @@ def predict(self, X):
def save(self, dirpath):
""" Save a trained CRF model at dirpath.
Args:
dirpath (str): path to model directory.
Parameters
----------
dirpath : str
path to model directory.
Returns
-------
None
"""
if self.model_ is None:
raise ValueError("No model to save, run fit() to train or load() pre-trained model")
Expand All @@ -114,10 +132,14 @@ def save(self, dirpath):
def load(self, dirpath):
""" Load a pre-trained CRF model from dirpath.
Args:
dirpath (str): path to model directory.
Returns:
this object populated with pre-trained model.
Parameters
-----------
dirpath : str
path to model directory.
Returns
--------
self
"""
model_file = os.path.join(dirpath, "crf-model.pkl")
if not os.path.exists(model_file):
Expand All @@ -135,12 +157,16 @@ def _sent2features(self, sent):
""" Converts a list of tokens to a list of features for CRF.
Each feature is a dictionary of feature name value pairs.
Args:
sent (list(str)): a list of tokens representing a sentence.
Parameters
----------
sent : list(str))
a list of tokens representing a sentence.
Returns:
feats (list(dict(str, obj))): a list of features, where each
feature is a dictionary of name-value pairs.
Returns
-------
feats : list(dict(str, obj))
a list of features, where each feature represents a token
as a dictionary of name-value pairs.
"""
if self._nlp is None:
self._nlp = self._load_language_model()
Expand Down
Loading

0 comments on commit b6e9e2a

Please sign in to comment.