Skip to content

Commit

Permalink
Merge pull request nltk#461 from larsmans/sklearn
Browse files Browse the repository at this point in the history
Updates to scikit-learn wrapper in nltk.classify
  • Loading branch information
kmike committed Aug 18, 2013
2 parents 0e1d87d + 7b602bd commit 49c163d
Showing 1 changed file with 40 additions and 21 deletions.
61 changes: 40 additions & 21 deletions nltk/classify/scikitlearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,21 @@
# For license information, see LICENSE.TXT
"""
scikit-learn (http://scikit-learn.org) is a machine learning library for
Python, supporting most of the basic classification algorithms, including SVMs,
Naive Bayes, logistic regression and decision trees.
Python. It supports many classification algorithms, including SVMs,
Naive Bayes, logistic regression (MaxEnt) and decision trees.
This package implement a wrapper around scikit-learn classifiers. To use this
wrapper, construct a scikit-learn classifier, then use that to construct a
SklearnClassifier. E.g., to wrap a linear SVM classifier with default settings,
do
wrapper, construct a scikit-learn estimator object, then use that to construct
a SklearnClassifier. E.g., to wrap a linear SVM with default settings:
>>> from sklearn.svm.sparse import LinearSVC
>>> from nltk.classify.scikitlearn import SklearnClassifier
>>> classif = SklearnClassifier(LinearSVC())
The scikit-learn classifier may be arbitrarily complex. E.g., the following
constructs and wraps a Naive Bayes estimator with tf-idf weighting and
chi-square feature selection:
A scikit-learn classifier may include preprocessing steps when it's wrapped
in a Pipeline object. The following constructs and wraps a Naive Bayes text
classifier with tf-idf weighting and chi-square feature selection to get the
best 1000 features:
>>> from sklearn.feature_extraction.text import TfidfTransformer
>>> from sklearn.feature_selection import SelectKBest, chi2
Expand All @@ -29,8 +29,6 @@
... ('chi2', SelectKBest(chi2, k=1000)),
... ('nb', MultinomialNB())])
>>> classif = SklearnClassifier(pipeline)
(Such a classifier could be trained on word counts for text classification.)
"""
from __future__ import print_function, unicode_literals

Expand All @@ -53,12 +51,13 @@ def __init__(self, estimator, dtype=float, sparse=True):
:param estimator: scikit-learn classifier object.
:param dtype: data type used when building feature array.
scikit-learn estimators work exclusively on numeric data; use bool
when all features are binary.
scikit-learn estimators work exclusively on numeric data. The
default value should be fine for almost all situations.
:param sparse: Whether to use sparse matrices. The estimator must
support these; not all scikit-learn classifiers do. The default
value is True, since most NLP problems involve sparse feature sets.
Setting this to False may take a great amount of memory.
:type sparse: boolean.
"""
self._clf = estimator
Expand All @@ -69,24 +68,42 @@ def __repr__(self):
return "<SklearnClassifier(%r)>" % self._clf

def batch_classify(self, featuresets):
"""Classify a batch of samples.
:param labeled_featuresets: A list of featuresets, each a dict
mapping strings to either numbers or booleans.
:return: The predicted class label for each input sample.
:rtype: list
"""
X = self._convert(featuresets)
y = self._clf.predict(X)
return [self._index_label[int(yi)] for yi in y]

def batch_prob_classify(self, featuresets):
"""Compute per-class probabilities for a batch of samples.
:param labeled_featuresets: A list of featuresets, each a dict
mapping strings to either numbers or booleans.
:rtype: list of ``ProbDistI``
"""
X = self._convert(featuresets)
y_proba_list = self._clf.predict_proba(X)
return [self._make_probdist(y_proba) for y_proba in y_proba_list]

def labels(self):
"""The class labels used by this classifier.
:rtype: list
"""
return self._label_index.keys()

def train(self, labeled_featuresets):
"""
Train (fit) the scikit-learn estimator.
:param labeled_featuresets: A list of classified featuresets,
i.e., a list of tuples ``(featureset, label)``.
:param labeled_featuresets: A list of ``(featureset, label)``
where each ``featureset`` is a dict mapping strings to either
numbers or booleans.
"""

self._feature_index = {}
Expand All @@ -110,13 +127,14 @@ def train(self, labeled_featuresets):
return self

def _convert(self, featuresets):
"""Convert featuresets to a format scikit-learn will grok."""
if self._sparse:
return self._featuresets_to_coo(featuresets)
else:
return self._featuresets_to_array(featuresets)

def _featuresets_to_coo(self, featuresets):
"""Convert featuresets to sparse matrix (COO format)."""
"""Convert featuresets to scipy.sparse matrix (COO format)."""

i_ind = []
j_ind = []
Expand All @@ -133,10 +151,11 @@ def _featuresets_to_coo(self, featuresets):
pass

shape = (i + 1, len(self._feature_index))
return coo_matrix((values, (i_ind, j_ind)), shape=shape, dtype=self._dtype)
return coo_matrix((values, (i_ind, j_ind)), shape=shape,
dtype=self._dtype)

def _featuresets_to_array(self, featuresets):
"""Convert featureset to Numpy array."""
"""Convert featureset to NumPy array."""

X = np.zeros((len(featuresets), len(self._feature_index)),
dtype=self._dtype)
Expand Down Expand Up @@ -165,13 +184,13 @@ def setup_module(module):

if __name__ == "__main__":
from nltk.classify.util import names_demo, binary_names_demo_features
try:
from sklearn.linear_model.sparse import LogisticRegression
except ImportError: # separate sparse LR to be removed in 0.12
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB

print("scikit-learn Naive Bayes:")
# Bernoulli Naive Bayes is designed for binary classification. We set the
# binarize option to False since we know we're passing binary features
# (when binarize=False, scikit-learn does x>0 on the feature values x).
names_demo(SklearnClassifier(BernoulliNB(binarize=False), dtype=bool).train,
features=binary_names_demo_features)
print("scikit-learn logistic regression:")
Expand Down

0 comments on commit 49c163d

Please sign in to comment.