Skip to content

Commit

Permalink
Merge branch 'master' of github.com:scikit-learn/scikit-learn
Browse files Browse the repository at this point in the history
  • Loading branch information
mblondel committed Nov 6, 2013
2 parents d0cdcde + f719d09 commit 63f7dd1
Show file tree
Hide file tree
Showing 82 changed files with 4,929 additions and 4,286 deletions.
7 changes: 5 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,11 @@ install:
- if [ "${COVERAGE}" == "--with-coverage" ]; then sudo pip install coverage; fi
- if [ "${COVERAGE}" == "--with-coverage" ]; then sudo pip install coveralls; fi
script:
- make test
- if [ "${COVERAGE}" == "--with-coverage" ]; then make test-coverage; fi
- if [ "${COVERAGE}" == "--with-coverage" ]; then
- make test-coverage;
- else
- make test;
- fi
after_success:
- if [ "${COVERAGE}" == "--with-coverage" ]; then coveralls; fi

2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Contributing code
=================

**Note: This document is just to get started, visit [**Contributing
page**](http://scikit-learn.org/stable/developers/index.html#coding-guidelines)
page**](http://scikit-learn.org/stable/developers/index.html)
for the full contributor's guide. Please be sure to read it carefully to make
the code review process go as smoothly as possible and maximize the
likelihood of your contribution being merged.**
Expand Down
13 changes: 12 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -71,13 +71,24 @@ GIT

You can check the latest sources with the command::

git clone git://github.com/scikit-learn/scikit-learn.git
git clone https://github.com/scikit-learn/scikit-learn.git

or if you have write privileges::

git clone [email protected]:scikit-learn/scikit-learn.git


Contributing
~~~~~~~~~~~~

Quick tutorial on how to go about setting up your environment to
contribute to scikit-learn: https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md

Before opening a Pull Request, have a look at the
full Contributing page to make sure your code complies
with our guidelines: http://scikit-learn.org/stable/developers/index.html


Testing
-------

Expand Down
36 changes: 21 additions & 15 deletions benchmarks/bench_plot_nmf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@

from __future__ import print_function

from collections import defaultdict
import gc
from time import time

import numpy as np
from collections import defaultdict
from scipy.linalg import norm

from sklearn.decomposition.nmf import NMF, _initialize_nmf
from sklearn.datasets.samples_generator import make_low_rank_matrix
Expand All @@ -27,7 +29,7 @@ def alt_nnmf(V, r, max_iter=1000, tol=1e-3, R=None):
r : integer
number of latent features
max_iter : integer, optional
maximum number of iterations (default: 10000)
maximum number of iterations (default: 1000)
tol : double
tolerance threshold for early exit (when the update factor is within
tol of 1., the function exits)
Expand Down Expand Up @@ -62,25 +64,29 @@ def alt_nnmf(V, r, max_iter=1000, tol=1e-3, R=None):
H *= updateH
updateW = np.dot(V, H.T) / (np.dot(W, np.dot(H, H.T)) + eps)
W *= updateW
if True or (i % 10) == 0:
if i % 10 == 0:
max_update = max(updateW.max(), updateH.max())
if abs(1. - max_update) < tol:
break
return W, H


def compute_bench(samples_range, features_range, rank=50, tolerance=1e-7):
def report(error, time):
print("Frobenius loss: %.5f" % error)
print("Took: %.2fs" % time)
print()


def benchmark(samples_range, features_range, rank=50, tolerance=1e-5):
it = 0
timeset = defaultdict(lambda: [])
err = defaultdict(lambda: [])

max_it = len(samples_range) * len(features_range)
for n_samples in samples_range:
for n_features in features_range:
it += 1
print('====================')
print('Iteration %03d of %03d' % (it, max_it))
print('====================')
print("%2d samples, %2d features" % (n_samples, n_features))
print('=======================')
X = np.abs(make_low_rank_matrix(n_samples, n_features,
effective_rank=rank, tail_strength=0.2))

Expand All @@ -91,7 +97,7 @@ def compute_bench(samples_range, features_range, rank=50, tolerance=1e-7):
tend = time() - tstart
timeset['nndsvd-nmf'].append(tend)
err['nndsvd-nmf'].append(m.reconstruction_err_)
print(m.reconstruction_err_, tend)
report(m.reconstruction_err_, tend)

gc.collect()
print("benchmarking nndsvda-nmf: ")
Expand All @@ -101,7 +107,7 @@ def compute_bench(samples_range, features_range, rank=50, tolerance=1e-7):
tend = time() - tstart
timeset['nndsvda-nmf'].append(tend)
err['nndsvda-nmf'].append(m.reconstruction_err_)
print(m.reconstruction_err_, tend)
report(m.reconstruction_err_, tend)

gc.collect()
print("benchmarking nndsvdar-nmf: ")
Expand All @@ -111,7 +117,7 @@ def compute_bench(samples_range, features_range, rank=50, tolerance=1e-7):
tend = time() - tstart
timeset['nndsvdar-nmf'].append(tend)
err['nndsvdar-nmf'].append(m.reconstruction_err_)
print(m.reconstruction_err_, tend)
report(m.reconstruction_err_, tend)

gc.collect()
print("benchmarking random-nmf")
Expand All @@ -121,7 +127,7 @@ def compute_bench(samples_range, features_range, rank=50, tolerance=1e-7):
tend = time() - tstart
timeset['random-nmf'].append(tend)
err['random-nmf'].append(m.reconstruction_err_)
print(m.reconstruction_err_, tend)
report(m.reconstruction_err_, tend)

gc.collect()
print("benchmarking alt-random-nmf")
Expand All @@ -130,7 +136,7 @@ def compute_bench(samples_range, features_range, rank=50, tolerance=1e-7):
tend = time() - tstart
timeset['alt-random-nmf'].append(tend)
err['alt-random-nmf'].append(np.linalg.norm(X - np.dot(W, H)))
print(np.linalg.norm(X - np.dot(W, H)), tend)
report(norm(X - np.dot(W, H)), tend)

return timeset, err

Expand All @@ -142,10 +148,10 @@ def compute_bench(samples_range, features_range, rank=50, tolerance=1e-7):

samples_range = np.linspace(50, 500, 3).astype(np.int)
features_range = np.linspace(50, 500, 3).astype(np.int)
timeset, err = compute_bench(samples_range, features_range)
timeset, err = benchmark(samples_range, features_range)

for i, results in enumerate((timeset, err)):
fig = plt.figure('scikit-learn Non-Negative Matrix Factorization benchmkar results')
fig = plt.figure('scikit-learn Non-Negative Matrix Factorization benchmark results')
ax = fig.gca(projection='3d')
for c, (label, timings) in zip('rbgcm', sorted(results.iteritems())):
X, Y = np.meshgrid(samples_range, features_range)
Expand Down
5 changes: 5 additions & 0 deletions doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,3 +224,8 @@
#latex_use_modindex = True

trim_doctests_flags = True

# Add the 'copybutton' javascript, to hide/show the prompt in code
# examples
def setup(app):
app.add_javascript('js/copybutton.js')
2 changes: 1 addition & 1 deletion doc/datasets/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ fetched from mldata.org have more sophisticated structure.
These functions return a dictionary-like object holding at least two items:
an array of shape ``n_samples`` * `` n_features`` with key ``data``
(except for 20newsgroups)
and a NumPy array of length ``n_features``, containing the target values,
and a NumPy array of length ``n_samples``, containing the target values,
with key ``target``.

The datasets also contain a description in ``DESCR`` and some contain
Expand Down
Binary file removed doc/images/minBox.png
Binary file not shown.
Binary file removed doc/images/minBoxHighlight.png
Binary file not shown.
Binary file removed doc/images/noneBox.png
Binary file not shown.
Binary file removed doc/images/plusBox.png
Binary file not shown.
Binary file removed doc/images/plusBoxHighlight.png
Binary file not shown.
5 changes: 4 additions & 1 deletion doc/modules/covariance.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,10 @@ The empirical covariance matrix of a sample can be computed using the
:class:`EmpiricalCovariance` object to the data sample with the
:meth:`EmpiricalCovariance.fit` method. Be careful that depending
whether the data are centered or not, the result will be different, so
one may want to use the `assume_centered` parameter accurately.
one may want to use the `assume_centered` parameter accurately. More precisely
if one uses `assume_centered=False`, then the test set is supposed to have the
same mean vector as the training set. If not so, both should be centered by the
user, and `assume_centered=True` should be used.

.. topic:: Examples:

Expand Down
38 changes: 36 additions & 2 deletions doc/modules/cross_validation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ where the number of samples is very small.
Computing cross-validated metrics
=================================

The simplest way to use perform cross-validation in to call the
The simplest way to use cross-validation is to call the
:func:`cross_val_score` helper function on the estimator and the dataset.

The following example demonstrates how to estimate the accuracy of a linear
Expand Down Expand Up @@ -165,7 +165,7 @@ validation strategies.
K-fold
------

:class:`KFold` divides all the samples in math:`k` groups of samples,
:class:`KFold` divides all the samples in :math:`k` groups of samples,
called folds (if :math:`k = n`, this is equivalent to the *Leave One
Out* strategy), of equal sizes (if possible). The prediction function is
learned using :math:`k - 1` folds, and the fold left out is used for test.
Expand Down Expand Up @@ -231,6 +231,40 @@ not waste much data as only one sample is removed from the learning set::
[0 1 2] [3]


Potential users of LOO for model selection should weigh a few known caveats.
When compared with *k*-fold cross validation, one builds *n* models from *n*
samples instead of *k* models, where *n > k*. Moreover, each is trained on *n - 1*
samples rather than *(k-1)n / k*. In both ways, assuming *k* is not too large
and *k < n*, LOO is more computationally expensive than *k*-fold cross validation.

In terms of accuracy, LOO often results in high variance as an estimator for the
test error. Intuitively, since *n - 1* of
the *n* samples are used to build each model, models constructed from folds are
virtually identical to each other and to the model built from the entire training
set.

However, if the learning curve is steep for the training size in question,
then 5- or 10- fold cross validation can overestimate the generalization error.

As a general rule, most authors, and empirical evidence, suggest that 5- or 10-
fold cross validation should be preferred to LOO.


.. topic:: References:

* http://www.faqs.org/faqs/ai-faq/neural-nets/part3/section-12.html
* T. Hastie, R. Tibshirani, J. Friedman, `The Elements of Statistical Learning
<http://www-stat.stanford.edu/~tibs/ElemStatLearn>`_, Springer 2009
* L. Breiman, P. Spector `Submodel selection and evaluation in regression: The X-random case
<http://digitalassets.lib.berkeley.edu/sdtr/ucb/text/197.pdf>`_, International Statistical Review 1992
* R. Kohavi, `A Study of Cross-Validation and Bootstrap for Accuracy Estimation and Model Selection
<http://www.cs.iastate.edu/~jtian/cs573/Papers/Kohavi-IJCAI-95.pdf>`_, Intl. Jnt. Conf. AI
* R. Bharat Rao, G. Fung, R. Rosales, `On the Dangers of Cross-Validation. An Experimental Evaluation
<http://www.siam.org/proceedings/datamining/2008/dm08_54_Rao.pdf>`_, SIAM 2008
* G. James, D. Witten, T. Hastie, R Tibshirani, `An Introduction to Statitical Learning
<http://www-bcf.usc.edu/~gareth/ISL>`_, Springer 2013


Leave-P-Out - LPO
-----------------

Expand Down
2 changes: 1 addition & 1 deletion doc/modules/feature_selection.rst
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ to use a :class:`sklearn.pipeline.Pipeline`::

In this snippet we make use of a :class:`sklearn.svm.LinearSVC`
to evaluate feature importances and select the most relevant features.
Then, a class:`sklearn.ensemble.GradientBoostingClassifier` is trained on the
Then, a :class:`sklearn.ensemble.RandomForestClassifier` is trained on the
transformed output, i.e. using only relevant features. You can perform
similar operations with the other feature selection methods and also
classifiers that provide a way to evaluate feature importances of course.
Expand Down
65 changes: 64 additions & 1 deletion doc/modules/linear_model.rst
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ Elastic Net
===========
:class:`ElasticNet` is a linear model trained with L1 and L2 prior as
regularizer. This combination allows for learning a sparse model where
few of the weights are non-zero like :class:`Lasso`, while still maintaining the
few of the weights are non-zero like :class:`Lasso`, while still maintaining
the regularization properties of :class:`Ridge`. We control this tradeoff
using the `l1_ratio` parameter.

Expand Down Expand Up @@ -725,3 +725,66 @@ For classification, :class:`PassiveAggressiveClassifier` can be used with
<http://jmlr.csail.mit.edu/papers/volume7/crammer06a/crammer06a.pdf>`_
K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR 7 (2006)

Robustness to outliers: RANSAC
==============================

The RANSAC (RANdom SAmple Consensus) is an iterative algorithm for the robust
estimation of parameters from a subset of inliers from the complete data set.

It is an iterative method to estimate the parameters of a mathematical model.
RANSAC is a non-deterministic algorithm producing only a reasonable result with
a certain probability, which is dependent on the number of iterations (see
`max_trials` parameter). It is typically used for linear and non-linear
regression problems and is especially popular in the fields of photogrammetric
computer vision.

The algorithm splits the complete input sample data into a set of inliers,
which may be subject to noise, and outliers, which are e.g. caused by erroneous
measurements or invalid hypotheses about the data. The resulting model is then
estimated only from the determined inliers.

.. figure:: ../auto_examples/linear_model/images/plot_ransac_1.png
:target: ../auto_examples/linear_model/plot_ransac.html
:align: center
:scale: 50%

Each iteration performs the following steps:

1. Select `min_samples` random samples from the original data and check
whether the set of data is valid (see `is_data_valid`).
2. Fit a model to the random subset (`base_estimator.fit`) and check
whether the estimated model is valid (see `is_model_valid`).
3. Classify all data as inliers or outliers by calculating the residuals
to the estimated model (`base_estimator.predict(X) - y`) - all data
samples with absolute residuals smaller than the `residual_threshold`
are considered as inliers.
4. Save fitted model as best model if number of inlier samples is
maximal. In case the current estimated model has the same number of
inliers, it is only considered as the best model if it has better score.

These steps are performed either a maximum number of times (`max_trials`) or
until one of the special stop criteria are met (see `stop_n_inliers` and
`stop_score`). The final model is estimated using all inlier samples (consensus
set) of the previously determined best model.

The `is_data_valid` and `is_model_valid` functions allow to identify and reject
degenerate combinations of random sub-samples. If the estimated model is not
needed for identifying degenerate cases, `is_data_valid` should be used as it
is called prior to fitting the model and thus leading to better computational
performance.


.. topic:: Examples:

* :ref:`example_linear_model_plot_ransac.py`

.. topic:: References:

* http://en.wikipedia.org/wiki/RANSAC
* `"Random Sample Consensus: A Paradigm for Model Fitting with Applications to
Image Analysis and Automated Cartography"
<http://www.cs.columbia.edu/~belhumeur/courses/compPhoto/ransac.pdf>`_
Martin A. Fischler and Robert C. Bolles - SRI International (1981)
* `"Performance Evaluation of RANSAC Family"
<http://www.bmva.org/bmvc/2009/Papers/Paper355/Paper355.pdf>`_
Sunglok Choi, Taemin Kim and Wonpil Yu - BMVC (2009)
5 changes: 3 additions & 2 deletions doc/modules/naive_bayes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,9 @@ are estimated using maximum likelihood.
>>> from sklearn.naive_bayes import GaussianNB
>>> gnb = GaussianNB()
>>> y_pred = gnb.fit(iris.data, iris.target).predict(iris.data)
>>> print("Number of mislabeled points : %d" % (iris.target != y_pred).sum())
Number of mislabeled points : 6
>>> print("Number of mislabeled points out of a total %d points : %d"
... % (iris.data.shape[0],(iris.target != y_pred).sum()))
Number of mislabeled points out of a total 150 points : 6

.. _multinomial_naive_bayes:

Expand Down
2 changes: 1 addition & 1 deletion doc/modules/sgd.rst
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,8 @@ further information.

- :ref:`example_linear_model_plot_sgd_separating_hyperplane.py`,
- :ref:`example_linear_model_plot_sgd_iris.py`
- :ref:`example_linear_model_plot_sgd_weighted_classes.py`
- :ref:`example_linear_model_plot_sgd_weighted_samples.py`
- :ref:`example_svm_plot_separating_hyperplane_unbalanced.py` (See the `Note`)

Regression
==========
Expand Down
2 changes: 2 additions & 0 deletions doc/sphinxext/gen_rst.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,8 @@ def generate_dir_rst(dir, fhindex, example_dir, root_dir, plot_gallery):
os.makedirs(target_dir)
sorted_listdir = line_count_sort(os.listdir(src_dir),
src_dir)
if not os.path.exists(os.path.join(dir, 'images', 'thumb')):
os.makedirs(os.path.join(dir, 'images', 'thumb'))
for fname in sorted_listdir:
if fname.endswith('py'):
generate_file_rst(fname, target_dir, src_dir, root_dir, plot_gallery)
Expand Down
Binary file added doc/testimonials/images/change-logo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 63f7dd1

Please sign in to comment.