Merge branch 'master' of github.com:scikit-learn/scikit-learn

nicomahler · Nov 6, 2013 · 63f7dd1 · 63f7dd1
2 parents d0cdcde + f719d09
commit 63f7dd1
Show file tree

Hide file tree

Showing 82 changed files with 4,929 additions and 4,286 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -14,8 +14,11 @@ install:
     - if [ "${COVERAGE}" == "--with-coverage" ]; then sudo pip install coverage; fi
     - if [ "${COVERAGE}" == "--with-coverage" ]; then sudo pip install coveralls; fi
 script:
-    - make test
-    - if [ "${COVERAGE}" == "--with-coverage" ]; then make test-coverage; fi
+    - if [ "${COVERAGE}" == "--with-coverage" ]; then
+    -   make test-coverage;
+    - else
+    -   make test;
+    - fi
 after_success:
     - if [ "${COVERAGE}" == "--with-coverage" ]; then coveralls; fi
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -3,7 +3,7 @@ Contributing code
 =================
 
 **Note: This document is just to get started, visit [**Contributing
-page**](http://scikit-learn.org/stable/developers/index.html#coding-guidelines)
+page**](http://scikit-learn.org/stable/developers/index.html)
 for the full contributor's guide. Please be sure to read it carefully to make
 the code review process go as smoothly as possible and maximize the
 likelihood of your contribution being merged.**

diff --git a/README.rst b/README.rst
@@ -71,13 +71,24 @@ GIT
 
 You can check the latest sources with the command::
 
-    git clone git://github.com/scikit-learn/scikit-learn.git
+    git clone https://github.com/scikit-learn/scikit-learn.git
 
 or if you have write privileges::
 
     git clone [email protected]:scikit-learn/scikit-learn.git
 
 
+Contributing
+~~~~~~~~~~~~
+
+Quick tutorial on how to go about setting up your environment to
+contribute to scikit-learn: https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md
+
+Before opening a Pull Request, have a look at the 
+full Contributing page to make sure your code complies
+with our guidelines: http://scikit-learn.org/stable/developers/index.html
+
+
 Testing
 -------
 

diff --git a/benchmarks/bench_plot_nmf.py b/benchmarks/bench_plot_nmf.py
@@ -4,10 +4,12 @@
 
 from __future__ import print_function
 
+from collections import defaultdict
 import gc
 from time import time
+
 import numpy as np
-from collections import defaultdict
+from scipy.linalg import norm
 
 from sklearn.decomposition.nmf import NMF, _initialize_nmf
 from sklearn.datasets.samples_generator import make_low_rank_matrix
@@ -27,7 +29,7 @@ def alt_nnmf(V, r, max_iter=1000, tol=1e-3, R=None):
     r : integer
         number of latent features
     max_iter : integer, optional
-        maximum number of iterations (default: 10000)
+        maximum number of iterations (default: 1000)
     tol : double
         tolerance threshold for early exit (when the update factor is within
         tol of 1., the function exits)
@@ -62,25 +64,29 @@ def alt_nnmf(V, r, max_iter=1000, tol=1e-3, R=None):
         H *= updateH
         updateW = np.dot(V, H.T) / (np.dot(W, np.dot(H, H.T)) + eps)
         W *= updateW
-        if True or (i % 10) == 0:
+        if i % 10 == 0:
             max_update = max(updateW.max(), updateH.max())
             if abs(1. - max_update) < tol:
                 break
     return W, H
 
 
-def compute_bench(samples_range, features_range, rank=50, tolerance=1e-7):
+def report(error, time):
+    print("Frobenius loss: %.5f" % error)
+    print("Took: %.2fs" % time)
+    print()
+
+
+def benchmark(samples_range, features_range, rank=50, tolerance=1e-5):
     it = 0
     timeset = defaultdict(lambda: [])
     err = defaultdict(lambda: [])
 
     max_it = len(samples_range) * len(features_range)
     for n_samples in samples_range:
         for n_features in features_range:
-            it += 1
-            print('====================')
-            print('Iteration %03d of %03d' % (it, max_it))
-            print('====================')
+            print("%2d samples, %2d features" % (n_samples, n_features))
+            print('=======================')
             X = np.abs(make_low_rank_matrix(n_samples, n_features,
                        effective_rank=rank,  tail_strength=0.2))
 
@@ -91,7 +97,7 @@ def compute_bench(samples_range, features_range, rank=50, tolerance=1e-7):
             tend = time() - tstart
             timeset['nndsvd-nmf'].append(tend)
             err['nndsvd-nmf'].append(m.reconstruction_err_)
-            print(m.reconstruction_err_, tend)
+            report(m.reconstruction_err_, tend)
 
             gc.collect()
             print("benchmarking nndsvda-nmf: ")
@@ -101,7 +107,7 @@ def compute_bench(samples_range, features_range, rank=50, tolerance=1e-7):
             tend = time() - tstart
             timeset['nndsvda-nmf'].append(tend)
             err['nndsvda-nmf'].append(m.reconstruction_err_)
-            print(m.reconstruction_err_, tend)
+            report(m.reconstruction_err_, tend)
 
             gc.collect()
             print("benchmarking nndsvdar-nmf: ")
@@ -111,7 +117,7 @@ def compute_bench(samples_range, features_range, rank=50, tolerance=1e-7):
             tend = time() - tstart
             timeset['nndsvdar-nmf'].append(tend)
             err['nndsvdar-nmf'].append(m.reconstruction_err_)
-            print(m.reconstruction_err_, tend)
+            report(m.reconstruction_err_, tend)
 
             gc.collect()
             print("benchmarking random-nmf")
@@ -121,7 +127,7 @@ def compute_bench(samples_range, features_range, rank=50, tolerance=1e-7):
             tend = time() - tstart
             timeset['random-nmf'].append(tend)
             err['random-nmf'].append(m.reconstruction_err_)
-            print(m.reconstruction_err_, tend)
+            report(m.reconstruction_err_, tend)
 
             gc.collect()
             print("benchmarking alt-random-nmf")
@@ -130,7 +136,7 @@ def compute_bench(samples_range, features_range, rank=50, tolerance=1e-7):
             tend = time() - tstart
             timeset['alt-random-nmf'].append(tend)
             err['alt-random-nmf'].append(np.linalg.norm(X - np.dot(W, H)))
-            print(np.linalg.norm(X - np.dot(W, H)), tend)
+            report(norm(X - np.dot(W, H)), tend)
 
     return timeset, err
 
@@ -142,10 +148,10 @@ def compute_bench(samples_range, features_range, rank=50, tolerance=1e-7):
 
     samples_range = np.linspace(50, 500, 3).astype(np.int)
     features_range = np.linspace(50, 500, 3).astype(np.int)
-    timeset, err = compute_bench(samples_range, features_range)
+    timeset, err = benchmark(samples_range, features_range)
 
     for i, results in enumerate((timeset, err)):
-        fig = plt.figure('scikit-learn Non-Negative Matrix Factorization benchmkar results')
+        fig = plt.figure('scikit-learn Non-Negative Matrix Factorization benchmark results')
         ax = fig.gca(projection='3d')
         for c, (label, timings) in zip('rbgcm', sorted(results.iteritems())):
             X, Y = np.meshgrid(samples_range, features_range)

diff --git a/doc/conf.py b/doc/conf.py
@@ -224,3 +224,8 @@
 #latex_use_modindex = True
 
 trim_doctests_flags = True
+
+# Add the 'copybutton' javascript, to hide/show the prompt in code
+# examples
+def setup(app):
+    app.add_javascript('js/copybutton.js')
diff --git a/doc/datasets/index.rst b/doc/datasets/index.rst
@@ -35,7 +35,7 @@ fetched from mldata.org have more sophisticated structure.
 These functions return a dictionary-like object holding at least two items:
 an array of shape ``n_samples`` * `` n_features`` with key ``data``
 (except for 20newsgroups)
-and a NumPy array of length ``n_features``, containing the target values,
+and a NumPy array of length ``n_samples``, containing the target values,
 with key ``target``.
 
 The datasets also contain a description in ``DESCR`` and some contain

diff --git a/doc/images/minBox.png b/doc/images/minBox.png
diff --git a/doc/images/minBoxHighlight.png b/doc/images/minBoxHighlight.png
diff --git a/doc/images/noneBox.png b/doc/images/noneBox.png
diff --git a/doc/images/plusBox.png b/doc/images/plusBox.png
diff --git a/doc/images/plusBoxHighlight.png b/doc/images/plusBoxHighlight.png
diff --git a/doc/modules/covariance.rst b/doc/modules/covariance.rst
@@ -36,7 +36,10 @@ The empirical covariance matrix of a sample can be computed using the
 :class:`EmpiricalCovariance` object to the data sample with the
 :meth:`EmpiricalCovariance.fit` method.  Be careful that depending
 whether the data are centered or not, the result will be different, so
-one may want to use the `assume_centered` parameter accurately.
+one may want to use the `assume_centered` parameter accurately. More precisely
+if one uses `assume_centered=False`, then the test set is supposed to have the 
+same mean vector as the training set. If not so, both should be centered by the 
+user, and `assume_centered=True` should be used.
 
 .. topic:: Examples:
 

diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
@@ -93,7 +93,7 @@ where the number of samples is very small.
 Computing cross-validated metrics
 =================================
 
-The simplest way to use perform cross-validation in to call the
+The simplest way to use cross-validation is to call the
 :func:`cross_val_score` helper function on the estimator and the dataset.
 
 The following example demonstrates how to estimate the accuracy of a linear
@@ -165,7 +165,7 @@ validation strategies.
 K-fold
 ------
 
-:class:`KFold` divides all the samples in math:`k` groups of samples,
+:class:`KFold` divides all the samples in :math:`k` groups of samples,
 called folds (if :math:`k = n`, this is equivalent to the *Leave One
 Out* strategy), of equal sizes (if possible). The prediction function is
 learned using :math:`k - 1` folds, and the fold left out is used for test.
@@ -231,6 +231,40 @@ not waste much data as only one sample is removed from the learning set::
   [0 1 2] [3]
 
 
+Potential users of LOO for model selection should weigh a few known caveats. 
+When compared with *k*-fold cross validation, one builds *n* models from *n* 
+samples instead of *k* models, where *n > k*. Moreover, each is trained on *n - 1* 
+samples rather than *(k-1)n / k*. In both ways, assuming *k* is not too large 
+and *k < n*, LOO is more computationally expensive than *k*-fold cross validation.
+
+In terms of accuracy, LOO often results in high variance as an estimator for the 
+test error. Intuitively, since *n - 1* of 
+the *n* samples are used to build each model, models constructed from folds are 
+virtually identical to each other and to the model built from the entire training 
+set. 
+
+However, if the learning curve is steep for the training size in question, 
+then 5- or 10- fold cross validation can overestimate the generalization error.
+
+As a general rule, most authors, and empirical evidence, suggest that 5- or 10- 
+fold cross validation should be preferred to LOO.
+
+
+.. topic:: References:
+
+ * http://www.faqs.org/faqs/ai-faq/neural-nets/part3/section-12.html
+ * T. Hastie, R. Tibshirani, J. Friedman,  `The Elements of Statistical Learning
+   <http://www-stat.stanford.edu/~tibs/ElemStatLearn>`_, Springer 2009
+ * L. Breiman, P. Spector `Submodel selection and evaluation in regression: The X-random case
+   <http://digitalassets.lib.berkeley.edu/sdtr/ucb/text/197.pdf>`_, International Statistical Review 1992
+ * R. Kohavi, `A Study of Cross-Validation and Bootstrap for Accuracy Estimation and Model Selection
+   <http://www.cs.iastate.edu/~jtian/cs573/Papers/Kohavi-IJCAI-95.pdf>`_, Intl. Jnt. Conf. AI   
+ * R. Bharat Rao, G. Fung, R. Rosales, `On the Dangers of Cross-Validation. An Experimental Evaluation
+   <http://www.siam.org/proceedings/datamining/2008/dm08_54_Rao.pdf>`_, SIAM 2008
+ * G. James, D. Witten, T. Hastie, R Tibshirani, `An Introduction to Statitical Learning
+   <http://www-bcf.usc.edu/~gareth/ISL>`_, Springer 2013
+
+
 Leave-P-Out - LPO
 -----------------
 

diff --git a/doc/modules/feature_selection.rst b/doc/modules/feature_selection.rst
@@ -262,7 +262,7 @@ to use a :class:`sklearn.pipeline.Pipeline`::
 
 In this snippet we make use of a :class:`sklearn.svm.LinearSVC` 
 to evaluate feature importances and select the most relevant features.
-Then, a class:`sklearn.ensemble.GradientBoostingClassifier` is trained on the 
+Then, a :class:`sklearn.ensemble.RandomForestClassifier` is trained on the 
 transformed output, i.e. using only relevant features. You can perform 
 similar operations with the other feature selection methods and also
 classifiers that provide a way to evaluate feature importances of course. 

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
@@ -270,7 +270,7 @@ Elastic Net
 ===========
 :class:`ElasticNet` is a linear model trained with L1 and L2 prior as
 regularizer. This combination allows for learning a sparse model where
-few of the weights are non-zero like :class:`Lasso`, while still maintaining the
+few of the weights are non-zero like :class:`Lasso`, while still maintaining
 the regularization properties of :class:`Ridge`. We control this tradeoff
 using the `l1_ratio` parameter.
 
@@ -725,3 +725,66 @@ For classification, :class:`PassiveAggressiveClassifier` can be used with
    <http://jmlr.csail.mit.edu/papers/volume7/crammer06a/crammer06a.pdf>`_
    K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR 7 (2006)
 
+Robustness to outliers: RANSAC
+==============================
+
+The RANSAC (RANdom SAmple Consensus) is an iterative algorithm for the robust
+estimation of parameters from a subset of inliers from the complete data set.
+
+It is an iterative method to estimate the parameters of a mathematical model.
+RANSAC is a non-deterministic algorithm producing only a reasonable result with
+a certain probability, which is dependent on the number of iterations (see
+`max_trials` parameter). It is typically used for linear and non-linear
+regression problems and is especially popular in the fields of photogrammetric
+computer vision.
+
+The algorithm splits the complete input sample data into a set of inliers,
+which may be subject to noise, and outliers, which are e.g. caused by erroneous
+measurements or invalid hypotheses about the data. The resulting model is then
+estimated only from the determined inliers.
+
+.. figure:: ../auto_examples/linear_model/images/plot_ransac_1.png
+   :target: ../auto_examples/linear_model/plot_ransac.html
+   :align: center
+   :scale: 50%
+
+Each iteration performs the following steps:
+
+1. Select `min_samples` random samples from the original data and check
+   whether the set of data is valid (see `is_data_valid`).
+2. Fit a model to the random subset (`base_estimator.fit`) and check
+   whether the estimated model is valid (see `is_model_valid`).
+3. Classify all data as inliers or outliers by calculating the residuals
+   to the estimated model (`base_estimator.predict(X) - y`) - all data
+   samples with absolute residuals smaller than the `residual_threshold`
+   are considered as inliers.
+4. Save fitted model as best model if number of inlier samples is
+   maximal. In case the current estimated model has the same number of
+   inliers, it is only considered as the best model if it has better score.
+
+These steps are performed either a maximum number of times (`max_trials`) or
+until one of the special stop criteria are met (see `stop_n_inliers` and
+`stop_score`). The final model is estimated using all inlier samples (consensus
+set) of the previously determined best model.
+
+The `is_data_valid` and `is_model_valid` functions allow to identify and reject
+degenerate combinations of random sub-samples. If the estimated model is not
+needed for identifying degenerate cases, `is_data_valid` should be used as it
+is called prior to fitting the model and thus leading to better computational
+performance.
+
+
+.. topic:: Examples:
+
+  * :ref:`example_linear_model_plot_ransac.py`
+
+.. topic:: References:
+
+ * http://en.wikipedia.org/wiki/RANSAC
+ * `"Random Sample Consensus: A Paradigm for Model Fitting with Applications to
+   Image Analysis and Automated Cartography"
+   <http://www.cs.columbia.edu/~belhumeur/courses/compPhoto/ransac.pdf>`_
+   Martin A. Fischler and Robert C. Bolles - SRI International (1981)
+ * `"Performance Evaluation of RANSAC Family"
+   <http://www.bmva.org/bmvc/2009/Papers/Paper355/Paper355.pdf>`_
+   Sunglok Choi, Taemin Kim and Wonpil Yu - BMVC (2009)
diff --git a/doc/modules/naive_bayes.rst b/doc/modules/naive_bayes.rst
@@ -93,8 +93,9 @@ are estimated using maximum likelihood.
     >>> from sklearn.naive_bayes import GaussianNB
     >>> gnb = GaussianNB()
     >>> y_pred = gnb.fit(iris.data, iris.target).predict(iris.data)
-    >>> print("Number of mislabeled points : %d" % (iris.target != y_pred).sum())
-    Number of mislabeled points : 6
+    >>> print("Number of mislabeled points out of a total %d points : %d"
+    ...       % (iris.data.shape[0],(iris.target != y_pred).sum()))
+    Number of mislabeled points out of a total 150 points : 6
 
 .. _multinomial_naive_bayes:
 

diff --git a/doc/modules/sgd.rst b/doc/modules/sgd.rst
@@ -159,8 +159,8 @@ further information.
 
  - :ref:`example_linear_model_plot_sgd_separating_hyperplane.py`,
  - :ref:`example_linear_model_plot_sgd_iris.py`
- - :ref:`example_linear_model_plot_sgd_weighted_classes.py`
  - :ref:`example_linear_model_plot_sgd_weighted_samples.py`
+ - :ref:`example_svm_plot_separating_hyperplane_unbalanced.py` (See the `Note`)
 
 Regression
 ==========

diff --git a/doc/sphinxext/gen_rst.py b/doc/sphinxext/gen_rst.py
@@ -635,6 +635,8 @@ def generate_dir_rst(dir, fhindex, example_dir, root_dir, plot_gallery):
         os.makedirs(target_dir)
     sorted_listdir = line_count_sort(os.listdir(src_dir),
                                      src_dir)
+    if not os.path.exists(os.path.join(dir, 'images', 'thumb')):
+        os.makedirs(os.path.join(dir, 'images', 'thumb'))
     for fname in sorted_listdir:
         if fname.endswith('py'):
             generate_file_rst(fname, target_dir, src_dir, root_dir, plot_gallery)

diff --git a/doc/testimonials/images/change-logo.png b/doc/testimonials/images/change-logo.png