FIX: make the dataset doctest fixture modular

zhujiem · Jun 14, 2011 · 159bcb1 · 159bcb1
1 parent b730d3a
commit 159bcb1
Show file tree

Hide file tree

Showing 10 changed files with 425 additions and 389 deletions.
diff --git a/.gitignore b/.gitignore
@@ -12,6 +12,7 @@ dist/
 doc/_build/
 doc/auto_examples/
 doc/modules/generated/
+doc/datasets/generated/
 pip-log.txt
 scikits.learn.egg-info/
 .coverage

diff --git a/doc/contents.rst b/doc/contents.rst
@@ -11,6 +11,6 @@
    supervised_learning.rst
    unsupervised_learning.rst
    model_selection.rst
-   Dataset loading utilities <modules/datasets.rst>
+   Dataset loading utilities <datasets/index.rst>
    Preprocessing data <modules/preprocessing.rst>
    Class Reference <modules/classes.rst>
diff --git a/doc/datasets/index.rst b/doc/datasets/index.rst
@@ -0,0 +1,99 @@
+..
+    For doctests:
+
+    >>> import numpy as np
+    >>> import os
+    >>> from scikits.learn import datasets
+    >>> datasets.mldata.urllib2 = mock_urllib2
+
+.. _datasets:
+
+=========================
+Dataset loading utilities
+=========================
+
+.. currentmodule:: scikits.learn.datasets
+
+The ``scikits.learn.datasets`` package embeds some small toy datasets
+as introduced in the "Getting Started" section.
+
+To evaluate the impact of the scale of the dataset (``n_samples`` and
+``n_features``) while controlling the statistical properties of the data
+(typically the correlation and informativeness of the features), it is
+also possible to generate synthetic data.
+
+This package also features helpers to fetch larger datasets commonly
+used by the machine learning community to benchmark algorithm on data
+that comes from the 'real world'.
+
+
+Datasets shipped with the scikit learn
+========================================
+
+scikit-learn comes with a few small standard datasets that do not
+require to download any file from some external website.
+
+.. autosummary::
+
+   :toctree: generated/
+   :template: function.rst
+
+   load_iris
+   load_diabetes
+   load_digits
+   load_linnerud
+
+These datasets are useful to quickly illustrate the behavior of the
+various algorithms implemented in the scikit. They are however often to
+small to be representative of real world machine learning tasks.
+
+
+Datasets in svmlight / libsvm format
+====================================
+
+scikit-learn includes a fast utility function, ``load_svmlight_format``,  to load
+datasets in the svmlight / libsvm format. In this format, each line
+takes the form ``<label> <feature-id>:<feature-value>
+<feature-id>:<feature-value> ...``. This format is especially suitable for sparse datasets.
+Scipy sparse CSR matrices are used for ``X`` and numpy arrays are used for ``y``.
+
+You may load a dataset like this::
+
+  >>> from scikits.learn.datasets import load_svmlight_format
+  >>> X_train, y_train = load_svmlight_format("/path/to/train_dataset.txt")
+  ...                                                         # doctest: +SKIP
+
+You may also load two datasets at once::
+
+  >>> X_train, y_train, X_test, y_test = load_svmlight_format(
+  ...     "/path/to/train_dataset.txt",
+  ...     "/path/to/test_dataset.txt")                        # doctest: +SKIP
+
+In this case, ``X_train`` and ``X_test`` are guaranteed to have the same number
+of features. Another way to achieve the same result is to fix the number of
+features::
+
+  >>> X_test, y_test = load_svmlight_format(
+  ...     "/path/to/test_dataset.txt", n_features=X_train.shape[1])
+  ...                                                         # doctest: +SKIP
+
+.. topic:: Public datasets:
+
+ _`Public datasets in svmlight / libsvm format`: http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/
+
+
+.. include:: mldata.rst
+
+.. include:: twenty_newsgroups.rst
+
+.. include:: labeled_faces.rst
+
+.. todo::
+
+  Dataset generators
+  ==================
+
+  Please write some narrative documentation on how to best use the most common
+  utility functions from the ``samples_generator`` module.
+
+
diff --git a/doc/datasets/labeled_faces.rst b/doc/datasets/labeled_faces.rst
@@ -0,0 +1,113 @@
+.. _labeled_faces_in_the_wild:
+
+The Labeled Faces in the Wild face recognition dataset
+======================================================
+
+This dataset is a collection of JPEG pictures of famous people collected
+over the internet, all details are available on the official website:
+
+    http://vis-www.cs.umass.edu/lfw/
+
+Each picture is centered on a single face. The typical task is called
+Face Verification: given a pair of two pictures, a binary classifier
+must predict whether the two images are from the same person.
+
+An alternative task, Face Recognition or Face Identification is:
+given the picture of the face of an unknown person, identify the name
+of the person by referring to a gallery of previously seen pictures of
+identified persons.
+
+Both Face Verification and Face Recognition are tasks that are typically
+performed on the output of a model trained to perform Face Detection. The
+most popular model for Face Detection is called Viola-Johns and is
+implemented in the OpenCV library. The LFW faces were extracted by this
+face detector from various online websites.
+
+
+Usage
+-----
+
+``scikit-learn`` provides two loaders that will automatically download,
+cache, parse the metadata files, decode the jpeg and convert the
+interesting slices into memmaped numpy arrays. This dataset size is more
+than 200 MB. The first load typically takes more than a couple of minutes
+to fully decode the relevant part of the JPEG files into numpy arrays. If
+the dataset has  been loaded once, the following times the loading times
+less than 200ms by using a memmaped version memoized on the disk in the
+``~/scikit_learn_data/lfw_home/`` folder using ``joblib``.
+
+The first loader is used for the Face Identification task: a multi-class
+classification task (hence supervised learning)::
+
+  >>> from scikits.learn.datasets import fetch_lfw_people
+  >>> lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
+
+  >>> for name in lfw_people.target_names:
+  ...     print name
+  ...
+  Ariel Sharon
+  Colin Powell
+  Donald Rumsfeld
+  George W Bush
+  Gerhard Schroeder
+  Hugo Chavez
+  Tony Blair
+
+The default slice is a rectangular shape around the face, removing
+most of the background::
+
+  >>> lfw_people.data.dtype
+  dtype('float32')
+
+  >>> lfw_people.data.shape
+  (1288, 50, 37)
+
+Each of the ``1140`` faces is assigned to a single person id in the ``target``
+array::
+
+  >>> lfw_people.target.shape
+  (1288,)
+
+  >>> list(lfw_people.target[:10])
+  [5, 6, 3, 1, 0, 1, 3, 4, 3, 0]
+
+The second loader is typically used for the face verification task: each sample
+is a pair of two picture belonging or not to the same person::
+
+  >>> from scikits.learn.datasets import fetch_lfw_pairs
+  >>> lfw_pairs_train = fetch_lfw_pairs(subset='train')
+
+  >>> list(lfw_pairs_train.target_names)
+  ['Different persons', 'Same person']
+
+  >>> lfw_pairs_train.data.shape
+  (2200, 2, 62, 47)
+
+  >>> lfw_pairs_train.target.shape
+  (2200,)
+
+Both for the ``fetch_lfw_people`` and ``fetch_lfw_pairs`` function it is
+possible to get an additional dimension with the RGB color channels by
+passing ``color=True``, in that case the shape will be
+``(2200, 2, 62, 47, 3)``.
+
+The ``fetch_lfw_pairs`` datasets is subdived in 3 subsets: the development
+``train`` set, the development ``test`` set and an evaluation ``10_folds``
+set meant to compute performance metrics using a 10-folds cross
+validation scheme.
+
+.. topic:: References:
+
+ * `Labeled Faces in the Wild: A Database for Studying Face Recognition
+   in Unconstrained Environments.
+   <http://vis-www.cs.umass.edu/lfw/lfw.pdf>`_
+   Gary B. Huang, Manu Ramesh, Tamara Berg, and Erik Learned-Miller.
+   University of Massachusetts, Amherst, Technical Report 07-49, October, 2007.
+
+
+Examples
+--------
+
+:ref:`example_applications_face_recognition.py`
+
+
diff --git a/doc/datasets/labeled_faces_fixture.py b/doc/datasets/labeled_faces_fixture.py
@@ -0,0 +1,15 @@
+"""Fixture module to skip the datasets loading when offline
+
+Doctests are skipped if the datasets have not already been dowloaded
+and cached in the past.
+"""
+from os.path import exists
+from os.path import join
+from nose import SkipTest
+from scikits.learn.datasets import get_data_home
+
+
+def setup_module(module):
+    data_home = get_data_home()
+    if not exists(join(data_home, 'lfw_home')):
+        raise SkipTest("Skipping dataset loading doctests")
diff --git a/doc/datasets/mldata.rst b/doc/datasets/mldata.rst
@@ -0,0 +1,64 @@
+Downloading datasets from the mldata.org repository
+===================================================
+
+`mldata.org <http://mldata.org>`_ is a public repository for machine learning
+data, supported by the `PASCAL network <http://www.pascal-network.org>`_ .
+
+The ``scikits.learn.datasets`` package is able to directly download data
+sets from the repository using the function ``fetch_mldata(dataname)``.
+
+For example, to download the MNIST digit recognition database::
+
+  >>> from scikits.learn.datasets import fetch_mldata
+  >>> mnist = fetch_mldata('MNIST original', data_home=custom_data_home)
+
+The MNIST database contains a total of 70000 examples of handwritten digits
+of size 28x28 pixels, labeled from 0 to 9::
+
+  >>> mnist.data.shape
+  (70000, 784)
+  >>> mnist.target.shape
+  (70000,)
+  >>> np.unique(mnist.target)
+  array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.])
+
+After the first download, the dataset is cached locally in the path
+specified by the ``data_home`` keyword argument, which defaults to
+``~/scikit_learn_data/``::
+
+  >>> os.listdir(os.path.join(custom_data_home, 'mldata'))
+  ['mnist-original.mat']
+
+Data sets in `mldata.org <http://mldata.org>`_ do not adhere to a strict
+naming or formatting convention. ``fetch_mldata`` is able to make sense
+of the most common cases, but allows to tailor the defaults to individual
+datasets:
+
+* The data arrays in `mldata.org <http://mldata.org>`_ are most often
+  shaped as ``(n_features, n_samples)``. This is the opposite of the
+  ``scikits.learn`` convention, so ``fetch_mldata`` transposes the matrix
+  by default. The ``transpose_data`` keyword controls this behavior::
+
+    >>> iris = fetch_mldata('iris', data_home=custom_data_home)
+    >>> iris.data.shape
+    (150, 4)
+    >>> iris = fetch_mldata('iris', transpose_data=False,
+    ...                     data_home=custom_data_home)
+    >>> iris.data.shape
+    (4, 150)
+
+* For datasets with multiple columns, ``fetch_mldata`` tries to identify
+  the target and data columns and rename them to ``target`` and ``data``.
+  This is done by looking for arrays named ``label`` and ``data`` in the
+  dataset, and failing that by choosing the first array to be ``target``
+  and the second to be ``data``. This behavior can be changed with the
+  ``target_name`` and ``data_name`` keywords, setting them to a specific
+  name or index number (the name and order of the columns in the datasets
+  can be found at its `mldata.org <http://mldata.org>`_ under the tab "Data"::
+
+    >>> iris2 = fetch_mldata('datasets-UCI iris', target_name=1, data_name=0,
+    ...                      data_home=custom_data_home)
+    >>> iris3 = fetch_mldata('datasets-UCI iris', target_name='class',
+    ...                      data_name='double0', data_home=custom_data_home)
+
+
diff --git a/doc/modules/datasets_fixture.py → doc/datasets/mldata_fixture.py b/doc/modules/datasets_fixture.py → doc/datasets/mldata_fixture.py
@@ -1,20 +1,18 @@
 """Fixture module to skip the datasets loading when offline
 
-Doctests are skipped if the datasets have not already been dowloaded
-and cached in the past.
+Mock urllib2 access to mldata.org
 """
-from os.path import exists
-from os.path import join
+
 from os import makedirs
-from nose import SkipTest
+from os.path import join
 from scikits.learn import datasets
-from scikits.learn.datasets import get_data_home
 from scikits.learn.utils.testing import mock_urllib2
 import tempfile
 import scipy as sp
 import shutil
 
 
+
 def globs(globs):
     # setup mock urllib2 module to avoid downloading from mldata.org
     mock_datasets = {
@@ -43,13 +41,6 @@ def globs(globs):
     return globs
 
 
-def setup_module(module):
-    data_home = get_data_home()
-    if (not exists(join(data_home, 'lfw_home'))
-        or not exists(join(data_home, '20news_home'))):
-        raise SkipTest("Skipping dataset loading doctests")
-
-
 def teardown_module(module):
     datasets.mldata.urllib2 = _urllib2_ref
     shutil.rmtree(custom_data_home)