Skip to content

Commit ad40483

Browse files
NelleVGaelVaroquaux
authored andcommitted
DOC/TEST improved doc and tests on the paired distances
1 parent eb54dc3 commit ad40483

File tree

2 files changed

+134
-10
lines changed

2 files changed

+134
-10
lines changed

sklearn/metrics/pairwise.py

+85-10
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# -*- coding: utf-8 -*-
22
"""
33
The :mod:`sklearn.metrics.pairwise` submodule implements utilities to evaluate
4-
pairwise distances or affinity of sets of samples.
4+
pairwise distances, paired distances or affinity of sets of samples.
55
66
This module contains both distance metrics and kernels. A brief summary is
77
given on the two here.
@@ -101,6 +101,44 @@ def check_pairwise_arrays(X, Y):
101101
return X, Y
102102

103103

104+
def check_paired_arrays(X, Y):
105+
""" Set X and Y appropriately and checks inputs for paired distances
106+
107+
All paired distance metrics should use this function first to assert that
108+
the given parameters are correct and safe to use.
109+
110+
Specifically, this function first ensures that both X and Y are arrays,
111+
then checks that they are at least two dimensional while ensuring that
112+
their elements are floats. Finally, the function checks that the size
113+
of the dimensions of the two arrays are equal.
114+
115+
Parameters
116+
----------
117+
X : {array-like, sparse matrix}, shape = [n_samples_a, n_features]
118+
119+
Y : {array-like, sparse matrix}, shape = [n_samples_b, n_features]
120+
121+
Returns
122+
-------
123+
safe_X : {array-like, sparse matrix}, shape = [n_samples_a, n_features]
124+
An array equal to X, guaranteed to be a numpy array.
125+
126+
safe_Y : {array-like, sparse matrix}, shape = [n_samples_b, n_features]
127+
An array equal to Y if Y was not None, guaranteed to be a numpy array.
128+
If Y was None, safe_Y will be a pointer to X.
129+
130+
"""
131+
X, Y = check_pairwise_arrays(X, Y)
132+
if X.shape != Y.shape:
133+
raise ValueError("X and Y should be of same shape. They were "
134+
"respectively (%d, %d) and (%d, %d) long." % (
135+
X.shape[0],
136+
X.shape[1],
137+
Y.shape[0],
138+
Y.shape[1]))
139+
return X, Y
140+
141+
104142
# Pairwise distances
105143
def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False):
106144
"""
@@ -146,6 +184,10 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False):
146184
>>> euclidean_distances(X, [[0, 0]])
147185
array([[ 1. ],
148186
[ 1.41421356]])
187+
188+
See also
189+
--------
190+
paired_distances : distances betweens pairs of elements of X and Y.
149191
"""
150192
# should not need X_norm_squared because if you could precompute that as
151193
# well as Y, then you should just pre-compute the output and not even
@@ -305,7 +347,7 @@ def cosine_distances(X, Y=None):
305347
# Paired distances
306348
def paired_euclidean_distances(X, Y):
307349
"""
308-
Computes the paired distances between X and Y
350+
Computes the paired euclidean distances between X and Y
309351
310352
Parameters
311353
----------
@@ -315,19 +357,27 @@ def paired_euclidean_distances(X, Y):
315357
316358
Returns
317359
-------
318-
Distances: ndarray (n_samples, )
360+
distances : ndarray (n_samples, )
319361
"""
320-
if len(X) != len(Y):
321-
raise ValueError("X and Y should be of same size. They were "
322-
"respectively %d and %d long." % (len(X), len(Y)))
323-
X, Y = check_pairwise_arrays(X, Y)
362+
X, Y = check_paired_arrays(X, Y)
363+
324364
return np.sqrt(((X - Y) ** 2).sum(axis=-1))
325365

326366

327367
def paired_manhattan_distances(X, Y):
328-
""" Compute the L1 distances between the vectors in X and Y.
368+
"""Compute the L1 distances between the vectors in X and Y.
369+
370+
Parameters
371+
----------
372+
X : array-like, shape = [n_samples, n_features]
373+
374+
Y : array-like, shape = [n_samples, n_features]
375+
376+
Returns
377+
-------
378+
distances : ndarray (n_samples, )
329379
"""
330-
X, Y = check_pairwise_arrays(X, Y)
380+
X, Y = check_paired_arrays(X, Y)
331381
return np.abs(X - Y).sum(axis=-1)
332382

333383

@@ -340,24 +390,49 @@ def paired_manhattan_distances(X, Y):
340390
}
341391

342392

343-
def paired_distances(X, Y, metric="euclidean"):
393+
def paired_distances(X, Y, metric="euclidean", **kwds):
344394
"""
395+
Computes the paired distances between X and Y.
396+
397+
Computes the distances between (X[0], Y[0]), (X[1], Y[1]), etc...
345398
346399
Parameters
347400
----------
348401
X, Y : ndarray (n_samples, n_features]
349402
350403
metric : string or callable
404+
XXX
405+
The metric to use when calculating distance between instances in a
406+
feature array. If metric is a string, it must be one of the options
407+
specified in PAIRED_DISTANCES
408+
Alternatively, if metric is a callable function, it is called on each
409+
pair of instances (rows) and the resulting value recorded. The callable
410+
should take two arrays from X as input and return a value indicating
411+
the distance between them.
351412
352413
Returns
353414
-------
354415
distances : ndarray (n_samples, )
416+
417+
Examples
418+
--------
419+
>>> from sklearn.metrics.pairwise import paired_distances
420+
>>> X = [[0, 1], [1, 1]]
421+
>>> Y = [[0, 1], [2, 1]]
422+
>>> paired_distances(X, Y)
423+
array([ 0., 1.])
424+
425+
See also
426+
--------
427+
pairwise_distances : pairwise distances.
355428
"""
356429

357430
if metric in PAIRED_DISTANCES:
358431
func = PAIRED_DISTANCES[metric]
359432
return func(X, Y)
360433
elif callable(metric):
434+
# Check the matrix first (it is usually done by the metric)
435+
X, Y = check_paired_arrays(X, Y)
361436
distances = np.zeros(len(X))
362437
for i in range(len(X)):
363438
distances[i] = metric(X[i], Y[i])

sklearn/metrics/tests/test_pairwise.py

+49
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,13 @@
2424
from sklearn.metrics.pairwise import pairwise_distances
2525
from sklearn.metrics.pairwise import pairwise_kernels
2626
from sklearn.metrics.pairwise import PAIRWISE_KERNEL_FUNCTIONS
27+
from sklearn.metrics.pairwise import PAIRED_DISTANCES
2728
from sklearn.metrics.pairwise import check_pairwise_arrays
29+
from sklearn.metrics.pairwise import check_paired_arrays
2830
from sklearn.metrics.pairwise import _parallel_pairwise
31+
from sklearn.metrics.pairwise import paired_distances
32+
from sklearn.metrics.pairwise import paired_euclidean_distances
33+
from sklearn.metrics.pairwise import paired_manhattan_distances
2934
from sklearn.preprocessing import normalize
3035

3136

@@ -187,6 +192,24 @@ def test_pairwise_kernels_filter_param():
187192
assert_raises(TypeError, pairwise_kernels, X, Y, "rbf", **params)
188193

189194

195+
def test_paired_distances():
196+
""" Test the pairwise_distance helper function. """
197+
rng = np.random.RandomState(0)
198+
# Euclidean distance should be equivalent to calling the function.
199+
X = rng.random_sample((5, 4))
200+
# Euclidean distance, with Y != X.
201+
Y = rng.random_sample((5, 4))
202+
for metric, func in PAIRED_DISTANCES.iteritems():
203+
S = paired_distances(X, Y, metric=metric)
204+
S2 = func(X, Y)
205+
assert_array_almost_equal(S, S2)
206+
207+
# Test that a value error is raised when the lengths of X and Y should not
208+
# differ
209+
Y = rng.random_sample((3, 4))
210+
assert_raises(ValueError, paired_distances, X, Y)
211+
212+
190213
def test_euclidean_distances():
191214
""" Check the pairwise Euclidean distances computation"""
192215
X = [[0]]
@@ -200,6 +223,24 @@ def test_euclidean_distances():
200223
assert_array_almost_equal(D, [[1., 2.]])
201224

202225

226+
# Paired distances
227+
228+
def test_paired_euclidean_distances():
229+
""" Check the paired Euclidean distances computation"""
230+
X = [[0], [0]]
231+
Y = [[1], [2]]
232+
D = paired_euclidean_distances(X, Y)
233+
assert_array_almost_equal(D, [1., 2.])
234+
235+
236+
def test_paired_manhattan_distances():
237+
""" Check the paired manhattan distances computation"""
238+
X = [[0], [0]]
239+
Y = [[1], [2]]
240+
D = paired_manhattan_distances(X, Y)
241+
assert_array_almost_equal(D, [1., 2.])
242+
243+
203244
def test_chi_square_kernel():
204245
rng = np.random.RandomState(0)
205246
X = rng.random_sample((5, 4))
@@ -332,13 +373,21 @@ def test_check_XB_returned():
332373
assert_array_equal(XA, XA_checked)
333374
assert_array_equal(XB, XB_checked)
334375

376+
XB = np.resize(np.arange(40), (5, 8))
377+
XA_checked, XB_checked = check_paired_arrays(XA, XB)
378+
assert_array_equal(XA, XA_checked)
379+
assert_array_equal(XB, XB_checked)
380+
335381

336382
def test_check_different_dimensions():
337383
""" Ensure an error is raised if the dimensions are different. """
338384
XA = np.resize(np.arange(45), (5, 9))
339385
XB = np.resize(np.arange(32), (4, 8))
340386
assert_raises(ValueError, check_pairwise_arrays, XA, XB)
341387

388+
XB = np.resize(np.arange(4 * 9), (4, 9))
389+
assert_raises(ValueError, check_paired_arrays, XA, XB)
390+
342391

343392
def test_check_invalid_dimensions():
344393
""" Ensure an error is raised on 1D input arrays. """

0 commit comments

Comments
 (0)