From ad9551a1d7b980595d5b6ada57cba8f86de33a5a Mon Sep 17 00:00:00 2001 From: jeanpauphilet Date: Wed, 5 Nov 2014 14:11:56 +0100 Subject: [PATCH 1/8] ENH: Add new init method for k-means - implement kmeans++ init method (_kpp) proposed by D. Arthur and S. Vassilvitski - remove references to unsupported init method ('uniform') --- scipy/cluster/tests/test_vq.py | 11 +++++-- scipy/cluster/vq.py | 58 ++++++++++++++++++++++++++++++++-- 2 files changed, 64 insertions(+), 5 deletions(-) diff --git a/scipy/cluster/tests/test_vq.py b/scipy/cluster/tests/test_vq.py index abffc55bc1f2..6e601d6e0d3a 100644 --- a/scipy/cluster/tests/test_vq.py +++ b/scipy/cluster/tests/test_vq.py @@ -250,8 +250,15 @@ def test_kmeans2_init(self): kmeans2(data, 3, minit='points') kmeans2(data[:, :1], 3, minit='points') # special case (1-D) - kmeans2(data, 3, minit='random') - kmeans2(data[:, :1], 3, minit='random') # special case (1-D) + kmeans2(data, 3, minit='++') + kmeans2(data[:, :1], 3, minit='++') # special case (1-D) + + # minit='random' can give warnings, filter those + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', + message="One of the clusters is empty. Re-run") + kmeans2(data, 3, minit='random') + kmeans2(data[:, :1], 3, minit='random') # special case (1-D) @pytest.mark.skipif(sys.platform == 'win32', reason='Fails with MemoryError in Wine.') def test_krandinit(self): diff --git a/scipy/cluster/vq.py b/scipy/cluster/vq.py index 27b1db7f26fc..2d02362e6ba5 100644 --- a/scipy/cluster/vq.py +++ b/scipy/cluster/vq.py @@ -28,10 +28,10 @@ any other centroids. If v belongs to i, we say centroid i is the dominating centroid of v. The k-means algorithm tries to minimize distortion, which is defined as the sum of the squared distances -between each observation vector and its dominating centroid. +between each observation vector and its dominating centroid. The minimization is achieved by iteratively reclassifying the observations into clusters and recalculating the centroids until -a configuration is reached in which the centroids are stable. One can +a configuration is reached in which the centroids are stable. One can also define a maximum number of iterations. Since vector quantization is a natural application for k-means, @@ -473,6 +473,11 @@ def _kpoints(data, k): row is one observation. k : int Number of samples to generate. + + Returns + ------- + x : ndarray + A 'k' by 'N' containing the initial centroids """ idx = np.random.choice(data.shape[0], size=k, replace=False) @@ -493,6 +498,11 @@ def _krandinit(data, k): row is one observation. k : int Number of samples to generate. + + Returns + ------- + x : ndarray + A 'k' by 'N' containing the initial centroids """ mu = data.mean(axis=0) @@ -519,7 +529,45 @@ def _krandinit(data, k): return x -_valid_init_meth = {'random': _krandinit, 'points': _kpoints} +def _kpp(data, k): + """ Picks k points in data based on the kmeans++ method + + Parameters + ---------- + data : ndarray + Expect a rank 1 or 2 array. Rank 1 are assumed to describe one + dimensional data, rank 2 multidimensional data, in which case one + row is one observation. + k : int + Number of samples to generate. + + Returns + ------- + init : ndarray + A 'k' by 'N' containing the initial centroids + + Reference + --------- + D. Arthur and S. Vassilvitskii, "k-means++: the advantages of careful seeding" + + """ + + init = np.ndarray((k, data.shape[1])) + + for i in range(k): + if i == 0: + init[i] = data[randint(len(data))] + + else: + D2 = np.array([min([np.inner(init[j]-x, init[j]-x) for j in range(i)]) for x in data]) + probs = D2/sum(D2) + cumprobs = probs.cumsum() + r = np.random.rand() + init[i] = data[np.searchsorted(cumprobs, r)] + + return init + +_valid_init_meth = {'random': _krandinit, 'points': _kpoints, '++': _kpp} def _missing_warn(): @@ -565,6 +613,7 @@ def kmeans2(data, k, iter=10, thresh=1e-5, minit='random', minit : str, optional Method for initialization. Available methods are 'random', 'points', and 'matrix': + 'points', '++' and 'matrix': 'random': generate k centroids from a Gaussian with mean and variance estimated from the data. @@ -572,6 +621,9 @@ def kmeans2(data, k, iter=10, thresh=1e-5, minit='random', 'points': choose k observations (rows) at random from data for the initial centroids. + '++': choose k observations accordingly to the kmeans++ method + (careful seeding) + 'matrix': interpret the k parameter as a k by M (or length k array for one-dimensional data) array of initial centroids. missing : str, optional From a9c21d2793cc1fb9d93308ea92ab17df0a1d307f Mon Sep 17 00:00:00 2001 From: jeanpauphilet Date: Tue, 17 Oct 2017 15:17:28 -0400 Subject: [PATCH 2/8] Update vq.py --- scipy/cluster/vq.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/scipy/cluster/vq.py b/scipy/cluster/vq.py index 2d02362e6ba5..efa47a73c303 100644 --- a/scipy/cluster/vq.py +++ b/scipy/cluster/vq.py @@ -546,9 +546,10 @@ def _kpp(data, k): init : ndarray A 'k' by 'N' containing the initial centroids - Reference - --------- - D. Arthur and S. Vassilvitskii, "k-means++: the advantages of careful seeding" + References + ---------- + D. Arthur and S. Vassilvitskii, "k-means++: the advantages of careful seeding", + Proceedings of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms, 2007. """ @@ -556,14 +557,14 @@ def _kpp(data, k): for i in range(k): if i == 0: - init[i] = data[randint(len(data))] + init[i, :] = data[randint(data.shape[1])] else: D2 = np.array([min([np.inner(init[j]-x, init[j]-x) for j in range(i)]) for x in data]) - probs = D2/sum(D2) + probs = D2/D2.sum() cumprobs = probs.cumsum() r = np.random.rand() - init[i] = data[np.searchsorted(cumprobs, r)] + init[i, :] = data[np.searchsorted(cumprobs, r)] return init @@ -621,7 +622,7 @@ def kmeans2(data, k, iter=10, thresh=1e-5, minit='random', 'points': choose k observations (rows) at random from data for the initial centroids. - '++': choose k observations accordingly to the kmeans++ method + '++': choose k observations accordingly to the kmeans++ method (careful seeding) 'matrix': interpret the k parameter as a k by M (or length k @@ -648,6 +649,11 @@ def kmeans2(data, k, iter=10, thresh=1e-5, minit='random', label[i] is the code or index of the centroid the i'th observation is closest to. + References + ---------- + D. Arthur and S. Vassilvitskii, "k-means++: the advantages of careful seeding", + Proceedings of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms, 2007. + """ if int(iter) < 1: raise ValueError("Invalid iter (%s), " From 95d7d1959c779cd4d641d14980a6909617befd98 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Sun, 15 Jul 2018 12:15:29 -0500 Subject: [PATCH 3/8] Fix final comments --- scipy/cluster/vq.py | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/scipy/cluster/vq.py b/scipy/cluster/vq.py index efa47a73c303..664616752a3a 100644 --- a/scipy/cluster/vq.py +++ b/scipy/cluster/vq.py @@ -323,10 +323,10 @@ def kmeans(obs, k_or_guess, iter=20, thresh=1e-5, check_finite=True): The k-means algorithm adjusts the classification of the observations into clusters and updates the cluster centroids until the position of - the centroids is stable over successive iterations. In this - implementation of the algorithm, the stability of the centroids is - determined by comparing the absolute value of the change in the average - Euclidean distance between the observations and their corresponding + the centroids is stable over successive iterations. In this + implementation of the algorithm, the stability of the centroids is + determined by comparing the absolute value of the change in the average + Euclidean distance between the observations and their corresponding centroids against a threshold. This yields a code book mapping centroids to codes and vice versa. @@ -373,9 +373,9 @@ def kmeans(obs, k_or_guess, iter=20, thresh=1e-5, check_finite=True): not necessarily the globally minimal distortion. distortion : float - The mean (non-squared) Euclidean distance between the observations + The mean (non-squared) Euclidean distance between the observations passed and the centroids generated. Note the difference to the standard - definition of distortion in the context of the K-means algorithm, which + definition of distortion in the context of the K-means algorithm, which is the sum of the squared distances. See Also @@ -473,7 +473,7 @@ def _kpoints(data, k): row is one observation. k : int Number of samples to generate. - + Returns ------- x : ndarray @@ -498,7 +498,7 @@ def _krandinit(data, k): row is one observation. k : int Number of samples to generate. - + Returns ------- x : ndarray @@ -531,7 +531,7 @@ def _krandinit(data, k): def _kpp(data, k): """ Picks k points in data based on the kmeans++ method - + Parameters ---------- data : ndarray @@ -540,34 +540,37 @@ def _kpp(data, k): row is one observation. k : int Number of samples to generate. - + Returns ------- init : ndarray A 'k' by 'N' containing the initial centroids - + References ---------- D. Arthur and S. Vassilvitskii, "k-means++: the advantages of careful seeding", Proceedings of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms, 2007. - + """ - init = np.ndarray((k, data.shape[1])) - + dims = data.shape[1] if len(data.shape) > 1 else 1 + init = np.ndarray((k, dims)) + for i in range(k): if i == 0: - init[i, :] = data[randint(data.shape[1])] - + init[i, :] = data[np.random.randint(dims)] + else: - D2 = np.array([min([np.inner(init[j]-x, init[j]-x) for j in range(i)]) for x in data]) + D2 = np.array([min( + [np.inner(init[j]-x, init[j]-x) for j in range(i)] + ) for x in data]) probs = D2/D2.sum() cumprobs = probs.cumsum() r = np.random.rand() init[i, :] = data[np.searchsorted(cumprobs, r)] return init - + _valid_init_meth = {'random': _krandinit, 'points': _kpoints, '++': _kpp} From 816c2a0ebacf3947dc283546deb36928315ab6ee Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Sun, 15 Jul 2018 12:20:58 -0500 Subject: [PATCH 4/8] flake8 --- scipy/cluster/tests/test_vq.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/scipy/cluster/tests/test_vq.py b/scipy/cluster/tests/test_vq.py index 6e601d6e0d3a..536fbe6437f1 100644 --- a/scipy/cluster/tests/test_vq.py +++ b/scipy/cluster/tests/test_vq.py @@ -6,13 +6,13 @@ import numpy as np from numpy.testing import (assert_array_equal, assert_array_almost_equal, - assert_allclose, assert_equal, assert_) + assert_allclose, assert_equal, assert_) from scipy._lib._numpy_compat import suppress_warnings import pytest from pytest import raises as assert_raises from scipy.cluster.vq import (kmeans, kmeans2, py_vq, vq, whiten, - ClusterError, _krandinit) + ClusterError, _krandinit) from scipy.cluster import _vq @@ -58,8 +58,8 @@ # Global data X = np.array([[3.0, 3], [4, 3], [4, 2], - [9, 2], [5, 1], [6, 2], [9, 4], - [5, 2], [5, 4], [7, 4], [6, 5]]) + [9, 2], [5, 1], [6, 2], [9, 4], + [5, 2], [5, 4], [7, 4], [6, 5]]) CODET1 = np.array([[3.0000, 3.0000], [6.2000, 4.0000], @@ -201,7 +201,7 @@ def test_kmeans_lost_cluster(self): data = TESTDATA_2D initk = np.array([[-1.8127404, -0.67128041], [2.04621601, 0.07401111], - [-2.31149087,-0.05160469]]) + [-2.31149087, -0.05160469]]) kmeans(data, initk) with suppress_warnings() as sup: @@ -255,12 +255,14 @@ def test_kmeans2_init(self): # minit='random' can give warnings, filter those with warnings.catch_warnings(): - warnings.filterwarnings('ignore', - message="One of the clusters is empty. Re-run") + warnings.filterwarnings( + 'ignore', + message="At least one cluster is empty. Re-run") kmeans2(data, 3, minit='random') kmeans2(data[:, :1], 3, minit='random') # special case (1-D) - @pytest.mark.skipif(sys.platform == 'win32', reason='Fails with MemoryError in Wine.') + @pytest.mark.skipif(sys.platform == 'win32', + reason='Fails with MemoryError in Wine.') def test_krandinit(self): data = TESTDATA_2D datas = [data.reshape((200, 2)), data.reshape((20, 20))[:10]] @@ -284,8 +286,7 @@ def test_kmeans_0k(self): def test_kmeans_large_thres(self): # Regression test for gh-1774 - x = np.array([1,2,3,4,10], dtype=float) + x = np.array([1, 2, 3, 4, 10], dtype=float) res = kmeans(x, 1, thresh=1e16) assert_allclose(res[0], np.array([4.])) assert_allclose(res[1], 2.3999999999999999) - From 370eb62df8cbf315257f5b3fed3c6d6120fe2068 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Sun, 15 Jul 2018 14:52:48 -0500 Subject: [PATCH 5/8] Minor fixes --- scipy/cluster/tests/test_vq.py | 6 ++---- scipy/cluster/vq.py | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/scipy/cluster/tests/test_vq.py b/scipy/cluster/tests/test_vq.py index 536fbe6437f1..9d4450b5083e 100644 --- a/scipy/cluster/tests/test_vq.py +++ b/scipy/cluster/tests/test_vq.py @@ -254,10 +254,8 @@ def test_kmeans2_init(self): kmeans2(data[:, :1], 3, minit='++') # special case (1-D) # minit='random' can give warnings, filter those - with warnings.catch_warnings(): - warnings.filterwarnings( - 'ignore', - message="At least one cluster is empty. Re-run") + with suppress_warnings() as sup: + sup.filter(message="One of the clusters is empty. Re-run") kmeans2(data, 3, minit='random') kmeans2(data[:, :1], 3, minit='random') # special case (1-D) diff --git a/scipy/cluster/vq.py b/scipy/cluster/vq.py index 664616752a3a..a220e52c5f90 100644 --- a/scipy/cluster/vq.py +++ b/scipy/cluster/vq.py @@ -550,7 +550,6 @@ def _kpp(data, k): ---------- D. Arthur and S. Vassilvitskii, "k-means++: the advantages of careful seeding", Proceedings of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms, 2007. - """ dims = data.shape[1] if len(data.shape) > 1 else 1 @@ -571,6 +570,7 @@ def _kpp(data, k): return init + _valid_init_meth = {'random': _krandinit, 'points': _kpoints, '++': _kpp} From 05e2a72968610d85e23ed3f7ea1159d6dd507f7c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Sun, 15 Jul 2018 15:43:22 -0500 Subject: [PATCH 6/8] Add numbering for references --- scipy/cluster/vq.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scipy/cluster/vq.py b/scipy/cluster/vq.py index a220e52c5f90..dae0b3c8234d 100644 --- a/scipy/cluster/vq.py +++ b/scipy/cluster/vq.py @@ -548,8 +548,9 @@ def _kpp(data, k): References ---------- - D. Arthur and S. Vassilvitskii, "k-means++: the advantages of careful seeding", - Proceedings of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms, 2007. + .. [1] D. Arthur and S. Vassilvitskii, "k-means++: the advantages of + careful seeding", Proceedings of the Eighteenth Annual ACM-SIAM Symposium + on Discrete Algorithms, 2007. """ dims = data.shape[1] if len(data.shape) > 1 else 1 From 4cf6fb09224de3d1f5218b9324bb7cb28e68cc2c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Sun, 15 Jul 2018 16:35:44 -0500 Subject: [PATCH 7/8] Fix reference in visible method --- scipy/cluster/vq.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scipy/cluster/vq.py b/scipy/cluster/vq.py index dae0b3c8234d..0b02cf88f880 100644 --- a/scipy/cluster/vq.py +++ b/scipy/cluster/vq.py @@ -655,9 +655,9 @@ def kmeans2(data, k, iter=10, thresh=1e-5, minit='random', References ---------- - D. Arthur and S. Vassilvitskii, "k-means++: the advantages of careful seeding", - Proceedings of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms, 2007. - + .. [1] D. Arthur and S. Vassilvitskii, "k-means++: the advantages of + careful seeding", Proceedings of the Eighteenth Annual ACM-SIAM Symposium + on Discrete Algorithms, 2007. """ if int(iter) < 1: raise ValueError("Invalid iter (%s), " From f56452b39517536a313251d1ea05c29511461e04 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Sun, 15 Jul 2018 16:59:54 -0500 Subject: [PATCH 8/8] Delete extraneous line --- scipy/cluster/vq.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scipy/cluster/vq.py b/scipy/cluster/vq.py index 0b02cf88f880..dfa156e08e1b 100644 --- a/scipy/cluster/vq.py +++ b/scipy/cluster/vq.py @@ -617,7 +617,6 @@ def kmeans2(data, k, iter=10, thresh=1e-5, minit='random', (not used yet) minit : str, optional Method for initialization. Available methods are 'random', - 'points', and 'matrix': 'points', '++' and 'matrix': 'random': generate k centroids from a Gaussian with mean and