From ad9551a1d7b980595d5b6ada57cba8f86de33a5a Mon Sep 17 00:00:00 2001
From: jeanpauphilet <jean.pauphilet@gmail.com>
Date: Wed, 5 Nov 2014 14:11:56 +0100
Subject: [PATCH 1/8] ENH: Add new init method for k-means

- implement kmeans++ init method (_kpp) proposed by D. Arthur and S. Vassilvitski
- remove references to unsupported init method ('uniform')
---
 scipy/cluster/tests/test_vq.py | 11 +++++--
 scipy/cluster/vq.py            | 58 ++++++++++++++++++++++++++++++++--
 2 files changed, 64 insertions(+), 5 deletions(-)

diff --git a/scipy/cluster/tests/test_vq.py b/scipy/cluster/tests/test_vq.py
index abffc55bc1f2..6e601d6e0d3a 100644
--- a/scipy/cluster/tests/test_vq.py
+++ b/scipy/cluster/tests/test_vq.py
@@ -250,8 +250,15 @@ def test_kmeans2_init(self):
         kmeans2(data, 3, minit='points')
         kmeans2(data[:, :1], 3, minit='points')  # special case (1-D)
 
-        kmeans2(data, 3, minit='random')
-        kmeans2(data[:, :1], 3, minit='random')  # special case (1-D)
+        kmeans2(data, 3, minit='++')
+        kmeans2(data[:, :1], 3, minit='++')  # special case (1-D)
+
+        # minit='random' can give warnings, filter those
+        with warnings.catch_warnings():
+            warnings.filterwarnings('ignore',
+                        message="One of the clusters is empty. Re-run")
+            kmeans2(data, 3, minit='random')
+            kmeans2(data[:, :1], 3, minit='random')  # special case (1-D)
 
     @pytest.mark.skipif(sys.platform == 'win32', reason='Fails with MemoryError in Wine.')
     def test_krandinit(self):
diff --git a/scipy/cluster/vq.py b/scipy/cluster/vq.py
index 27b1db7f26fc..2d02362e6ba5 100644
--- a/scipy/cluster/vq.py
+++ b/scipy/cluster/vq.py
@@ -28,10 +28,10 @@
 any other centroids. If v belongs to i, we say centroid i is the
 dominating centroid of v. The k-means algorithm tries to
 minimize distortion, which is defined as the sum of the squared distances
-between each observation vector and its dominating centroid. 
+between each observation vector and its dominating centroid.
 The minimization is achieved by iteratively reclassifying
 the observations into clusters and recalculating the centroids until
-a configuration is reached in which the centroids are stable. One can 
+a configuration is reached in which the centroids are stable. One can
 also define a maximum number of iterations.
 
 Since vector quantization is a natural application for k-means,
@@ -473,6 +473,11 @@ def _kpoints(data, k):
         row is one observation.
     k : int
         Number of samples to generate.
+   
+   Returns
+    -------
+    x : ndarray
+        A 'k' by 'N' containing the initial centroids
 
     """
     idx = np.random.choice(data.shape[0], size=k, replace=False)
@@ -493,6 +498,11 @@ def _krandinit(data, k):
         row is one observation.
     k : int
         Number of samples to generate.
+   
+    Returns
+    -------
+    x : ndarray
+        A 'k' by 'N' containing the initial centroids
 
     """
     mu = data.mean(axis=0)
@@ -519,7 +529,45 @@ def _krandinit(data, k):
     return x
 
 
-_valid_init_meth = {'random': _krandinit, 'points': _kpoints}
+def _kpp(data, k):
+    """ Picks k points in data based on the kmeans++ method
+    
+    Parameters
+    ----------
+    data : ndarray
+        Expect a rank 1 or 2 array. Rank 1 are assumed to describe one
+        dimensional data, rank 2 multidimensional data, in which case one
+        row is one observation.
+    k : int
+        Number of samples to generate.
+   
+    Returns
+    -------
+    init : ndarray
+        A 'k' by 'N' containing the initial centroids
+   
+    Reference
+    ---------
+    D. Arthur and S. Vassilvitskii, "k-means++: the advantages of careful seeding"
+    
+    """
+
+    init = np.ndarray((k, data.shape[1]))
+    
+    for i in range(k):
+        if i == 0:
+            init[i] = data[randint(len(data))]
+               
+        else:
+            D2 = np.array([min([np.inner(init[j]-x, init[j]-x) for j in range(i)]) for x in data])
+            probs = D2/sum(D2)
+            cumprobs = probs.cumsum()
+            r = np.random.rand()
+            init[i] = data[np.searchsorted(cumprobs, r)]
+
+    return init
+    
+_valid_init_meth = {'random': _krandinit, 'points': _kpoints, '++': _kpp}
 
 
 def _missing_warn():
@@ -565,6 +613,7 @@ def kmeans2(data, k, iter=10, thresh=1e-5, minit='random',
     minit : str, optional
         Method for initialization. Available methods are 'random',
         'points', and 'matrix':
+        'points', '++' and 'matrix':
 
         'random': generate k centroids from a Gaussian with mean and
         variance estimated from the data.
@@ -572,6 +621,9 @@ def kmeans2(data, k, iter=10, thresh=1e-5, minit='random',
         'points': choose k observations (rows) at random from data for
         the initial centroids.
 
+         '++': choose k observations accordingly to the kmeans++ method
+        (careful seeding)
+
         'matrix': interpret the k parameter as a k by M (or length k
         array for one-dimensional data) array of initial centroids.
     missing : str, optional

From a9c21d2793cc1fb9d93308ea92ab17df0a1d307f Mon Sep 17 00:00:00 2001
From: jeanpauphilet <jean.pauphilet@gmail.com>
Date: Tue, 17 Oct 2017 15:17:28 -0400
Subject: [PATCH 2/8] Update vq.py

---
 scipy/cluster/vq.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/scipy/cluster/vq.py b/scipy/cluster/vq.py
index 2d02362e6ba5..efa47a73c303 100644
--- a/scipy/cluster/vq.py
+++ b/scipy/cluster/vq.py
@@ -546,9 +546,10 @@ def _kpp(data, k):
     init : ndarray
         A 'k' by 'N' containing the initial centroids
    
-    Reference
-    ---------
-    D. Arthur and S. Vassilvitskii, "k-means++: the advantages of careful seeding"
+    References
+    ----------
+    D. Arthur and S. Vassilvitskii, "k-means++: the advantages of careful seeding",
+      Proceedings of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms, 2007.
     
     """
 
@@ -556,14 +557,14 @@ def _kpp(data, k):
     
     for i in range(k):
         if i == 0:
-            init[i] = data[randint(len(data))]
+            init[i, :] = data[randint(data.shape[1])]
                
         else:
             D2 = np.array([min([np.inner(init[j]-x, init[j]-x) for j in range(i)]) for x in data])
-            probs = D2/sum(D2)
+            probs = D2/D2.sum()
             cumprobs = probs.cumsum()
             r = np.random.rand()
-            init[i] = data[np.searchsorted(cumprobs, r)]
+            init[i, :] = data[np.searchsorted(cumprobs, r)]
 
     return init
     
@@ -621,7 +622,7 @@ def kmeans2(data, k, iter=10, thresh=1e-5, minit='random',
         'points': choose k observations (rows) at random from data for
         the initial centroids.
 
-         '++': choose k observations accordingly to the kmeans++ method
+        '++': choose k observations accordingly to the kmeans++ method
         (careful seeding)
 
         'matrix': interpret the k parameter as a k by M (or length k
@@ -648,6 +649,11 @@ def kmeans2(data, k, iter=10, thresh=1e-5, minit='random',
         label[i] is the code or index of the centroid the
         i'th observation is closest to.
 
+    References
+    ----------
+    D. Arthur and S. Vassilvitskii, "k-means++: the advantages of careful seeding",
+      Proceedings of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms, 2007.
+
     """
     if int(iter) < 1:
         raise ValueError("Invalid iter (%s), "

From 95d7d1959c779cd4d641d14980a6909617befd98 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vramasub@umich.edu>
Date: Sun, 15 Jul 2018 12:15:29 -0500
Subject: [PATCH 3/8] Fix final comments

---
 scipy/cluster/vq.py | 39 +++++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/scipy/cluster/vq.py b/scipy/cluster/vq.py
index efa47a73c303..664616752a3a 100644
--- a/scipy/cluster/vq.py
+++ b/scipy/cluster/vq.py
@@ -323,10 +323,10 @@ def kmeans(obs, k_or_guess, iter=20, thresh=1e-5, check_finite=True):
 
     The k-means algorithm adjusts the classification of the observations
     into clusters and updates the cluster centroids until the position of
-    the centroids is stable over successive iterations. In this 
-    implementation of the algorithm, the stability of the centroids is 
-    determined by comparing the absolute value of the change in the average 
-    Euclidean distance between the observations and their corresponding 
+    the centroids is stable over successive iterations. In this
+    implementation of the algorithm, the stability of the centroids is
+    determined by comparing the absolute value of the change in the average
+    Euclidean distance between the observations and their corresponding
     centroids against a threshold. This yields
     a code book mapping centroids to codes and vice versa.
 
@@ -373,9 +373,9 @@ def kmeans(obs, k_or_guess, iter=20, thresh=1e-5, check_finite=True):
        not necessarily the globally minimal distortion.
 
     distortion : float
-       The mean (non-squared) Euclidean distance between the observations 
+       The mean (non-squared) Euclidean distance between the observations
        passed and the centroids generated. Note the difference to the standard
-       definition of distortion in the context of the K-means algorithm, which 
+       definition of distortion in the context of the K-means algorithm, which
        is the sum of the squared distances.
 
     See Also
@@ -473,7 +473,7 @@ def _kpoints(data, k):
         row is one observation.
     k : int
         Number of samples to generate.
-   
+
    Returns
     -------
     x : ndarray
@@ -498,7 +498,7 @@ def _krandinit(data, k):
         row is one observation.
     k : int
         Number of samples to generate.
-   
+
     Returns
     -------
     x : ndarray
@@ -531,7 +531,7 @@ def _krandinit(data, k):
 
 def _kpp(data, k):
     """ Picks k points in data based on the kmeans++ method
-    
+
     Parameters
     ----------
     data : ndarray
@@ -540,34 +540,37 @@ def _kpp(data, k):
         row is one observation.
     k : int
         Number of samples to generate.
-   
+
     Returns
     -------
     init : ndarray
         A 'k' by 'N' containing the initial centroids
-   
+
     References
     ----------
     D. Arthur and S. Vassilvitskii, "k-means++: the advantages of careful seeding",
       Proceedings of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms, 2007.
-    
+
     """
 
-    init = np.ndarray((k, data.shape[1]))
-    
+    dims = data.shape[1] if len(data.shape) > 1 else 1
+    init = np.ndarray((k, dims))
+
     for i in range(k):
         if i == 0:
-            init[i, :] = data[randint(data.shape[1])]
-               
+            init[i, :] = data[np.random.randint(dims)]
+
         else:
-            D2 = np.array([min([np.inner(init[j]-x, init[j]-x) for j in range(i)]) for x in data])
+            D2 = np.array([min(
+                            [np.inner(init[j]-x, init[j]-x) for j in range(i)]
+                            ) for x in data])
             probs = D2/D2.sum()
             cumprobs = probs.cumsum()
             r = np.random.rand()
             init[i, :] = data[np.searchsorted(cumprobs, r)]
 
     return init
-    
+
 _valid_init_meth = {'random': _krandinit, 'points': _kpoints, '++': _kpp}
 
 

From 816c2a0ebacf3947dc283546deb36928315ab6ee Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vramasub@umich.edu>
Date: Sun, 15 Jul 2018 12:20:58 -0500
Subject: [PATCH 4/8] flake8

---
 scipy/cluster/tests/test_vq.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/scipy/cluster/tests/test_vq.py b/scipy/cluster/tests/test_vq.py
index 6e601d6e0d3a..536fbe6437f1 100644
--- a/scipy/cluster/tests/test_vq.py
+++ b/scipy/cluster/tests/test_vq.py
@@ -6,13 +6,13 @@
 
 import numpy as np
 from numpy.testing import (assert_array_equal, assert_array_almost_equal,
-    assert_allclose, assert_equal, assert_)
+                           assert_allclose, assert_equal, assert_)
 from scipy._lib._numpy_compat import suppress_warnings
 import pytest
 from pytest import raises as assert_raises
 
 from scipy.cluster.vq import (kmeans, kmeans2, py_vq, vq, whiten,
-    ClusterError, _krandinit)
+                              ClusterError, _krandinit)
 from scipy.cluster import _vq
 
 
@@ -58,8 +58,8 @@
 
 # Global data
 X = np.array([[3.0, 3], [4, 3], [4, 2],
-               [9, 2], [5, 1], [6, 2], [9, 4],
-               [5, 2], [5, 4], [7, 4], [6, 5]])
+              [9, 2], [5, 1], [6, 2], [9, 4],
+              [5, 2], [5, 4], [7, 4], [6, 5]])
 
 CODET1 = np.array([[3.0000, 3.0000],
                    [6.2000, 4.0000],
@@ -201,7 +201,7 @@ def test_kmeans_lost_cluster(self):
         data = TESTDATA_2D
         initk = np.array([[-1.8127404, -0.67128041],
                          [2.04621601, 0.07401111],
-                         [-2.31149087,-0.05160469]])
+                         [-2.31149087, -0.05160469]])
 
         kmeans(data, initk)
         with suppress_warnings() as sup:
@@ -255,12 +255,14 @@ def test_kmeans2_init(self):
 
         # minit='random' can give warnings, filter those
         with warnings.catch_warnings():
-            warnings.filterwarnings('ignore',
-                        message="One of the clusters is empty. Re-run")
+            warnings.filterwarnings(
+                    'ignore',
+                    message="At least one cluster is empty. Re-run")
             kmeans2(data, 3, minit='random')
             kmeans2(data[:, :1], 3, minit='random')  # special case (1-D)
 
-    @pytest.mark.skipif(sys.platform == 'win32', reason='Fails with MemoryError in Wine.')
+    @pytest.mark.skipif(sys.platform == 'win32',
+                        reason='Fails with MemoryError in Wine.')
     def test_krandinit(self):
         data = TESTDATA_2D
         datas = [data.reshape((200, 2)), data.reshape((20, 20))[:10]]
@@ -284,8 +286,7 @@ def test_kmeans_0k(self):
 
     def test_kmeans_large_thres(self):
         # Regression test for gh-1774
-        x = np.array([1,2,3,4,10], dtype=float)
+        x = np.array([1, 2, 3, 4, 10], dtype=float)
         res = kmeans(x, 1, thresh=1e16)
         assert_allclose(res[0], np.array([4.]))
         assert_allclose(res[1], 2.3999999999999999)
-

From 370eb62df8cbf315257f5b3fed3c6d6120fe2068 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vramasub@umich.edu>
Date: Sun, 15 Jul 2018 14:52:48 -0500
Subject: [PATCH 5/8] Minor fixes

---
 scipy/cluster/tests/test_vq.py | 6 ++----
 scipy/cluster/vq.py            | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/scipy/cluster/tests/test_vq.py b/scipy/cluster/tests/test_vq.py
index 536fbe6437f1..9d4450b5083e 100644
--- a/scipy/cluster/tests/test_vq.py
+++ b/scipy/cluster/tests/test_vq.py
@@ -254,10 +254,8 @@ def test_kmeans2_init(self):
         kmeans2(data[:, :1], 3, minit='++')  # special case (1-D)
 
         # minit='random' can give warnings, filter those
-        with warnings.catch_warnings():
-            warnings.filterwarnings(
-                    'ignore',
-                    message="At least one cluster is empty. Re-run")
+        with suppress_warnings() as sup:
+            sup.filter(message="One of the clusters is empty. Re-run")
             kmeans2(data, 3, minit='random')
             kmeans2(data[:, :1], 3, minit='random')  # special case (1-D)
 
diff --git a/scipy/cluster/vq.py b/scipy/cluster/vq.py
index 664616752a3a..a220e52c5f90 100644
--- a/scipy/cluster/vq.py
+++ b/scipy/cluster/vq.py
@@ -550,7 +550,6 @@ def _kpp(data, k):
     ----------
     D. Arthur and S. Vassilvitskii, "k-means++: the advantages of careful seeding",
       Proceedings of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms, 2007.
-
     """
 
     dims = data.shape[1] if len(data.shape) > 1 else 1
@@ -571,6 +570,7 @@ def _kpp(data, k):
 
     return init
 
+
 _valid_init_meth = {'random': _krandinit, 'points': _kpoints, '++': _kpp}
 
 

From 05e2a72968610d85e23ed3f7ea1159d6dd507f7c Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vramasub@umich.edu>
Date: Sun, 15 Jul 2018 15:43:22 -0500
Subject: [PATCH 6/8] Add numbering for references

---
 scipy/cluster/vq.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scipy/cluster/vq.py b/scipy/cluster/vq.py
index a220e52c5f90..dae0b3c8234d 100644
--- a/scipy/cluster/vq.py
+++ b/scipy/cluster/vq.py
@@ -548,8 +548,9 @@ def _kpp(data, k):
 
     References
     ----------
-    D. Arthur and S. Vassilvitskii, "k-means++: the advantages of careful seeding",
-      Proceedings of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms, 2007.
+    .. [1] D. Arthur and S. Vassilvitskii, "k-means++: the advantages of
+       careful seeding", Proceedings of the Eighteenth Annual ACM-SIAM Symposium
+       on Discrete Algorithms, 2007.
     """
 
     dims = data.shape[1] if len(data.shape) > 1 else 1

From 4cf6fb09224de3d1f5218b9324bb7cb28e68cc2c Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vramasub@umich.edu>
Date: Sun, 15 Jul 2018 16:35:44 -0500
Subject: [PATCH 7/8] Fix reference in visible method

---
 scipy/cluster/vq.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scipy/cluster/vq.py b/scipy/cluster/vq.py
index dae0b3c8234d..0b02cf88f880 100644
--- a/scipy/cluster/vq.py
+++ b/scipy/cluster/vq.py
@@ -655,9 +655,9 @@ def kmeans2(data, k, iter=10, thresh=1e-5, minit='random',
 
     References
     ----------
-    D. Arthur and S. Vassilvitskii, "k-means++: the advantages of careful seeding",
-      Proceedings of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms, 2007.
-
+    .. [1] D. Arthur and S. Vassilvitskii, "k-means++: the advantages of
+       careful seeding", Proceedings of the Eighteenth Annual ACM-SIAM Symposium
+       on Discrete Algorithms, 2007.
     """
     if int(iter) < 1:
         raise ValueError("Invalid iter (%s), "

From f56452b39517536a313251d1ea05c29511461e04 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vramasub@umich.edu>
Date: Sun, 15 Jul 2018 16:59:54 -0500
Subject: [PATCH 8/8] Delete extraneous line

---
 scipy/cluster/vq.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scipy/cluster/vq.py b/scipy/cluster/vq.py
index 0b02cf88f880..dfa156e08e1b 100644
--- a/scipy/cluster/vq.py
+++ b/scipy/cluster/vq.py
@@ -617,7 +617,6 @@ def kmeans2(data, k, iter=10, thresh=1e-5, minit='random',
         (not used yet)
     minit : str, optional
         Method for initialization. Available methods are 'random',
-        'points', and 'matrix':
         'points', '++' and 'matrix':
 
         'random': generate k centroids from a Gaussian with mean and