forked from gustavowillam/SmartMapPlugin
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path_cmeans.py
307 lines (251 loc) · 9.28 KB
/
_cmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
"""
cmeans.py : Fuzzy C-means clustering algorithm.
"""
import numpy as np
from scipy.spatial.distance import cdist
from .normalize_columns import normalize_columns, normalize_power_columns
def _cmeans0(data, u_old, c, m, metric):
"""
Single step in generic fuzzy c-means clustering algorithm.
Modified from Ross, Fuzzy Logic w/Engineering Applications (2010),
pages 352-353, equations 10.28 - 10.35.
Parameters inherited from cmeans()
"""
# Normalizing, then eliminating any potential zero values.
u_old = normalize_columns(u_old)
u_old = np.fmax(u_old, np.finfo(np.float64).eps)
um = u_old ** m
# Calculate cluster centers
data = data.T
cntr = um.dot(data) / np.atleast_2d(um.sum(axis=1)).T
d = _distance(data, cntr, metric)
d = np.fmax(d, np.finfo(np.float64).eps)
jm = (um * d ** 2).sum()
u = normalize_power_columns(d, - 2. / (m - 1))
return cntr, u, jm, d
def _distance(data, centers, metric='euclidean'):
"""
Euclidean distance from each point to each cluster center.
Parameters
----------
data : 2d array (N x Q)
Data to be analyzed. There are N data points.
centers : 2d array (C x Q)
Cluster centers. There are C clusters, with Q features.
metric: string
By default is set to euclidean. Passes any option accepted by
``scipy.spatial.distance.cdist``.
Returns
-------
dist : 2d array (C x N)
Euclidean distance from each point, to each cluster center.
See Also
--------
scipy.spatial.distance.cdist
"""
return cdist(data, centers, metric=metric).T
def _fp_coeff(u):
"""
Fuzzy partition coefficient `fpc` relative to fuzzy c-partitioned
matrix `u`. Measures 'fuzziness' in partitioned clustering.
Parameters
----------
u : 2d array (C, N)
Fuzzy c-partitioned matrix; N = number of data points and C = number
of clusters.
Returns
-------
fpc : float
Fuzzy partition coefficient.
"""
n = u.shape[1]
return np.trace(u.dot(u.T)) / float(n)
def cmeans(data, c, m, error, maxiter, metric='euclidean', init=None, seed=None):
"""
Fuzzy c-means clustering algorithm [1].
Parameters
----------
data : 2d array, size (S, N)
Data to be clustered. N is the number of data sets; S is the number
of features within each sample vector.
c : int
Desired number of clusters or classes.
m : float
Array exponentiation applied to the membership function u_old at each
iteration, where U_new = u_old ** m.
error : float
Stopping criterion; stop early if the norm of (u[p] - u[p-1]) < error.
maxiter : int
Maximum number of iterations allowed.
metric: string
By default is set to euclidean. Passes any option accepted by
``scipy.spatial.distance.cdist``.
init : 2d array, size (S, N)
Initial fuzzy c-partitioned matrix. If none provided, algorithm is
randomly initialized.
seed : int
If provided, sets random seed of init. No effect if init is
provided. Mainly for debug/testing purposes.
Returns
-------
cntr : 2d array, size (S, c)
Cluster centers. Data for each center along each feature provided
for every cluster (of the `c` requested clusters).
u : 2d array, (S, N)
Final fuzzy c-partitioned matrix.
u0 : 2d array, (S, N)
Initial guess at fuzzy c-partitioned matrix (either provided init or
random guess used if init was not provided).
d : 2d array, (S, N)
Final Euclidian distance matrix.
jm : 1d array, length P
Objective function history.
p : int
Number of iterations run.
fpc : float
Final fuzzy partition coefficient.
Notes
-----
The algorithm implemented is from Ross et al. [1]_.
Fuzzy C-Means has a known problem with high dimensionality datasets, where
the majority of cluster centers are pulled into the overall center of
gravity. If you are clustering data with very high dimensionality and
encounter this issue, another clustering method may be required. For more
information and the theory behind this, see Winkler et al. [2]_.
References
----------
.. [1] Ross, Timothy J. Fuzzy Logic With Engineering Applications, 3rd ed.
Wiley. 2010. ISBN 978-0-470-74376-8 pp 352-353, eq 10.28 - 10.35.
.. [2] Winkler, R., Klawonn, F., & Kruse, R. Fuzzy c-means in high
dimensional spaces. 2012. Contemporary Theory and Pragmatic
Approaches in Fuzzy Computing Utilization, 1.
"""
# Setup u0
if init is None:
if seed is not None:
np.random.seed(seed=seed)
n = data.shape[1]
u0 = np.random.rand(c, n)
u0 = normalize_columns(u0)
init = u0.copy()
u0 = init
u = np.fmax(u0, np.finfo(np.float64).eps)
# Initialize loop parameters
jm = np.zeros(0)
p = 0
# Main cmeans loop
while p < maxiter - 1:
u2 = u.copy()
[cntr, u, Jjm, d] = _cmeans0(data, u2, c, m, metric)
jm = np.hstack((jm, Jjm))
p += 1
# Stopping rule
if np.linalg.norm(u - u2) < error:
break
# Final calculations
error = np.linalg.norm(u - u2)
fpc = _fp_coeff(u)
return cntr, u, u0, d, jm, p, fpc
def cmeans_predict(test_data, cntr_trained, m, error, maxiter, metric='euclidean', init=None,
seed=None):
"""
Prediction of new data in given a trained fuzzy c-means framework [1].
Parameters
----------
test_data : 2d array, size (S, N)
New, independent data set to be predicted based on trained c-means
from ``cmeans``. N is the number of data sets; S is the number of
features within each sample vector.
cntr_trained : 2d array, size (S, c)
Location of trained centers from prior training c-means.
m : float
Array exponentiation applied to the membership function u_old at each
iteration, where U_new = u_old ** m.
error : float
Stopping criterion; stop early if the norm of (u[p] - u[p-1]) < error.
maxiter : int
Maximum number of iterations allowed.
metric: string
By default is set to euclidean. Passes any option accepted by
``scipy.spatial.distance.cdist``.
init : 2d array, size (S, N)
Initial fuzzy c-partitioned matrix. If none provided, algorithm is
randomly initialized.
seed : int
If provided, sets random seed of init. No effect if init is
provided. Mainly for debug/testing purposes.
Returns
-------
u : 2d array, (S, N)
Final fuzzy c-partitioned matrix.
u0 : 2d array, (S, N)
Initial guess at fuzzy c-partitioned matrix (either provided init or
random guess used if init was not provided).
d : 2d array, (S, N)
Final Euclidian distance matrix.
jm : 1d array, length P
Objective function history.
p : int
Number of iterations run.
fpc : float
Final fuzzy partition coefficient.
Notes
-----
Ross et al. [1]_ did not include a prediction algorithm to go along with
fuzzy c-means. This prediction algorithm works by repeating the clustering
with fixed centers, then efficiently finds the fuzzy membership at all
points.
References
----------
.. [1] Ross, Timothy J. Fuzzy Logic With Engineering Applications, 3rd ed.
Wiley. 2010. ISBN 978-0-470-74376-8 pp 352-353, eq 10.28 - 10.35.
"""
c = cntr_trained.shape[0]
# Setup u0
if init is None:
if seed is not None:
np.random.seed(seed=seed)
n = test_data.shape[1]
u0 = np.random.rand(c, n)
u0 = normalize_columns(u0)
init = u0.copy()
u0 = init
u = np.fmax(u0, np.finfo(np.float64).eps)
# Initialize loop parameters
jm = np.zeros(0)
p = 0
# Main cmeans loop
while p < maxiter - 1:
u2 = u.copy()
[u, Jjm, d] = _cmeans_predict0(test_data, cntr_trained, u2, c, m, metric)
jm = np.hstack((jm, Jjm))
p += 1
# Stopping rule
if np.linalg.norm(u - u2) < error:
break
# Final calculations
error = np.linalg.norm(u - u2)
fpc = _fp_coeff(u)
return u, u0, d, jm, p, fpc
def _cmeans_predict0(test_data, cntr, u_old, c, m, metric):
"""
Single step in fuzzy c-means prediction algorithm. Clustering algorithm
modified from Ross, Fuzzy Logic w/Engineering Applications (2010)
p.352-353, equations 10.28 - 10.35, but this method to generate fuzzy
predictions was independently derived by Josh Warner.
Parameters inherited from cmeans()
Very similar to initial clustering, except `cntr` is not updated, thus
the new test data are forced into known (trained) clusters.
"""
# Normalizing, then eliminating any potential zero values.
u_old = normalize_columns(u_old)
u_old = np.fmax(u_old, np.finfo(np.float64).eps)
um = u_old ** m
test_data = test_data.T
# For prediction, we do not recalculate cluster centers. The test_data is
# forced to conform to the prior clustering.
d = _distance(test_data, cntr, metric)
d = np.fmax(d, np.finfo(np.float64).eps)
jm = (um * d ** 2).sum()
u = normalize_power_columns(d, - 2. / (m - 1))
return u, jm, d