-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkmeans.py
159 lines (124 loc) · 4.01 KB
/
kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#encoding utf-8
import math
import sys
import random
import copy
from collections import defaultdict
def distance(a, b, gauss=True):
"""
Euclidean distance or gauss kernel
"""
dim = len(a)
_sum = 0.0
for dimension in xrange(dim):
difference_sq = (a[dimension] - b[dimension]) ** 2
_sum += difference_sq
if gauss :
dis = 1 - math.exp(_sum * -1/2)
else :
dis = math.sqrt(_sum)
return dis
def load_file(file_name) :
"""
Load dataset, for iris dataset
@return data, count
count : the records num
data : {'f' : feature list , 'c' : cluster id, 'l' : origin cluster label}
"""
data = {}
count = 0
with open(file_name) as fp :
for line in fp :
line = line.strip()
if len(line) < 1 :
continue
tokens = line.split(',')
if len(tokens) < 2 :
continue
#this last column is label
data[count] = {'f' : [float(x) for x in tokens[:-1]], 'c' : -1, 'l' : tokens[-1]}
count += 1
return data, count
def init_centers(data, count, center_num) :
"""
Random init centers
"""
centers = set()
if center_num > count / 2 :
raise Exception("too many centers!")
while len(centers) < center_num :
c = random.randint(0, count - 1)
centers.add(c)
center_feature = {}
for i in centers :
center_feature[i] = data[i]['f']
return center_feature
def update_centers(data) :
"""
Update centers
"""
center_feature = {}
center_nums = defaultdict(lambda : 0)
for k, v in data.iteritems() :
center_nums[v['c']] += 1
if not v['c'] in center_feature :
center_feature[v['c']] = copy.deepcopy(v['f'])
else :
center_feature[v['c']] = [center_feature[v['c']][i] + j for i, j in enumerate(v['f'])]
for k, v in center_feature.iteritems() :
center_feature[k] = [float(i) / center_nums[k] for i in v]
return center_feature
def kmeans(file_name, center_num, gauss, iterate_num, decrease_threshold) :
"""
kmeans
"""
data, count = load_file(file_name)
center_feature = init_centers(data, count, center_num)
total_distance, last_distance = 0.0, 0.0
count = 1
while True :
last_distance = total_distance
total_distance = 0.0
for k, v in data.iteritems() :
min_dis = sys.float_info.max
step_cluster = -1
for i, j in center_feature.iteritems() :
dis = distance(v['f'], j, gauss)
if dis < min_dis :
min_dis = dis
step_cluster = i
total_distance += min_dis
data[k]['c'] = step_cluster
center_feature = update_centers(data)
decrease = last_distance - total_distance if count != 1 else total_distance
print "STEP: %d, TOTAL_DISTANCE: %.4f, DESCREASE: %.4f" % (count, total_distance, decrease)
if decrease < decrease_threshold :
break
count += 1
if count > iterate_num :
break
evaluate(data)
def evaluate(data) :
"""
evaluate precision
"""
stat = defaultdict(lambda : defaultdict(lambda : 0))
for k, v in data.iteritems() :
stat[v['c']][v['l']] += 1
wholecount = 0
wholecorrect = 0
for k, v in stat.iteritems() :
allcount = 0
maxj = 0
maxi = ''
for i, j in v.iteritems() :
allcount += j
if j > maxj :
maxj = j
maxi = i
wholecorrect += maxj
wholecount += allcount
print "CLUSTER: %d ALLNUM: %d CORRECT: %d PRECISION: %.4f LABEL: %s" % (k, allcount, maxj, float(maxj) / allcount, maxi)
print "ALLNUM: %d CORRECT: %d PRECISION: %.4f" % (wholecount, wholecorrect, float(wholecorrect) / wholecount)
if __name__ == '__main__':
kmeans('data/iris.data', 3, False, 100, 0.01)