-
Notifications
You must be signed in to change notification settings - Fork 0
/
sklean_k_means.py
129 lines (104 loc) · 4.71 KB
/
sklean_k_means.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from sklearn.cluster import KMeans
import pack_tra
import numpy as np
from matplotlib import pyplot
import os
from sklearn import metrics
path = os.getcwd() + "//data"
user = "000"
tra, loc = pack_tra.loadData(path, user) # tra, loc 分别为全部轨迹,和全部位置点
print("用户{}共有{}条轨迹数据".format(user, len(tra)))
# 以下处理只针对一条轨迹来进行处理
tra_100 = tra[0]
# tra = tra[100]
# print("轨迹包含{}个轨迹点".format(len(tra)))
# x = np.array([[1, 2], [1.5, 1.8], [5, 8], [8, 8], [1, 0.6], [9, 11]])
array = pack_tra.allTraToKmeans(tra)
# array = pack_tra.traToKMeans(tra)
# 数据归一化
array = np.divide(np.subtract(array, np.min(array, axis=0)), np.subtract(np.max(array, axis=0), np.min(array, axis=0)))
# print(array)
print("轨迹包含{}个轨迹点".format(len(array)))
# 把上面数据点分为两组(非监督学习)
def testK():
scores = []
for i in range(2, 50):
print("第{}次循环".format(i))
km = KMeans(n_clusters=i, max_iter=300)
km.fit(array)
# 轮廓系数
# scores.append(metrics.silhouette_score(array, km.labels_, metric='euclidean'))
# SSE 样本距离最近的聚类中心的距离总和 (簇内误差平方和)
scores.append(km.inertia_)
pyplot.plot(range(2, 50), scores, marker='o')
pyplot.xlabel('Number of clusters')
pyplot.ylabel('silhouette_score')
pyplot.show()
# 聚类并预测
def julei():
# 聚类簇的数量
k = 30
clf = KMeans(n_clusters=k)
clf.fit(array) # 分组
centers = clf.cluster_centers_ # 数据点的中心点
labels = clf.labels_ # 每个数据点所属分组
print("分类点的中心坐标")
print(centers)
print("所有点的分类标签")
print(labels)
colors = ['aliceblue', 'antiquewhite', 'aqua', 'aquamarine', 'azure', 'beige', 'bisque', 'black', 'blanchedalmond',
'blue',
'blueviolet', 'brown', 'burlywood', 'cadetblue', 'chartreuse', 'chocolate', 'coral', 'cornflowerblue',
'cornsilk', 'crimson',
'cyan', 'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgreen', 'darkkhaki', 'darkmagenta',
'darkolivegreen', 'darkorange',
'darkorchid', 'darkred', 'darksalmon', 'darkseagreen', 'darkslateblue', 'darkslategray', 'darkturquoise',
'darkviolet', 'deeppink', 'deepskyblue',
'dimgray', 'dodgerblue', 'firebrick', 'floralwhite', 'forestgreen', 'fuchsia', 'gainsboro', 'ghostwhite',
'gold', 'goldenrod',
'gray', 'green', 'greenyellow', 'honeydew', 'hotpink', 'indianred', 'indigo', 'ivory', 'khaki',
'lavender',
'lavenderblush', 'lawngreen', 'lemonchiffon', 'lightblue', 'lightcoral', 'lightcyan',
'lightgoldenrodyellow', 'lightgreen', 'lightgray', 'lightpink',
'lightsalmon', 'lightseagreen', 'lightskyblue', 'lightslategray', 'lightsteelblue', 'lightyellow', 'lime',
'limegreen', 'linen', 'magenta',
'maroon', 'mediumaquamarine', 'mediumblue', 'mediumorchid', 'mediumpurple', 'mediumseagreen',
'mediumslateblue', 'mediumspringgreen', 'mediumturquoise']
# 计算每个分类的频率
sumPoint = len(array)
proPoint = np.zeros((k,), dtype=np.int64)
print(proPoint.shape)
for i in range(len(labels)):
proPoint[labels[i]] += 1
probabilityPoint = np.zeros((k,))
for i in range(k):
probabilityPoint[i] = proPoint[i] / sumPoint
pointAttribute = []
count = 0
arrayTra100 = pack_tra.traToKMeans(tra_100)
arrayTra100 = np.divide(np.subtract(arrayTra100, np.min(arrayTra100, axis=0)),
np.subtract(np.max(arrayTra100, axis=0), np.min(arrayTra100, axis=0)))
label = clf.predict(arrayTra100)
for tra_ in tra_100:
pointAttributeTmp = {
'id': count,
'lat': tra_[0],
'lng': tra_[1],
'frequency': probabilityPoint[label[count]]
}
count += 1
pointAttribute.append(pointAttributeTmp)
# print(pointAttribute)
pack_tra.save_json("jsonData/pointFrequency.json", pointAttribute)
print(probabilityPoint)
for i in range(len(labels)):
if i % 100 == 0:
pyplot.scatter(array[i][0], array[i][1], c=colors[labels[i]])
pyplot.scatter(centers[:, 0], centers[:, 1], marker='*', s=100)
# 预测
# predict = [[40.00823937, 116.31899055], [39.98616056, 116]]
# label = clf.predict(predict)
# for i in range(len(label)):
# pyplot.scatter(predict[i][0], predict[i][1], c=colors[labels[i]], marker='x')
pyplot.show()
julei()