-
Notifications
You must be signed in to change notification settings - Fork 44
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
618 additions
and
1 deletion.
There are no files selected for viewing
96 changes: 96 additions & 0 deletions
96
Artificial Intelligence/zmh/Project2/code/algorithms/KMeans.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
from sklearn.cluster import KMeans as KM | ||
|
||
|
||
class KMeans(object): | ||
def __init__(self, k=3, tolerance=0.0001, max_iterations=500, show_type='bar'): | ||
self.k = k | ||
self.tolerance = tolerance | ||
self.max_iterations = max_iterations | ||
self.centroids = dict() | ||
self._show_type = show_type | ||
self.names = ['my kmeans', 'sklearn kmeans'] | ||
|
||
def fit(self, data): | ||
data = np.array(data) | ||
labels_ = np.zeros(len(data)) | ||
for i in range(self.k): | ||
self.centroids[i] = data[i] | ||
for i in range(self.max_iterations): | ||
self.classes = {} | ||
for j in range(self.k): | ||
self.classes[j] = [] | ||
# find the distance between the point and cluster; choose the nearest centroid | ||
for k, features in enumerate(data): | ||
distances = [np.linalg.norm(features - self.centroids[centroid]) for centroid in self.centroids] | ||
classification = distances.index(min(distances)) | ||
self.classes[classification].append(features) | ||
labels_[k] = classification | ||
|
||
previous = dict(self.centroids) | ||
flag = True | ||
# average the cluster data points to re-calculate the centroids | ||
for classification in self.classes: | ||
self.centroids[classification] = np.average(self.classes[classification], axis=0) | ||
for centroid in self.centroids: | ||
original_centroid = previous[centroid] | ||
curr = self.centroids[centroid] | ||
|
||
if np.sum((curr - original_centroid) / (original_centroid + 1e-8) * 100.0) > self.tolerance: | ||
flag = False | ||
if flag: | ||
break | ||
return labels_.astype(int) | ||
|
||
def evaluation(self, train_data, train_label, centroids, cmp=False): | ||
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score, silhouette_score, completeness_score | ||
|
||
db_scores = [davies_bouldin_score(train_data, centroids)] | ||
ch_scores = [calinski_harabasz_score(train_data, centroids)] | ||
silhouette_scores = [silhouette_score(train_data, centroids)] | ||
completeness_scores = [completeness_score(train_label, centroids)] | ||
|
||
if cmp: | ||
km = KM(n_clusters=4, random_state=1) | ||
km_centroids = km.fit(train_data).labels_ | ||
db_scores.append(davies_bouldin_score(train_data, km_centroids)) | ||
ch_scores.append(calinski_harabasz_score(train_data, km_centroids)) | ||
silhouette_scores.append(silhouette_score(train_data, km_centroids)) | ||
completeness_scores.append(completeness_score(train_label, km_centroids)) | ||
|
||
return {'db_scores': db_scores, 'ch_scores': ch_scores, 'silhouette_scores': silhouette_scores, | ||
'completeness_scores': completeness_scores} | ||
|
||
def show(self, scores, train_data, centroids): | ||
if self._show_type == 'print': | ||
for score_name, score in scores.items(): | ||
for name, s in zip(self.names, score): | ||
print('clustering algorithm: ', name) | ||
print(score_name + ': ', s) | ||
print('=' * 50) | ||
elif self._show_type == 'bar': | ||
for score_name, score in scores.items(): | ||
plt.bar(range(len(self.names)), score, color='rgb', tick_label=self.names) | ||
plt.savefig('fig/cmp_clustering_' + score_name + '.png') | ||
plt.clf() | ||
|
||
km = KM(n_clusters=4, random_state=1) | ||
km_centroid = km.fit(train_data).labels_ | ||
from sklearn.manifold import TSNE | ||
tsne = TSNE(n_components=2, learning_rate=100) | ||
tsne_data = tsne.fit_transform(train_data) | ||
plt.scatter(tsne_data[:, 0], tsne_data[:, 1], c=centroids) | ||
plt.savefig('fig/tsne.png') | ||
plt.clf() | ||
plt.scatter(tsne_data[:, 0], tsne_data[:, 1], c=km_centroid) | ||
plt.savefig('fig/tsne_cmp.png') | ||
plt.clf() | ||
|
||
|
||
if __name__ == '__main__': | ||
kmeans = KMeans() | ||
train_x = [[1., 3., 4., 5., 9., ], [3., 7., 5., 5., 9., ], [0., 8., 6., 5., 9., ], | ||
[2., 3., 4., 6., 6., ], [0., 10., 3., 4., 1., ], [4., 2., 2., 6., 3., ], ] | ||
labels = kmeans.fit(train_x) | ||
print(labels) |
131 changes: 131 additions & 0 deletions
131
Artificial Intelligence/zmh/Project2/code/algorithms/KNN.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
import numpy as np | ||
import operator | ||
from sklearn.metrics import confusion_matrix | ||
import matplotlib.pyplot as plt | ||
|
||
|
||
def euclidean_distance(vector1, vector2): | ||
return np.sqrt(np.sum(np.power(vector1 - vector2, 2))) | ||
|
||
|
||
def absolute_distance(vector1, vector2): | ||
return np.sum(np.absolute(vector1 - vector2)) | ||
|
||
|
||
class KNN(object): | ||
def __init__(self, k=5, dist_type='l2'): | ||
self.k = k | ||
if dist_type == 'l2': | ||
self.distance = euclidean_distance | ||
elif dist_type == 'l1': | ||
self.distance = absolute_distance | ||
else: | ||
raise NotImplementedError | ||
|
||
def neighbours(self, train_x, X_test_instance, k): | ||
distances = [] | ||
neighbors = [] | ||
for i in range(0, train_x.shape[0]): | ||
dist = self.distance(train_x[i], X_test_instance) | ||
distances.append((i, dist)) | ||
distances.sort(key=operator.itemgetter(1)) | ||
for x in range(k): | ||
neighbors.append(distances[x][0]) | ||
return neighbors | ||
|
||
def _predict(self, output, train_y, weights=None): | ||
class_votes = {} | ||
for i in range(len(output)): | ||
if train_y[output[i]] in class_votes: | ||
class_votes[train_y[output[i]]] += 1 | ||
else: | ||
class_votes[train_y[output[i]]] = 1 | ||
sorted_votes = sorted(class_votes.items(), key=operator.itemgetter(1), reverse=True) | ||
return sorted_votes[0][0] | ||
|
||
def predict(self, train_x, test_x, train_y): | ||
output_classes = [] | ||
for i in range(0, test_x.shape[0]): | ||
output = self.neighbours(train_x, test_x[i], self.k) | ||
p_class = self._predict(output, train_y) | ||
output_classes.append(p_class) | ||
return output_classes | ||
|
||
def confusion_matrix(self, test_y, pred_y, cmp=False): | ||
cm = confusion_matrix(test_y, pred_y, labels=range(2)) | ||
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] | ||
plt.imshow(cm, interpolation='nearest') | ||
plt.title('confusion matrix') | ||
plt.colorbar() | ||
labels_name = ['no', 'yes'] | ||
num_local = np.array(range(2)) | ||
plt.xticks(num_local, labels_name, rotation=90) | ||
plt.yticks(num_local, labels_name) | ||
plt.ylabel('True label') | ||
plt.xlabel('Predicted label') | ||
if cmp: | ||
plt.savefig('fig/confusion_matrix_knn.png') | ||
else: | ||
plt.savefig('fig/confusion_matrix.png') | ||
|
||
def confusion_matrix_cmp(self, train_x, test_x, train_y, test_y): | ||
from sklearn.neighbors import KNeighborsClassifier | ||
knn = KNeighborsClassifier(n_neighbors=5) | ||
knn.fit(train_x, train_y) | ||
pred_y = knn.predict(test_x) | ||
self.confusion_matrix(test_y, pred_y, cmp=True) | ||
|
||
|
||
class WKNN(KNN): | ||
def __init__(self, k=5, dist_type='l2'): | ||
super(WKNN, self).__init__(k=k, dist_type=dist_type) | ||
|
||
def neighbours(self, train_x, X_test_instance, k): | ||
distances = [] | ||
neighbors = [] | ||
weights = [] | ||
for i in range(0, train_x.shape[0]): | ||
dist = self.distance(train_x[i], X_test_instance) | ||
distances.append((i, dist)) | ||
distances.sort(key=operator.itemgetter(1)) | ||
for x in range(k): | ||
neighbors.append(distances[x][0]) | ||
weights.append(1 / distances[x][1] + 1e-3) | ||
weights = np.array(weights) | ||
weights = (weights / weights.sum()) * self.k * 0.1 | ||
return neighbors, weights | ||
|
||
def _predict(self, output, train_y, weights=None): | ||
class_votes = {} | ||
for i in range(len(output)): | ||
if train_y[output[i]] in class_votes: | ||
class_votes[train_y[output[i]]] += weights[i] | ||
else: | ||
class_votes[train_y[output[i]]] = weights[i] | ||
sorted_votes = sorted(class_votes.items(), key=operator.itemgetter(1), reverse=True) | ||
return sorted_votes[0][0] | ||
|
||
def predict(self, train_x, test_x, train_y): | ||
output_classes = [] | ||
for i in range(0, test_x.shape[0]): | ||
output, weights = self.neighbours(train_x, test_x[i], self.k) | ||
p_class = self._predict(output, train_y, weights) | ||
output_classes.append(p_class) | ||
return output_classes | ||
|
||
def confusion_matrix(self, test_y, pred_y, cmp=False): | ||
cm = confusion_matrix(test_y, pred_y, labels=range(2)) | ||
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] | ||
plt.imshow(cm, interpolation='nearest') | ||
plt.title('confusion matrix') | ||
plt.colorbar() | ||
labels_name = ['no', 'yes'] | ||
num_local = np.array(range(2)) | ||
plt.xticks(num_local, labels_name, rotation=90) | ||
plt.yticks(num_local, labels_name) | ||
plt.ylabel('True label') | ||
plt.xlabel('Predicted label') | ||
if cmp: | ||
plt.savefig('fig/confusion_matrix_knn.png') | ||
else: | ||
plt.savefig('fig/confusion_matrix_w.png') |
89 changes: 89 additions & 0 deletions
89
Artificial Intelligence/zmh/Project2/code/classification.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
from sklearn.model_selection import cross_val_score, cross_validate | ||
from sklearn.metrics import classification_report | ||
|
||
|
||
class ClassifyMethods(object): | ||
|
||
def __init__(self, k_fold_num=5, show=True, show_type='print'): | ||
self._show = show | ||
self._show_type = show_type | ||
self.k_fold_num = k_fold_num | ||
self.names = ['KNN', 'LR', 'RF', 'DT', 'SVM', 'GBDT'] | ||
self.classifiers = {'KNN': self.knn_classifier(), | ||
'LR': self.logistic_regression_classifier(), | ||
'RF': self.random_forest_classifier(), | ||
'DT': self.decision_tree_classifier(), | ||
'SVM': self.svm_classifier(), | ||
'GBDT': self.gradient_boosting_classifier() | ||
} | ||
|
||
# KNN Classifier | ||
def knn_classifier(self): | ||
from sklearn.neighbors import KNeighborsClassifier | ||
clf = KNeighborsClassifier(n_neighbors=5) | ||
return clf | ||
|
||
# Logistic Regression Classifier | ||
def logistic_regression_classifier(self): | ||
from sklearn.linear_model import LogisticRegression | ||
clf = LogisticRegression(penalty='l2', max_iter=1000) | ||
return clf | ||
|
||
# Random Forest Classifier | ||
def random_forest_classifier(self): | ||
from sklearn.ensemble import RandomForestClassifier | ||
clf = RandomForestClassifier(n_estimators=8) | ||
return clf | ||
|
||
# Decision Tree Classifier | ||
def decision_tree_classifier(self): | ||
from sklearn import tree | ||
clf = tree.DecisionTreeClassifier() | ||
return clf | ||
|
||
# GBDT(Gradient Boosting Decision Tree) Classifier | ||
def gradient_boosting_classifier(self): | ||
from sklearn.ensemble import GradientBoostingClassifier | ||
clf = GradientBoostingClassifier(n_estimators=200) | ||
return clf | ||
|
||
# SVM Classifier | ||
def svm_classifier(self): | ||
from sklearn.svm import SVC | ||
clf = SVC(kernel='rbf', probability=True) | ||
return clf | ||
|
||
def train_all(self, train_x, test_x, train_y, test_y): | ||
scores = [] | ||
for name, model in self.classifiers.items(): | ||
scores.append(cross_val_score(model, train_x, train_y, cv=self.k_fold_num, scoring='accuracy').mean()) | ||
print('classification algorithm: ', name) | ||
model.fit(train_x, train_y) | ||
pred_y = model.predict(test_x) | ||
print(classification_report(y_true=test_y, y_pred=pred_y)) | ||
if self._show: | ||
self.show(scores) | ||
|
||
def cv_all(self, train_x, train_y): | ||
scoring = {'accuracy': 'accuracy', | ||
'precision': 'precision', | ||
'recall': 'recall', | ||
'f1': 'f1', | ||
'roc_auc': 'roc_auc'} | ||
for name, model in self.classifiers.items(): | ||
print('classification algorithm: ', name) | ||
print(cross_validate(model, train_x, train_y, cv=self.k_fold_num, scoring=scoring)) | ||
|
||
def show(self, scores): | ||
if self._show_type == 'print': | ||
for name, score in zip(self.names, scores): | ||
print('classification algorithm: ', name) | ||
print('accuracy: {:.3f}'.format(score)) | ||
print('=' * 50) | ||
elif self._show_type == 'bar': | ||
import matplotlib.pyplot as plt | ||
plt.bar(range(len(self.names)), scores, color='rgb', tick_label=self.names) | ||
plt.savefig('fig/classification.png') | ||
else: | ||
raise NotImplementedError | ||
|
Oops, something went wrong.