Skip to content

Commit

Permalink
ai project2
Browse files Browse the repository at this point in the history
  • Loading branch information
Mehooz committed Jun 11, 2020
1 parent 3a318d3 commit 735b0d8
Show file tree
Hide file tree
Showing 8 changed files with 618 additions and 1 deletion.
96 changes: 96 additions & 0 deletions Artificial Intelligence/zmh/Project2/code/algorithms/KMeans.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans as KM


class KMeans(object):
def __init__(self, k=3, tolerance=0.0001, max_iterations=500, show_type='bar'):
self.k = k
self.tolerance = tolerance
self.max_iterations = max_iterations
self.centroids = dict()
self._show_type = show_type
self.names = ['my kmeans', 'sklearn kmeans']

def fit(self, data):
data = np.array(data)
labels_ = np.zeros(len(data))
for i in range(self.k):
self.centroids[i] = data[i]
for i in range(self.max_iterations):
self.classes = {}
for j in range(self.k):
self.classes[j] = []
# find the distance between the point and cluster; choose the nearest centroid
for k, features in enumerate(data):
distances = [np.linalg.norm(features - self.centroids[centroid]) for centroid in self.centroids]
classification = distances.index(min(distances))
self.classes[classification].append(features)
labels_[k] = classification

previous = dict(self.centroids)
flag = True
# average the cluster data points to re-calculate the centroids
for classification in self.classes:
self.centroids[classification] = np.average(self.classes[classification], axis=0)
for centroid in self.centroids:
original_centroid = previous[centroid]
curr = self.centroids[centroid]

if np.sum((curr - original_centroid) / (original_centroid + 1e-8) * 100.0) > self.tolerance:
flag = False
if flag:
break
return labels_.astype(int)

def evaluation(self, train_data, train_label, centroids, cmp=False):
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score, silhouette_score, completeness_score

db_scores = [davies_bouldin_score(train_data, centroids)]
ch_scores = [calinski_harabasz_score(train_data, centroids)]
silhouette_scores = [silhouette_score(train_data, centroids)]
completeness_scores = [completeness_score(train_label, centroids)]

if cmp:
km = KM(n_clusters=4, random_state=1)
km_centroids = km.fit(train_data).labels_
db_scores.append(davies_bouldin_score(train_data, km_centroids))
ch_scores.append(calinski_harabasz_score(train_data, km_centroids))
silhouette_scores.append(silhouette_score(train_data, km_centroids))
completeness_scores.append(completeness_score(train_label, km_centroids))

return {'db_scores': db_scores, 'ch_scores': ch_scores, 'silhouette_scores': silhouette_scores,
'completeness_scores': completeness_scores}

def show(self, scores, train_data, centroids):
if self._show_type == 'print':
for score_name, score in scores.items():
for name, s in zip(self.names, score):
print('clustering algorithm: ', name)
print(score_name + ': ', s)
print('=' * 50)
elif self._show_type == 'bar':
for score_name, score in scores.items():
plt.bar(range(len(self.names)), score, color='rgb', tick_label=self.names)
plt.savefig('fig/cmp_clustering_' + score_name + '.png')
plt.clf()

km = KM(n_clusters=4, random_state=1)
km_centroid = km.fit(train_data).labels_
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, learning_rate=100)
tsne_data = tsne.fit_transform(train_data)
plt.scatter(tsne_data[:, 0], tsne_data[:, 1], c=centroids)
plt.savefig('fig/tsne.png')
plt.clf()
plt.scatter(tsne_data[:, 0], tsne_data[:, 1], c=km_centroid)
plt.savefig('fig/tsne_cmp.png')
plt.clf()


if __name__ == '__main__':
kmeans = KMeans()
train_x = [[1., 3., 4., 5., 9., ], [3., 7., 5., 5., 9., ], [0., 8., 6., 5., 9., ],
[2., 3., 4., 6., 6., ], [0., 10., 3., 4., 1., ], [4., 2., 2., 6., 3., ], ]
labels = kmeans.fit(train_x)
print(labels)
131 changes: 131 additions & 0 deletions Artificial Intelligence/zmh/Project2/code/algorithms/KNN.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import numpy as np
import operator
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt


def euclidean_distance(vector1, vector2):
return np.sqrt(np.sum(np.power(vector1 - vector2, 2)))


def absolute_distance(vector1, vector2):
return np.sum(np.absolute(vector1 - vector2))


class KNN(object):
def __init__(self, k=5, dist_type='l2'):
self.k = k
if dist_type == 'l2':
self.distance = euclidean_distance
elif dist_type == 'l1':
self.distance = absolute_distance
else:
raise NotImplementedError

def neighbours(self, train_x, X_test_instance, k):
distances = []
neighbors = []
for i in range(0, train_x.shape[0]):
dist = self.distance(train_x[i], X_test_instance)
distances.append((i, dist))
distances.sort(key=operator.itemgetter(1))
for x in range(k):
neighbors.append(distances[x][0])
return neighbors

def _predict(self, output, train_y, weights=None):
class_votes = {}
for i in range(len(output)):
if train_y[output[i]] in class_votes:
class_votes[train_y[output[i]]] += 1
else:
class_votes[train_y[output[i]]] = 1
sorted_votes = sorted(class_votes.items(), key=operator.itemgetter(1), reverse=True)
return sorted_votes[0][0]

def predict(self, train_x, test_x, train_y):
output_classes = []
for i in range(0, test_x.shape[0]):
output = self.neighbours(train_x, test_x[i], self.k)
p_class = self._predict(output, train_y)
output_classes.append(p_class)
return output_classes

def confusion_matrix(self, test_y, pred_y, cmp=False):
cm = confusion_matrix(test_y, pred_y, labels=range(2))
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.imshow(cm, interpolation='nearest')
plt.title('confusion matrix')
plt.colorbar()
labels_name = ['no', 'yes']
num_local = np.array(range(2))
plt.xticks(num_local, labels_name, rotation=90)
plt.yticks(num_local, labels_name)
plt.ylabel('True label')
plt.xlabel('Predicted label')
if cmp:
plt.savefig('fig/confusion_matrix_knn.png')
else:
plt.savefig('fig/confusion_matrix.png')

def confusion_matrix_cmp(self, train_x, test_x, train_y, test_y):
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(train_x, train_y)
pred_y = knn.predict(test_x)
self.confusion_matrix(test_y, pred_y, cmp=True)


class WKNN(KNN):
def __init__(self, k=5, dist_type='l2'):
super(WKNN, self).__init__(k=k, dist_type=dist_type)

def neighbours(self, train_x, X_test_instance, k):
distances = []
neighbors = []
weights = []
for i in range(0, train_x.shape[0]):
dist = self.distance(train_x[i], X_test_instance)
distances.append((i, dist))
distances.sort(key=operator.itemgetter(1))
for x in range(k):
neighbors.append(distances[x][0])
weights.append(1 / distances[x][1] + 1e-3)
weights = np.array(weights)
weights = (weights / weights.sum()) * self.k * 0.1
return neighbors, weights

def _predict(self, output, train_y, weights=None):
class_votes = {}
for i in range(len(output)):
if train_y[output[i]] in class_votes:
class_votes[train_y[output[i]]] += weights[i]
else:
class_votes[train_y[output[i]]] = weights[i]
sorted_votes = sorted(class_votes.items(), key=operator.itemgetter(1), reverse=True)
return sorted_votes[0][0]

def predict(self, train_x, test_x, train_y):
output_classes = []
for i in range(0, test_x.shape[0]):
output, weights = self.neighbours(train_x, test_x[i], self.k)
p_class = self._predict(output, train_y, weights)
output_classes.append(p_class)
return output_classes

def confusion_matrix(self, test_y, pred_y, cmp=False):
cm = confusion_matrix(test_y, pred_y, labels=range(2))
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.imshow(cm, interpolation='nearest')
plt.title('confusion matrix')
plt.colorbar()
labels_name = ['no', 'yes']
num_local = np.array(range(2))
plt.xticks(num_local, labels_name, rotation=90)
plt.yticks(num_local, labels_name)
plt.ylabel('True label')
plt.xlabel('Predicted label')
if cmp:
plt.savefig('fig/confusion_matrix_knn.png')
else:
plt.savefig('fig/confusion_matrix_w.png')
89 changes: 89 additions & 0 deletions Artificial Intelligence/zmh/Project2/code/classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import classification_report


class ClassifyMethods(object):

def __init__(self, k_fold_num=5, show=True, show_type='print'):
self._show = show
self._show_type = show_type
self.k_fold_num = k_fold_num
self.names = ['KNN', 'LR', 'RF', 'DT', 'SVM', 'GBDT']
self.classifiers = {'KNN': self.knn_classifier(),
'LR': self.logistic_regression_classifier(),
'RF': self.random_forest_classifier(),
'DT': self.decision_tree_classifier(),
'SVM': self.svm_classifier(),
'GBDT': self.gradient_boosting_classifier()
}

# KNN Classifier
def knn_classifier(self):
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=5)
return clf

# Logistic Regression Classifier
def logistic_regression_classifier(self):
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(penalty='l2', max_iter=1000)
return clf

# Random Forest Classifier
def random_forest_classifier(self):
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=8)
return clf

# Decision Tree Classifier
def decision_tree_classifier(self):
from sklearn import tree
clf = tree.DecisionTreeClassifier()
return clf

# GBDT(Gradient Boosting Decision Tree) Classifier
def gradient_boosting_classifier(self):
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=200)
return clf

# SVM Classifier
def svm_classifier(self):
from sklearn.svm import SVC
clf = SVC(kernel='rbf', probability=True)
return clf

def train_all(self, train_x, test_x, train_y, test_y):
scores = []
for name, model in self.classifiers.items():
scores.append(cross_val_score(model, train_x, train_y, cv=self.k_fold_num, scoring='accuracy').mean())
print('classification algorithm: ', name)
model.fit(train_x, train_y)
pred_y = model.predict(test_x)
print(classification_report(y_true=test_y, y_pred=pred_y))
if self._show:
self.show(scores)

def cv_all(self, train_x, train_y):
scoring = {'accuracy': 'accuracy',
'precision': 'precision',
'recall': 'recall',
'f1': 'f1',
'roc_auc': 'roc_auc'}
for name, model in self.classifiers.items():
print('classification algorithm: ', name)
print(cross_validate(model, train_x, train_y, cv=self.k_fold_num, scoring=scoring))

def show(self, scores):
if self._show_type == 'print':
for name, score in zip(self.names, scores):
print('classification algorithm: ', name)
print('accuracy: {:.3f}'.format(score))
print('=' * 50)
elif self._show_type == 'bar':
import matplotlib.pyplot as plt
plt.bar(range(len(self.names)), scores, color='rgb', tick_label=self.names)
plt.savefig('fig/classification.png')
else:
raise NotImplementedError

Loading

0 comments on commit 735b0d8

Please sign in to comment.