ai project2

Starrah · Jun 11, 2020 · 735b0d8 · 735b0d8
1 parent 3a318d3
commit 735b0d8
Show file tree

Hide file tree

Showing 8 changed files with 618 additions and 1 deletion.
diff --git a/Artificial Intelligence/zmh/Project2/code/algorithms/KMeans.py b/Artificial Intelligence/zmh/Project2/code/algorithms/KMeans.py
@@ -0,0 +1,96 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.cluster import KMeans as KM
+
+
+class KMeans(object):
+    def __init__(self, k=3, tolerance=0.0001, max_iterations=500, show_type='bar'):
+        self.k = k
+        self.tolerance = tolerance
+        self.max_iterations = max_iterations
+        self.centroids = dict()
+        self._show_type = show_type
+        self.names = ['my kmeans', 'sklearn kmeans']
+
+    def fit(self, data):
+        data = np.array(data)
+        labels_ = np.zeros(len(data))
+        for i in range(self.k):
+            self.centroids[i] = data[i]
+        for i in range(self.max_iterations):
+            self.classes = {}
+            for j in range(self.k):
+                self.classes[j] = []
+            # find the distance between the point and cluster; choose the nearest centroid
+            for k, features in enumerate(data):
+                distances = [np.linalg.norm(features - self.centroids[centroid]) for centroid in self.centroids]
+                classification = distances.index(min(distances))
+                self.classes[classification].append(features)
+                labels_[k] = classification
+
+            previous = dict(self.centroids)
+            flag = True
+            # average the cluster data points to re-calculate the centroids
+            for classification in self.classes:
+                self.centroids[classification] = np.average(self.classes[classification], axis=0)
+            for centroid in self.centroids:
+                original_centroid = previous[centroid]
+                curr = self.centroids[centroid]
+
+                if np.sum((curr - original_centroid) / (original_centroid + 1e-8) * 100.0) > self.tolerance:
+                    flag = False
+            if flag:
+                break
+        return labels_.astype(int)
+
+    def evaluation(self, train_data, train_label, centroids, cmp=False):
+        from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score, silhouette_score, completeness_score
+
+        db_scores = [davies_bouldin_score(train_data, centroids)]
+        ch_scores = [calinski_harabasz_score(train_data, centroids)]
+        silhouette_scores = [silhouette_score(train_data, centroids)]
+        completeness_scores = [completeness_score(train_label, centroids)]
+
+        if cmp:
+            km = KM(n_clusters=4, random_state=1)
+            km_centroids = km.fit(train_data).labels_
+            db_scores.append(davies_bouldin_score(train_data, km_centroids))
+            ch_scores.append(calinski_harabasz_score(train_data, km_centroids))
+            silhouette_scores.append(silhouette_score(train_data, km_centroids))
+            completeness_scores.append(completeness_score(train_label, km_centroids))
+
+        return {'db_scores': db_scores, 'ch_scores': ch_scores, 'silhouette_scores': silhouette_scores,
+                'completeness_scores': completeness_scores}
+
+    def show(self, scores, train_data, centroids):
+        if self._show_type == 'print':
+            for score_name, score in scores.items():
+                for name, s in zip(self.names, score):
+                    print('clustering algorithm: ', name)
+                    print(score_name + ': ', s)
+                    print('=' * 50)
+        elif self._show_type == 'bar':
+            for score_name, score in scores.items():
+                plt.bar(range(len(self.names)), score, color='rgb', tick_label=self.names)
+                plt.savefig('fig/cmp_clustering_' + score_name + '.png')
+                plt.clf()
+
+        km = KM(n_clusters=4, random_state=1)
+        km_centroid = km.fit(train_data).labels_
+        from sklearn.manifold import TSNE
+        tsne = TSNE(n_components=2, learning_rate=100)
+        tsne_data = tsne.fit_transform(train_data)
+        plt.scatter(tsne_data[:, 0], tsne_data[:, 1], c=centroids)
+        plt.savefig('fig/tsne.png')
+        plt.clf()
+        plt.scatter(tsne_data[:, 0], tsne_data[:, 1], c=km_centroid)
+        plt.savefig('fig/tsne_cmp.png')
+        plt.clf()
+
+
+if __name__ == '__main__':
+    kmeans = KMeans()
+    train_x = [[1., 3., 4., 5., 9., ], [3., 7., 5., 5., 9., ], [0., 8., 6., 5., 9., ],
+               [2., 3., 4., 6., 6., ], [0., 10., 3., 4., 1., ], [4., 2., 2., 6., 3., ], ]
+    labels = kmeans.fit(train_x)
+    print(labels)
diff --git a/Artificial Intelligence/zmh/Project2/code/algorithms/KNN.py b/Artificial Intelligence/zmh/Project2/code/algorithms/KNN.py
@@ -0,0 +1,131 @@
+import numpy as np
+import operator
+from sklearn.metrics import confusion_matrix
+import matplotlib.pyplot as plt
+
+
+def euclidean_distance(vector1, vector2):
+    return np.sqrt(np.sum(np.power(vector1 - vector2, 2)))
+
+
+def absolute_distance(vector1, vector2):
+    return np.sum(np.absolute(vector1 - vector2))
+
+
+class KNN(object):
+    def __init__(self, k=5, dist_type='l2'):
+        self.k = k
+        if dist_type == 'l2':
+            self.distance = euclidean_distance
+        elif dist_type == 'l1':
+            self.distance = absolute_distance
+        else:
+            raise NotImplementedError
+
+    def neighbours(self, train_x, X_test_instance, k):
+        distances = []
+        neighbors = []
+        for i in range(0, train_x.shape[0]):
+            dist = self.distance(train_x[i], X_test_instance)
+            distances.append((i, dist))
+        distances.sort(key=operator.itemgetter(1))
+        for x in range(k):
+            neighbors.append(distances[x][0])
+        return neighbors
+
+    def _predict(self, output, train_y, weights=None):
+        class_votes = {}
+        for i in range(len(output)):
+            if train_y[output[i]] in class_votes:
+                class_votes[train_y[output[i]]] += 1
+            else:
+                class_votes[train_y[output[i]]] = 1
+        sorted_votes = sorted(class_votes.items(), key=operator.itemgetter(1), reverse=True)
+        return sorted_votes[0][0]
+
+    def predict(self, train_x, test_x, train_y):
+        output_classes = []
+        for i in range(0, test_x.shape[0]):
+            output = self.neighbours(train_x, test_x[i], self.k)
+            p_class = self._predict(output, train_y)
+            output_classes.append(p_class)
+        return output_classes
+
+    def confusion_matrix(self, test_y, pred_y, cmp=False):
+        cm = confusion_matrix(test_y, pred_y, labels=range(2))
+        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+        plt.imshow(cm, interpolation='nearest')
+        plt.title('confusion matrix')
+        plt.colorbar()
+        labels_name = ['no', 'yes']
+        num_local = np.array(range(2))
+        plt.xticks(num_local, labels_name, rotation=90)
+        plt.yticks(num_local, labels_name)
+        plt.ylabel('True label')
+        plt.xlabel('Predicted label')
+        if cmp:
+            plt.savefig('fig/confusion_matrix_knn.png')
+        else:
+            plt.savefig('fig/confusion_matrix.png')
+
+    def confusion_matrix_cmp(self, train_x, test_x, train_y, test_y):
+        from sklearn.neighbors import KNeighborsClassifier
+        knn = KNeighborsClassifier(n_neighbors=5)
+        knn.fit(train_x, train_y)
+        pred_y = knn.predict(test_x)
+        self.confusion_matrix(test_y, pred_y, cmp=True)
+
+
+class WKNN(KNN):
+    def __init__(self, k=5, dist_type='l2'):
+        super(WKNN, self).__init__(k=k, dist_type=dist_type)
+
+    def neighbours(self, train_x, X_test_instance, k):
+        distances = []
+        neighbors = []
+        weights = []
+        for i in range(0, train_x.shape[0]):
+            dist = self.distance(train_x[i], X_test_instance)
+            distances.append((i, dist))
+        distances.sort(key=operator.itemgetter(1))
+        for x in range(k):
+            neighbors.append(distances[x][0])
+            weights.append(1 / distances[x][1] + 1e-3)
+        weights = np.array(weights)
+        weights = (weights / weights.sum()) * self.k * 0.1
+        return neighbors, weights
+
+    def _predict(self, output, train_y, weights=None):
+        class_votes = {}
+        for i in range(len(output)):
+            if train_y[output[i]] in class_votes:
+                class_votes[train_y[output[i]]] += weights[i]
+            else:
+                class_votes[train_y[output[i]]] = weights[i]
+        sorted_votes = sorted(class_votes.items(), key=operator.itemgetter(1), reverse=True)
+        return sorted_votes[0][0]
+
+    def predict(self, train_x, test_x, train_y):
+        output_classes = []
+        for i in range(0, test_x.shape[0]):
+            output, weights = self.neighbours(train_x, test_x[i], self.k)
+            p_class = self._predict(output, train_y, weights)
+            output_classes.append(p_class)
+        return output_classes
+
+    def confusion_matrix(self, test_y, pred_y, cmp=False):
+        cm = confusion_matrix(test_y, pred_y, labels=range(2))
+        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+        plt.imshow(cm, interpolation='nearest')
+        plt.title('confusion matrix')
+        plt.colorbar()
+        labels_name = ['no', 'yes']
+        num_local = np.array(range(2))
+        plt.xticks(num_local, labels_name, rotation=90)
+        plt.yticks(num_local, labels_name)
+        plt.ylabel('True label')
+        plt.xlabel('Predicted label')
+        if cmp:
+            plt.savefig('fig/confusion_matrix_knn.png')
+        else:
+            plt.savefig('fig/confusion_matrix_w.png')
diff --git a/Artificial Intelligence/zmh/Project2/code/classification.py b/Artificial Intelligence/zmh/Project2/code/classification.py
@@ -0,0 +1,89 @@
+from sklearn.model_selection import cross_val_score, cross_validate
+from sklearn.metrics import classification_report
+
+
+class ClassifyMethods(object):
+
+    def __init__(self, k_fold_num=5, show=True, show_type='print'):
+        self._show = show
+        self._show_type = show_type
+        self.k_fold_num = k_fold_num
+        self.names = ['KNN', 'LR', 'RF', 'DT', 'SVM', 'GBDT']
+        self.classifiers = {'KNN': self.knn_classifier(),
+                            'LR': self.logistic_regression_classifier(),
+                            'RF': self.random_forest_classifier(),
+                            'DT': self.decision_tree_classifier(),
+                            'SVM': self.svm_classifier(),
+                            'GBDT': self.gradient_boosting_classifier()
+                            }
+
+    # KNN Classifier
+    def knn_classifier(self):
+        from sklearn.neighbors import KNeighborsClassifier
+        clf = KNeighborsClassifier(n_neighbors=5)
+        return clf
+
+    # Logistic Regression Classifier
+    def logistic_regression_classifier(self):
+        from sklearn.linear_model import LogisticRegression
+        clf = LogisticRegression(penalty='l2', max_iter=1000)
+        return clf
+
+    # Random Forest Classifier
+    def random_forest_classifier(self):
+        from sklearn.ensemble import RandomForestClassifier
+        clf = RandomForestClassifier(n_estimators=8)
+        return clf
+
+    # Decision Tree Classifier
+    def decision_tree_classifier(self):
+        from sklearn import tree
+        clf = tree.DecisionTreeClassifier()
+        return clf
+
+    # GBDT(Gradient Boosting Decision Tree) Classifier
+    def gradient_boosting_classifier(self):
+        from sklearn.ensemble import GradientBoostingClassifier
+        clf = GradientBoostingClassifier(n_estimators=200)
+        return clf
+
+    # SVM Classifier
+    def svm_classifier(self):
+        from sklearn.svm import SVC
+        clf = SVC(kernel='rbf', probability=True)
+        return clf
+
+    def train_all(self, train_x, test_x, train_y, test_y):
+        scores = []
+        for name, model in self.classifiers.items():
+            scores.append(cross_val_score(model, train_x, train_y, cv=self.k_fold_num, scoring='accuracy').mean())
+            print('classification algorithm: ', name)
+            model.fit(train_x, train_y)
+            pred_y = model.predict(test_x)
+            print(classification_report(y_true=test_y, y_pred=pred_y))
+        if self._show:
+            self.show(scores)
+
+    def cv_all(self, train_x, train_y):
+        scoring = {'accuracy': 'accuracy',
+                   'precision': 'precision',
+                   'recall': 'recall',
+                   'f1': 'f1',
+                   'roc_auc': 'roc_auc'}
+        for name, model in self.classifiers.items():
+            print('classification algorithm: ', name)
+            print(cross_validate(model, train_x, train_y, cv=self.k_fold_num, scoring=scoring))
+
+    def show(self, scores):
+        if self._show_type == 'print':
+            for name, score in zip(self.names, scores):
+                print('classification algorithm: ', name)
+                print('accuracy: {:.3f}'.format(score))
+                print('=' * 50)
+        elif self._show_type == 'bar':
+            import matplotlib.pyplot as plt
+            plt.bar(range(len(self.names)), scores, color='rgb', tick_label=self.names)
+            plt.savefig('fig/classification.png')
+        else:
+            raise NotImplementedError
+