add file->linear classification

dailiang · May 12, 2013 · cab9e68 · cab9e68
commit cab9e68
Show file tree

Hide file tree

Showing 5 changed files with 205 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,14 @@
+Machine Learning Visualization
+==============
+Overview
+-------------
+A collection of common machine learning algorithm implemented in python, and I am trying to plot them out for visualization. The following algorithms are included:  
+
+* Linear classification (MLP, Perceptron Algorithm, LMS Algorithm, etc)
+* Clustering (K-means, )
+
+Requirements
+---------------
+
+* python 2.7
+* numpy, scipy, matplotlib
diff --git a/linear-classification/LMS.py b/linear-classification/LMS.py
@@ -0,0 +1,53 @@
+# !/usr/bin/python
+# -*- coding=UTF-8 -*-
+
+import copy as cp
+
+from data import *
+
+def lms_alg(inputs, targets, eta=0.05, weights=np.random.rand(3,1)*0.1 - 0.05, nIteration=1):
+    ''' eta is learning rate '''
+    weights_start = cp.deepcopy(weights)
+    # n iteration
+    for n in range(nIteration):
+        outputs = np.dot(inputs, weights)
+        # Threshold the outputs
+        weights += eta/(nData1+nData2)*np.dot(inputs.T, targets-outputs)
+    outputs = np.dot(inputs, weights)
+    # Threshold the outputs
+    outputs = np.where(outputs>0,1,-1) 
+    return (outputs,weights_start, weights)
+
+if __name__=="__main__":
+    final = lms_alg(inputs, targets, nIteration=8)
+
+    outputs_final = final[0] - targets
+    outputs_final = np.where(outputs_final==0, 0, 1)
+    nMis = sum(outputs_final)
+    print 'Num of misclassified:'
+    print nMis
+    weights_start = final[1]
+    weights = final[2]
+    outputs = np.dot(inputs, weights)
+    # 统计误分点的个数 
+    outputs = np.dot(inputs, weights)
+    # 求Loss
+    Loss = sum((targets-outputs)**2)
+    print 'Init Weight:'
+    print weights_start
+    print 'Final Weight:'
+    print weights
+    print 'Loss:'
+    print Loss
+
+    # plot
+    classfier_x = np.linspace(-8, 8, 100)
+    classfier_y = (weights[2] - weights[0]*classfier_x)/weights[1]
+    classfier_y_start = (weights_start[2] - weights_start[0]*classfier_x)/weights_start[1]
+    plt.axis([-8,8,-6,6])
+    plt.plot(x1, y1, 'ro', x2, y2, 'bo')
+    plt.plot(classfier_x, classfier_y_start, 'm--', classfier_x, classfier_y, 'g-')
+    plt.legend(["+1", "-1", "Init", "Final"])
+    plt.xlabel('x')
+    plt.ylabel('y')
+    plt.show()
diff --git a/linear-classification/MSE.py b/linear-classification/MSE.py
@@ -0,0 +1,40 @@
+# !/usr/bin/python
+# -*- coding=UTF-8 -*-
+
+from data import *
+
+# W = (XX^T)^{-1}XT
+
+t1 = np.ones((1, nData1)) 
+t2 = -np.ones((1, nData2))
+targets = np.concatenate((t1, t2), axis=1).T
+X = inputs
+weights = np.linalg.solve(np.dot(X.T, X), np.dot(X.T, targets))
+
+if __name__=="__main__":
+    outputs = np.dot(X, weights)
+    # Loss
+    Loss = sum((targets - outputs)**2)
+    # Threshold the outputs
+    outputs = np.where(outputs>0,1,-1)
+    outputs -= targets
+    outputs = np.where(outputs==0, 0, 1)
+    nMis = sum(outputs)
+    print 'Num of miclassifed datapoint:'
+    print nMis
+    print 'Final weights:'
+    print weights
+    print 'Loss:'
+    print Loss
+
+
+    # plot
+    classfier_x = np.linspace(-8, 8, 100)
+    classfier_y = (weights[2] - weights[0]*classfier_x)/weights[1]
+    plt.axis([-8,8,-6,6])
+    plt.plot(x1, y1, 'ro', x2, y2, 'bo')
+    plt.plot(classfier_x, classfier_y, 'g-')
+    plt.legend(['+1', '-1'])
+    plt.xlabel('x')
+    plt.ylabel('y')
+    plt.show()
diff --git a/linear-classification/data.py b/linear-classification/data.py
@@ -0,0 +1,45 @@
+# !/usr/bin/python
+# -*- coding=UTF-8 -*-
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+# From 2-D normal distribution, we will get two groups data (x1,y1) 
+# and (x2,y2), each with 200 datapoints.
+
+# define covariance matrix and mean values.
+m1 = [-5, 0]
+m2 = [5, 0]
+cov1 = [[1, 0.5], [0.5, 1]]
+cov2 = cov1
+
+# Generator random data
+np.random.seed(0)
+x1,y1 = np.random.multivariate_normal(m1, cov1, 200).T
+np.random.seed(1)
+x2,y2 = np.random.multivariate_normal(m2, cov2, 200).T
+
+# range the data
+nData1 = np.shape(x1)[0]
+nData2 = np.shape(x2)[0]
+X1 = np.concatenate(([x1], [y1], -np.ones((1,nData1))), axis=0).T
+X2 = np.concatenate(([x2], [y2], -np.ones((1,nData2))), axis=0).T
+inputs = np.concatenate((X1, X2), axis=0)
+# Targets
+t1 = np.ones((1, nData1)) 
+t2 = -np.ones((1, nData2))
+targets = np.concatenate((t1, t2), axis=1).T
+
+
+if __name__=="__main__":
+    # what is the relationship between covariance matrix and shape of distribution?
+    eigenval, eigenvec = np.linalg.eig(cov1)
+    print "The eigen value of cov matrix is:\n", eigenval
+    print "The eigen vector of cov matrix is:\n", eigenvec
+
+    # plot
+    plt.axis([-8,8,-6,6])
+    plt.plot(x1, y1, 'ro', x2, y2, 'bo')
+    plt.xlabel('x')
+    plt.ylabel('y')
+    plt.show()
diff --git a/linear-classification/perceptron.py b/linear-classification/perceptron.py
@@ -0,0 +1,53 @@
+# !/usr/bin/python
+# -*- coding=UTF-8 -*-
+
+import copy as cp
+
+from data import *
+
+# Since Loss function is:
+#     Loss(w) = sum( sign(Xw)*(Xw-target))   X belongs to misclassified datapoint
+# using gradient descent algorithm:
+#        Gradient of w = sign(Xw) * X
+
+def pcn_train(inputs, targets, eta=0.25, weights=np.random.rand(3,1)*0.1 - 0.05, nIteration=1):
+    ''' perceptron training phase, eta is learning rate '''
+    weights_start = cp.deepcopy(weights)
+    for n in range(nIteration):
+        """ Run the network forward """
+        outputs = np.dot(inputs, weights)
+        # Threshold the outputs
+        outputs = np.where(outputs>0,1,-1)
+        weights += eta*np.dot(inputs.T,targets-outputs)
+    # outputs    
+    outputs = np.dot(inputs, weights)
+    # Threshold the outputs
+    outputs = np.where(outputs>0,1,-1)
+    return (outputs,weights_start, weights)
+
+if __name__=="__main__":
+    '''train'''
+    final = pcn_train(inputs, targets, nIteration=4)
+    outputs_final = final[0] - targets
+    outputs_final = np.where(outputs_final==0, 0, 1)
+    nMis = sum(outputs_final)
+    print 'Nums of data that is misclassifed:'
+    print nMis
+    weights_start = final[1]
+    weights = final[2]
+    print 'The init weights:'
+    print weights_start
+    print 'The finally weights:'
+    print weights
+
+    # plot
+    classfier_x = np.linspace(-8, 8, 100)
+    classfier_y = (weights[2] - weights[0]*classfier_x)/weights[1]
+    classfier_y_start = (weights_start[2] - weights_start[0]*classfier_x)/weights_start[1]
+    plt.axis([-8,8,-6,6])
+    plt.plot(x1, y1, 'ro', x2, y2, 'bo')
+    plt.plot(classfier_x, classfier_y_start, 'm--', classfier_x, classfier_y, 'g-')
+    plt.legend(['+1', '-1', 'init', 'fianl'])
+    plt.xlabel('x')
+    plt.ylabel('y')
+    plt.show()