Skip to content

Commit

Permalink
algorithm analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
surejinwu(吴晓军) committed Mar 24, 2017
1 parent 8f0d07d commit f29887d
Show file tree
Hide file tree
Showing 5 changed files with 267 additions and 1 deletion.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
斯坦福机器学习代码实现
==================

Gitbook地址:[斯坦福机器学习笔记](https://github.com/yoyoyohamapi/mit-ml)
Binary file added algorithm_analysis/data/water.mat
Binary file not shown.
148 changes: 148 additions & 0 deletions algorithm_analysis/linear_regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
# coding: utf-8
# algorithm_analysis/linear_regression.py
import numpy as np
import matplotlib as plt
import time


def exeTime(func):
""" 耗时计算装饰器
"""
def newFunc(*args, **args2):
t0 = time.time()
back = func(*args, **args2)
return back, time.time() - t0
return newFunc


def h(theta, x):
"""预测函数
Args:
theta 相关系数矩阵
x 特征向量
Returns:
预测结果
"""
return (theta.T * x)[0, 0]


def J(theta, X, y, theLambda=0):
"""代价函数
Args:
theta 相关系数矩阵
X 样本集矩阵
y 标签集矩阵
Returns:
预测误差(代价)
"""
m = len(X)
return (X * theta - y).T * (X * theta - y) / (2 * m) + theLambda * np.sum(np.square(theta)) / (2*m)


@exeTime
def gradient(X, y, rate=1, maxLoop=50, epsilon=1e-1, theLambda=0, initTheta=None):
"""批量梯度下降法
Args:
X 样本矩阵
y 标签矩阵
rate 学习率
maxLoop 最大迭代次数
epsilon 收敛精度
theLambda 正规化参数
Returns:
(theta, errors), timeConsumed
"""
m, n = X.shape
# 初始化theta
if initTheta is None:
theta = np.zeros((n, 1))
else:
theta = initTheta
count = 0
converged = False
error = float('inf')
errors = []
for i in range(maxLoop):
theta = theta + (1.0 / m) * rate * ((y - X * theta).T * X).T
error = J(theta, X, y, theLambda)
if np.isnan(error) is True:
error = np.inf
else:
error = error[0, 0]
errors.append(error)
# 如果已经收敛
if(error < epsilon):
break
return theta, errors

def standardize(X):
"""特征标准化处理
Args:
X 样本集
Returns:
标准后的样本集
"""
m, n = X.shape
# 归一化每一个特征
for j in range(n):
features = X[:,j]
meanVal = features.mean(axis=0)
std = features.std(axis=0)
if std != 0:
X[:, j] = (features-meanVal)/std
else:
X[:, j] = 0
return X

def normalize(X):
"""特征归一化处理
Args:
X 样本集
Returns:
归一化后的样本集
"""
m, n = X.shape
# 归一化每一个特征
for j in range(n):
features = X[:,j]
minVal = features.min(axis=0)
maxVal = features.max(axis=0)
diff = maxVal - minVal
if diff != 0:
X[:,j] = (features-minVal)/diff
else:
X[:,j] = 0
return X

def getLearningCurves(X, y, Xval, yval, rate=1, maxLoop=50, epsilon=0.1, theLambda=0):
"""获得学习曲线
Args:
X 样本集
y 标签集
Xval 交叉验证集
yval 交叉验证集标签
Returns:
trainErrors 训练误差随样本规模的变化
valErrors 校验验证集误差随样本规模的变化
"""
# 绘制随样本规模学习曲线
m, n = X.shape
trainErrors = np.zeros((1,m))
valErrors = np.zeros((1,m))
for i in range(m):
Xtrain = X[0:i+1]
ytrain = y[0:i+1]
res, timeConsumed = gradient(
Xtrain, ytrain, rate=rate, maxLoop=maxLoop, epsilon=epsilon,theLambda=theLambda)
theta, errors = res
trainErrors[0,i] = errors[-1]
valErrors[0,i] = J(theta, Xval, yval, theLambda=theLambda)
return trainErrors, valErrors
114 changes: 114 additions & 0 deletions algorithm_analysis/test_datasets_divide.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# coding: utf-8
# algorithm_analysis/test_datasets_divide.py
"""数据集划分
"""
import linear_regression
import numpy as np
from scipy.io import loadmat
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures

data = loadmat('data/water.mat')
# 训练集
X = np.mat(data['X'])
# 为X添加偏置
X = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
y = np.mat(data['y'])
# 交叉验证集
Xval = np.mat(data['Xval'])
Xval = np.concatenate((np.ones((Xval.shape[0], 1)), Xval), axis=1)
yval = np.mat(data['yval'])
# 测试集
Xtest = np.mat(data['Xtest'])
Xtest = np.concatenate((np.ones((Xtest.shape[0], 1)), Xtest), axis=1)
ytest = np.mat(data['ytest'])

rate = 0.001
maxLoop = 5000
epsilon = 0.1
# initTheta = np.mat(np.ones((X.shape[1], 1)))
# result, timeConsumed = linear_regression.gradient(
# X, y, rate=rate, maxLoop=maxLoop, epsilon=epsilon, initTheta=initTheta)
# theta, errors = result
#
# # 绘制拟合成果
# title = 'bgd: rate=%.3f, maxLoop=%d, epsilon=%.3f \n time: %.2fms, error=%.3f' % (
# rate, maxLoop, epsilon, timeConsumed/1000.0, errors[-1])
# Xmin = X[:, 1].min()
# Xmax = X[:, 1].max()
# ymax = y[:, 0].max()
# ymin = y[:, 0].min()
# fitX = np.mat(np.linspace(Xmin, Xmax, 20).reshape(-1, 1))
# fitX = np.concatenate((np.ones((fitX.shape[0], 1)), fitX), axis=1)
# h = fitX * theta
# plt.xlim(Xmin, Xmax)
# plt.ylim(ymin, ymax)
# plt.title(title)
# # 绘制训练样本
# plt.scatter(X[:, 1].flatten().A[0], y[:, 0].flatten().A[0])
# # 绘制拟合曲线
# plt.plot(fitX[:, 1], h, color='g')
# plt.xlabel('Change in water level(x)')
# plt.ylabel('Water flowing out of the dam(y)')
# plt.show()

# 绘制随样本规模学习曲线
# m, n = X.shape
# trainErrors = np.zeros((1,m))
# valErrors = np.zeros((1,m))
# for i in range(m):
# Xtrain = X[0:i+1]
# ytrain = y[0:i+1]
# res, timeConsumed = linear_regression.gradient(
# Xtrain, ytrain, rate=rate, maxLoop=maxLoop, epsilon=epsilon)
# theta, errors = res
# trainErrors[0,i] = errors[-1]
# valErrors[0,i] = linear_regression.J(theta, Xval, yval)
#
# plt.plot(np.arange(1,m+1).ravel(), trainErrors.ravel(), color='b', label='Training Error')
# plt.plot(np.arange(1,m+1).ravel(), valErrors.ravel(), color='g', label='Validation Error')
#
# plt.title('Learning curve for linear regression')
# plt.xlabel('Number of training examples')
# plt.ylabel('Error')
# plt.legend()
# plt.show()

# 多项式回归
poly = PolynomialFeatures(degree=8)
XX,XXval,XXtest = [linear_regression.normalize(np.mat(poly.fit_transform(data[:,1:]))) for data in [X,Xval,Xtest]]
initTheta = np.mat(np.ones((XX.shape[1], 1)))
# res, timeConsumed = linear_regression.gradient(XX, y, rate=1, maxLoop=5000, epsilon=0.01, theLambda=100)
# theta, errors = res
# print errors[-1]
#
#
# # 绘制拟合曲线
# fitX = np.mat(np.linspace(-60,45).reshape(-1, 1))
# fitX = np.concatenate((np.ones((fitX.shape[0], 1)), fitX), axis=1)
# fitXX = linear_regression.normalize(np.mat(poly.fit_transform(fitX[:, 1:])))
# h = fitXX * theta
# plt.scatter(X[:, 1].ravel(), y[:, 0].flatten().A[0])
# plt.plot(fitX[:, 1], h, color='g')
# plt.show()

theLambdas = [0, 0.001,0.003,0.01,0.003,0.1,0.3,1,3,10,100]
numTheLambdas = len(theLambdas)
trainErrors = np.zeros((1,numTheLambdas))
valErrors = np.zeros((1,numTheLambdas))
for idx, theLambda in enumerate(theLambdas):
res, timeConsumed = linear_regression.gradient(XX, y, rate=0.3, maxLoop=500, epsilon=0.01, theLambda=theLambda)
theta, errors = res
trainErrors[0, idx] = errors[-1]
valErrors[0,idx] = linear_regression.J(theta, XXval, yval, theLambda = theLambda)
print valErrors
# # 绘制随样本规模学习曲线
# trainErrors, valErrors = linear_regression.getLearningCurves(XX, y, XXval, yval, rate=0.1, maxLoop=5000, epsilon=0.01, theLambda=100)
# m,n = XX.shape
plt.plot(np.arange(1, numTheLambdas+1).ravel(), trainErrors.ravel(), color='b', label='Training Error')
plt.plot(np.arange(1, numTheLambdas+1).ravel(), valErrors.ravel(), color='g', label='Validation Error')
plt.title('Learning curve for linear regression')
plt.xlabel('Number of training examples')
plt.ylabel('Error')
plt.legend()
plt.show()
2 changes: 1 addition & 1 deletion linear_regression/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def lwr(rate, maxLoop, epsilon, X, y, x, c=1):
converged = True
return theta,errors,thetas

def standardize(X):
def standarize(X):
"""特征标准化处理
Args:
Expand Down

0 comments on commit f29887d

Please sign in to comment.