algorithm analysis

andyleung · Mar 24, 2017 · f29887d · f29887d
1 parent 8f0d07d
commit f29887d
Show file tree

Hide file tree

Showing 5 changed files with 267 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -0,0 +1,4 @@
+斯坦福机器学习代码实现
+==================
+
+Gitbook地址：[斯坦福机器学习笔记](https://github.com/yoyoyohamapi/mit-ml)
diff --git a/algorithm_analysis/data/water.mat b/algorithm_analysis/data/water.mat
diff --git a/algorithm_analysis/linear_regression.py b/algorithm_analysis/linear_regression.py
@@ -0,0 +1,148 @@
+# coding: utf-8
+# algorithm_analysis/linear_regression.py
+import numpy as np
+import matplotlib as plt
+import time
+
+
+def exeTime(func):
+    """ 耗时计算装饰器
+    """
+    def newFunc(*args, **args2):
+        t0 = time.time()
+        back = func(*args, **args2)
+        return back, time.time() - t0
+    return newFunc
+
+
+def h(theta, x):
+    """预测函数
+
+    Args:
+        theta 相关系数矩阵
+        x 特征向量
+
+    Returns:
+        预测结果
+    """
+    return (theta.T * x)[0, 0]
+
+
+def J(theta, X, y, theLambda=0):
+    """代价函数
+
+    Args:
+        theta 相关系数矩阵
+        X 样本集矩阵
+        y 标签集矩阵
+
+    Returns:
+        预测误差（代价）
+    """
+    m = len(X)
+    return (X * theta - y).T * (X * theta - y) / (2 * m) + theLambda * np.sum(np.square(theta)) / (2*m)
+
+
+@exeTime
+def gradient(X, y, rate=1, maxLoop=50, epsilon=1e-1, theLambda=0, initTheta=None):
+    """批量梯度下降法
+
+    Args:
+        X 样本矩阵
+        y 标签矩阵
+        rate 学习率
+        maxLoop 最大迭代次数
+        epsilon 收敛精度
+        theLambda 正规化参数
+    Returns:
+        (theta, errors), timeConsumed
+    """
+    m, n = X.shape
+    # 初始化theta
+    if initTheta is None:
+        theta = np.zeros((n, 1))
+    else:
+        theta = initTheta
+    count = 0
+    converged = False
+    error = float('inf')
+    errors = []
+    for i in range(maxLoop):
+        theta = theta + (1.0 / m) * rate * ((y - X * theta).T * X).T
+        error = J(theta, X, y, theLambda)
+        if np.isnan(error) is True:
+            error = np.inf
+        else:
+            error = error[0, 0]
+        errors.append(error)
+        # 如果已经收敛
+        if(error < epsilon):
+            break
+    return theta, errors
+
+def standardize(X):
+    """特征标准化处理
+
+    Args:
+        X 样本集
+    Returns:
+        标准后的样本集
+    """
+    m, n = X.shape
+    # 归一化每一个特征
+    for j in range(n):
+        features = X[:,j]
+        meanVal = features.mean(axis=0)
+        std = features.std(axis=0)
+        if std != 0:
+            X[:, j] = (features-meanVal)/std
+        else:
+            X[:, j] = 0
+    return X
+
+def normalize(X):
+    """特征归一化处理
+
+    Args:
+        X 样本集
+    Returns:
+        归一化后的样本集
+    """
+    m, n = X.shape
+    # 归一化每一个特征
+    for j in range(n):
+        features = X[:,j]
+        minVal = features.min(axis=0)
+        maxVal = features.max(axis=0)
+        diff = maxVal - minVal
+        if diff != 0:
+           X[:,j] = (features-minVal)/diff
+        else:
+           X[:,j] = 0
+    return X
+
+def getLearningCurves(X, y, Xval, yval, rate=1, maxLoop=50, epsilon=0.1, theLambda=0):
+    """获得学习曲线
+
+    Args:
+        X 样本集
+        y 标签集
+        Xval 交叉验证集
+        yval 交叉验证集标签
+    Returns:
+        trainErrors 训练误差随样本规模的变化
+        valErrors 校验验证集误差随样本规模的变化
+    """
+    # 绘制随样本规模学习曲线
+    m, n = X.shape
+    trainErrors = np.zeros((1,m))
+    valErrors = np.zeros((1,m))
+    for i in range(m):
+        Xtrain = X[0:i+1]
+        ytrain = y[0:i+1]
+        res, timeConsumed = gradient(
+            Xtrain, ytrain, rate=rate, maxLoop=maxLoop, epsilon=epsilon,theLambda=theLambda)
+        theta, errors = res
+        trainErrors[0,i] = errors[-1]
+        valErrors[0,i] = J(theta, Xval, yval, theLambda=theLambda)
+    return trainErrors, valErrors
diff --git a/algorithm_analysis/test_datasets_divide.py b/algorithm_analysis/test_datasets_divide.py
@@ -0,0 +1,114 @@
+# coding: utf-8
+# algorithm_analysis/test_datasets_divide.py
+"""数据集划分
+"""
+import linear_regression
+import numpy as np
+from scipy.io import loadmat
+import matplotlib.pyplot as plt
+from sklearn.preprocessing import PolynomialFeatures
+
+data = loadmat('data/water.mat')
+# 训练集
+X = np.mat(data['X'])
+# 为X添加偏置
+X = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
+y = np.mat(data['y'])
+# 交叉验证集
+Xval = np.mat(data['Xval'])
+Xval = np.concatenate((np.ones((Xval.shape[0], 1)), Xval), axis=1)
+yval = np.mat(data['yval'])
+# 测试集
+Xtest = np.mat(data['Xtest'])
+Xtest = np.concatenate((np.ones((Xtest.shape[0], 1)), Xtest), axis=1)
+ytest = np.mat(data['ytest'])
+
+rate = 0.001
+maxLoop = 5000
+epsilon = 0.1
+# initTheta = np.mat(np.ones((X.shape[1], 1)))
+# result, timeConsumed = linear_regression.gradient(
+#     X, y, rate=rate, maxLoop=maxLoop, epsilon=epsilon, initTheta=initTheta)
+# theta, errors = result
+#
+# # 绘制拟合成果
+# title = 'bgd: rate=%.3f, maxLoop=%d, epsilon=%.3f \n time: %.2fms, error=%.3f' % (
+#     rate, maxLoop, epsilon, timeConsumed/1000.0, errors[-1])
+# Xmin = X[:, 1].min()
+# Xmax = X[:, 1].max()
+# ymax = y[:, 0].max()
+# ymin = y[:, 0].min()
+# fitX = np.mat(np.linspace(Xmin, Xmax, 20).reshape(-1, 1))
+# fitX = np.concatenate((np.ones((fitX.shape[0], 1)), fitX), axis=1)
+# h = fitX * theta
+# plt.xlim(Xmin, Xmax)
+# plt.ylim(ymin, ymax)
+# plt.title(title)
+# # 绘制训练样本
+# plt.scatter(X[:, 1].flatten().A[0], y[:, 0].flatten().A[0])
+# # 绘制拟合曲线
+# plt.plot(fitX[:, 1], h, color='g')
+# plt.xlabel('Change in water level(x)')
+# plt.ylabel('Water flowing out of the dam(y)')
+# plt.show()
+
+# 绘制随样本规模学习曲线
+# m, n = X.shape
+# trainErrors = np.zeros((1,m))
+# valErrors = np.zeros((1,m))
+# for i in range(m):
+#     Xtrain = X[0:i+1]
+#     ytrain = y[0:i+1]
+#     res, timeConsumed = linear_regression.gradient(
+#         Xtrain, ytrain, rate=rate, maxLoop=maxLoop, epsilon=epsilon)
+#     theta, errors = res
+#     trainErrors[0,i] = errors[-1]
+#     valErrors[0,i] = linear_regression.J(theta, Xval, yval)
+#
+# plt.plot(np.arange(1,m+1).ravel(), trainErrors.ravel(), color='b', label='Training Error')
+# plt.plot(np.arange(1,m+1).ravel(), valErrors.ravel(), color='g', label='Validation Error')
+#
+# plt.title('Learning curve for linear regression')
+# plt.xlabel('Number of training examples')
+# plt.ylabel('Error')
+# plt.legend()
+# plt.show()
+
+# 多项式回归
+poly = PolynomialFeatures(degree=8)
+XX,XXval,XXtest = [linear_regression.normalize(np.mat(poly.fit_transform(data[:,1:]))) for data in [X,Xval,Xtest]]
+initTheta = np.mat(np.ones((XX.shape[1], 1)))
+# res, timeConsumed = linear_regression.gradient(XX, y, rate=1, maxLoop=5000, epsilon=0.01, theLambda=100)
+# theta, errors = res
+# print errors[-1]
+#
+#
+# # 绘制拟合曲线
+# fitX = np.mat(np.linspace(-60,45).reshape(-1, 1))
+# fitX = np.concatenate((np.ones((fitX.shape[0], 1)), fitX), axis=1)
+# fitXX = linear_regression.normalize(np.mat(poly.fit_transform(fitX[:, 1:])))
+# h = fitXX * theta
+# plt.scatter(X[:, 1].ravel(), y[:, 0].flatten().A[0])
+# plt.plot(fitX[:, 1], h, color='g')
+# plt.show()
+
+theLambdas = [0, 0.001,0.003,0.01,0.003,0.1,0.3,1,3,10,100]
+numTheLambdas = len(theLambdas)
+trainErrors = np.zeros((1,numTheLambdas))
+valErrors = np.zeros((1,numTheLambdas))
+for idx, theLambda in enumerate(theLambdas):
+    res, timeConsumed = linear_regression.gradient(XX, y, rate=0.3, maxLoop=500, epsilon=0.01, theLambda=theLambda)
+    theta, errors = res
+    trainErrors[0, idx] = errors[-1]
+    valErrors[0,idx] = linear_regression.J(theta, XXval, yval, theLambda = theLambda)
+print valErrors
+# # 绘制随样本规模学习曲线
+# trainErrors, valErrors = linear_regression.getLearningCurves(XX, y, XXval, yval, rate=0.1, maxLoop=5000, epsilon=0.01, theLambda=100)
+# m,n = XX.shape
+plt.plot(np.arange(1, numTheLambdas+1).ravel(), trainErrors.ravel(), color='b', label='Training Error')
+plt.plot(np.arange(1, numTheLambdas+1).ravel(), valErrors.ravel(), color='g', label='Validation Error')
+plt.title('Learning curve for linear regression')
+plt.xlabel('Number of training examples')
+plt.ylabel('Error')
+plt.legend()
+plt.show()
diff --git a/linear_regression/regression.py b/linear_regression/regression.py
@@ -205,7 +205,7 @@ def lwr(rate, maxLoop, epsilon, X, y, x, c=1):
             converged = True
     return theta,errors,thetas
 
-def standardize(X):
+def standarize(X):
     """特征标准化处理
 
     Args: