forked from WuLC/MachineLearningAlgorithm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLinearRegression.py
103 lines (88 loc) · 3.08 KB
/
LinearRegression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# -*- coding: utf-8 -*-
# Created on Fri Apr 20 2016 17:0:36
# Author: WuLC
# EMail: [email protected]
########################################################################
# linear regression with batch gradient descent or stochastic gradient descent
# compare with SGDClassfier in sklearn
##########################################################################
from sklearn.linear_model import SGDClassifier
import numpy as np
import random
def batchGradientDescent(x,y,theta,alpha):
"""batch gradient descent for linear regression
@parms x: input of independent variables
@parms y: input dependent variables
@parms theta: weights parameterizing the hypothesis
@parms alpha: learning rate
"""
m, n = np.shape(x)
xTran = x.transpose()
threshold = 0.000000001
lastCost = 0
cost = -1
recurseCount = 0
while abs(lastCost - cost) > threshold: # rcurse until converge
lastCost = cost
hypothesis = np.dot(x, theta)
loss = hypothesis - y
cost = np.sum(loss**2)/(2*m)
gradient = np.dot(xTran, loss)/m
theta = theta - alpha*gradient
recurseCount += 1
return recurseCount,theta
def stochasticGradientDescent(x,y,theta,alpha):
"""stochastic gradient descent for linear regression
@parms x: input of independent variables
@parms y: input dependent variables
@parms theta: weights parameterizing the hypothesis
@parms alpha: learning rate
"""
m,n = np.shape(x)
threshold = 0.000000001
lastCost = 0
cost = -1
recurseCount = 0
while abs(lastCost - cost) > threshold: # rcurse until converge
lastCost = cost
hypothesis = np.dot(x,theta)
for i in range(m):
# alpha = 4.0 / (1.0 + i) + 0.01
loss = hypothesis[i] - y[i]
# gradient = np.dot(x[i],loss)
gradient = x[i,:].transpose() * loss
theta = theta - alpha * gradient
cost = np.sum((hypothesis-y)**2)/(2*m)
recurseCount += 1
return recurseCount,theta
def getData(m,bias,variance):
"""
get sample data for the test
@params m:number of input example
@params n:number of independent variables
"""
x = np.zeros(shape=(m,2))
y = np.zeros(m)
for i in range(m):
x[i][0] = 1
x[i][1] = i
y[i] = i^2+i
return x,y
if __name__ == '__main__':
x, y = getData(100,25,10)
m, n = np.shape(x)
alpha = 0.0005
theta = np.ones(n)
# recurseNum, theta = batchGradientDescent(x, y, theta, alpha)
recurseNum, theta = stochasticGradientDescent(x,y,theta,alpha)
print recurseNum,theta
X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
# SGDClassifier throws error when there are float numbers in the training data set
Y = np.array([0.1, 1.1, 2.1, 2])
clf = SGDClassifier(loss ="log")
clf.fit(x,y)
# compare manually implemented SGD with SGDClassifier in sklearn
for i in X:
print i
print 'manual method:',np.dot(theta,i)
print 'sklearn method:', clf.predict(i)