-
Notifications
You must be signed in to change notification settings - Fork 6.4k
/
Copy pathadaboost.py
98 lines (78 loc) · 2.59 KB
/
adaboost.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# https://deeplearningcourses.com/c/machine-learning-in-python-random-forest-adaboost
# https://www.udemy.com/machine-learning-in-python-random-forest-adaboost
from __future__ import print_function, division
from builtins import range, input
# Note: you may need to update your version of future
# sudo pip install -U future
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from rf_classification import get_data
class AdaBoost:
def __init__(self, M):
self.M = M
def fit(self, X, Y):
self.models = []
self.alphas = []
N, _ = X.shape
W = np.ones(N) / N
for m in range(self.M):
tree = DecisionTreeClassifier(max_depth=1)
tree.fit(X, Y, sample_weight=W)
P = tree.predict(X)
err = W.dot(P != Y)
alpha = 0.5*(np.log(1 - err) - np.log(err))
W = W*np.exp(-alpha*Y*P) # vectorized form
W = W / W.sum() # normalize so it sums to 1
self.models.append(tree)
self.alphas.append(alpha)
def predict(self, X):
# NOT like SKLearn API
# we want accuracy and exponential loss for plotting purposes
N, _ = X.shape
FX = np.zeros(N)
for alpha, tree in zip(self.alphas, self.models):
FX += alpha*tree.predict(X)
return np.sign(FX), FX
def score(self, X, Y):
# NOT like SKLearn API
# we want accuracy and exponential loss for plotting purposes
P, FX = self.predict(X)
L = np.exp(-Y*FX).mean()
return np.mean(P == Y), L
if __name__ == '__main__':
X, Y = get_data()
Y[Y == 0] = -1 # make the targets -1,+1
Ntrain = int(0.8*len(X))
Xtrain, Ytrain = X[:Ntrain], Y[:Ntrain]
Xtest, Ytest = X[Ntrain:], Y[Ntrain:]
T = 200
train_errors = np.empty(T)
test_losses = np.empty(T)
test_errors = np.empty(T)
for num_trees in range(T):
if num_trees == 0:
train_errors[num_trees] = None
test_errors[num_trees] = None
test_losses[num_trees] = None
continue
if num_trees % 20 == 0:
print(num_trees)
model = AdaBoost(num_trees)
model.fit(Xtrain, Ytrain)
acc, loss = model.score(Xtest, Ytest)
acc_train, _ = model.score(Xtrain, Ytrain)
train_errors[num_trees] = 1 - acc_train
test_errors[num_trees] = 1 - acc
test_losses[num_trees] = loss
if num_trees == T - 1:
print("final train error:", 1 - acc_train)
print("final test error:", 1 - acc)
plt.plot(test_errors, label='test errors')
plt.plot(test_losses, label='test losses')
plt.legend()
plt.show()
plt.plot(train_errors, label='train errors')
plt.plot(test_errors, label='test errors')
plt.legend()
plt.show()