forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path_stochastic_optimizers.py
288 lines (230 loc) · 8.62 KB
/
_stochastic_optimizers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
"""Stochastic optimization methods for MLP
"""
# Authors: Jiyuan Qian <[email protected]>
# License: BSD 3 clause
import numpy as np
class BaseOptimizer:
"""Base (Stochastic) gradient descent optimizer
Parameters
----------
learning_rate_init : float, default=0.1
The initial learning rate used. It controls the step-size in updating
the weights
Attributes
----------
learning_rate : float
the current learning rate
"""
def __init__(self, learning_rate_init=0.1):
self.learning_rate_init = learning_rate_init
self.learning_rate = float(learning_rate_init)
def update_params(self, params, grads):
"""Update parameters with given gradients
Parameters
----------
params : list of length = len(coefs_) + len(intercepts_)
The concatenated list containing coefs_ and intercepts_ in MLP
model. Used for initializing velocities and updating params
grads : list of length = len(params)
Containing gradients with respect to coefs_ and intercepts_ in MLP
model. So length should be aligned with params
"""
updates = self._get_updates(grads)
for param, update in zip((p for p in params), updates):
param += update
def iteration_ends(self, time_step):
"""Perform update to learning rate and potentially other states at the
end of an iteration
"""
pass
def trigger_stopping(self, msg, verbose):
"""Decides whether it is time to stop training
Parameters
----------
msg : str
Message passed in for verbose output
verbose : bool
Print message to stdin if True
Returns
-------
is_stopping : bool
True if training needs to stop
"""
if verbose:
print(msg + " Stopping.")
return True
class SGDOptimizer(BaseOptimizer):
"""Stochastic gradient descent optimizer with momentum
Parameters
----------
params : list, length = len(coefs_) + len(intercepts_)
The concatenated list containing coefs_ and intercepts_ in MLP model.
Used for initializing velocities and updating params
learning_rate_init : float, default=0.1
The initial learning rate used. It controls the step-size in updating
the weights
lr_schedule : {'constant', 'adaptive', 'invscaling'}, default='constant'
Learning rate schedule for weight updates.
-'constant', is a constant learning rate given by
'learning_rate_init'.
-'invscaling' gradually decreases the learning rate 'learning_rate_' at
each time step 't' using an inverse scaling exponent of 'power_t'.
learning_rate_ = learning_rate_init / pow(t, power_t)
-'adaptive', keeps the learning rate constant to
'learning_rate_init' as long as the training keeps decreasing.
Each time 2 consecutive epochs fail to decrease the training loss by
tol, or fail to increase validation score by tol if 'early_stopping'
is on, the current learning rate is divided by 5.
momentum : float, default=0.9
Value of momentum used, must be larger than or equal to 0
nesterov : bool, default=True
Whether to use nesterov's momentum or not. Use nesterov's if True
power_t : float, default=0.5
Power of time step 't' in inverse scaling. See `lr_schedule` for
more details.
Attributes
----------
learning_rate : float
the current learning rate
velocities : list, length = len(params)
velocities that are used to update params
"""
def __init__(
self,
params,
learning_rate_init=0.1,
lr_schedule="constant",
momentum=0.9,
nesterov=True,
power_t=0.5,
):
super().__init__(learning_rate_init)
self.lr_schedule = lr_schedule
self.momentum = momentum
self.nesterov = nesterov
self.power_t = power_t
self.velocities = [np.zeros_like(param) for param in params]
def iteration_ends(self, time_step):
"""Perform updates to learning rate and potential other states at the
end of an iteration
Parameters
----------
time_step : int
number of training samples trained on so far, used to update
learning rate for 'invscaling'
"""
if self.lr_schedule == "invscaling":
self.learning_rate = (
float(self.learning_rate_init) / (time_step + 1) ** self.power_t
)
def trigger_stopping(self, msg, verbose):
if self.lr_schedule != "adaptive":
if verbose:
print(msg + " Stopping.")
return True
if self.learning_rate <= 1e-6:
if verbose:
print(msg + " Learning rate too small. Stopping.")
return True
self.learning_rate /= 5.0
if verbose:
print(msg + " Setting learning rate to %f" % self.learning_rate)
return False
def _get_updates(self, grads):
"""Get the values used to update params with given gradients
Parameters
----------
grads : list, length = len(coefs_) + len(intercepts_)
Containing gradients with respect to coefs_ and intercepts_ in MLP
model. So length should be aligned with params
Returns
-------
updates : list, length = len(grads)
The values to add to params
"""
updates = [
self.momentum * velocity - self.learning_rate * grad
for velocity, grad in zip(self.velocities, grads)
]
self.velocities = updates
if self.nesterov:
updates = [
self.momentum * velocity - self.learning_rate * grad
for velocity, grad in zip(self.velocities, grads)
]
return updates
class AdamOptimizer(BaseOptimizer):
"""Stochastic gradient descent optimizer with Adam
Note: All default values are from the original Adam paper
Parameters
----------
params : list, length = len(coefs_) + len(intercepts_)
The concatenated list containing coefs_ and intercepts_ in MLP model.
Used for initializing velocities and updating params
learning_rate_init : float, default=0.001
The initial learning rate used. It controls the step-size in updating
the weights
beta_1 : float, default=0.9
Exponential decay rate for estimates of first moment vector, should be
in [0, 1)
beta_2 : float, default=0.999
Exponential decay rate for estimates of second moment vector, should be
in [0, 1)
epsilon : float, default=1e-8
Value for numerical stability
Attributes
----------
learning_rate : float
The current learning rate
t : int
Timestep
ms : list, length = len(params)
First moment vectors
vs : list, length = len(params)
Second moment vectors
References
----------
:arxiv:`Kingma, Diederik, and Jimmy Ba (2014) "Adam: A method for
stochastic optimization." <1412.6980>
"""
def __init__(
self, params, learning_rate_init=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8
):
super().__init__(learning_rate_init)
self.beta_1 = beta_1
self.beta_2 = beta_2
self.epsilon = epsilon
self.t = 0
self.ms = [np.zeros_like(param) for param in params]
self.vs = [np.zeros_like(param) for param in params]
def _get_updates(self, grads):
"""Get the values used to update params with given gradients
Parameters
----------
grads : list, length = len(coefs_) + len(intercepts_)
Containing gradients with respect to coefs_ and intercepts_ in MLP
model. So length should be aligned with params
Returns
-------
updates : list, length = len(grads)
The values to add to params
"""
self.t += 1
self.ms = [
self.beta_1 * m + (1 - self.beta_1) * grad
for m, grad in zip(self.ms, grads)
]
self.vs = [
self.beta_2 * v + (1 - self.beta_2) * (grad**2)
for v, grad in zip(self.vs, grads)
]
self.learning_rate = (
self.learning_rate_init
* np.sqrt(1 - self.beta_2**self.t)
/ (1 - self.beta_1**self.t)
)
updates = [
-self.learning_rate * m / (np.sqrt(v) + self.epsilon)
for m, v in zip(self.ms, self.vs)
]
return updates