-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathLBFGS0.m
executable file
·251 lines (223 loc) · 6.15 KB
/
LBFGS0.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
function [xj, fval, it] = LBFGS0(fg, x0, lmax, scale, progTol, optTol, ...
max_iter, report_iter, varargin)
%
%
% S. Ulbrich, May 2002
% Modified by Truyen, Oct 2004.
% Limited memory BFGS-method with Powell-Wolfe stepsize rule.
%
% Input:
% fg : name of a matlab-function [f,g] = fg(x)
% that returns value and gradient
% of the objective function depending on the
% number of the given ouput arguments
% x0 : starting point (ROW VECTOR)
% lmax : maximal number of stored updates for
% the limited memory BFGS-approximation
% if not given, lmax = 60 is used
% opt : other options for the function fhd(X,opt)
% scale : initial step length, to avoid numerical problem
% epsilon : stopping criteria
% max_iter : number of iterations
% Output:
% sols : series of solutions
% fvals : series of loglikehood values per iteration
% it : number of iterations toward convergence
% timex : timex taken
if report_iter > 0
fprintf('%10s %15s\n', 'Iteration', 'Func Val');
end
% constants 0<del<theta<1, del<1/2 for Wolfe condition
del = 0.001;
theta = 0.9;
% constant 0<al<1 for sufficient decrease condition
al = 0.001;
%lmax = 60;
f_len = length(x0);
P = zeros(f_len,lmax);
D = zeros(f_len,lmax);
ga = zeros(lmax,1);
rho = zeros(lmax,1);
l = 0;
gak = 1;
xj = x0;
% [f,g] = vector_convert(fg,xj,opt);
[f, g] = fg(xj, varargin{:});
fval = f;
if max(abs(g)) <= optTol
return;
end
nmg0 = norm(g);
nmg = nmg0;
it = 0;
l = 0;
ln = 1;
stp = zeros(1,f_len);
% main loop
while it < max_iter
%sig = r;
% compute BFGS-step s = B*g;
q = g;
for j = 1:l
i = mod(ln-j-1,lmax) + 1;
ga(i) = rho(i)*(P(:,i)'*q);
q = q-ga(i)*D(:,i);
end
r = gak*q;
for j = l:-1:1
i = mod(ln-j-1,lmax) + 1;
be = rho(i)*(D(:,i)'*r);
r = r + (ga(i)-be)*P(:,i);
end
s = r;
step = 'LBFGS';
% check if BFGS-step provides sufficient decrease; else take gradient
stg = s'*g;
if stg < min(al,nmg)*nmg*norm(s)
s = g;
stg = s'*g;
step = 'Grad';
end
% choose sig by Powell-Wolfe stepsize rule
[xn,fn,gn] = wolfe(xj,s,stg,fg,f,g,del,theta,scale,varargin{:});
% update BFGS-matrix
d = g - gn;
p = xj - xn;
dtp = d'*p;
if dtp >= 1e-8*norm(d)*norm(p)
rho(ln) = 1/dtp;
D(:,ln) = d;
P(:,ln) = p;
l = min(l + 1,lmax);
ln = mod(ln,lmax) + 1;
if l == lmax
gak = dtp/(d'*d);
end
end
xj = xn;
g = gn;
f = fn;
nmg = norm(g);
it = it + 1;
prev_fval = fval;
fval = f;
if mod(it, report_iter) == 0
fprintf('%10d %15.5e\n', it, f);
end
if isnan(f) %it is numerically unstable for some reason, Truyen 20th May, 2005
break;
end
if it > 1 && prev_fval ~= 0 && abs(fval/prev_fval -1) < progTol
% if abs(fval-prev_fval) < progTol
break; %stop the optimiser
end
if max(abs(g)) < optTol
break;
end
end
end
%----------------------------------------------------------------
function [best_x,best_f,best_g] = wolfe(xj,s,stg,fct,f,g,del,theta,scale,varargin)
%
%
% S. Ulbrich, May 2002
% Modified:
% Truyen, Oct 2004 to control the step size,
% avoiding ill-conditioning and unsuccessful step size search
% Truyen, Nov 2005 to use best solution found so far
% This code comes with no guarantee or warranty of any kind.
%
% function [sig,xn,fn] = wolfe(xj,s,stg,fct,f,del,scale,theta)
%
% Determines stepsize satisfying the Powell-Wolfe conditions
%
% Input: xj current point
% s search direction (xn = xj-sig*s)
% stg stg = s'*g
% fct name of a matlab-function [f] = fct(x)
% that returns the value of the objective function
% f current objective function value f = fct(xj)
% g current gradient
% del constant 0<del<1/2 in Armijo condition f-fn> = sig*del*stg
% theta constant del<theta<1 in Wolfe condition gn'*s< = theta*stg
% scale initial stepsize (usually scale = 1)
%
% Output: sig stepsize sig satisfying the Armijo condition
% xn new point xn = xj-sig*s
% fn fn = f(xn)
%
max_s = max(abs(s));
sig1 = scale/max_s;
sig = sig1;
xn = xj - sig*s;
best_x = xj;
best_f = f;
best_g = g;
% [fn,gn] = vector_convert(fct,xn,opt);
[fn, gn] = fct(xn, varargin{:});
% Determine maximal sig = scale/2^k satisfying Armijo
count = 0;
while (count < 5 && f-fn < del*sig*stg)
count = count + 1;
sig = 0.5*sig;
xn = xj-sig*s;
% [fn] = vector_convert(fct,xn,opt);
fn = fct(xn, varargin{:});
end
if sig~=sig1
% [fn,gn] = vector_convert(fct,xn,opt);
[fn, gn] = fct(xn, varargin{:});
end
if fn < best_f
best_x = xn;
best_f = fn;
best_g = gn;
end
% If sig = scale satisfies Armijo then try sig = 2^k*scale
% until sig satisfies also the Wolfe condition
% or until sigp = 2^(k + 1)*scale violates the Armijo condition
if (sig == sig1)
xnn = xj - 2*sig*s;
% [fnn,gnn] = vector_convert(fct,xnn,opt);
[fnn, gnn] = fct(xnn, varargin{:});
count = 0;
while count < 5 & (gn'*s > theta*stg) & (f - fnn >= 2*del*sig*stg)
count = count + 1;
sig = 2*sig;
xn = xnn;
fn = fnn;
gn = gnn;
xnn = xj - 2*sig*s;
% [fnn,gnn] = vector_convert(fct,xnn,opt);
[fnn, gnn] = fct(xnn, varargin{:});
if fnn < best_f
best_x = xnn;
best_f = fnn;
best_g = gnn;
end
end
end
sigp = 2*sig;
% Perform bisektion until sig satisfies also the Wolfe condition
count = 0;
while count < 5 && (gn'*s > theta*stg)
count = count + 1;
sigb = 0.5*(sig + sigp);
xb = xj - sigb*s;
% [fnn,gnn] = vector_convert(fct,xb, opt);
[fnn, gnn] = fct(xb, varargin{:});
if fnn < best_f
best_x = xb;
best_f = fnn;
best_g = gnn;
end
if (f - fnn >= del*sigb*stg)
sig = sigb;
xn = xb;
fn = fnn;
gn = gnn;
else
sigp = sigb;
end
end
end