-
Notifications
You must be signed in to change notification settings - Fork 0
/
hper_bo.py
488 lines (357 loc) · 22.1 KB
/
hper_bo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
"""
SPProC: Sequential learning with Physical Probabilistic Constraints
@authors: Armi Tiihonen, Felipe Oviedo, Shreyaa Raghavan, Zhe Liu
MIT Photovoltaics Laboratory
"""
import psutil
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Key data fusion functionalities
from hper_fun import determine_data_fusion_points
# Helper functions that are not specific to GPyOpt or GPy Bayesian optimization and Gaussian process regression packages.
from hper_util_bo import fill_accum_df_with_this_round, query_target_data_from_model, query_data_fusion_data_from_model, create_optima_arrays, plot_basic
from hper_plots_target import plotBO
from hper_plots_data_fusion import plotDF
# Helper functions that need to be modified if you switch away from GPy + GPyOpt.
from hper_util_bo_gpyopt import build_constraint_str, run_bo
from hper_util_gp import GP_model, evaluate_GP_model_constraints, constrain_optimize_GP_model, extract_gpmodel_params
def bo_sim_target(targetprop_data_source,
human_data_source = None,
materials=['CsPbI', 'MAPbI', 'FAPbI'], rounds=10,
init_points=None, batch_size=1,
acquisition_function='EI', acq_fun_params=None,
df_data_coll_params=None, no_plots=False,
results_folder='./Results/', noise_target = 1,
seed = None, save_memory = True, close_figs = True):
'''
Simulates a Bayesian Optimization cycle using the Gaussian Process
Regression model given in bo_ground_truth_model_path as the ground truth for
sampling BO target property. NOTE: These functions assume that the ground
truth model contains non-scaled Y data as an argument. GPyOpt BO does not
produce this automatically for its surrogate model - it needs to be added
manually.
Acquisition function options are the ones available by default in GPyOpt,
additionally, there is 'EI_DF'. With EI_DF, one MUST deliver the correct
acq_fun_params, otherwise the code fails. This option will use the
acq_fun_params for building a Gaussian Process Regression model
that will be used as the basis for data fusion -aided acquisition.
Explanation for acq_fun_params: Three-element vector assumed with
parameters built using ei_df_param_builder() as the first element,
the name of the data fusion property as the second element (valid options
are 'dft', 'visual', or 'cutoff'; 'cutoff' implementation has not been
tested),
a string defining how the data fusion data will be used as the third
element (valid options are 'files' i.e. forward the data fusion data to
EI_DF directly, which is to be used with IRL data; and 'model_none'/
'model_all'/'model_necessary' which all fit a GPR model on all the data
provided to serve as a ground truth source, and sample from it never/all
the samples/when the algo defines it necessary).
'''
###########################################################################
# HARD-CODED SETTINGS (CONSIDER TURNING INTO FUNCTION PARAMS)
# The sum of the amount of each material will be
composition_total = [0.99, 1] #Ennen [0.995,1]
# limited between these values. If you need everything to sum up to 100% within
# 1%-units accuracy, choose [0.995, 1] (default value). If the code runs too
# long with this value, choose [0.99,1] or even wider range.
# Input parameter values are within this range (all the dimensions).
domain_boundaries = [0, 1]
# These variables are related to the Bayesian optimization.
num_cores = 1 # Not a parallel run
if type(targetprop_data_source) is list:
# List of experimental data dataframes. A feature that has not been tested fully.
simulated_bo = False
else:
## No real experiments but data is queried online from a function.
simulated_bo = True
###########################################################################
# SET BO DATA SOURCE
if simulated_bo == True:
gt_model_targetprop = targetprop_data_source
else:
data_targetprop = targetprop_data_source # Not really tested. The idea is that there is a list of csv files for each round.
###########################################################################
# SET DATA FUSION DATA SOURCE AND PARAMETERS
if df_data_coll_params is not None:
# Do data fusion.
if df_data_coll_params['use_model'] is False:
# List of real human observations in dataframes. A feature that has not been tested fully.
data_datafusionprop = human_data_source
else:
## No real experiments but data is queried online from a function.
gt_model_datafusionprop = human_data_source
# Then data_fusion_data is cleared now that it has been used for
# fitting the model. New data will be sampled only when the BO algo
# requires it.
acq_fun_params['df_data'] = [pd.DataFrame(
columns=acq_fun_params['df_input_var'] +
[acq_fun_params['df_target_var']])]
###########################################################################
# INITIALIZE VARIABLES.
if simulated_bo is True:
# "exact_feval" is a GPyOpt BO setting related to the expected noise
# level of the black-box function. The BO might throw errors if a
# noise-free target is treated, so the variable value needs to be
# correctly set.
if noise_target == 0:
exact_feval = True
else:
exact_feval = False
# Material composition needs to sum up to 1 within the accuracy defined in
# 'composition_total'.
if composition_total is None:
constraints = None
ternary = False
elif (np.isclose(composition_total[1], 1,
atol = np.abs(composition_total[1]-composition_total[0])) == True):
c0, c1 = build_constraint_str(materials, composition_total)
# Search constraints.
constraints = []
for k in range(rounds):
constraints.append([{'name': 'constr_1', 'constraint': c0},
{'name': 'constr_2', 'constraint': c1}])
ternary = True
else:
ternary = True
raise Exception('The constraint requested has not been implemented or tested.')
# Boundaries of the search domain.
bounds = []
for j in range(len(materials)):
bounds.append({'name': materials[j], 'type': 'continuous',
'domain': domain_boundaries})
# A dummy parameter at the moment; will be needed in IRL BO to determine
# if target variable values are being read from a function or from a table
# provided by the user.
function = True
###
# Variables related to the optimization target variable:
# Requests for the next locations to be sampled.
x_next = [None for j in range(rounds)]
# Same in nice dataframes.
x_next_df = [None for j in range(rounds)]
# Data actually collected during the round in question (in DataFrames);
# could in experiments be different than the ones actually requested at the
# end of the previous round.
X_rounds = [None for j in range(rounds)]
Y_rounds = [None for j in range(rounds)]
# All the data actully collected by that round (in numpy arrays for GPyOpt):
X_accum = [None for j in range(rounds)]
Y_accum = [None for j in range(rounds)]
# BO objects for each BO round (fitted using data acquired by the round in
# question).
BO_objects = [None for j in range(rounds)]
if simulated_bo == False:
# The actually collected optimization target points from the previous
# rounds exist already and can be filled in to the variables.
raise Exception('Not implemented!')
lengthscales = [np.nan for j in range(rounds)]
variances = [np.nan for j in range(rounds)]
max_gradients = [np.nan for j in range(rounds)]
gaussian_noises = [np.nan for j in range(rounds)]
###
# Variables related to data fusion.
if df_data_coll_params is not None:
# Requests for the next locations to be queried (in DataFrames).
data_fusion_x_next = [None for j in range(rounds)]
# Data actually collected during the round in question (in DataFrames);
# could in experiments be different than the ones actually requested at the
# end of the previous round. These are
data_fusion_XZ_rounds = [None for j in range(rounds)]
# All the data actully collected by that round (in DataFrames):
data_fusion_XZ_accum = [None for j in range(rounds)]
# GP models for each BO round (fitted using data acquired by the round
# in question, note that object is added only when the model
# changes after having acquired more data fusion data - this is to save
# memory):
data_fusion_models = [None for j in range(rounds)]
data_fusion_lengthscales = [None for j in range(rounds)]
data_fusion_variances = [None for j in range(rounds)]
data_fusion_gaussian_noises = [None for j in range(rounds)]
if df_data_coll_params['use_model'] == False:
# The actually collected data fusion points from the previous
# rounds do already exist and can be filled in to the variables.
for j in range(len(acq_fun_params['df_data'])):
data_fusion_XZ_rounds[j] = acq_fun_params['df_data'][j].copy()
# Note that acq_fun_param[df_data] will be wiped multiple times
# during the BO loop. Data is not lost because it has been
# saved into data_fusion_XZ_rounds here.
# Now, the first round has at minimum been initialized to empty
# DataFrames. There may still be 'None' slots in
# data_fusion_XZ_rounds.
for k in range(rounds):
if data_fusion_XZ_rounds[k] is None:
# Data would have already been added in the previous loop
# if there would have been any. Fill in an empty DataFrame.
data_fusion_XZ_rounds[k] = pd.DataFrame(
columns=acq_fun_params['df_input_var'] +
[acq_fun_params['df_target_var']])
# TO DO: This part works but does unnecessary redefinitions.
# Clean up and test.
data_fusion_XZ_accum = fill_accum_df_with_this_round(
data_fusion_XZ_accum, data_fusion_XZ_rounds, k)
###############################################################################
# BEGIN BAYESIAN OPTIMIZATION
for k in range(rounds):
if (function == True):
# Query target variable values from the provided ground truth model
# and update X_rounds, Y_rounds, X_accum, Y_accum in place.
X_rounds, Y_rounds, X_accum, Y_accum = query_target_data_from_model(
k, X_rounds, Y_rounds, X_accum, Y_accum, init_points, x_next,
gt_model_targetprop, rounds, materials, noise_level = noise_target,
seed = seed)
if df_data_coll_params is not None:
# Do data fusion. Start by querying data fusion data from the model
data_fusion_XZ_rounds, data_fusion_XZ_accum = query_data_fusion_data_from_model(
k, data_fusion_XZ_rounds, data_fusion_XZ_accum, init_points,
data_fusion_x_next, gt_model_datafusionprop, rounds, materials,
acq_fun_params, noise_level = df_data_coll_params['noise_df'],
seed = seed)
# Save the data fusion data for this round to the params. #that will be sent to the BO.
acq_fun_params['df_data'] = data_fusion_XZ_accum[k]
if df_data_coll_params is not None:
if k == 0:
# Create a data fusion model with the data fusion data collected this far.
current_df_model = GP_model(data_fusion_XZ_accum[k],
data_fusion_target_variable = acq_fun_params['df_target_var'],
lengthscale = acq_fun_params['df_kernel_lengthscale'],
variance = acq_fun_params['df_kernel_variance'],
noise_variance = acq_fun_params['df_noise_variance'],
data_fusion_input_variables = acq_fun_params['df_input_var'],
domain_boundaries = domain_boundaries)
else:
if data_fusion_XZ_rounds[k].empty is False:
# Update existing data fusion model with the new observations.
current_df_model.set_XY(X = acq_fun_params['df_data'][
acq_fun_params['df_input_var']].values,
Y = acq_fun_params['df_data'][[
acq_fun_params['df_target_var']]].values)
init_hyperpars_df, lims_kernel_var_df, lims_noise_var_df = evaluate_GP_model_constraints(
Y = current_df_model.Y,
noise_variance = acq_fun_params['df_noise_variance'],
kernel_variance = acq_fun_params['df_kernel_variance'],
lengthscale = acq_fun_params['df_kernel_lengthscale'],
domain_boundaries = domain_boundaries)
constrain_optimize_GP_model(current_df_model,
init_hyperpars = init_hyperpars_df,
lims_kernel_var = lims_kernel_var_df,
lims_noise_var = lims_kernel_var_df)
#if (save_memory is False) or (k<2) or (no_plots == False):
data_fusion_models[k] = current_df_model.copy()
# Add the model in any case to the acquisition parameters.
acq_fun_params['df_model'] = current_df_model.copy()
# Define and fit BO object.
BO_objects[k], x_next[k], current_surrogate_model_params = run_bo(
X= X_accum[k], Y = Y_accum[k], bounds = bounds,
constraints = constraints[k],
acquisition_function = acquisition_function,
acq_fun_params = acq_fun_params, batch_size = batch_size,
exact_feval = exact_feval)
# The next points to sample as a list of DataFrames for the convenience
# of the user.
x_next_df[k] = pd.DataFrame(x_next[k], columns=materials)
lengthscales[k] = current_surrogate_model_params['lengthscale']
variances[k] = current_surrogate_model_params['variance']
gradients = current_surrogate_model_params['gradients']
gaussian_noises[k] = current_surrogate_model_params['gaussian_noise']
max_gradients[k] = current_surrogate_model_params['max_gradient']
if df_data_coll_params is not None:
# Do data fusion.
# Estimate if data fusion should be requested for (some of) the
# suggestions for the next round.
data_fusion_x_next[k] = determine_data_fusion_points(
data_fusion_XZ_accum, df_data_coll_params, acq_fun_params,
x_next, current_surrogate_model_params, materials, bounds, k)
data_fusion_lengthscales[k], data_fusion_variances[k], data_fusion_gaussian_noises[k] = extract_gpmodel_params(current_df_model)
###########################################################################
# DATA TREATMENT, PLOTTING, SAVING
print('Before plots and saves:\n')
print('RAM memory % used:', psutil.virtual_memory()[2])
message = 'Last suggestions for the next sampling points: ' + str(x_next[-1])
#logging.log(21, message)
# Save the model as an backup
# dbfile = open('Backup-model-{date:%Y%m%d%H%M%S}'.format(date=datetime.datetime.now()), 'ab')
# pickle.dump([BO_batch, x_next_df, x_next, X_rounds, Y_rounds], dbfile)
# dbfile.close()
surrogate_model_params = {'lengthscales': lengthscales,
'variances': variances,
'max_gradients': max_gradients,
'gaussian_noise_variances': gaussian_noises}
if df_data_coll_params is not None:
data_fusion_params = {'df_data_rounds': data_fusion_XZ_rounds,
'df_data_accum': data_fusion_XZ_accum,
'df_data_hyperpars': {'df_data_lengthscales': data_fusion_lengthscales,
'df_data_variances': data_fusion_variances,
'df_data_gaussian_noise_variances': data_fusion_gaussian_noises}
}
else:
data_fusion_params = None
optimum, model_optimum = create_optima_arrays(BO_objects, X_accum, Y_accum,
rounds, materials, ternary,
domain_boundaries)
if no_plots == False:
time_now = '{date:%Y%m%d%H%M}'.format(date=datetime.datetime.now())
# Plot ternary-specific plots.
plotBO(rounds, x_next_df, BO_objects, materials, X_rounds, Y_rounds,
Y_accum, X_accum, x_next, limit_file_number=False,
time_str=time_now, results_folder=results_folder,
minimize = True, close_figs = close_figs)
if (acquisition_function == 'EI_DF') or (acquisition_function == 'LCB_DF'):
# Plot ternary-specific plots regarding data fusion.
plotDF(rounds, materials, data_fusion_models,
data_fusion_XZ_accum, acq_fun_params['df_target_var'],
acq_fun_params['p_beta'], acq_fun_params['p_midpoint'],
limit_file_number=False, time_str=time_now,
results_folder=results_folder, close_figs = close_figs)
message = ('Data fusion:\nGaussian noise variances in this run: ' +
str(np.mean(data_fusion_gaussian_noises)) + '+-' +
str(np.std(data_fusion_gaussian_noises)) + '\n' +
'Lenghthscales in this run: ' +
str(np.mean(data_fusion_lengthscales, axis=0)) + '+-' +
str(np.std(data_fusion_lengthscales)) + '\n' +
'Variances in this run: ' +
str(np.mean(data_fusion_variances)) + '+-' +
str(np.std(data_fusion_variances)) + '\n' +
'Results are saved into the given folder.')
#logging.log(21, message)
print(message)
# Plots that work with any di<mensionality.
plot_basic(Y_accum, X_accum, optimum,
model_optimum, rounds,
time_str = time_now,
results_folder = results_folder, saveas = not no_plots,
hyperpars = surrogate_model_params,
data_fusion_hyperpars = data_fusion_params,
close_figs = close_figs)
message = ('Target property:\nGaussian noise variances in this run: ' +
str(np.mean(gaussian_noises)) + '+-' +
str(np.std(gaussian_noises)) + '\n' +
'Lenghthscales in this run: ' + str(np.mean(lengthscales, axis=0)) + '+-' +
str(np.std(lengthscales)) + '\n' +
'Variances in this run: ' + str(np.mean(variances)) + '+-' +
str(np.std(variances)) + '\n' +
'Max gradients in this run: ' + str(np.mean(max_gradients)) + '+-' +
str(np.std(max_gradients)) + '\n' +
'Results are saved into the given folder.')
#logging.log(21, message)
print(message)
# Not sure if these are needed anymore. I used to have some memory issues
# that seemed to have gotten fixed by adding these, did not debug at the
# time.
next_suggestions = x_next_df.copy()
optimum = optimum.copy()
X_rounds = X_rounds.copy()
Y_rounds = Y_rounds.copy()
print('After plots and saves:\n')
print('RAM memory % used:', psutil.virtual_memory()[2])
plt.close()
print('After closing figs:\n')
print('RAM memory % used:', psutil.virtual_memory()[2])
if (save_memory is True):
BO_objects = [None] * (len(BO_objects))
if (acquisition_function == 'EI_DF') or (acquisition_function == 'LCB_DF'):
data_fusion_models = [None] * (len(data_fusion_models))
print('After clear-outs:\n')
print('RAM memory % used:', psutil.virtual_memory()[2])
return next_suggestions, optimum, model_optimum, X_rounds, Y_rounds, X_accum, Y_accum, surrogate_model_params, data_fusion_params, BO_objects