-
Notifications
You must be signed in to change notification settings - Fork 0
/
hper_bo.py
1071 lines (753 loc) · 41.6 KB
/
hper_bo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
SPProC: Sequential learning with Physical Probabilistic Constraints
@authors: Armi Tiihonen, Felipe Oviedo, Shreyaa Raghavan, Zhe Liu
MIT Photovoltaics Laboratory
"""
import pandas as pd
import numpy as np
import GPyOpt
from GPyOpt.acquisitions.EI_DFT import GP_model
import pickle
import datetime
import matplotlib.pyplot as plt
from plotting_v2 import plotBO
from plotting_data_fusion import plotDF
#import logging
def predict_points(GP_model, x_points, Y_data=None):
'''
For a GPy GP regression model.
'''
posterior_mean, posterior_var = GP_model.predict(x_points)
# If the model has been trained with already-scaled (zero mean, unit
# variance) data, the provided train data 'Y_data' will be used for scaling
# the predictions to the correct units.
if Y_data is not None:
posterior_mean_true_units = posterior_mean * \
np.std(Y_data) + np.mean(Y_data)
posterior_var_true_units = (np.std(Y_data))**2 * posterior_var
posterior_mean = posterior_mean_true_units
posterior_var = posterior_var_true_units
return posterior_mean, posterior_var
def predict_points_noisy(GP_model, x_points, Y_data=None, noise_proportion = 1):
# Predictions.
posterior_mean, posterior_var = predict_points(
GP_model, x_points, Y_data=Y_data)
# Adding Gaussian noise to the mean predictions.
posterior_mean_noisy = np.random.normal(
posterior_mean, np.sqrt(posterior_var)*noise_proportion)
return posterior_mean_noisy, posterior_var, posterior_mean
def ei_dft_param_builder(acquisition_type, data_fusion_target_property='dft',
data_fusion_input_variables=[
'CsPbI', 'MAPbI', 'FAPbI'],
optional_acq_params=None):
'''
This function builds a properly formatted param dictionary for EI_DFT
acquisition function when using bo_sim_target().
The only acquisition_type implemented is 'EI_DFT'. The allowed options for
data_fusion_target_variable are 'dft', 'visual', or 'cutoff'. If the rest
of the variables are None, hard-coded default values listed will be resumed.
'''
if acquisition_type == 'EI_DFT':
# Init all params.
list_p = ['files', 'lengthscale', 'variance',
'beta', 'midpoint', 'df_target_variable']
p = {'df_input_variables': data_fusion_input_variables}
for i in range(len(list_p)):
p[list_p[i]] = None
if optional_acq_params is not None:
for i in range(len(list_p)):
if list_p[i] in optional_acq_params:
p.update({list_p[i]: optional_acq_params[list_p[i]]})
if data_fusion_target_property == 'dft':
if p['files'] is None:
# These files contain DFT data that is integrated into the optimization loop as
# a soft constraint (starting from the round it is first listed - if the vector
# is shorter than the number of rounds, it is assumed that no data is being
# added to data fusion in the rounds not listed).
files = [['./phasestability/CsFA/fulldata/CsFA_T300_above.csv',
'./phasestability/FAMA/fulldata/FAMA_T300_above.csv',
'./phasestability/CsMA/fulldata/CsMA_T300_above.csv']
]
if p['df_target_variable'] == None:
variable = 'dGmix (ev/f.u.)'
if p['lengthscale'] == None: # For Gaussian process regression
lengthscale = 0.03
if p['variance'] == None: # For GPR
variance = 2
if p['beta'] == None: # For probability model
beta = 0.025
if p['midpoint'] == None: # For P model
midpoint = 0 # For P
elif data_fusion_target_property == 'visual':
if p['files'] is None:
# Visual quality of the samples as a constraint.
files = [
['./Source_data/visualquality/visualquality_round_0-1.csv']]
if p['df_target_variable'] == None:
variable = 'Quality'
if p['lengthscale'] == None: # For GP regression
lengthscale = 0.1
if p['variance'] == None: # For GPR
variance = 0.1
if p['beta'] == None: # For probability model
beta = 0.1
if p['midpoint'] == None: # For P model
midpoint = 0 # For P
elif data_fusion_target_property == 'cutoff':
if p['files'] is None:
# Arbitrary function (such as a direct cutoff) as a constraint.
files = [['./arbitrary_function/cutoff.csv']]
if p['df_target_variable'] == None:
variable = 'Cutoff'
if p['lengthscale'] == None: # For Gaussian process regression
lengthscale = 0.05
if p['variance'] == None: # For GPR
variance = 0.05
if p['beta'] == None: # For probability model
beta = 0.025
if p['midpoint'] == None: # For P model
midpoint = 0 # For P
else:
raise Exception('Data fusion target variable ' + p['df_target_variable'] +
' has not been implemented in the parameter builder. Please provide another variable name.')
if (files != None):
# Retrieve the data.
data_fusion_data = []
for i in range(len(files)):
data_fusion_data.append([])
for j in range(len(files[i])):
data_fusion_data[i].append(pd.read_csv(files[i][j]))
data_fusion_data[i] = pd.concat(data_fusion_data[i])
ei_dft_params = {'df_data': data_fusion_data,
'df_target_var': variable,
'df_target_prop': data_fusion_target_property,
'df_input_var': p['df_input_variables'],
'gp_lengthscale': lengthscale,
'gp_variance': variance,
'p_beta': beta,
'p_midpoint': midpoint
}
else:
raise Exception(
'The parameter builder is not required for this acquisition function. The builder has been implemented only for EI_DFT.')
return ei_dft_params
def acq_param_builder(acquisition_function, data_fusion_property=None,
data_fusion_input_variables=None, optional_acq_params=None):
if data_fusion_property is None:
acq_fun_params = {}
else:
if acquisition_function == 'EI_DFT':
acq_fun_params = ei_dft_param_builder(acquisition_function,
data_fusion_target_property=data_fusion_property,
data_fusion_input_variables=data_fusion_input_variables,
optional_acq_params=optional_acq_params)
else:
raise Exception(
'Data fusion has not been implemented for this acquisition function.')
acq_fun_params['jitter'] = 0.01
if optional_acq_params is not None:
if 'jitter' in optional_acq_params:
acq_fun_params['jitter'] = optional_acq_params['jitter']
return acq_fun_params
def acq_fun_param2descr(acq_fun, acq_fun_params=None):
output_str = acq_fun
if acq_fun == 'EI_DFT':
ei_dft_params = acq_fun_params
output_str = (output_str + '-dftarget-' + ei_dft_params['df_target_prop'] +
'-lengthscale-' + str(ei_dft_params['gp_lengthscale']) +
'-variance-' + str(ei_dft_params['gp_variance']) +
'-beta-' + str(ei_dft_params['p_beta']) +
'-midpoint-' + str(ei_dft_params['p_midpoint']))
if acq_fun_params is not None:
if 'jitter' in acq_fun_params:
output_str = output_str + '-jitter-' + str(acq_fun_params['jitter'])
return output_str
def df_data_coll_param_builder(df_method=None, gradient_param=None, exclusion_param=None):
if df_method is None:
# No data fusion is done.
df_data_coll_params = None
else:
if df_method.find('model') != -1:
use_model = True
else:
use_model = False
df_data_coll_params = {'use_model': use_model}
if df_method.find('_necessary') != -1:
if df_method.find('_eig') != -1:
method = 'eig'
c_g = gradient_param
c_e = exclusion_param
use_and = False
df_data_coll_params['c_eig'] = c_e
elif df_method.find('_excl') != -1:
method = 'exclz'
c_g = gradient_param
c_e = exclusion_param
use_and = False
df_data_coll_params['c_exclz'] = c_e
else:
# This is a deprecated version that just exists for compatibility
# with old results.
method = 'necessary_depr'
df_data_coll_params['c_grad'] = c_g
df_data_coll_params['use_and'] = use_and
elif df_method.find('_all') != -1:
method = 'all'
elif df_method.find('_none') != -1:
method = 'none'
else:
raise Exception(
'Data fusion data collection has not been implemented for this case.')
df_data_coll_params['method'] = method
return df_data_coll_params
def df_data_coll_method_param2descr(df_data_coll_params):
output_str = ''
if df_data_coll_params is not None:
if df_data_coll_params['use_model']:
output_str = output_str + '_model'
else:
output_str = output_str + '_live'
if ((df_data_coll_params['method'] == 'all') or
(df_data_coll_params['method'] == 'none')):
output_str = output_str + '_' + df_data_coll_params['method']
else:
output_str = output_str + '_necessary'
if df_data_coll_params['method'] != 'necessary_depr':
output_str = (output_str + '_' + df_data_coll_params['method'] +
'-grad_lim-' + str(df_data_coll_params['c_grad']))
if df_data_coll_params['method'] == 'eig':
output_str = (output_str + '-eig_lim-' +
str(df_data_coll_params['c_eig']))
elif df_data_coll_params['method'] == 'exclz':
output_str = (output_str + '-r-' +
str(df_data_coll_params['c_exclz']))
if len(output_str) == 0:
raise Exception(
'Description has not been implemented for this case.')
return output_str
def build_constraint_str(materials, composition_total, prefix='x[:,',
postfix=']'):
c1 = ''
c0 = str(composition_total[0])
for idx in range(len(materials)):
c1 = c1 + prefix + str(idx) + postfix + ' + '
c0 = c0 + ' - ' + prefix + str(idx) + postfix
c1 = c1[0:-2] + '- ' + str(composition_total[1])
return c0, c1
def load_ground_truth(path_model):
# Load already existing stability data as the "ground truth" of stability.
with open(path_model, 'rb') as f:
target_model_raw = pickle.load(f)
# The current implementation assumes the model is GPy gp_regression.
# Uncomment the following lines if you want to use GPyOpt GPModel, instead.
# The saved model is GPyOpt GPModel that is a wrapper of GPy gp_regression.
# GPyOpt is not maintained anymore, so for easier adaptation to other
# packages, we use here GPy gp_regression model. Let's dig it out.
#target_model = target_model_raw.model
target_model = target_model_raw
return target_model
def query_data_fusion_data_from_model(k, data_fusion_XZ_rounds,
data_fusion_XZ_accum, init_points,
data_fusion_x_next, data_fusion_gt_model,
rounds, materials,
acq_fun_params, noise = False):
# Data fusion data is generated using the ground truth
# model.
if k == 0:
# Sample the initial points given by the user.
# Predict.
# Note: There's no need to add train data to this GP model
# because it has not been trained with scaled Y data (unlike
# the target data model).
if noise == False:
data_fusion_z_next = predict_points(
data_fusion_gt_model, np.array(init_points))[0]
else:
data_fusion_z_next = predict_points_noisy(
data_fusion_gt_model, np.array(init_points))[0]
# Add to the list.
data_fusion_XZ_rounds[k] = pd.DataFrame(
np.concatenate((init_points,
data_fusion_z_next), axis=1),
columns=acq_fun_params['df_input_var'] +
[acq_fun_params['df_target_var']])
else:
# There may be point(s) suggested in the previous round
# that need to be evaluated.
if data_fusion_x_next[k-1].empty is False:
# Note: There's no need to add train data to this GP model
# (see above).
# Predict.
if noise == False:
data_fusion_z_next = predict_points(
data_fusion_gt_model, data_fusion_x_next[k-1].values)[0]
else:
data_fusion_z_next = predict_points_noisy(
data_fusion_gt_model, data_fusion_x_next[k-1].values)[0]
# Add to the list.
data_fusion_XZ_rounds[k] = pd.DataFrame(
np.concatenate((data_fusion_x_next[k-1].values,
data_fusion_z_next), axis=1),
columns=acq_fun_params['df_input_var'] +
[acq_fun_params['df_target_var']])
else:
data_fusion_XZ_rounds[k] = pd.DataFrame(
columns=acq_fun_params['df_input_var'] +
[acq_fun_params['df_target_var']])
data_fusion_XZ_accum = fill_accum_df_with_this_round(
data_fusion_XZ_accum, data_fusion_XZ_rounds, k)
return data_fusion_XZ_rounds, data_fusion_XZ_accum
def fill_accum_df_with_this_round(accum_df, rounds_df, k):
if k == 0:
accum_df[k] = rounds_df[k].copy()
else:
accum_df[k] = pd.concat([accum_df[k-1], rounds_df[k]],
ignore_index=True)
return accum_df
def fill_accum_ar_with_this_round(accum_ar, rounds_df, k):
if k == 0:
accum_ar[k] = rounds_df[k].values.copy()
else:
accum_ar[k] = np.append(accum_ar[k-1], rounds_df[k].values, axis=0)
return accum_ar
def query_target_data_from_model(k, X_rounds, Y_rounds, X_accum, Y_accum,
init_points, x_next, stability_model, rounds,
materials, noise=True):
if (k == 0):
# Initialization with the given grid points.
X_rounds[k] = pd.DataFrame(init_points, columns=materials)
else:
# The locations suggested after the previous BO round will be
# sampled in this round.
X_rounds[k] = pd.DataFrame(x_next[k-1], columns=materials)
#compositions_input.append(pd.DataFrame(x_next[k-1], columns = materials))
#df = compositions_input[k].copy()
# Function predictions. Each round, the BO model is trained from zero.
# This is not computationally efficient, though. Should we modify at
# some point? Or do we want to keep the BO model for each round intact?
# TO DO: clean up so that works for any #D
#x = df.iloc[:,0:len(materials)].values
if noise == True:
preds = predict_points_noisy(stability_model, X_rounds[k].values,
stability_model.Y)[0]
else:
preds = predict_points(stability_model, X_rounds[k].values,
stability_model.Y)[0]
# np.reshape(preds, (preds.shape[0], 1))
Y_rounds[k] = pd.DataFrame(preds, columns=['Target value'])
# Y_rounds[k] = np.reshape(predict_points_noisy(stability_model, x, stability_model.Y)[0],
# (predict_points_noisy(stability_model, x, stability_model.Y)[0].shape[0], 1))
#df['Merit'] = predict_points_noisy(stability_model, x, stability_model.Y)[0]
# degradation_input.append(df)
# X is a vector of compositions, Y is a vector of merit values.
#X_rounds[k] = df[materials].values
# Reshaping is done to go from (n,) to (n,1), which is required by GPyOpt.
#Y_rounds[k] = np.reshape(df['Merit'].values, (df['Merit'].values.shape[0], 1))
Y_accum = fill_accum_ar_with_this_round(Y_accum, Y_rounds, k)
X_accum = fill_accum_ar_with_this_round(X_accum, X_rounds, k)
# For each BayesianOpt round, we include only the data that has been
# collected by that time.
# for j in range(rounds):
#
# if j >= k:
#
# if k == 0:
#
# X_accum[j] = X_rounds[k].values
# Y_accum[j] = Y_rounds[k].values
#
# else:
#
# X_accum[j] = np.append(X_accum[j], X_rounds[k].values, axis=0)
# Y_accum[j] = np.append(Y_accum[j], Y_rounds[k].values, axis=0)
return X_rounds, Y_rounds, X_accum, Y_accum
def determine_data_fusion_points(data_fusion_XZ_accum,
df_data_coll_params, acq_fun_params, x_next,
current_surrogate_model_params, materials, bounds, k):
# The locations from which the data fusion data for _the next round_ will
# be queried are determined here.
if df_data_coll_params['method'] == 'none':
# 'model_none' or 'live_none'
# Never sample from the model or query human, just add an empty df for
# records.
result = pd.DataFrame(columns=acq_fun_params['df_input_var'])
elif df_data_coll_params['method'] == 'all':
# Always sample human.
result = pd.DataFrame(x_next[k],
columns=acq_fun_params['df_input_var'])
if df_data_coll_params['use_model'] == False:
# Request humans to give feedback from the specified samples.
print('Give feedback on sample quality of these samples:\n',
result)
elif ((df_data_coll_params['method'] == 'exclz') or
(df_data_coll_params['method'] == 'eig')):
# Gradient criterion.
# Constant for the gradient limit.
c_grad = df_data_coll_params['c_grad']
# Parameters of the current surrogate model for the optimization target.
lengthscale_s = current_surrogate_model_params['lengthscale']
variance_s = current_surrogate_model_params['variance']
# Sample only if the suggested point has a larger gradient than this
# limit.
gradient_limit = (np.sqrt(variance_s)/lengthscale_s)*c_grad
# Max. gradients of the next suggested points in the surrogate model.
grad_max_s_next = current_surrogate_model_params['max_gradient']
# Pick new points for which the surrogate model has a high gradient, no
# matter if there is an earlier data fusion point nearby.
new_df_points_x_g = x_next[k][grad_max_s_next > gradient_limit]
# THIS OPTION IS NOT IN USE.
# Pick new points for which the surrogate model does not have
# a high gradient but they are located far away from the
# previously sampled points (in unknown region).
new_df_points_x_u = x_next[k][grad_max_s_next <= gradient_limit]
# If there are points to be considered based on exclusion zone or
# gradient criterion.
if new_df_points_x_g.shape[0] > 0:
# Drop the points that are excluded from the points to be queried
# because there are previous human evaluations nearby.
if df_data_coll_params['method'] == 'exclz':
# Drop points with an earlier data fusion point nearby.
# 'Nearby' is X% of the domain length here.
c_exclz = df_data_coll_params['c_exclz']
r_limit = (bounds[0]['domain'][1] - bounds[0]
['domain'][0])*c_exclz/100
index = 0
for l in range(len(new_df_points_x_g)): # Should be u finally!
if data_fusion_XZ_accum[k].shape[0] > 0:
if np.any(np.sqrt(np.sum((
data_fusion_XZ_accum[k].iloc[:, 0:len(materials)] -
new_df_points_x_g[index])**2, axis=1)) <
r_limit):
new_df_points_x_g = np.delete(new_df_points_x_g,
index, axis=0)
# TO DO: Test if index works correctly when batch BO is used!
message = 'Deleted a point based on r exclusion.'
#logging.log(21, message)
else:
index = index + 1
elif df_data_coll_params['method'] == 'eig':
# If the data fusion model exists already.
if data_fusion_XZ_accum[k].shape[0] > 0:
# Drop points if the expected information gain for the
# human opinion model is too low.
# Let's re-create the human opinion model for EIG test.
current_df_model = GP_model(data_fusion_XZ_accum[k],
acq_fun_params['df_target_var'],
acq_fun_params['gp_lengthscale'],
acq_fun_params['gp_variance'],
acq_fun_params['df_input_var'])
# Variance on each point x (pred. from the data fusion
# model).
var_d_next = predict_points(
current_df_model, new_df_points_x_g)[1]
# Data fusion model y variance estimate.
vary_d = current_df_model.Gaussian_noise.variance[0]
message = 'Data fusion Gaussian noise variance: ' + str(vary_d)
#logging.log(21, message)
index = 0
for l in range(len(new_df_points_x_g)):
eig = 0.5 * (np.log10(var_d_next[l, 0]/vary_d + 1))
c_eig = df_data_coll_params['c_eig']
# Let's scale EIG scale factor
# based on max EIG (i.e., c_eig = 1 samples very little,
# c_eig = 0 does not limit at all).
eig_max = - np.log10(1/2)
if eig < (eig_max * c_eig):
new_df_points_x_g = np.delete(
new_df_points_x_g, index, axis=0)
message = 'Deleted a point based on EIG.'
#logging.log(21, message)
else:
index = index + 1
# NOT IN USE AT THE MOMENT.
# Combine the two criteria.
# np.append(new_df_points_x_g, new_df_points_x_u, axis = 0)
new_df_points_x = new_df_points_x_g
if new_df_points_x.shape[0] > 0:
result = pd.DataFrame(new_df_points_x,
columns=acq_fun_params['df_input_var'])
if df_data_coll_params['use_model'] == False:
# Request humans to give feedback from the specified samples.
print('Give feedback on sample quality of these samples:\n',
result)
else:
# Add empty DataFrame
result = pd.DataFrame(columns=acq_fun_params['df_input_var'])
return result
def bo_sim_target(bo_ground_truth_model_path='./Source_data/C2a_GPR_model_with_unscaled_ydata-20190730172222',
materials=['CsPbI', 'MAPbI', 'FAPbI'], rounds=10,
init_points=None, batch_size=1,
acquisition_function='EI', acq_fun_params=None,
df_data_coll_params=None, no_plots=False,
results_folder='./Results/', noise_df = False,
noise_target = False):
'''
Simulates a Bayesian Optimization cycle using the Gaussian Process
Regression model given in bo_ground_truth_model_path as the ground truth for
sampling BO target property. NOTE: These functions assume that the ground
truth model contains non-scaled Y data as an argument. GPyOpt BO does not
produce this automatically for its surrogate model - it needs to be added
manually.
Acquisition function options are the ones available by default in GPyOpt,
additionally, there is 'EI_DFT'. With EI_DFT, one MUST deliver the correct
acq_fun_params, otherwise the code fails. This option will use the
acq_fun_params for building a Gaussian Process Regression model
that will be used as the basis for data fusion -aided acquisition.
Explanation for acq_fun_params: Three-element vector assumed with
parameters built using ei_dft_param_builder() as the first element,
the name of the data fusion property as the second element (valid options
are 'dft', 'visual', or 'cutoff'; 'cutoff' implementation has not been
tested),
a string defining how the data fusion data will be used as the third
element (valid options are 'files' i.e. forward the data fusion data to
EI_DFT directly, which is to be used with IRL data; and 'model_none'/
'model_all'/'model_necessary' which all fit a GPR model on all the data
provided to serve as a ground truth source, and sample from it never/all
the samples/when the algo defines it necessary).
'''
###########################################################################
# HARD-CODED SETTINGS (CONSIDER TURNING INTO FUNCTION PARAMS)
# The sum of the amount of each material will be
composition_total = [0.995, 1]
# limited between these values. If you need everything to sum up to 100% within
# 1%-units accuracy, choose [0.995, 1] (default value). If the code runs too
# long with this value, choose [0.99,1] or even wider range.
# Input parameter values are within this range (all the dimensions).
domain_boundaries = [0, 1]
# These variables are related to the Bayesian optimization.
num_cores = 1 # Not a parallel run
jitter = 0.01 # The level of exploration.
# No real experiments but data is queried online from a function.
simulated_bo = True
###########################################################################
# SET DATA FUSION PARAMETERS
if simulated_bo == True:
stability_model = load_ground_truth(bo_ground_truth_model_path)
if df_data_coll_params is not None:
# Do data fusion.
if df_data_coll_params['use_model']:
# Fit a Gaussian process regression model to simulate the
# data fusion variable. During the BO loop, data fusion observations
# will be observed by sampling this model with noise.
# ALL the data fusion data will be used for fitting the model.
data_fusion_gt_model = GP_model(pd.concat(acq_fun_params['df_data']),
acq_fun_params['df_target_var'],
acq_fun_params['gp_lengthscale'],
acq_fun_params['gp_variance'],
acq_fun_params['df_input_var'])
# Then data_fusion_data is cleared now that it has been used for
# fitting the model. New data will be sampled only when the BO algo
# requires it.
acq_fun_params['df_data'] = [pd.DataFrame(columns=acq_fun_params['df_input_var'] +
[acq_fun_params['df_target_var']
])]
if acq_fun_params['df_data'] is None:
# Initialize to empty.
acq_fun_params['df_data'] = [pd.DataFrame(columns=acq_fun_params['df_input_var'] +
[acq_fun_params['df_target_var']
])]
# Else, the data fusion data is already in acq_fun_params['df_data'] in
# the required format (list of DataFrames for each round).
###########################################################################
# INITIALIZE VARIABLES.
# Material composition needs to sum up to 1 within the accuracy defined in
# 'composition_total'.
c0, c1 = build_constraint_str(materials, composition_total)
# Search constraints.
constraints = []
for k in range(rounds):
constraints.append([{'name': 'constr_1', 'constraint': c0},
{'name': 'constr_2', 'constraint': c1}])
# Boundaries of the search domain.
bounds = []
for j in range(len(materials)):
bounds.append({'name': materials[j], 'type': 'continuous',
'domain': domain_boundaries})
# A dummy parameter at the moment; will be needed in IRL BO to determine
# if target variable values are being read from a function or from a table
# provided by the user.
function = True
###
# Variables related to the optimization target variable:
# Requests for the next locations to be sampled.
x_next = [None for j in range(rounds)]
# Same in nice dataframes.
x_next_df = [None for j in range(rounds)]
# Data actually collected during the round in question (in DataFrames);
# could in experiments be different than the ones actually requested at the
# end of the previous round.
X_rounds = [None for j in range(rounds)]
Y_rounds = [None for j in range(rounds)]
# All the data actully collected by that round (in numpy arrays for GPyOpt):
X_accum = [None for j in range(rounds)]
Y_accum = [None for j in range(rounds)]
# BO objects for each BO round (fitted using data acquired by the round in
# question).
BO_objects = [None for j in range(rounds)]
if simulated_bo == False:
# The actually collected optimization target points from the previous
# rounds exist already and can be filled in to the variables.
raise Exception('Not implemented!')
lengthscales = [np.nan for j in range(rounds)]
variances = [np.nan for j in range(rounds)]
max_gradients = [np.nan for j in range(rounds)]
gaussian_noises = [np.nan for j in range(rounds)]
###
# Variables related to data fusion.
if df_data_coll_params is not None:
# Requests for the next locations to be queried (in DataFrames).
data_fusion_x_next = [None for j in range(rounds)]
# Data actually collected during the round in question (in DataFrames);
# could in experiments be different than the ones actually requested at the
# end of the previous round. These are
data_fusion_XZ_rounds = [None for j in range(rounds)]
# All the data actully collected by that round (in DataFrames):
data_fusion_XZ_accum = [None for j in range(rounds)]
# GP models for each BO round (fitted using data acquired by the round
# in question).
data_fusion_models = [None for j in range(rounds)]
if df_data_coll_params['use_model'] == False:
# The actually collected data fusion points from the previous
# rounds do already exist and can be filled in to the variables.
for j in range(len(acq_fun_params['df_data'])):
data_fusion_XZ_rounds[j] = acq_fun_params['df_data'][j].copy()
# Note that acq_fun_param[df_data] will be wiped multiple times
# during the BO loop. Data is not lost because it has been
# saved into data_fusion_XZ_rounds here.
# Now, the first round has at minimum been initialized to empty
# DataFrames. There may still be 'None' slots in
# data_fusion_XZ_rounds.
for k in range(rounds):
if data_fusion_XZ_rounds[k] is None:
# Data would have already been added in the previous loop
# if there would have been any. Fill in an empty DataFrame.
data_fusion_XZ_rounds[k] = pd.DataFrame(
columns=acq_fun_params['df_input_var'] +
[acq_fun_params['df_target_var']])
# TO DO: This part works but does unnecessary redefinitions.
# Clean up and test.
data_fusion_XZ_accum = fill_accum_df_with_this_round(
data_fusion_XZ_accum, data_fusion_XZ_rounds, k)
###############################################################################
# BEGIN BAYESIAN OPTIMIZATION
for k in range(rounds):
if (function == True):
# Query target variable values from the provided ground truth model
# and update X_rounds, Y_rounds, X_accum, Y_accum in place.
X_rounds, Y_rounds, X_accum, Y_accum = query_target_data_from_model(
k, X_rounds, Y_rounds, X_accum, Y_accum, init_points, x_next,
stability_model, rounds, materials, noise = noise_target)
if df_data_coll_params is not None:
# Do data fusion.
data_fusion_XZ_rounds, data_fusion_XZ_accum = query_data_fusion_data_from_model(
k, data_fusion_XZ_rounds, data_fusion_XZ_accum, init_points,
data_fusion_x_next, data_fusion_gt_model, rounds, materials,
acq_fun_params, noise = noise_df)
# Save the data fusion data for this round to the params that will
# be sent to the BO.
acq_fun_params['df_data'] = data_fusion_XZ_accum[k]
# Define and fit BO object.
# f=None because this code will be adapted in future for experimental
# BO cycles.
BO_objects[k] = GPyOpt.methods.BayesianOptimization(f=None,
domain=bounds,
constraints=constraints[k],
acquisition_type=acquisition_function,
normalize_Y=True,
X=X_accum[k],
Y=Y_accum[k],
evaluator_type='local_penalization',
batch_size=batch_size,
acquisition_jitter=acq_fun_params['jitter'],
acq_fun_params=acq_fun_params,
noise_var = 0.1*(Y_accum[k]/Y_accum[k].max()).var(), # GPyOpt assumes normalized Y data at the point when variance is defined.
optimize_restarts = 10,
max_iters = 2000,
exact_feval = (not noise_target)
)
# Suggest next points (samples to prepare).
x_next[k] = BO_objects[k].suggest_next_locations()
# Same as a list of DataFrames for the convenience of the user.
x_next_df[k] = pd.DataFrame(x_next[k], columns=materials)
# Fetch surrogate model parameters to be forwarded for the user and, if
# data fusion is enabled, used in evaluating if data fusion data should
# be queried.
lengthscales[k] = BO_objects[k].model.model.kern.lengthscale[0]
variances[k] = BO_objects[k].model.model.kern.variance[0]
gradients = BO_objects[k].model.model.predictive_gradients(x_next[k])[
0][:, :, 0]
gaussian_noises[k] = BO_objects[k].model.model.Gaussian_noise.variance[0]
# Maximum gradient element value to any direction of the search space
# for each x_next point.
max_gradients[k] = np.max(np.abs(gradients), axis=1)
current_surrogate_model_params = {'lengthscale': lengthscales[k],
'variance': variances[k],
'max_gradient': max_gradients[k]}
if df_data_coll_params is not None:
# Do data fusion.
# Estimate if data fusion should be requested for (some of) the
# suggestions for the next round.
data_fusion_x_next[k] = determine_data_fusion_points(
data_fusion_XZ_accum, df_data_coll_params, acq_fun_params,
x_next, current_surrogate_model_params, materials, bounds, k)
###########################################################################
# DATA TREATMENT, PLOTTING, SAVING
message = 'Last suggestions for the next sampling points: ' + str(x_next[-1])
#logging.log(21, message)
# Save the model as an backup
# dbfile = open('Backup-model-{date:%Y%m%d%H%M%S}'.format(date=datetime.datetime.now()), 'ab')
# pickle.dump([BO_batch, x_next_df, x_next, X_rounds, Y_rounds], dbfile)
# dbfile.close()