Skip to content

Commit

Permalink
initial NAP commit
Browse files Browse the repository at this point in the history
  • Loading branch information
a80055247 committed Jun 8, 2023
1 parent 58ba247 commit 6c7b38a
Show file tree
Hide file tree
Showing 121 changed files with 10,587 additions and 0 deletions.
46 changes: 46 additions & 0 deletions NAP/HPOB_data/convertHPOB.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import json
import pickle

import numpy as np

# based on https://arxiv.org/pdf/2106.06257.pdf

#wget https://rewind.tf.uni-freiburg.de/index.php/s/xdrJQPCTNi2zbfL/download/hpob-data.zip

ids = {
"5860": "glmnet",
"4796": "rpart_preproc",
"5906": "xgboost",
"5859": "rpart",
"5889": "ranger",
"5527": "svm",
}

for dstype in ['test', 'train', 'validation']:
dataset_name = f"meta-train-dataset-augmented.json" if 'train' in dstype else f"meta-{dstype}-dataset.json"
with open(dataset_name, "r") as f:
data = json.load(f)
for space_id, label in ids.items():
index = 0
for dataset_key in data[space_id].keys():

hpo_format = dict()
hpo_format["domain"] = np.array(data[space_id][dataset_key]["X"])
hpo_format["accs"] = np.array(data[space_id][dataset_key]["y"])[..., 0]

assert hpo_format["accs"].max() <= 1.0
assert hpo_format["accs"].min() >= 0.0

path = f"{label}_{dstype}_{index}.pkl"
print(path)

if index == 0 and dstype == "test":
print(f"problem {label} dim {hpo_format['domain'].shape[1]}")
elif dstype == "train":
print("number of points", hpo_format["accs"].shape[0], "min y",
hpo_format["accs"].min(), "max y", hpo_format["accs"].max())

with open(path, 'wb') as f:
pickle.dump(hpo_format, f)

index += 1
129 changes: 129 additions & 0 deletions NAP/HPOB_data/create_task_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import json
import os
import pickle as pkl

import botorch
import numpy as np
import torch
from botorch import fit_gpytorch_mll
from botorch.models import SingleTaskGP
from botorch.optim.fit import fit_gpytorch_mll_torch
from gpytorch import ExactMarginalLogLikelihood

from pathlib import Path
import os, sys
ROOT = str(Path(os.path.realpath(__file__)).parent.parent)
sys.path.insert(0, ROOT)

if __name__ == '__main__':

models = ['glmnet', 'rpart_preproc', 'xgboost', 'ranger', 'rpart', 'svm']
hpob_data_root = os.path.join(ROOT, 'HPOB_data')

if not os.path.exists(os.path.join(hpob_data_root, 'gps')):
os.makedirs(os.path.join(hpob_data_root, 'gps'))

name_ids = {
"glmnet": "5860",
"rpart_preproc": "4796",
"xgboost": "5906",
"ranger": "5889",
"rpart": "5859",
"svm": "5527",
}

with open(os.path.join(hpob_data_root, "meta-dataset-descriptors.json")) as f:
descriptor = json.load(f)

for model_name in models:
search_space_id = name_ids[model_name]
search_space_desc = descriptor[search_space_id]
train_datasets = os.listdir(os.path.join(hpob_data_root))
train_datasets = sorted([d for d in train_datasets if model_name + '_train' in d and 'pkl' in d])

skipped, skipped_n = [], []
for dataset in train_datasets:
data = pkl.load(open(os.path.join(hpob_data_root, dataset), 'rb'))
Y = data['accs']
stdY = (Y - Y.mean()) / Y.std()

if np.isnan(stdY).any():
print(f"({model_name}) Dataset #{dataset} Y.std()=NaN Skipped")
skipped.append(dataset)
skipped_n.append(int(dataset.split(".pkl")[0].split("_")[-1]))
continue
if Y.std() < 1e-3:
print(f"({model_name}) Dataset #{dataset} Y.std()={Y.std():.10f} Skipped")
skipped.append(dataset)
skipped_n.append(int(dataset.split(".pkl")[0].split("_")[-1]))
continue

print(f"({model_name}) skipped datasets {skipped_n}")
train_datasets = [trd for trd in train_datasets if trd not in skipped]

for dataset in train_datasets:
gp_name = dataset.split('.pkl')[0] + f'_gp.pt'
if not os.path.exists(os.path.join(hpob_data_root, 'gps', gp_name)):
data = pkl.load(open(os.path.join(hpob_data_root, dataset), 'rb'))
Y = data['accs']
X = data['domain'] # X is already normalised across all datasets (train, val, test)

yuniq, ycount = np.unique(Y, return_counts=True)
counts = {v: c for v, c in zip(yuniq, ycount)}
logits = np.array([Y[i] / counts[Y[i]] for i in range(len(Y))])
freq_idx = logits.argsort()[::-1]

selected_rows = freq_idx[:(3 * len(yuniq))]
np.random.shuffle(selected_rows)
X = X[selected_rows]
Y = Y[selected_rows]
stdY = (Y - Y.mean()) / Y.std()

num_dims = list(np.arange(X.shape[-1]))
cat_dims = []

# Fit and save GP
print(f'Fit GP on dataset {dataset} containing {X.shape[0]} points...')
normX = torch.from_numpy(X).to(dtype=torch.float64)
stdY = torch.from_numpy(stdY).to(dtype=torch.float64)

# Sub-sample dataset
model = SingleTaskGP(train_X=normX, train_Y=stdY.view(-1, 1))
mll = ExactMarginalLogLikelihood(model.likelihood, model)

try:
mll.cpu()
_ = fit_gpytorch_mll(mll=mll)
except (RuntimeError, botorch.exceptions.errors.ModelFittingError) as e:
print(e)
try:
print('Try fit on GPU')
mll.cuda()
_ = fit_gpytorch_mll_torch(mll)
except RuntimeError as e:
print(f'Error during the GP fit on {dataset}.')
print(e)
normX = normX.cpu().numpy()
stdY = stdY.cpu().numpy()
model = model.cpu()
mll = mll.cpu()
del model, mll
torch.cuda.empty_cache()
continue

with torch.no_grad():
torch.save(model, os.path.join(hpob_data_root, 'gps', gp_name))
print(f"saved model at {os.path.join(hpob_data_root, 'gps', gp_name)}")

normX = normX.cpu()
stdY = stdY.cpu()
model = model.cpu()
mll = mll.cpu()
model.eval()
del normX, stdY, model, mll
torch.cuda.empty_cache()

else:
data = pkl.load(open(os.path.join(hpob_data_root, dataset), 'rb'))
X = data['domain']
print(f'{dataset} GP already fit and saved: {X.shape[0]} points in {X.shape[1]} dims.')
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
25 changes: 25 additions & 0 deletions NAP/HPO_data/fit_xgb_gps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import os
import torch
import pickle
import numpy as np

from nap.RL.util import compute_cond_gps
from nap.environment.hpo import get_hpo_specs
from nap.environment.objectives import get_HPO_domain
from pathlib import Path


if __name__ == '__main__':
rootdir = os.path.join(os.path.dirname(Path(os.path.realpath(__file__)).parent))
hpo_type = "hpobenchXGB"
dims, points, train_datasets, valid_datasets, test_datasets, kernel_lengthscale, kernel_variance, \
noise_variance, X_mean, X_std = get_hpo_specs(hpo_type, rootdir)

saved_models_dir = os.path.join("/".join(train_datasets[0].split("/")[:-1]), 'GPs/train_sets')
if not os.path.exists(saved_models_dir):
os.makedirs(saved_models_dir)

loaded_datasets = [pickle.load(open(dataset, "rb")) for dataset in train_datasets]
all_X = np.array([get_HPO_domain(data=dataset) for dataset in loaded_datasets])
all_X = all_X.reshape(-1, all_X.shape[-1])
compute_cond_gps(train_datasets, saved_models_dir, trainXmean=all_X.mean(0), trainXstd=all_X.std(0))
Binary file added NAP/HPO_data/hpobenchXGB_0_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_10_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_11_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_12_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_13_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_14_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_15_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_16_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_17_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_18_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_19_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_1_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_20_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_21_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_22_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_23_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_24_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_25_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_26_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_27_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_28_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_29_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_2_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_30_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_31_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_32_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_33_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_34_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_35_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_36_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_37_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_38_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_39_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_3_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_40_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_41_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_42_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_43_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_44_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_45_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_46_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_47_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_4_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_5_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_6_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_7_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_8_eq.pkl
Binary file not shown.
Binary file added NAP/HPO_data/hpobenchXGB_9_eq.pkl
Binary file not shown.
4 changes: 4 additions & 0 deletions NAP/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (c) 2023
# Copyright holder of the paper "End-to-End Meta-Bayesian Optimisation with Transformer Neural Processes".
# Submitted to NeurIPS 2023 for review.
# All rights reserved.
Loading

0 comments on commit 6c7b38a

Please sign in to comment.