Skip to content

Commit

Permalink
experimentation
Browse files Browse the repository at this point in the history
  • Loading branch information
Ishika Agarwal committed Sep 23, 2024
1 parent 48752ed commit 265a491
Show file tree
Hide file tree
Showing 9 changed files with 293 additions and 65 deletions.
21 changes: 14 additions & 7 deletions subset_selection/src/utils/dist_utils/select_it_baseline.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def construction_rps(self, prompts, references):
# return token_level_score

def sentence_level_self_reflection(self, prompts, references, alpha=0.2, k=5):
self.model.to('cuda')
# self.model.to('cuda')
rps = self.construction_rps(prompts, references)
pro = []
for idx, p in enumerate(rps):
Expand All @@ -99,14 +99,21 @@ def sentence_level_self_reflection(self, prompts, references, alpha=0.2, k=5):
predictions = outputs[0]
logits = predictions[:, -1, :]
softmax_logits = torch.softmax(logits.float(), dim=-1)
for index in range(1):
tmp_res = [float(softmax_logits[index][29896]), float(softmax_logits[index][29906]),
float(softmax_logits[index][29941]), float(softmax_logits[index][29946]),
float(softmax_logits[index][29945])]
pro.append(tmp_res)
if "Phi" in str(type(self.model)):
for index in range(1):
tmp_res = [float(softmax_logits[index][29896]), float(softmax_logits[index][29906]),
float(softmax_logits[index][29941]), float(softmax_logits[index][29946]),
float(softmax_logits[index][29945])]
pro.append(tmp_res)
elif "Qwen" in str(type(self.model)):
for index in range(1):
tmp_res = [float(softmax_logits[index][16]), float(softmax_logits[index][17]),
float(softmax_logits[index][18]), float(softmax_logits[index][19]),
float(softmax_logits[index][20])]
pro.append(tmp_res)
except Exception as ex:
print(ex)
self.model.to('cpu')
# self.model.to('cpu')
pro_softmax = []
for item in pro:
tmp_pro_softmax = item
Expand Down
11 changes: 4 additions & 7 deletions subset_selection/subset_fl.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,8 @@ def create_subset(self, data_sijs, k=0.3):
n, _ = data_sijs.shape

# use facility location to find subset
fl = sb.functions.facilityLocation.FacilityLocationFunction(n, mode='dense', sijs=data_sijs)
subset = fl.maximize(budget=int(k * n), optimizer='LazyGreedy', stopIfZeroGain=False, stopIfNegativeGain=True, verbose=True)

fl = sb.functions.facilityLocation.FacilityLocationFunction(n, mode='dense', sijs=data_sijs, separate_rep=False)
subset = fl.maximize(budget=int(k * n), optimizer='LazyGreedy', stopIfZeroGain=False, stopIfNegativeGain=False, verbose=True)
return subset

def create_conditional_gain_subset(self, data_sijs, private_sijs, k=0.3):
Expand All @@ -28,8 +27,7 @@ def create_conditional_gain_subset(self, data_sijs, private_sijs, k=0.3):

# use facility location to find subset
fl = sb.functions.facilityLocationConditionalGain.FacilityLocationConditionalGainFunction(n, num_privates, data_sijs=data_sijs, private_sijs=private_sijs)
subset = fl.maximize(budget=int(k * n), optimizer='LazyGreedy', stopIfZeroGain=False, stopIfNegativeGain=True, verbose=True)

subset = fl.maximize(budget=int(k * n), optimizer='LazyGreedy', stopIfZeroGain=False, stopIfNegativeGain=False, verbose=True)
return subset

def create_mutual_information_subset(self, data_sijs, query_sijs, k=0.3):
Expand All @@ -41,8 +39,7 @@ def create_mutual_information_subset(self, data_sijs, query_sijs, k=0.3):

# use facility location to find subset
fl = sb.functions.facilityLocationMutualInformation.FacilityLocationMutualInformationFunction(n, num_privates, data_sijs=data_sijs, query_sijs=query_sijs)
subset = fl.maximize(budget=int(k * n), optimizer='LazyGreedy', stopIfZeroGain=False, stopIfNegativeGain=True, verbose=True)

subset = fl.maximize(budget=int(k * n), optimizer='LazyGreedy', stopIfZeroGain=False, stopIfNegativeGain=False, verbose=True)
return subset


Expand Down
15 changes: 11 additions & 4 deletions visualization/create_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,16 +145,22 @@ def parse_qa_datasets():

input = ""
temp = i['supporting_facts']
keep = True
for x in range(len(temp['title'])):
ind = i['context']['title'].index(temp['title'][x])
input += i['context']['sentences'][ind][temp['sent_id'][x]] + " "
try:
input += i['context']['sentences'][ind][temp['sent_id'][x]] + " "
except:
keep = False

data.append(f"Instruction:\nContext: {instruction}\nInput:\n{input}\nOutput:\n{output}\n")
if keep:
data.append(f"Instruction:\nContext: {instruction}\nInput:\n{input}\nOutput:\n{output}\n")

if len(data) > 10:
if len(data) > 1000:
break
break

x = len(data)
train_ds = pd.DataFrame(data[:int(0.7*x)], columns=['data'])
valid_ds = pd.DataFrame(data[int(0.7*x):int(0.9*x)], columns=['data'])
test_ds = pd.DataFrame(data[int(0.9*x):], columns=['data'])
Expand All @@ -170,10 +176,11 @@ def parse_qa_datasets():

data.append(f"Instruction:\nContext: {instruction}\nInput:\n{input}\nOutput:\n{output}\n")

if len(data) > 10:
if len(data) > 1000:
break
break

x = len(data)
train_ds = pd.DataFrame(data[:int(0.7*x)], columns=['data'])
valid_ds = pd.DataFrame(data[int(0.7*x):int(0.9*x)], columns=['data'])
test_ds = pd.DataFrame(data[int(0.9*x):], columns=['data'])
Expand Down
11 changes: 9 additions & 2 deletions visualization/folder_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ def parse_file_name(dataset_name, exp_config):
if "before" in exp_config:
k = re.findall(".*-.*-(.*-.*)", exp_config)[0]
return f"before_exp-{k}_{dataset_name}"
elif "initial" in exp_config:
return f"{exp_config.replace('initial', 'before_exp')}_{dataset_name}"
return f"{exp_config}_{dataset_name}"

class FolderNames:
Expand Down Expand Up @@ -41,11 +43,13 @@ def __init__(self, model_name, cache_name="version_cache"):
self.subset_folder = os.path.join(self.main_folder, "utility")
self.select_it_subset_file = lambda dataset_name: os.path.join(self.subset_folder, f"select_it_subset_{dataset_name}.pkl")
self.model_dep_utility_file = lambda dataset_name: os.path.join(self.subset_folder, f"model_dep_utility_{dataset_name}.pkl")
self.superfiltering_utility_file = lambda dataset_name: os.path.join(self.subset_folder, f"superfiltering_utility_{dataset_name}.pkl")
self.model_ind_utility_file = lambda dataset_name: os.path.join(self.subset_folder, f"model_ind_utility_{dataset_name}.pkl")
if not os.path.exists(self.subset_folder): os.mkdir(self.subset_folder)

# store the knowledge after the experiments
self.exp_knowledge_file = lambda dataset_name, exp_config, prefix="": os.path.join(self.main_folder, "generated_text", f"{prefix}{parse_file_name(dataset_name, exp_config)}.pkl")
self.exp_prefix = ""
self.exp_knowledge_file = lambda dataset_name, exp_config, prefix="": os.path.join(self.main_folder, f"{self.exp_prefix}generated_text", f"{prefix}{parse_file_name(dataset_name, exp_config)}.pkl")
if not os.path.exists(os.path.join(self.main_folder, "generated_text")): os.mkdir(os.path.join(self.main_folder, "generated_text"))

# store individual similarity metrics
Expand All @@ -58,4 +62,7 @@ def __init__(self, model_name, cache_name="version_cache"):

# peft specific
self.peft_ft_model = lambda dataset_name, exp_config: os.path.join(self.main_folder, "peft_ft_models", parse_file_name(dataset_name, exp_config))
if not os.path.exists(os.path.join(self.main_folder, "peft_ft_models")): os.mkdir(os.path.join(self.main_folder, "peft_ft_models"))
if not os.path.exists(os.path.join(self.main_folder, "peft_ft_models")): os.mkdir(os.path.join(self.main_folder, "peft_ft_models"))

# less specific
self.less_subset_file = lambda model, dataset_name: f"/u/ishikaa2/selected_data/{model}-{dataset_name}_indicies.pkl"
36 changes: 10 additions & 26 deletions visualization/lm_knowledge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import numpy as np
import evaluate
import torch
import similarity

rouge_metric = evaluate.load('rouge')
bleu_metric = evaluate.load('bleu')
Expand All @@ -14,38 +15,20 @@ def calculate_similarity(predictions, references, score="rouge", return_invidiua
Args:
predictions: list of strings for the hypothesis
references: list of strings for the reference
score: either "rouge" or "bleu" to calculate either metric
score: one of "rouge", "bleu", "bertscore", "bge", "promedeus"
return_invidiual: if True, it will return the individual scores for corresponding prediction-reference pairs
Returns:
np array of metrics of size 1x1 if return_individual is True, else 1x|predictions|
"""
if not return_invidiual:
predictions = [predictions]
references = [references]
if "rouge" in score or "bleu" in score or "bertscore" in score:
return similarity.calculate_evaluate_metric(predictions, references, score, return_invidiual)
elif "bge" in score:
return similarity.calculate_bge(predictions, references, return_invidiual)
else:
predictions = [[p] for p in predictions]
references = [[r] for r in references]
raise ValueError(f"Invalid similarity metric: {score}")


if score == "rouge":
sim_metric = rouge_metric
metric_key = "rouge1"
elif score == "bleu":
sim_metric = bleu_metric
metric_key = "bleu"
else:
sim_metric = bert_metric
metric_key = "f1"

# sim_metric = rouge_metric if score == "rouge" elif score == "bleu" bleu_metric else bert_metric
# metric_key = "rouge1" if score == "rouge" else "bleu"
metrics = []
for p, r in zip(predictions, references):
metrics.append(sim_metric.compute(predictions=p, references=r)[metric_key])
return np.array(metrics)


def perform_inference(model, tokenizer, prompts, references, batch_size=4):
def perform_inference(model, tokenizer, prompts, references, batch_size=2):
"""
Performs inference on prompts and computes the ROUGE between the generated text and corresponding reference
Expand All @@ -67,7 +50,8 @@ def perform_inference(model, tokenizer, prompts, references, batch_size=4):
all_gen_texts = []

# Process prompts in batches
for i in tqdm(range(0, len(prompts), batch_size)):
max_len = max(len(prompts), 200)
for i in tqdm(range(0, max_len, batch_size)):
batch_prompts = prompts[i:i+batch_size]
batch_references = references[i:i+batch_size]

Expand Down
130 changes: 130 additions & 0 deletions visualization/load_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
from visualization import load_subset_experiment, calculate_test_performance
from data_object import DataObject, DataObjectConstants
from folder_names import FolderNames
from plotting import Plotting
from models import Models
import numpy as np
import traceback
import argparse
import pickle
import torch
import os

import wandb
wandb.login(anonymous='allow', key='fa20f5af73ec4d1dedb50a817a8de53fbef1bade')

def main(model_names, existing_data_name, new_data_name, threshold, subset_percentage):
run = wandb.init(
project="Optimizing Data Selection",
)

# all experimental configurations
uc_labels = ["Model Dependent + CG FL", "SelectIT", "Model Independent + CG FL", "Random", "Full Dataset"]
ucl_shorthand = ["mod_dep_fl", "select_it", "mod_ind_fl", "random", "full_data"]
# uc_labels = ["SelectIT", "Model Independent + CG FL", "Random", "Full Dataset"]
# ucl_shorthand = ["select_it", "mod_ind_fl", "random", "full_data"]
# uc_labels = ["Model Dependent ICL Utility", "Model Dependent Gradient Utility", "Model Independent", "Random", "Full Dataset"]
# ucl_shorthand = ["mod_dep_icl", "mod_dep_grad", "mod_ind", "random", "full_data"]
sl_labels = ["ICL", "PEFT"]

# loop through each of the model names
for model_name in model_names:
if existing_data_name == new_data_name:
fn = FolderNames(model_name, "same_data_cache")
elif "benchmark" in new_data_name:
fn = FolderNames(model_name, "benchmark_cache")
else:
fn = FolderNames(model_name, "version_cache")

models = Models(language_model_name=model_name)

with open(fn.visualization_cache_file, 'rb') as f:
vis_dims, all_data = pickle.load(f)

labels = [label.split('.')[0] for label in os.listdir(fn.dataset_pkl_folder) if 'all_data' not in label]
existing_data_ind = labels.index(existing_data_name)
new_data_ind = labels.index(new_data_name)

# set up training and validation sets for the DataObject instance
num_exist_train, num_new_train = len(all_data[existing_data_ind][0]), len(all_data[new_data_ind][0])
num_exist_valid, num_new_valid = len(all_data[existing_data_ind][1]), len(all_data[new_data_ind][1])
exist_point_labels = [np.array([f"{existing_data_ind}-{i}" for i in range(len(all_data[existing_data_ind][0]))]),
np.array([f"{existing_data_ind}-{num_exist_train+i}" for i in range(len(all_data[existing_data_ind][1]))]),
np.array([f"{existing_data_ind}-{num_exist_train+num_exist_valid+i}" for i in range(len(all_data[existing_data_ind][2]))]),]
new_point_labels = [np.array([f"{new_data_ind}-{i}" for i in range(len(all_data[new_data_ind][0]))]),
np.array([f"{new_data_ind}-{num_new_train+i}" for i in range(len(all_data[new_data_ind][1]))]),
np.array([f"{new_data_ind}-{num_new_train+num_new_valid+i}" for i in range(len(all_data[new_data_ind][2]))])]

# create a DataObject instance
# lim = 200 if "Qwen" in model_name else 100000
if "Qwen" in model_name:
lim = 200
if existing_data_name == new_data_name:
fn.exp_prefix = "200"
else:
lim = 200
if existing_data_name == new_data_name:
data = DataObject([existing_data_name], [existing_data_ind], [new_data_name], [new_data_ind], [all_data[existing_data_ind][0]], [vis_dims[existing_data_ind][0]], [exist_point_labels[0]],
[all_data[new_data_ind][1]], [vis_dims[new_data_ind][1]], [new_point_labels[1]],
case=DataObjectConstants.DATA_OBJECT_SAME_DATSET, lim=lim)
elif "benchmark" in new_data_name:
data = DataObject(existing_data_name, existing_data_ind, new_data_name, new_data_ind, all_data[existing_data_ind], vis_dims[existing_data_ind], exist_point_labels,
all_data[new_data_ind], vis_dims[new_data_ind], new_point_labels,
case=DataObjectConstants.DATA_OBJECT_BENCHMARK, lim=lim)
else:
data = DataObject(existing_data_name, existing_data_ind, new_data_name, new_data_ind, all_data[existing_data_ind], vis_dims[existing_data_ind], exist_point_labels,
all_data[new_data_ind], vis_dims[new_data_ind], new_point_labels,
case=DataObjectConstants.DATA_OBJECT_NEW_VERSION, lim=lim)

# define the dataset configuration code (a code that indicates the combination of datasets one is using)
dataset_config_code = fn.dataset_config_file_code(existing_data_name, new_data_name)
data.set_dataset_config_code(dataset_config_code)

# create a Plotting instance
plotting = Plotting(data, labels, models, fn)

# loop through all combinations of experiments
for subset_learning in reversed(sl_labels):
for utility_criteria in (uc_labels + ["initial"]):

# define the experiment configuration (a shorthand code that helps store experiment results in the cache)
# exp_config = ucl_shorthand[uc_labels.index(utility_criteria)] + "-" + subset_learning + "-" + str(subset_percentage)
# print('NEW EXPERIMENT\n', exp_config, utility_criteria, '\n\n\n\n')
# load_subset_experiment(existing_data_name, existing_data_ind, new_data_name, new_data_ind, exp_config, utility_criteria, subset_learning,
# subset_percentage, threshold, labels, data, plotting, models, fn)
# calculate_test_performance(all_data[new_data_ind][2], data, exp_config, models, fn, score="rouge")
# try:
if "initial" in utility_criteria:
exp_config = utility_criteria + "-" + subset_learning + "-" + str(subset_percentage)
else:
exp_config = ucl_shorthand[uc_labels.index(utility_criteria)] + "-" + subset_learning + "-" + str(subset_percentage)
print('NEW EXPERIMENT\n', exp_config, utility_criteria, '\n\n\n\n')
# load_subset_experiment(existing_data_name, existing_data_ind, new_data_name, new_data_ind, exp_config, utility_criteria, subset_learning,
# subset_percentage, threshold, labels, data, plotting, models, fn)

rouge_val, _ = calculate_test_performance(all_data[new_data_ind][1], data, exp_config, models, fn, score="rouge")
bleu_val, _ = calculate_test_performance(all_data[new_data_ind][1], data, exp_config, models, fn, score="bge")
# bert_val = [-100.0]
# bert_val, _ = calculate_test_performance(all_data[new_data_ind][1], data, exp_config, models, fn, score="bertscore")

my_table = wandb.Table(columns=['ROUGE', 'BGE']) #, 'BERTScore'])
my_table.add_data(rouge_val[0], bleu_val[0]) #, bert_val[0])
run.log({f"{data.use_case} - {model_name}, {exp_config}": my_table})
# except Exception as e:
# with open('failures.txt', 'a+') as f:
# f.write(f'{exp_config} on {existing_data_name} and {new_data_name}')
# f.write('\n\n')
# f.write(str(traceback.format_exc()))
# f.write('\n---------------------------------------------------------------------\n')

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--threshold", type=float, default=0.7)
parser.add_argument("--subset_percentage", type=float, default=0.3)
parser.add_argument("--existing_data_name", type=str, default="mix-instruct")
parser.add_argument("--new_data_name", type=str, default="mix-instruct")
parser.add_argument("--model_name", type=str, default="Qwen/Qwen2-7B-Instruct") #microsoft/Phi-3-mini-128k-instruct Qwen/Qwen2-7B-Instruct
args = parser.parse_args()

main([args.model_name], args.existing_data_name, args.new_data_name, args.threshold, args.subset_percentage)

Loading

0 comments on commit 265a491

Please sign in to comment.