experimentation

agarwalishika · Sep 23, 2024 · 265a491 · 265a491
1 parent 48752ed
commit 265a491
Show file tree

Hide file tree

Showing 9 changed files with 293 additions and 65 deletions.
diff --git a/subset_selection/src/utils/dist_utils/select_it_baseline.py b/subset_selection/src/utils/dist_utils/select_it_baseline.py
@@ -86,7 +86,7 @@ def construction_rps(self, prompts, references):
     #     return token_level_score
 
     def sentence_level_self_reflection(self, prompts, references, alpha=0.2, k=5):
-        self.model.to('cuda')
+        # self.model.to('cuda')
         rps = self.construction_rps(prompts, references)
         pro = []
         for idx, p in enumerate(rps):
@@ -99,14 +99,21 @@ def sentence_level_self_reflection(self, prompts, references, alpha=0.2, k=5):
                     predictions = outputs[0]
                     logits = predictions[:, -1, :]
                     softmax_logits = torch.softmax(logits.float(), dim=-1)
-                    for index in range(1):
-                        tmp_res = [float(softmax_logits[index][29896]), float(softmax_logits[index][29906]),
-                                float(softmax_logits[index][29941]), float(softmax_logits[index][29946]),
-                                float(softmax_logits[index][29945])]
-                        pro.append(tmp_res)
+                    if "Phi" in str(type(self.model)):
+                        for index in range(1):
+                            tmp_res = [float(softmax_logits[index][29896]), float(softmax_logits[index][29906]),
+                                    float(softmax_logits[index][29941]), float(softmax_logits[index][29946]),
+                                    float(softmax_logits[index][29945])]
+                            pro.append(tmp_res)
+                    elif "Qwen" in str(type(self.model)):
+                        for index in range(1):
+                            tmp_res = [float(softmax_logits[index][16]), float(softmax_logits[index][17]),
+                                    float(softmax_logits[index][18]), float(softmax_logits[index][19]),
+                                    float(softmax_logits[index][20])]
+                            pro.append(tmp_res)
                 except Exception as ex:
                     print(ex)
-        self.model.to('cpu')
+        # self.model.to('cpu')
         pro_softmax = []
         for item in pro:
             tmp_pro_softmax = item

diff --git a/subset_selection/subset_fl.py b/subset_selection/subset_fl.py
@@ -14,9 +14,8 @@ def create_subset(self, data_sijs, k=0.3):
         n, _ = data_sijs.shape
 
         # use facility location to find subset
-        fl = sb.functions.facilityLocation.FacilityLocationFunction(n, mode='dense', sijs=data_sijs)
-        subset = fl.maximize(budget=int(k * n), optimizer='LazyGreedy', stopIfZeroGain=False, stopIfNegativeGain=True, verbose=True)
-
+        fl = sb.functions.facilityLocation.FacilityLocationFunction(n, mode='dense', sijs=data_sijs, separate_rep=False)
+        subset = fl.maximize(budget=int(k * n), optimizer='LazyGreedy', stopIfZeroGain=False, stopIfNegativeGain=False, verbose=True)
         return subset
 
     def create_conditional_gain_subset(self, data_sijs, private_sijs, k=0.3):
@@ -28,8 +27,7 @@ def create_conditional_gain_subset(self, data_sijs, private_sijs, k=0.3):
 
         # use facility location to find subset
         fl = sb.functions.facilityLocationConditionalGain.FacilityLocationConditionalGainFunction(n, num_privates, data_sijs=data_sijs, private_sijs=private_sijs)
-        subset = fl.maximize(budget=int(k * n), optimizer='LazyGreedy', stopIfZeroGain=False, stopIfNegativeGain=True, verbose=True)
-
+        subset = fl.maximize(budget=int(k * n), optimizer='LazyGreedy', stopIfZeroGain=False, stopIfNegativeGain=False, verbose=True)
         return subset
 
     def create_mutual_information_subset(self, data_sijs, query_sijs, k=0.3):
@@ -41,8 +39,7 @@ def create_mutual_information_subset(self, data_sijs, query_sijs, k=0.3):
 
         # use facility location to find subset
         fl = sb.functions.facilityLocationMutualInformation.FacilityLocationMutualInformationFunction(n, num_privates, data_sijs=data_sijs, query_sijs=query_sijs)
-        subset = fl.maximize(budget=int(k * n), optimizer='LazyGreedy', stopIfZeroGain=False, stopIfNegativeGain=True, verbose=True)
-
+        subset = fl.maximize(budget=int(k * n), optimizer='LazyGreedy', stopIfZeroGain=False, stopIfNegativeGain=False, verbose=True)
         return subset
 
 

diff --git a/visualization/create_embeddings.py b/visualization/create_embeddings.py
@@ -145,16 +145,22 @@ def parse_qa_datasets():
 
             input = ""
             temp = i['supporting_facts']
+            keep = True
             for x in range(len(temp['title'])):
                 ind = i['context']['title'].index(temp['title'][x])
-                input += i['context']['sentences'][ind][temp['sent_id'][x]] + " "
+                try:
+                    input += i['context']['sentences'][ind][temp['sent_id'][x]] + " "
+                except:
+                   keep = False
 
-            data.append(f"Instruction:\nContext: {instruction}\nInput:\n{input}\nOutput:\n{output}\n")
+            if keep:
+                data.append(f"Instruction:\nContext: {instruction}\nInput:\n{input}\nOutput:\n{output}\n")
 
-            if len(data) > 10:
+            if len(data) > 1000:
                 break
         break
 
+    x = len(data)
     train_ds = pd.DataFrame(data[:int(0.7*x)], columns=['data'])
     valid_ds = pd.DataFrame(data[int(0.7*x):int(0.9*x)], columns=['data'])
     test_ds = pd.DataFrame(data[int(0.9*x):], columns=['data'])
@@ -170,10 +176,11 @@ def parse_qa_datasets():
 
             data.append(f"Instruction:\nContext: {instruction}\nInput:\n{input}\nOutput:\n{output}\n")
 
-            if len(data) > 10:
+            if len(data) > 1000:
                 break
         break
 
+    x = len(data)
     train_ds = pd.DataFrame(data[:int(0.7*x)], columns=['data'])
     valid_ds = pd.DataFrame(data[int(0.7*x):int(0.9*x)], columns=['data'])
     test_ds = pd.DataFrame(data[int(0.9*x):], columns=['data'])

diff --git a/visualization/folder_names.py b/visualization/folder_names.py
@@ -5,6 +5,8 @@ def parse_file_name(dataset_name, exp_config):
     if "before" in exp_config:
         k = re.findall(".*-.*-(.*-.*)", exp_config)[0]
         return f"before_exp-{k}_{dataset_name}"
+    elif "initial" in exp_config:
+        return f"{exp_config.replace('initial', 'before_exp')}_{dataset_name}"
     return f"{exp_config}_{dataset_name}"
 
 class FolderNames:
@@ -41,11 +43,13 @@ def __init__(self, model_name, cache_name="version_cache"):
         self.subset_folder = os.path.join(self.main_folder, "utility")
         self.select_it_subset_file = lambda dataset_name: os.path.join(self.subset_folder, f"select_it_subset_{dataset_name}.pkl")
         self.model_dep_utility_file = lambda dataset_name: os.path.join(self.subset_folder, f"model_dep_utility_{dataset_name}.pkl")
+        self.superfiltering_utility_file = lambda dataset_name: os.path.join(self.subset_folder, f"superfiltering_utility_{dataset_name}.pkl")
         self.model_ind_utility_file = lambda dataset_name: os.path.join(self.subset_folder, f"model_ind_utility_{dataset_name}.pkl")
         if not os.path.exists(self.subset_folder): os.mkdir(self.subset_folder)
 
         # store the knowledge after the experiments
-        self.exp_knowledge_file = lambda dataset_name, exp_config, prefix="": os.path.join(self.main_folder, "generated_text", f"{prefix}{parse_file_name(dataset_name, exp_config)}.pkl")
+        self.exp_prefix = ""
+        self.exp_knowledge_file = lambda dataset_name, exp_config, prefix="": os.path.join(self.main_folder, f"{self.exp_prefix}generated_text", f"{prefix}{parse_file_name(dataset_name, exp_config)}.pkl")
         if not os.path.exists(os.path.join(self.main_folder, "generated_text")): os.mkdir(os.path.join(self.main_folder, "generated_text"))
 
         # store individual similarity metrics
@@ -58,4 +62,7 @@ def __init__(self, model_name, cache_name="version_cache"):
 
         # peft specific
         self.peft_ft_model = lambda dataset_name, exp_config: os.path.join(self.main_folder, "peft_ft_models", parse_file_name(dataset_name, exp_config))
-        if not os.path.exists(os.path.join(self.main_folder, "peft_ft_models")): os.mkdir(os.path.join(self.main_folder, "peft_ft_models"))
+        if not os.path.exists(os.path.join(self.main_folder, "peft_ft_models")): os.mkdir(os.path.join(self.main_folder, "peft_ft_models"))
+
+        # less specific
+        self.less_subset_file = lambda model, dataset_name: f"/u/ishikaa2/selected_data/{model}-{dataset_name}_indicies.pkl"
diff --git a/visualization/lm_knowledge.py b/visualization/lm_knowledge.py
@@ -2,6 +2,7 @@
 import numpy as np
 import evaluate
 import torch
+import similarity
 
 rouge_metric = evaluate.load('rouge')
 bleu_metric = evaluate.load('bleu')
@@ -14,38 +15,20 @@ def calculate_similarity(predictions, references, score="rouge", return_invidiua
     Args:
         predictions: list of strings for the hypothesis
         references: list of strings for the reference
-        score: either "rouge" or "bleu" to calculate either metric
+        score: one of "rouge", "bleu", "bertscore", "bge", "promedeus"
         return_invidiual: if True, it will return the individual scores for corresponding prediction-reference pairs
     Returns:
         np array of metrics of size 1x1 if return_individual is True, else 1x|predictions|
     """
-    if not return_invidiual:
-        predictions = [predictions]
-        references = [references]
+    if "rouge" in score or "bleu" in score or "bertscore" in score:
+        return similarity.calculate_evaluate_metric(predictions, references, score, return_invidiual)
+    elif "bge" in score:
+        return similarity.calculate_bge(predictions, references, return_invidiual)
     else:
-        predictions = [[p] for p in predictions]
-        references = [[r] for r in references]
+        raise ValueError(f"Invalid similarity metric: {score}")
 
 
-    if score == "rouge":
-        sim_metric = rouge_metric
-        metric_key = "rouge1"
-    elif score == "bleu":
-        sim_metric = bleu_metric
-        metric_key = "bleu"
-    else:
-        sim_metric = bert_metric
-        metric_key = "f1"
-
-    # sim_metric = rouge_metric if score == "rouge" elif score == "bleu" bleu_metric else bert_metric
-    # metric_key = "rouge1" if score == "rouge" else "bleu"
-    metrics = []
-    for p, r in zip(predictions, references):
-        metrics.append(sim_metric.compute(predictions=p, references=r)[metric_key])
-    return np.array(metrics)
-
-
-def perform_inference(model, tokenizer, prompts, references, batch_size=4):
+def perform_inference(model, tokenizer, prompts, references, batch_size=2):
     """
     Performs inference on prompts and computes the ROUGE between the generated text and corresponding reference
 
@@ -67,7 +50,8 @@ def perform_inference(model, tokenizer, prompts, references, batch_size=4):
     all_gen_texts = []
 
     # Process prompts in batches
-    for i in tqdm(range(0, len(prompts), batch_size)):
+    max_len = max(len(prompts), 200)
+    for i in tqdm(range(0, max_len, batch_size)):
         batch_prompts = prompts[i:i+batch_size]
         batch_references = references[i:i+batch_size]
 

diff --git a/visualization/load_results.py b/visualization/load_results.py
@@ -0,0 +1,130 @@
+from visualization import load_subset_experiment, calculate_test_performance
+from data_object import DataObject, DataObjectConstants
+from folder_names import FolderNames
+from plotting import Plotting
+from models import Models
+import numpy as np
+import traceback
+import argparse
+import pickle
+import torch
+import os
+
+import wandb
+wandb.login(anonymous='allow', key='fa20f5af73ec4d1dedb50a817a8de53fbef1bade')
+
+def main(model_names, existing_data_name, new_data_name, threshold, subset_percentage):
+    run = wandb.init(
+        project="Optimizing Data Selection",
+    )
+
+    # all experimental configurations
+    uc_labels = ["Model Dependent + CG FL", "SelectIT", "Model Independent + CG FL", "Random", "Full Dataset"]
+    ucl_shorthand = ["mod_dep_fl", "select_it", "mod_ind_fl", "random", "full_data"] 
+    # uc_labels = ["SelectIT", "Model Independent + CG FL", "Random", "Full Dataset"]
+    # ucl_shorthand = ["select_it", "mod_ind_fl", "random", "full_data"] 
+    # uc_labels = ["Model Dependent ICL Utility", "Model Dependent Gradient Utility", "Model Independent", "Random", "Full Dataset"]
+    # ucl_shorthand = ["mod_dep_icl", "mod_dep_grad", "mod_ind", "random", "full_data"]
+    sl_labels = ["ICL", "PEFT"]
+
+    # loop through each of the model names
+    for model_name in model_names:
+        if existing_data_name == new_data_name:
+            fn = FolderNames(model_name, "same_data_cache")
+        elif "benchmark" in new_data_name:
+            fn = FolderNames(model_name, "benchmark_cache")
+        else:
+            fn = FolderNames(model_name, "version_cache")
+
+        models = Models(language_model_name=model_name)
+
+        with open(fn.visualization_cache_file, 'rb') as f:
+            vis_dims, all_data = pickle.load(f)
+
+        labels = [label.split('.')[0] for label in os.listdir(fn.dataset_pkl_folder) if 'all_data' not in label]
+        existing_data_ind = labels.index(existing_data_name)
+        new_data_ind = labels.index(new_data_name)
+
+        # set up training and validation sets for the DataObject instance
+        num_exist_train, num_new_train = len(all_data[existing_data_ind][0]), len(all_data[new_data_ind][0])
+        num_exist_valid, num_new_valid = len(all_data[existing_data_ind][1]), len(all_data[new_data_ind][1])
+        exist_point_labels = [np.array([f"{existing_data_ind}-{i}" for i in range(len(all_data[existing_data_ind][0]))]), 
+                            np.array([f"{existing_data_ind}-{num_exist_train+i}" for i in range(len(all_data[existing_data_ind][1]))]),
+                            np.array([f"{existing_data_ind}-{num_exist_train+num_exist_valid+i}" for i in range(len(all_data[existing_data_ind][2]))]),]
+        new_point_labels = [np.array([f"{new_data_ind}-{i}" for i in range(len(all_data[new_data_ind][0]))]), 
+                            np.array([f"{new_data_ind}-{num_new_train+i}" for i in range(len(all_data[new_data_ind][1]))]),
+                            np.array([f"{new_data_ind}-{num_new_train+num_new_valid+i}" for i in range(len(all_data[new_data_ind][2]))])]
+
+        # create a DataObject instance
+        # lim = 200 if "Qwen" in model_name else 100000
+        if "Qwen" in model_name:
+            lim = 200
+            if existing_data_name == new_data_name:
+                fn.exp_prefix = "200"
+        else:
+            lim = 200
+        if existing_data_name == new_data_name:
+            data = DataObject([existing_data_name], [existing_data_ind], [new_data_name], [new_data_ind], [all_data[existing_data_ind][0]], [vis_dims[existing_data_ind][0]], [exist_point_labels[0]],
+                        [all_data[new_data_ind][1]], [vis_dims[new_data_ind][1]], [new_point_labels[1]],
+                        case=DataObjectConstants.DATA_OBJECT_SAME_DATSET, lim=lim)
+        elif "benchmark" in new_data_name:
+            data = DataObject(existing_data_name, existing_data_ind, new_data_name, new_data_ind, all_data[existing_data_ind], vis_dims[existing_data_ind], exist_point_labels,
+                        all_data[new_data_ind], vis_dims[new_data_ind], new_point_labels,
+                        case=DataObjectConstants.DATA_OBJECT_BENCHMARK, lim=lim)
+        else:
+            data = DataObject(existing_data_name, existing_data_ind, new_data_name, new_data_ind, all_data[existing_data_ind], vis_dims[existing_data_ind], exist_point_labels,
+                        all_data[new_data_ind], vis_dims[new_data_ind], new_point_labels,
+                        case=DataObjectConstants.DATA_OBJECT_NEW_VERSION, lim=lim)
+
+        # define the dataset configuration code (a code that indicates the combination of datasets one is using)
+        dataset_config_code = fn.dataset_config_file_code(existing_data_name, new_data_name)
+        data.set_dataset_config_code(dataset_config_code)
+
+        # create a Plotting instance
+        plotting = Plotting(data, labels, models, fn)
+
+        # loop through all combinations of experiments
+        for subset_learning in reversed(sl_labels):
+            for utility_criteria in (uc_labels + ["initial"]):
+
+                # define the experiment configuration (a shorthand code that helps store experiment results in the cache)
+                # exp_config = ucl_shorthand[uc_labels.index(utility_criteria)] + "-" + subset_learning + "-" + str(subset_percentage)
+                # print('NEW EXPERIMENT\n', exp_config, utility_criteria, '\n\n\n\n')
+                # load_subset_experiment(existing_data_name, existing_data_ind, new_data_name, new_data_ind, exp_config, utility_criteria, subset_learning, 
+                #                     subset_percentage, threshold, labels, data, plotting, models, fn)
+                # calculate_test_performance(all_data[new_data_ind][2], data, exp_config, models, fn, score="rouge")
+                # try:
+                if "initial" in utility_criteria:
+                    exp_config = utility_criteria + "-" + subset_learning + "-" + str(subset_percentage)
+                else:
+                    exp_config = ucl_shorthand[uc_labels.index(utility_criteria)] + "-" + subset_learning + "-" + str(subset_percentage)
+                print('NEW EXPERIMENT\n', exp_config, utility_criteria, '\n\n\n\n')
+                # load_subset_experiment(existing_data_name, existing_data_ind, new_data_name, new_data_ind, exp_config, utility_criteria, subset_learning, 
+                #                     subset_percentage, threshold, labels, data, plotting, models, fn)
+
+                rouge_val, _ = calculate_test_performance(all_data[new_data_ind][1], data, exp_config, models, fn, score="rouge")
+                bleu_val, _ = calculate_test_performance(all_data[new_data_ind][1], data, exp_config, models, fn, score="bge")
+                # bert_val = [-100.0]
+                # bert_val, _ = calculate_test_performance(all_data[new_data_ind][1], data, exp_config, models, fn, score="bertscore")
+
+                my_table = wandb.Table(columns=['ROUGE', 'BGE']) #, 'BERTScore'])
+                my_table.add_data(rouge_val[0], bleu_val[0]) #, bert_val[0])
+                run.log({f"{data.use_case} - {model_name}, {exp_config}": my_table})
+                # except Exception as e:
+                #     with open('failures.txt', 'a+') as f:
+                #         f.write(f'{exp_config} on {existing_data_name} and {new_data_name}')
+                #         f.write('\n\n')
+                #         f.write(str(traceback.format_exc()))
+                #         f.write('\n---------------------------------------------------------------------\n')
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--threshold", type=float, default=0.7)
+    parser.add_argument("--subset_percentage", type=float, default=0.3)
+    parser.add_argument("--existing_data_name", type=str, default="mix-instruct")
+    parser.add_argument("--new_data_name", type=str, default="mix-instruct")
+    parser.add_argument("--model_name", type=str, default="Qwen/Qwen2-7B-Instruct") #microsoft/Phi-3-mini-128k-instruct Qwen/Qwen2-7B-Instruct
+    args = parser.parse_args()
+
+    main([args.model_name], args.existing_data_name, args.new_data_name, args.threshold, args.subset_percentage)
+