diff --git a/.env.template b/.env.template
new file mode 100644
index 0000000..04113da
--- /dev/null
+++ b/.env.template
@@ -0,0 +1,3 @@
+GOOGLE_API_KEY=""
+OPENAI_API_KEY=""
+HF_TOKEN=""
\ No newline at end of file
diff --git a/kvcache.py b/kvcache.py
new file mode 100644
index 0000000..0ec4d7f
--- /dev/null
+++ b/kvcache.py
@@ -0,0 +1,511 @@
+import torch
+import torch.nn.functional as F
+import pandas as pd
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers.cache_utils import DynamicCache
+import argparse
+import os
+from time import time
+import json
+from transformers import BitsAndBytesConfig
+import subprocess
+import random
+
+def get_env():
+    env_dict = {}
+    with open (file=".env" if os.path.exists(".env") else "env", mode="r") as f:
+        for line in f:
+            key, value = line.strip().split("=")
+            env_dict[key] = value.strip('"')
+    return env_dict
+
+"""Hugging Face Llama model"""
+HF_TOKEN = get_env()["HF_TOKEN"]
+global model_name, model, tokenizer
+global rand_seed
+
+# Assume input_ids is your initial input sequence tensor, and max_length is the target length for decoding
+# Define the maximum length for decoding
+import torch
+import torch.nn as nn
+from torch.nn.parallel import DataParallel
+import subprocess
+from typing import Optional, Tuple, List
+def generate(
+    model,
+    input_ids: torch.Tensor,
+    past_key_values,
+    max_new_tokens: int = 300
+) -> torch.Tensor:
+    """
+    Generate text with proper device handling for HuggingFace models using device_map="auto"
+    
+    Args:
+        model: HuggingFace model with automatic device mapping
+        input_ids: Input token ids
+        past_key_values: Previous KV cache
+        max_length: Maximum sequence length to generate
+    """
+    # Get the device of the embedding layer
+    embed_device = model.model.embed_tokens.weight.device
+
+    origin_ids = input_ids
+    # Move input to the same device as embedding layer
+    input_ids = input_ids.to(embed_device)
+    
+    # Initialize output tensor on embedding device
+    output_ids = input_ids.clone()
+    next_token = input_ids
+    
+    # Main generation loop
+    with torch.no_grad():
+        for _ in range(max_new_tokens):
+            # Forward pass with proper device placement
+            outputs = model(
+                input_ids=next_token,  # Only process last token
+                past_key_values=past_key_values,
+                use_cache=True
+            )
+            
+            # Get next token prediction (logits will be on the last device)
+            next_token_logits = outputs.logits[:, -1, :]
+            next_token = next_token_logits.argmax(dim=-1).unsqueeze(-1)
+            
+            # Move next token to embedding device for next iteration
+            next_token = next_token.to(embed_device)
+            
+            # Update KV cache
+            past_key_values = outputs.past_key_values
+            
+            # Append prediction
+            output_ids = torch.cat([output_ids, next_token], dim=1)
+            
+            # Optional: Check for EOS token
+            #print(next_token.item())
+            #print(model.config.eos_token_id)
+            if next_token.item() in model.config.eos_token_id:
+                break
+    return output_ids[:,origin_ids.shape[-1]:]
+
+# Example usage:
+# model = AutoModelForCausalLM.from_pretrained("your-model")
+# tokenizer = AutoTokenizer.from_pretrained("your-model")
+# input_text = "Your input text here"
+# input_ids = tokenizer.encode(input_text, return_tensors="pt")
+# output_ids = generate(model, input_ids, past_key_values=None)
+# output_text = tokenizer.decode(output_ids[0])
+
+# Run a command and capture its output
+  # Example command, replace with your desired command
+
+
+# Print the standard output
+
+
+"""KV Cache test"""
+# Allowlist the DynamicCache class
+torch.serialization.add_safe_globals([DynamicCache])
+torch.serialization.add_safe_globals([set])
+
+def get_kv_cache(
+    model,
+    tokenizer,
+    prompt: str,
+) -> DynamicCache:
+    """
+    Prepare KV cache for a model distributed across multiple GPUs using device_map="auto"
+    
+    Args:
+        model: HuggingFace model with automatic device mapping
+        tokenizer: HuggingFace tokenizer
+        prompt: Input text to generate KV cache for
+    
+    Returns:
+        DynamicCache: Distributed KV cache
+    """
+    # Get embedding layer device
+    embed_device = model.model.embed_tokens.weight.device
+    
+    # Encode and move input to embedding device
+    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(embed_device)
+    
+    # Initialize dynamic cache
+    past_key_values = DynamicCache()
+    
+    # Generate KV cache with proper device placement
+    with torch.no_grad():
+        outputs = model(
+            input_ids=input_ids,
+            past_key_values=past_key_values,
+            use_cache=True,
+            output_attentions=False,
+            output_hidden_states=False
+        )
+    
+    # The model's device mapping will automatically place each layer's 
+    # KV cache on the correct device
+    return outputs.past_key_values
+
+def write_kv_cache(kv: DynamicCache,path: str):
+    torch.save(kv, path)
+
+
+
+def clean_up(kv: DynamicCache, origin_len: int):
+    for i in range(len(kv.key_cache)):
+        kv.key_cache[i] = kv.key_cache[i][:,:,:origin_len,:]
+        kv.value_cache[i] = kv.value_cache[i][:,:,:origin_len,:]
+
+
+def read_kv_cache(path: str) -> DynamicCache:
+    # kv = torch.load(path)
+    kv = torch.load(path, weights_only=True)
+    return kv
+
+"""Sentence-BERT for evaluate semantic similarity"""
+from sentence_transformers import SentenceTransformer, util
+bert_model = SentenceTransformer('all-MiniLM-L6-v2')  # Use a lightweight sentence-transformer
+
+def get_bert_similarity(response, ground_truth):
+    # Encode the query and text
+    query_embedding = bert_model.encode(response, convert_to_tensor=True)
+    text_embedding = bert_model.encode(ground_truth, convert_to_tensor=True)
+
+    # Compute the cosine similarity between the query and text
+    cosine_score = util.pytorch_cos_sim(query_embedding, text_embedding)
+
+    return cosine_score.item()
+
+def prepare_kvcache(documents, filepath: str = "./data_cache/cache_knowledges.pt", answer_instruction: str = None):
+    # Prepare the knowledges kvcache
+    
+    if answer_instruction == None:
+        answer_instruction = "Answer the question with a super short answer."
+    knowledges = f"""
+    <|begin_of_text|>
+    <|start_header_id|>system<|end_header_id|>
+    You are an assistant for giving short answers based on given context.<|eot_id|>
+    <|start_header_id|>user<|end_header_id|>
+    Context information is bellow.
+    ------------------------------------------------
+    {documents}
+    ------------------------------------------------
+    {answer_instruction}
+    Question:
+    """
+    # Get the knowledge cache
+    t1 = time()
+    try:
+        kv = get_kv_cache(model, tokenizer, knowledges)
+        print("kvlen: ", kv.key_cache[0].shape[-2])
+        write_kv_cache(kv, filepath)
+        t2 = time()
+        # command = ["nvidia-smi"]
+        # result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        # print(result.stdout)
+        return kv, t2 - t1
+    except Exception as e:
+        print("Error: ", e)
+        # command = ["nvidia-smi"]
+        # result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        # print(result.stdout)
+
+    
+def get_kis_dataset(filepath: str):
+    df = pd.read_csv(filepath)
+    dataset = zip(df['sample_question'], df['sample_ground_truth'])
+    text_list = df["ki_text"].to_list()
+    
+    return text_list, dataset
+
+def parse_squad_data(raw):
+    dataset = { "ki_text": [], "qas": [] }
+    
+    for k_id, data in enumerate(raw['data']):
+        article = []
+        for p_id, para in enumerate(data['paragraphs']):
+            article.append(para['context'])
+            for qa in para['qas']:
+                ques = qa['question']
+                answers = [ans['text'] for ans in qa['answers']]
+                dataset['qas'].append({"title": data['title'], "paragraph_index": tuple((k_id, p_id)) ,"question": ques, "answers": answers})
+        dataset['ki_text'].append({"id": k_id, "title": data['title'], "paragraphs": article})
+    
+    return dataset
+
+def get_squad_dataset(filepath: str, max_knowledge: int = None, max_paragraph: int = None, max_questions: int = None):
+    # Open and read the JSON file
+    with open(filepath, 'r') as file:
+        data = json.load(file)
+    # Parse the SQuAD data
+    parsed_data = parse_squad_data(data)
+    
+    print("max_knowledge", max_knowledge, "max_paragraph", max_paragraph, "max_questions", max_questions)
+    
+    # Set the limit Maximum Articles, use all Articles if max_knowledge is None or greater than the number of Articles
+    max_knowledge = max_knowledge if max_knowledge != None and max_knowledge < len(parsed_data['ki_text']) else len(parsed_data['ki_text'])
+    
+    # Shuffle the Articles and Questions
+    if rand_seed != None:
+        random.seed(rand_seed)
+        random.shuffle(parsed_data["ki_text"])
+        random.shuffle(parsed_data["qas"])
+        k_ids = [i['id'] for i in parsed_data["ki_text"][:max_knowledge]]
+        
+    text_list = []
+    # Get the knowledge Articles for at most max_knowledge, or all Articles if max_knowledge is None
+    for article in parsed_data['ki_text'][:max_knowledge]:
+        max_para = max_paragraph if max_paragraph != None and max_paragraph < len(article['paragraphs']) else len(article['paragraphs'])
+        text_list.append(article['title'])
+        text_list.append('\n'.join(article['paragraphs'][0:max_para]))
+    
+    # Check if the knowledge id of qas is less than the max_knowledge
+    questions = [qa['question'] for qa in parsed_data['qas'] if qa['paragraph_index'][0] in k_ids and (max_paragraph == None or qa['paragraph_index'][1] < max_paragraph)]
+    answers = [qa['answers'][0] for qa in parsed_data['qas'] if qa['paragraph_index'][0]  in k_ids and (max_paragraph == None or qa['paragraph_index'][1] < max_paragraph)]
+    
+    dataset = zip(questions, answers)
+    
+    return text_list, dataset
+
+def get_hotpotqa_dataset(filepath: str, max_knowledge: int = None):
+    # Open and read the JSON
+    with open (filepath, "r") as file:
+        data = json.load(file)
+    
+    if rand_seed != None:
+        random.seed(rand_seed)
+        random.shuffle(data)
+    
+    questions = [ qa['question'] for qa in data ]
+    answers = [ qa['answer'] for qa in data ]
+    dataset = zip(questions, answers)
+    
+    if max_knowledge == None:
+        max_knowledge = len(data)
+    else:
+        max_knowledge = min(max_knowledge, len(data))
+    
+    text_list = []
+    for i, qa in enumerate(data[:max_knowledge]):
+        context = qa['context']
+        context = [ c[0] + ": \n" + "".join(c[1]) for c in context ]
+        article = "\n\n".join(context)
+
+        text_list.append(article)
+    
+    return text_list, dataset
+    
+def kvcache_test(args: argparse.Namespace):
+    answer_instruction = None
+    if args.dataset == "kis_sample":
+        datapath = "./datasets/rag_sample_qas_from_kis.csv"
+        text_list, dataset = get_kis_dataset(datapath)
+    if args.dataset == "kis":
+        datapath = "./datasets/synthetic_knowledge_items.csv"
+        text_list, dataset = get_kis_dataset(datapath)
+    if args.dataset == "squad-dev":
+        datapath = "./datasets/squad/dev-v1.1.json"
+        text_list, dataset = get_squad_dataset(datapath, max_knowledge=args.maxKnowledge, max_paragraph=args.maxParagraph, max_questions=args.maxQuestion)
+    if args.dataset == "squad-train":
+        datapath = "./datasets/squad/train-v1.1.json"
+        text_list, dataset = get_squad_dataset(datapath, max_knowledge=args.maxKnowledge, max_paragraph=args.maxParagraph, max_questions=args.maxQuestion)
+        answer_instruction = "Answer the question with a super short answer."
+    if args.dataset == "hotpotqa-dev":
+        datapath = "./datasets/hotpotqa/hotpot_dev_fullwiki_v1.json"
+        text_list, dataset = get_hotpotqa_dataset(datapath, args.maxKnowledge)
+        answer_instruction = "Answer the question with a super short answer."
+    if args.dataset == "hotpotqa-test":
+        datapath = "./datasets/hotpotqa/hotpot_test_fullwiki_v1.json"
+        text_list, dataset = get_hotpotqa_dataset(datapath, args.maxKnowledge)
+        answer_instruction = "Answer the question with a super short answer."
+    if args.dataset == "hotpotqa-train":
+        datapath = "./datasets/hotpotqa/hotpot_train_v1.1.json"
+        text_list, dataset = get_hotpotqa_dataset(datapath, args.maxKnowledge)
+        answer_instruction = "Answer the question with a super short answer."
+    
+    kvcache_path = "./data_cache/cache_knowledges.pt"
+
+    knowledges = '\n\n\n\n\n\n'.join(text_list)
+    knowledge_cache, prepare_time = prepare_kvcache(knowledges, filepath=kvcache_path, answer_instruction=answer_instruction)
+    kv_len = knowledge_cache.key_cache[0].shape[-2]
+    print(f"KVcache prepared in {prepare_time} seconds")
+    with open(args.output, "a") as f:
+        f.write(f"KVcache prepared in {prepare_time} seconds\n")
+    
+    results = {
+        "cache_time": [],
+        "generate_time": [],
+        "similarity": [],
+        "prompts": [],
+        "responses": []
+    }
+
+    
+    dataset = list(dataset) # Convert the dataset to a list
+    
+    max_questions = min(len(dataset), args.maxQuestion) if args.maxQuestion != None else len(dataset)
+    
+    for id, (question, ground_truth) in enumerate(dataset[:max_questions]):    # Retrieve the knowledge from the vector database
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+        
+        # Read the knowledge cache from the cache file
+        cache_t1 = time()
+        #if args.kvcache == "file":
+            #knowledge_cache = read_kv_cache(kvcache_path)
+        
+        # Not a good idea to use this method, as it will consume a lot of memory
+        # if args.kvcache == "variable":
+        #     knowledge_cache = documents_cache
+        cache_t2 = time()
+        
+        # Generate Response for the question
+        knowledges = '\n\n\n'.join(text_list)
+        
+        if args.usePrompt:
+            prompt = f"""
+    <|begin_of_text|>
+    <|start_header_id|>system<|end_header_id|>
+    You are an assistant for giving short answers based on given context.<|eot_id|>
+    <|start_header_id|>user<|end_header_id|>
+    Context information is bellow.
+    ------------------------------------------------
+    {knowledges}
+    ------------------------------------------------
+    {answer_instruction}
+    Question:
+    {question}<|eot_id|>
+    <|start_header_id|>assistant<|end_header_id|>
+    """
+            generate_t1 = time()
+            input_ids = tokenizer.encode( prompt , return_tensors="pt" ).to(model.device)
+            output = generate(model, input_ids, DynamicCache()) #knowledge_cache)
+            generated_text = tokenizer.decode(output[0], skip_special_tokens=True, temperature=None)
+            generate_t2 = time() 
+        else:
+            prompt = f"""
+    {question}<|eot_id|>
+    <|start_header_id|>assistant<|end_header_id|>
+    """     
+            generate_t1 = time()
+            clean_up(knowledge_cache, kv_len)
+            input_ids = tokenizer.encode( prompt , return_tensors="pt" ).to(model.device)
+            output = generate(model, input_ids, knowledge_cache) #knowledge_cache)
+            generated_text = tokenizer.decode(output[0], skip_special_tokens=True, temperature=None)
+            generate_t2 = time() 
+        
+        # print("D: ", knowledges)
+        print("Q: ",question)
+        print("A: ", generated_text)
+        
+        # Evaluate bert-score similarity
+        similarity = get_bert_similarity(generated_text, ground_truth)
+        
+        print(f"[{id}]: Semantic Similarity: {round(similarity, 5)},",
+            f"cache time: {cache_t2 - cache_t1},",
+            f"generate time: {generate_t2 - generate_t1}"
+            )
+        with open(args.output, "a") as f:
+            f.write(f"[{id}]: Semantic Similarity: {round(similarity, 5)},\t cache time: {cache_t2 - cache_t1},\t generate time: {generate_t2 - generate_t1}\n")
+        
+        results["prompts"].append(question)
+        results["responses"].append(generated_text)
+        results["cache_time"].append(cache_t2 - cache_t1)
+        results["generate_time"].append(generate_t2 - generate_t1)
+        results["similarity"].append(similarity)
+        
+        with open(args.output, "a") as f:
+            f.write(f"[{id}]: [Cumulative]: " 
+                    + f"Semantic Similarity: {round(sum(results['similarity']) / (len(results['similarity'])) , 5)}," 
+                    + f"\t cache time: {sum(results['cache_time']) / (len(results['cache_time'])) },"
+                    + f"\t generate time: {sum(results['generate_time']) / (len(results['generate_time'])) }\n")
+        
+        
+    avg_similarity = sum(results["similarity"]) / len(results["similarity"])
+    avg_cache_time = sum(results["cache_time"]) / len(results["cache_time"])
+    avg_generate_time = sum(results["generate_time"]) / len(results["generate_time"])
+    print()
+    print(f"Prepare time: {prepare_time}")
+    print(f"Average Semantic Similarity: {avg_similarity}")
+    print(f"cache time: {avg_cache_time},\t generate time: {avg_generate_time}")
+    print()
+    with open(args.output, "a") as f:
+        f.write("\n")
+        f.write(f"Result for {args.output}\n")
+        f.write(f"Prepare time: {prepare_time}\n")
+        f.write(f"Average Semantic Similarity: {avg_similarity}\n")
+        f.write(f"cache time: {avg_cache_time},\t generate time: {avg_generate_time}\n")
+
+# Define quantization configuration
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,              # Load model in 4-bit precision
+    bnb_4bit_quant_type="nf4",      # Normalize float 4 quantization
+    bnb_4bit_compute_dtype=torch.float16,  # Compute dtype for 4-bit base matrices
+    bnb_4bit_use_double_quant=True  # Use nested quantization
+)
+
+def load_quantized_model(model_name, hf_token=None):
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_name,
+        token=hf_token
+    )
+    
+    # Load model with quantization
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        quantization_config=bnb_config,
+        device_map="auto",          # Automatically choose best device
+        trust_remote_code=True,     # Required for some models
+        token=hf_token
+    )
+    
+    return tokenizer, model
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run RAG test with specified parameters.")
+    # parser.add_argument('--method', choices=['rag', 'kvcache'], required=True, help='Method to use (rag or kvcache)')
+    # parser.add_argument('--kvcache', choices=['file', 'variable'], required=True, help='Method to use (from_file or from_var)')
+    parser.add_argument('--modelname', required=False, default="meta-llama/Llama-3.2-1B-Instruct", type=str, help='Model name to use')
+    parser.add_argument('--quantized', required=False, default=False, type=bool, help='Quantized model')
+    parser.add_argument('--kvcache', choices=['file'], required=True, help='Method to use (from_file or from_var)')
+    parser.add_argument('--similarity', choices=['bertscore'], required=True, help='Similarity metric to use (bertscore)')
+    parser.add_argument('--output', required=True, type=str, help='Output file to save the results')
+    parser.add_argument('--maxQuestion', required=False, default=None ,type=int, help='Maximum number of questions to test')
+    parser.add_argument('--maxKnowledge', required=False, default=None ,type=int, help='Maximum number of knowledge items to use')
+    parser.add_argument('--maxParagraph', required=False, default=None ,type=int, help='Maximum number of paragraph to use')
+    parser.add_argument('--usePrompt', default=False, action="store_true", help='Do not use cache')
+    parser.add_argument('--dataset', required=True, help='Dataset to use (kis, kis_sample, squad-dev, squad-train)', 
+                        choices=['kis', 'kis_sample', 
+                                'squad-dev', 'squad-train', 
+                                'hotpotqa-dev',  'hotpotqa-train', 'hotpotqa-test'])
+    parser.add_argument('--randomSeed', required=False, default=None, type=int, help='Random seed to use')
+    # 48 Articles, each article average 40~50 paragraph, each average 5~10 questions
+    
+    args = parser.parse_args()
+    
+    print("maxKnowledge", args.maxKnowledge, "maxParagraph", args.maxParagraph, "maxQuestion", args.maxQuestion, "randomeSeed", args.randomSeed)
+    
+    model_name = args.modelname
+    rand_seed = args.randomSeed if args.randomSeed != None else None
+    
+    if args.quantized:
+        tokenizer, model = load_quantized_model(model_name=model_name, hf_token=HF_TOKEN)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16,
+            device_map="auto",
+            token=HF_TOKEN
+        )
+    
+    def unique_path(path, i=0):
+        if os.path.exists(path):
+            # path = path.split("_")[:-1] if i != 0 else path
+            return unique_path(path + "_" + str(i), i + 1)
+        return path
+    
+    if os.path.exists(args.output):
+        args.output = unique_path(args.output)
+        
+    kvcache_test(args)
diff --git a/rag.py b/rag.py
new file mode 100644
index 0000000..12387ed
--- /dev/null
+++ b/rag.py
@@ -0,0 +1,407 @@
+import torch
+import torch.nn.functional as F
+import pandas as pd
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from llama_index.core import VectorStoreIndex, Document
+from transformers.cache_utils import DynamicCache
+import argparse
+import os
+import json
+from transformers import BitsAndBytesConfig
+import random
+
+def get_env():
+    env_dict = {}
+    with open (file=".env" if os.path.exists(".env") else "env", mode="r") as f:
+        for line in f:
+            key, value = line.strip().split("=")
+            env_dict[key] = value.strip('"')
+    return env_dict
+
+"""Hugging Face Llama model"""
+HF_TOKEN = get_env()["HF_TOKEN"]
+global model_name, model, tokenizer
+global rand_seed
+
+# Allowlist the DynamicCache class
+torch.serialization.add_safe_globals([DynamicCache])
+torch.serialization.add_safe_globals([set])
+
+# Define a simplified generate function
+
+
+"""Sentence-BERT for evaluate semantic similarity"""
+from sentence_transformers import SentenceTransformer, util
+bert_model = SentenceTransformer('all-MiniLM-L6-v2')  # Use a lightweight sentence-transformer
+
+def get_bert_similarity(response, ground_truth):
+    # Encode the query and text
+    query_embedding = bert_model.encode(response, convert_to_tensor=True)
+    text_embedding = bert_model.encode(ground_truth, convert_to_tensor=True)
+
+    # Compute the cosine similarity between the query and text
+    cosine_score = util.pytorch_cos_sim(query_embedding, text_embedding)
+
+    return cosine_score.item()
+
+from time import time
+
+from llama_index.core import Settings
+
+def getOpenAIRetriever(documents: list[str], similarity_top_k: int = 1):
+    """OpenAI RAG model"""
+    import openai
+    openai.api_key = get_env()["OPENAI_API_KEY"]        
+    # from llama_index.llms.openai import OpenAI
+    # Settings.llm = OpenAI(model="gpt-3.5-turbo")
+    
+    from llama_index.embeddings.openai import OpenAIEmbedding
+    # Set the embed_model in llama_index
+    Settings.embed_model = OpenAIEmbedding(model_name="text-embedding-3-small", api_key=get_env()["OPENAI_API_KEY"], title="openai-embedding")
+    # model_name: "text-embedding-3-small", "text-embedding-3-large"
+    
+    # Create the OpenAI retriever
+    t1 = time()
+    index = VectorStoreIndex.from_documents(documents)
+    OpenAI_retriever = index.as_retriever(similarity_top_k=similarity_top_k)
+    t2 = time()
+    
+    return OpenAI_retriever, t2 - t1
+    
+
+def getGeminiRetriever(documents: list[str], similarity_top_k: int = 1):
+    """Gemini Embedding RAG model"""
+    GOOGLE_API_KEY = get_env()["GOOGLE_API_KEY"]
+    from llama_index.embeddings.gemini import GeminiEmbedding
+    model_name = "models/embedding-001"
+    # Set the embed_model in llama_index
+    Settings.embed_model = GeminiEmbedding( model_name=model_name, api_key=GOOGLE_API_KEY, title="gemini-embedding")
+    
+    # Create the Gemini retriever
+    t1 = time()
+    index = VectorStoreIndex.from_documents(documents)
+    Gemini_retriever = index.as_retriever(similarity_top_k=similarity_top_k)
+    t2 = time()
+    
+    return Gemini_retriever, t2 - t1
+    
+def getBM25Retriever(documents: list[str], similarity_top_k: int = 1):
+    from llama_index.core.node_parser import SentenceSplitter  
+    from llama_index.retrievers.bm25 import BM25Retriever
+    import Stemmer
+
+    splitter = SentenceSplitter(chunk_size=512)
+    
+    t1 = time()
+    nodes = splitter.get_nodes_from_documents(documents)
+    # We can pass in the index, docstore, or list of nodes to create the retriever
+    bm25_retriever = BM25Retriever.from_defaults(
+        nodes=nodes,
+        similarity_top_k=similarity_top_k,
+        stemmer=Stemmer.Stemmer("english"),
+        language="english",
+    )
+    t2 = time()
+    bm25_retriever.persist("./bm25_retriever")
+
+    return bm25_retriever, t2 - t1
+
+def get_kis_dataset(filepath: str):
+    df = pd.read_csv(filepath)
+    dataset = zip(df['sample_question'], df['sample_ground_truth'])
+    text_list = df["ki_text"].to_list()
+    
+    return text_list, dataset
+
+def parse_squad_data(raw):
+    dataset = { "ki_text": [], "qas": [] }
+    
+    for k_id, data in enumerate(raw['data']):
+        article = []
+        for p_id, para in enumerate(data['paragraphs']):
+            article.append(para['context'])
+            for qa in para['qas']:
+                ques = qa['question']
+                answers = [ans['text'] for ans in qa['answers']]
+                dataset['qas'].append({"title": data['title'], "paragraph_index": tuple((k_id, p_id)) ,"question": ques, "answers": answers})
+        dataset['ki_text'].append({"id": k_id, "title": data['title'], "paragraphs": article})
+    
+    return dataset
+
+def get_squad_dataset(filepath: str, max_knowledge: int = None, max_paragraph: int = None, max_questions: int = None):
+    # Open and read the JSON file
+    with open(filepath, 'r') as file:
+        data = json.load(file)
+    # Parse the SQuAD data
+    parsed_data = parse_squad_data(data)
+    
+    print("max_knowledge", max_knowledge, "max_paragraph", max_paragraph, "max_questions", max_questions)
+    
+    # Set the limit Maximum Articles, use all Articles if max_knowledge is None or greater than the number of Articles
+    max_knowledge = max_knowledge if max_knowledge != None and max_knowledge < len(parsed_data['ki_text']) else len(parsed_data['ki_text'])
+    
+    # Shuffle the Articles and Questions
+    if rand_seed != None:
+        random.seed(rand_seed)
+        random.shuffle(parsed_data["ki_text"])
+        random.shuffle(parsed_data["qas"])
+        k_ids = [i['id'] for i in parsed_data["ki_text"][:max_knowledge]]
+
+        
+    text_list = []
+    # Get the knowledge Articles for at most max_knowledge, or all Articles if max_knowledge is None
+    for article in parsed_data['ki_text'][:max_knowledge]:
+        max_para = max_paragraph if max_paragraph != None and max_paragraph < len(article['paragraphs']) else len(article['paragraphs'])
+        text_list.append(article['title'])
+        text_list.append('\n'.join(article['paragraphs'][0:max_para]))
+    
+    # Check if the knowledge id of qas is less than the max_knowledge
+    questions = [qa['question'] for qa in parsed_data['qas'] if qa['paragraph_index'][0] in k_ids and (max_paragraph == None or qa['paragraph_index'][1] < max_paragraph)]
+    answers = [qa['answers'][0] for qa in parsed_data['qas'] if qa['paragraph_index'][0]  in k_ids and (max_paragraph == None or qa['paragraph_index'][1] < max_paragraph)]
+    
+    dataset = zip(questions, answers)
+    
+    return text_list, dataset
+
+
+def get_hotpotqa_dataset(filepath: str, max_knowledge: int = None):
+    # Open and read the JSON
+    with open (filepath, "r") as file:
+        data = json.load(file)
+    
+    if rand_seed != None:
+        random.seed(rand_seed)
+        random.shuffle(data)
+    
+    questions = [ qa['question'] for qa in data ]
+    answers = [ qa['answer'] for qa in data ]
+    dataset = zip(questions, answers)
+    
+    if max_knowledge == None:
+        max_knowledge = len(data)
+    else:
+        max_knowledge = min(max_knowledge, len(data))
+    
+    text_list = []
+    for i, qa in enumerate(data[:max_knowledge]):
+        context = qa['context']
+        context = [ c[0] + ": \n" + "".join(c[1]) for c in context ]
+        article = "\n\n".join(context)
+
+        text_list.append(article)
+    
+    return text_list, dataset
+    
+def rag_test(args: argparse.Namespace):
+    answer_instruction = None
+    if args.dataset == "kis_sample":
+        datapath = "./datasets/rag_sample_qas_from_kis.csv"
+        text_list, dataset = get_kis_dataset(datapath)
+    if args.dataset == "kis":
+        datapath = "./datasets/synthetic_knowledge_items.csv"
+        text_list, dataset = get_kis_dataset(datapath)
+    if args.dataset == "squad-dev":
+        datapath = "./datasets/squad/dev-v1.1.json"
+        text_list, dataset = get_squad_dataset(datapath, max_knowledge=args.maxKnowledge, max_paragraph=args.maxParagraph, max_questions=args.maxQuestion)
+    if args.dataset == "squad-train":
+        datapath = "./datasets/squad/train-v1.1.json"
+        text_list, dataset = get_squad_dataset(datapath, max_knowledge=args.maxKnowledge, max_paragraph=args.maxParagraph, max_questions=args.maxQuestion)
+        answer_instruction = "Answer the question with a super short answer."
+    if args.dataset == "hotpotqa-dev":
+        datapath = "./datasets/hotpotqa/hotpot_dev_fullwiki_v1.json"
+        text_list, dataset = get_hotpotqa_dataset(datapath, args.maxKnowledge)
+        answer_instruction
+    if args.dataset == "hotpotqa-test":
+        datapath = "./datasets/hotpotqa/hotpot_test_fullwiki_v1.json"
+        text_list, dataset = get_hotpotqa_dataset(datapath, args.maxKnowledge)
+        answer_instruction = "Answer the question with a super short answer."
+    if args.dataset == "hotpotqa-train":
+        datapath = "./datasets/hotpotqa/hotpot_train_v1.1.json"
+        text_list, dataset = get_hotpotqa_dataset(datapath, args.maxKnowledge)
+        answer_instruction = "Answer the question with a super short answer."
+        
+    if answer_instruction != None:
+        answer_instruction = "Answer the question with a super short answer."
+    
+    kvcache_path = "./data_cache/cache_knowledges.pt"
+    # document indexing for the rag retriever
+    documents = [Document(text=t) for t in text_list]
+    
+    if args.index == "gemini":
+        retriever, prepare_time = getGeminiRetriever(documents, similarity_top_k=args.topk)
+    if args.index == "openai":
+        retriever, prepare_time = getOpenAIRetriever(documents, similarity_top_k=args.topk)
+    if args.index == "bm25":
+        retriever, prepare_time = getBM25Retriever(documents, similarity_top_k=args.topk)
+        
+    print(f"Retriever {args.index.upper()} prepared in {prepare_time} seconds")
+    with open(args.output, "a") as f:
+        f.write(f"Retriever {args.index.upper()} prepared in {prepare_time} seconds\n")
+    
+    results = {
+        "retrieve_time": [],
+        "generate_time": [],
+        "similarity": [],
+        "prompts": [],
+        "responses": []
+    }
+    
+    dataset = list(dataset) # Convert the dataset to a list
+    
+    max_questions = min(len(dataset), args.maxQuestion) if args.maxQuestion != None else len(dataset)
+    
+    for id, (question, ground_truth) in enumerate(dataset[:max_questions]):    # Retrieve the knowledge from the vector database
+        retrieve_t1 = time()
+        nodes = retriever.retrieve(question)
+        retrieve_t2 = time()
+        
+        knowledge = "\n---------------------\n".join([node.text for node in nodes])
+        # short_knowledge = knowledge[:knowledge.find("**Step 4")]
+        
+        prompt = f"""
+    <|begin_of_text|>
+    <|start_header_id|>system<|end_header_id|>
+    You are an assistant for giving short answers based on given context.<|eot_id|>
+    <|start_header_id|>user<|end_header_id|>
+    Context information is bellow.
+    ------------------------------------------------
+    {knowledge}
+    ------------------------------------------------
+    {answer_instruction}
+    Question:
+    {question}
+    <|eot_id|>
+    <|start_header_id|>assistant<|end_header_id|>
+    """
+
+        # Generate Response for the question
+        generate_t1 = time() 
+        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
+        output = model.generate(
+            input_ids,
+            max_new_tokens=300,  # Set the maximum length of the generated text
+            do_sample=False,  # Ensures greedy decoding,
+            temperature=None
+        )
+        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
+        generate_t2 = time() 
+
+        generated_text = generated_text[generated_text.find(question) + len(question):]
+        generated_text = generated_text[generated_text.find('assistant') + len('assistant'):].lstrip()
+        
+        # print("R: ", knowledge)
+        print("Q: ", question)
+        print("A: ", generated_text)
+        
+        # Evaluate bert-score similarity
+        similarity = get_bert_similarity(generated_text, ground_truth)
+        
+        print(f"[{id}]: Semantic Similarity: {round(similarity, 5)},\t",
+            f"retrieve time: {retrieve_t2 - retrieve_t1},\t",
+            f"generate time: {generate_t2 - generate_t1}"
+            )
+        with open(args.output, "a") as f:
+            f.write(f"[{id}]: Semantic Similarity: {round(similarity, 5)},\t retrieve time: {retrieve_t2 - retrieve_t1},\t generate time: {generate_t2 - generate_t1}\n")
+            
+        results["prompts"].append(prompt)
+        results["responses"].append(generated_text)
+        results["retrieve_time"].append(retrieve_t2 - retrieve_t1)
+        results["generate_time"].append(generate_t2 - generate_t1)
+        results["similarity"].append(similarity)
+        
+        with open(args.output, "a") as f:
+            f.write(f"[{id}]: [Cumulative]: " 
+                    + f"Semantic Similarity: {round(sum(results['similarity']) / (len(results['similarity'])) , 5)}," 
+                    + f"\t retrieve time: {sum(results['retrieve_time']) / (len(results['retrieve_time'])) },"
+                    + f"\t generate time: {sum(results['generate_time']) / (len(results['generate_time'])) }\n")
+        
+        
+    avg_similarity = sum(results["similarity"]) / len(results["similarity"])
+    avg_retrieve_time = sum(results["retrieve_time"]) / len(results["retrieve_time"])
+    avg_generate_time = sum(results["generate_time"]) / len(results["generate_time"])
+    print()
+    print(f"Prepare time: {prepare_time}")
+    print(f"Average Semantic Similarity: {avg_similarity}")
+    print(f"retrieve time: {avg_retrieve_time},\t generate time: {avg_generate_time}")
+    print()
+    with open(args.output, "a") as f:
+        f.write("\n")
+        f.write(f"Result for {args.output}\n")
+        f.write(f"Prepare time: {prepare_time}\n")
+        f.write(f"Average Semantic Similarity: {avg_similarity}\n")
+        f.write(f"retrieve time: {avg_retrieve_time},\t generate time: {avg_generate_time}\n")
+
+
+# Define quantization configuration
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,              # Load model in 4-bit precision
+    bnb_4bit_quant_type="nf4",      # Normalize float 4 quantization
+    bnb_4bit_compute_dtype=torch.float16,  # Compute dtype for 4-bit base matrices
+    bnb_4bit_use_double_quant=True  # Use nested quantization
+)
+
+def load_quantized_model(model_name, hf_token=None):
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_name,
+        token=hf_token
+    )
+    
+    # Load model with quantization
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        quantization_config=bnb_config,
+        device_map="auto",          # Automatically choose best device
+        trust_remote_code=True,     # Required for some models
+        token=hf_token
+    )
+    
+    return tokenizer, model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run RAG test with specified parameters.")
+    # parser.add_argument('--method', choices=['rag', 'kvcache'], required=True, help='Method to use (rag or kvcache)')
+    parser.add_argument('--modelname', required=False, default="meta-llama/Llama-3.2-1B-Instruct", type=str, help='Model name to use')
+    parser.add_argument('--quantized', required=False, default=False, type=bool, help='Quantized model')
+    parser.add_argument('--index', choices=['gemini', 'openai', 'bm25'], required=True, help='Index to use (gemini, openai, bm25)')
+    parser.add_argument('--similarity', choices=['bertscore'], required=True, help='Similarity metric to use (bertscore)')
+    parser.add_argument('--output', required=True, type=str, help='Output file to save the results')
+    parser.add_argument('--maxQuestion', required=False, default=None ,type=int, help='Maximum number of questions to test')
+    parser.add_argument('--maxKnowledge', required=False, default=None ,type=int, help='Maximum number of knowledge items to use')
+    parser.add_argument('--maxParagraph', required=False, default=None ,type=int, help='Maximum number of paragraph to use')
+    parser.add_argument('--topk', required=False, default=1, type=int, help='Top K retrievals to use')
+    parser.add_argument('--dataset', required=True, help='Dataset to use (kis, kis_sample, squad-dev, squad-train)', 
+                        choices=['kis', 'kis_sample', 
+                                'squad-dev', 'squad-train', 
+                                'hotpotqa-dev',  'hotpotqa-train', 'hotpotqa-test'])
+    parser.add_argument('--randomSeed', required=False, default=None, type=int, help='Random seed to use')
+    
+    # 48 Articles, each article average 40~50 paragraph, each average 5~10 questions
+    
+    args = parser.parse_args()
+    
+    print("maxKnowledge", args.maxKnowledge, "maxParagraph", args.maxParagraph, "maxQuestion", args.maxQuestion, "randomSeed", args.randomSeed)
+    
+    model_name = args.modelname
+    rand_seed = args.randomSeed if args.randomSeed != None else None
+    
+    if args.quantized:
+        tokenizer, model = load_quantized_model(model_name=model_name, hf_token=HF_TOKEN)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16,
+            device_map="auto",
+            token=HF_TOKEN
+        )
+    
+    def unique_path(path, i=0):
+        if os.path.exists(path):
+            return unique_path(path + "_" + str(i), i + 1)
+        return path
+    
+    if os.path.exists(args.output):
+        args.output = unique_path(args.output)
+        
+    rag_test(args)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..efdcf70
Binary files /dev/null and b/requirements.txt differ
diff --git a/scripts/random-hotpot-k16-k80.sh b/scripts/random-hotpot-k16-k80.sh
new file mode 100644
index 0000000..6229840
--- /dev/null
+++ b/scripts/random-hotpot-k16-k80.sh
@@ -0,0 +1,125 @@
+#!/bin/bash
+logfilename="./log/random-hotpot-k16-k80.log"
+# while log file exists, create a new one called random_i.log
+i=1
+while [ -f $logfilename ]; do
+    echo "log file ${logfilename} exists, create a new one"
+    logfilename="./log/random-hotpot-k16-k80_$i.log"
+    i=$(($i+1))
+done
+
+# # all k = 7405 article, tokens = 10,038,084 
+# # when k = 1, tokens = 1,400
+# # when k = 16, tokens = 22,400
+# # when k = 24, tokens = 33,667
+# # when k = 32, tokens = 44,800
+# # when k = 48, tokens = 64,000
+# # when k = 64, tokens = 85,000
+# # when k = 80, tokens = 106,000
+
+datasets=("hotpotqa-train")
+# models=("3.1-8B" "3.2-3B" "3.2-1B")
+models=("3.1-8B")
+indices=("openai" "bm25")
+# k=("16" "32" "48" "64" "80")
+k=("16")
+maxQuestions=("500")
+top_k=("1" "3" "5" "10" "20")
+
+
+for dataset in "${datasets[@]}"; do
+  for model in "${models[@]}"; do
+    for maxQuestion in "${maxQuestions[@]}"; do
+      batch=k 
+      # iteration = maxQuestion / batch
+      iteration=$(($maxQuestion / $batch))
+
+      for i in $(seq 1 $iteration); do
+        randomSeed=$(shuf -i 1-100000 -n 1)
+        echo "Random seed: $randomSeed" >> $logfilename
+
+        # Run KVCACHE without cache
+        echo "Running KVCACHE for $dataset, maxQuestion $maxQuestion, model $model"
+        echo "Running KVCACHE for $dataset, maxQuestion $maxQuestion, model $model" >> $logfilename
+        python ./kvcache.py --kvcache file --dataset "$dataset" --similarity bertscore \
+          --maxKnowledge "$k" --maxQuestion "$k" --usePrompt \
+          --modelname "meta-llama/Llama-${model}-Instruct" --randomSeed  "$randomSeed" \
+          --output "./random_results/${dataset}/${k}/result_${model}_k${k}_q${maxQuestion}_${dataset}_bertscore_kvcache_nokv.txt_${i}"
+
+        # Run KVCACHE
+        echo "Running KVCACHE for $dataset, maxQuestion $maxQuestion, model $model"
+        echo "Running KVCACHE for $dataset, maxQuestion $maxQuestion, model $model" >> $logfilename
+        python ./kvcache.py --kvcache file --dataset "$dataset" --similarity bertscore \
+          --maxKnowledge "$k" --maxQuestion "$k" \
+          --modelname "meta-llama/Llama-${model}-Instruct" --randomSeed  "$randomSeed" \
+          --output "./random_results/${dataset}/${k}/result_${model}_k${k}_q${maxQuestion}_${dataset}_bertscore_kvcache.txt_${i}"
+        
+        # Run RAG
+        for topk in "${top_k[@]}"; do
+            for index in "${indices[@]}"; do
+              echo "Running RAG with $index for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model, topk ${topk}"
+              echo "Running RAG with $index for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model, topk ${topk}" >> $logfilename
+              python ./rag.py --index "$index" --dataset "$dataset" --similarity bertscore \
+                --maxKnowledge "$k" --maxQuestion "$k" --topk "$topk" \
+                --modelname "meta-llama/Llama-${model}-Instruct" --randomSeed  "$randomSeed" \
+                --output "./random_results/${dataset}/${k}/result_${model}_k${k}_q${maxQuestion}_${dataset}_bertscore_rag_Index_${index}_top${topk}.txt_${i}" 
+            done
+        done
+
+      done
+    done
+  done
+done
+
+datasets=("hotpotqa-train")
+# models=("3.1-8B" "3.2-3B" "3.2-1B")
+models=("3.1-8B")
+indices=("openai" "bm25")
+# k=("16" "32" "48" "64" "80")
+k=("80")
+maxQuestions=("500")
+top_k=("1" "3" "5" "10" "20")
+
+for dataset in "${datasets[@]}"; do
+  for model in "${models[@]}"; do
+    for maxQuestion in "${maxQuestions[@]}"; do
+      batch=k 
+      # iteration = maxQuestion / batch
+      iteration=$(($maxQuestion / $batch))
+
+      for i in $(seq 1 $iteration); do
+        randomSeed=$(shuf -i 1-100000 -n 1)
+        echo "Random seed: $randomSeed" >> $logfilename
+
+        # Run KVCACHE without cache
+        echo "Running KVCACHE for $dataset, maxQuestion $maxQuestion, model $model" >> $logfilename
+        python ./kvcache.py --kvcache file --dataset "$dataset" --similarity bertscore \
+          --maxKnowledge "$k" --maxQuestion "$k" --usePrompt \
+          --modelname "meta-llama/Llama-${model}-Instruct" --randomSeed  "$randomSeed" \
+          --output "./random_results/${dataset}/${k}/result_${model}_k${k}_q${maxQuestion}_${dataset}_bertscore_kvcache_nokv.txt_${i}"
+
+        # Run KVCACHE
+        echo "Running KVCACHE for $dataset, maxQuestion $maxQuestion, model $model" >> $logfilename
+        python ./kvcache.py --kvcache file --dataset "$dataset" --similarity bertscore \
+          --maxKnowledge "$k" --maxQuestion "$k" \
+          --modelname "meta-llama/Llama-${model}-Instruct" --randomSeed  "$randomSeed" \
+          --output "./random_results/${dataset}/${k}/result_${model}_k${k}_q${maxQuestion}_${dataset}_bertscore_kvcache.txt_${i}"
+        
+        # Run RAG
+        for topk in "${top_k[@]}"; do
+            for index in "${indices[@]}"; do
+              echo "Running RAG with $index for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model, topk ${topk}" >> $logfilename
+              python ./rag.py --index "$index" --dataset "$dataset" --similarity bertscore \
+                --maxKnowledge "$k" --maxQuestion "$k" --topk "$topk" \
+                --modelname "meta-llama/Llama-${model}-Instruct" --randomSeed  "$randomSeed" \
+                --output "./random_results/${dataset}/${k}/result_${model}_k${k}_q${maxQuestion}_${dataset}_bertscore_rag_Index_${index}_top${topk}.txt_${i}" 
+            done
+        done
+      
+
+      done
+    done
+  done
+done
+
+echo "All done" >> $logfilename
\ No newline at end of file
diff --git a/scripts/random-hotpot-k24-k64.sh b/scripts/random-hotpot-k24-k64.sh
new file mode 100644
index 0000000..74ba48d
--- /dev/null
+++ b/scripts/random-hotpot-k24-k64.sh
@@ -0,0 +1,125 @@
+#!/bin/bash
+logfilename="./log/random-hotpot-k24-k64.log"
+# while log file exists, create a new one called random_i.log
+i=1
+while [ -f $logfilename ]; do
+    echo "log file ${logfilename} exists, create a new one"
+    logfilename="./log/random-hotpot-k24-k64_$i.log"
+    i=$(($i+1))
+done
+
+# # all k = 7405 article, tokens = 10,038,084 
+# # when k = 1, tokens = 1,400
+# # when k = 16, tokens = 22,400
+# # when k = 24, tokens = 33,667
+# # when k = 32, tokens = 44,800
+# # when k = 48, tokens = 64,000
+# # when k = 64, tokens = 85,000
+# # when k = 80, tokens = 106,000
+
+datasets=("hotpotqa-train")
+# models=("3.1-8B" "3.2-3B" "3.2-1B")
+models=("3.1-8B")
+indices=("openai" "bm25")
+# k=("16" "32" "48" "64" "80")
+k=("24")
+maxQuestions=("500")
+top_k=("1" "3" "5" "10" "20")
+
+
+for dataset in "${datasets[@]}"; do
+  for model in "${models[@]}"; do
+    for maxQuestion in "${maxQuestions[@]}"; do
+      batch=k 
+      # iteration = maxQuestion / batch
+      iteration=$(($maxQuestion / $batch))
+
+      for i in $(seq 1 $iteration); do
+        randomSeed=$(shuf -i 1-100000 -n 1)
+        echo "Random seed: $randomSeed" >> $logfilename
+
+        # Run KVCACHE without cache
+        echo "Running KVCACHE for $dataset, maxQuestion $maxQuestion, model $model"
+        echo "Running KVCACHE for $dataset, maxQuestion $maxQuestion, model $model" >> $logfilename
+        python ./kvcache.py --kvcache file --dataset "$dataset" --similarity bertscore \
+          --maxKnowledge "$k" --maxQuestion "$k" --usePrompt \
+          --modelname "meta-llama/Llama-${model}-Instruct" --randomSeed  "$randomSeed" \
+          --output "./random_results/${dataset}/${k}/result_${model}_k${k}_q${maxQuestion}_${dataset}_bertscore_kvcache_nokv.txt_${i}"
+
+        # Run KVCACHE
+        echo "Running KVCACHE for $dataset, maxQuestion $maxQuestion, model $model"
+        echo "Running KVCACHE for $dataset, maxQuestion $maxQuestion, model $model" >> $logfilename
+        python ./kvcache.py --kvcache file --dataset "$dataset" --similarity bertscore \
+          --maxKnowledge "$k" --maxQuestion "$k" \
+          --modelname "meta-llama/Llama-${model}-Instruct" --randomSeed  "$randomSeed" \
+          --output "./random_results/${dataset}/${k}/result_${model}_k${k}_q${maxQuestion}_${dataset}_bertscore_kvcache.txt_${i}"
+        
+        # Run RAG
+        for topk in "${top_k[@]}"; do
+            for index in "${indices[@]}"; do
+              echo "Running RAG with $index for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model, topk ${topk}"
+              echo "Running RAG with $index for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model, topk ${topk}" >> $logfilename
+              python ./rag.py --index "$index" --dataset "$dataset" --similarity bertscore \
+                --maxKnowledge "$k" --maxQuestion "$k" --topk "$topk" \
+                --modelname "meta-llama/Llama-${model}-Instruct" --randomSeed  "$randomSeed" \
+                --output "./random_results/${dataset}/${k}/result_${model}_k${k}_q${maxQuestion}_${dataset}_bertscore_rag_Index_${index}_top${topk}.txt_${i}" 
+            done
+        done
+        
+      done
+    done
+  done
+done
+
+datasets=("hotpotqa-train")
+# models=("3.1-8B" "3.2-3B" "3.2-1B")
+models=("3.1-8B")
+indices=("openai" "bm25")
+# k=("24" "32" "48" "64" "64")
+k=("64")
+maxQuestions=("500")
+top_k=("1" "3" "5" "10" "20")
+
+for dataset in "${datasets[@]}"; do
+  for model in "${models[@]}"; do
+    for maxQuestion in "${maxQuestions[@]}"; do
+      batch=k 
+      # iteration = maxQuestion / batch
+      iteration=$(($maxQuestion / $batch))
+
+      for i in $(seq 1 $iteration); do
+        randomSeed=$(shuf -i 1-100000 -n 1)
+        echo "Random seed: $randomSeed" >> $logfilename
+
+        # Run KVCACHE without cache
+        echo "Running KVCACHE for $dataset, maxQuestion $maxQuestion, model $model" >> $logfilename
+        python ./kvcache.py --kvcache file --dataset "$dataset" --similarity bertscore \
+          --maxKnowledge "$k" --maxQuestion "$k" --usePrompt \
+          --modelname "meta-llama/Llama-${model}-Instruct" --randomSeed  "$randomSeed" \
+          --output "./random_results/${dataset}/${k}/result_${model}_k${k}_q${maxQuestion}_${dataset}_bertscore_kvcache_nokv.txt_${i}"
+
+        # Run KVCACHE
+        echo "Running KVCACHE for $dataset, maxQuestion $maxQuestion, model $model" >> $logfilename
+        python ./kvcache.py --kvcache file --dataset "$dataset" --similarity bertscore \
+          --maxKnowledge "$k" --maxQuestion "$k" \
+          --modelname "meta-llama/Llama-${model}-Instruct" --randomSeed  "$randomSeed" \
+          --output "./random_results/${dataset}/${k}/result_${model}_k${k}_q${maxQuestion}_${dataset}_bertscore_kvcache.txt_${i}"
+        
+        # Run RAG
+        for topk in "${top_k[@]}"; do
+            for index in "${indices[@]}"; do
+              echo "Running RAG with $index for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model, topk ${topk}" >> $logfilename
+              python ./rag.py --index "$index" --dataset "$dataset" --similarity bertscore \
+                --maxKnowledge "$k" --maxQuestion "$k" --topk "$topk" \
+                --modelname "meta-llama/Llama-${model}-Instruct" --randomSeed  "$randomSeed" \
+                --output "./random_results/${dataset}/${k}/result_${model}_k${k}_q${maxQuestion}_${dataset}_bertscore_rag_Index_${index}_top${topk}.txt_${i}" 
+            done
+        done
+      
+
+      done
+    done
+  done
+done
+
+echo "All done" >> $logfilename
\ No newline at end of file
diff --git a/scripts/random-hotpot-k32-k48.sh b/scripts/random-hotpot-k32-k48.sh
new file mode 100644
index 0000000..5ee5c52
--- /dev/null
+++ b/scripts/random-hotpot-k32-k48.sh
@@ -0,0 +1,125 @@
+#!/bin/bash
+logfilename="./log/random-hotpot-k32-k48.log"
+# while log file exists, create a new one called random_i.log
+i=1
+while [ -f $logfilename ]; do
+    echo "log file ${logfilename} exists, create a new one"
+    logfilename="./log/random-hotpot-k32-k48_$i.log"
+    i=$(($i+1))
+done
+
+# # all k = 7405 article, tokens = 10,038,084 
+# # when k = 1, tokens = 1,400
+# # when k = 16, tokens = 22,400
+# # when k = 24, tokens = 33,667
+# # when k = 32, tokens = 44,800
+# # when k = 48, tokens = 64,000
+# # when k = 64, tokens = 85,000
+# # when k = 80, tokens = 106,000
+
+datasets=("hotpotqa-train")
+# models=("3.1-8B" "3.2-3B" "3.2-1B")
+models=("3.1-8B")
+indices=("openai" "bm25")
+# k=("16" "32" "48" "64" "80")
+k=("32")
+maxQuestions=("500")
+top_k=("1" "3" "5" "10" "20")
+
+
+for dataset in "${datasets[@]}"; do
+  for model in "${models[@]}"; do
+    for maxQuestion in "${maxQuestions[@]}"; do
+      batch=k 
+      # iteration = maxQuestion / batch
+      iteration=$(($maxQuestion / $batch))
+
+      for i in $(seq 1 $iteration); do
+        randomSeed=$(shuf -i 1-100000 -n 1)
+        echo "Random seed: $randomSeed" >> $logfilename
+
+        # Run KVCACHE without cache
+        echo "Running KVCACHE for $dataset, maxQuestion $maxQuestion, model $model"
+        echo "Running KVCACHE for $dataset, maxQuestion $maxQuestion, model $model" >> $logfilename
+        python ./kvcache.py --kvcache file --dataset "$dataset" --similarity bertscore \
+          --maxKnowledge "$k" --maxQuestion "$k" --usePrompt \
+          --modelname "meta-llama/Llama-${model}-Instruct" --randomSeed  "$randomSeed" \
+          --output "./random_results/${dataset}/${k}/result_${model}_k${k}_q${maxQuestion}_${dataset}_bertscore_kvcache_nokv.txt_${i}"
+
+        # Run KVCACHE
+        echo "Running KVCACHE for $dataset, maxQuestion $maxQuestion, model $model"
+        echo "Running KVCACHE for $dataset, maxQuestion $maxQuestion, model $model" >> $logfilename
+        python ./kvcache.py --kvcache file --dataset "$dataset" --similarity bertscore \
+          --maxKnowledge "$k" --maxQuestion "$k" \
+          --modelname "meta-llama/Llama-${model}-Instruct" --randomSeed  "$randomSeed" \
+          --output "./random_results/${dataset}/${k}/result_${model}_k${k}_q${maxQuestion}_${dataset}_bertscore_kvcache.txt_${i}"
+        
+        # Run RAG
+        for topk in "${top_k[@]}"; do
+            for index in "${indices[@]}"; do
+              echo "Running RAG with $index for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model, topk ${topk}"
+              echo "Running RAG with $index for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model, topk ${topk}" >> $logfilename
+              python ./rag.py --index "$index" --dataset "$dataset" --similarity bertscore \
+                --maxKnowledge "$k" --maxQuestion "$k" --topk "$topk" \
+                --modelname "meta-llama/Llama-${model}-Instruct" --randomSeed  "$randomSeed" \
+                --output "./random_results/${dataset}/${k}/result_${model}_k${k}_q${maxQuestion}_${dataset}_bertscore_rag_Index_${index}_top${topk}.txt_${i}" 
+            done
+        done
+        
+      done
+    done
+  done
+done
+
+datasets=("hotpotqa-train")
+# models=("3.1-8B" "3.2-3B" "3.2-1B")
+models=("3.1-8B")
+indices=("openai" "bm25")
+# k=("16" "32" "48" "64" "80")
+k=("48")
+maxQuestions=("500")
+top_k=("1" "3" "5" "10" "20")
+
+for dataset in "${datasets[@]}"; do
+  for model in "${models[@]}"; do
+    for maxQuestion in "${maxQuestions[@]}"; do
+      batch=k 
+      # iteration = maxQuestion / batch
+      iteration=$(($maxQuestion / $batch))
+
+      for i in $(seq 1 $iteration); do
+        randomSeed=$(shuf -i 1-100000 -n 1)
+        echo "Random seed: $randomSeed" >> $logfilename
+
+        # Run KVCACHE without cache
+        echo "Running KVCACHE for $dataset, maxQuestion $maxQuestion, model $model" >> $logfilename
+        python ./kvcache.py --kvcache file --dataset "$dataset" --similarity bertscore \
+          --maxKnowledge "$k" --maxQuestion "$k" --usePrompt \
+          --modelname "meta-llama/Llama-${model}-Instruct" --randomSeed  "$randomSeed" \
+          --output "./random_results/${dataset}/${k}/result_${model}_k${k}_q${maxQuestion}_${dataset}_bertscore_kvcache_nokv.txt_${i}"
+
+        # Run KVCACHE
+        echo "Running KVCACHE for $dataset, maxQuestion $maxQuestion, model $model" >> $logfilename
+        python ./kvcache.py --kvcache file --dataset "$dataset" --similarity bertscore \
+          --maxKnowledge "$k" --maxQuestion "$k" \
+          --modelname "meta-llama/Llama-${model}-Instruct" --randomSeed  "$randomSeed" \
+          --output "./random_results/${dataset}/${k}/result_${model}_k${k}_q${maxQuestion}_${dataset}_bertscore_kvcache.txt_${i}"
+        
+        # Run RAG
+        for topk in "${top_k[@]}"; do
+            for index in "${indices[@]}"; do
+              echo "Running RAG with $index for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model, topk ${topk}" >> $logfilename
+              python ./rag.py --index "$index" --dataset "$dataset" --similarity bertscore \
+                --maxKnowledge "$k" --maxQuestion "$k" --topk "$topk" \
+                --modelname "meta-llama/Llama-${model}-Instruct" --randomSeed  "$randomSeed" \
+                --output "./random_results/${dataset}/${k}/result_${model}_k${k}_q${maxQuestion}_${dataset}_bertscore_rag_Index_${index}_top${topk}.txt_${i}" 
+            done
+        done
+      
+
+      done
+    done
+  done
+done
+
+echo "All done" >> $logfilename
\ No newline at end of file
diff --git a/scripts/random-squad.sh b/scripts/random-squad.sh
new file mode 100644
index 0000000..6f35080
--- /dev/null
+++ b/scripts/random-squad.sh
@@ -0,0 +1,194 @@
+#!/bin/bash
+logfilename="./log/random-squad-k3.log"
+# while log file exists, create a new one called random_i.log
+i=1
+while [ -f $logfilename ]; do
+    echo "log file ${logfilename} exists, create a new one"
+    logfilename="./log/random-squad$i-k3_$i.log"
+    i=$(($i+1))
+done
+
+# datasets=("squad-train")
+# when k = 3, tokens = 21,000
+# when k = 4, tokens = 32,000
+# when k = 7, tokens = 50,000
+
+# 在這裡自訂 k 和 p 的值
+k=3  # 設定 k 值
+p=100  # 設定 p 值
+
+datasets=("squad-train")
+# models=("3.1-8B" "3.2-3B" "3.2-1B")
+models=("3.1-8B")
+indices=("openai" "bm25")
+maxQuestions=("500")
+top_k=("1" "3" "5" "10" "20")
+
+for dataset in "${datasets[@]}"; do
+  for model in "${models[@]}"; do
+    for maxQuestion in "${maxQuestions[@]}"; do
+      
+      randomSeed=$(shuf -i 1-100000 -n 1)
+      echo "Random seed: $randomSeed" >> $logfilename
+
+      # Run KVCACHE without cache
+      echo "Running KVCACHE for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model"
+      echo "Running KVCACHE for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model" >> $logfilename
+      python ./kvcache.py --kvcache file --dataset "$dataset" --similarity bertscore \
+        --maxKnowledge "$k" --maxParagraph "$p" --maxQuestion "$maxQuestion" --usePrompt \
+        --modelname "meta-llama/Llama-${model}-Instruct" --randomSeed  "$randomSeed" \
+        --output "./random_results/${dataset}/k${k}/result_${model}_k${k}_p${p}_q${maxQuestion}_${dataset}_bertscore_kvcache_nokv.txt_${i}"
+      
+      # Run KVCACHE
+      echo "Running KVCACHE for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model"
+      echo "Running KVCACHE for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model" >> $logfilename
+      python ./kvcache.py --kvcache file --dataset "$dataset" --similarity bertscore \
+        --maxKnowledge "$k" --maxParagraph "$p" --maxQuestion "$maxQuestion" \
+        --modelname "meta-llama/Llama-${model}-Instruct" --randomSeed  "$randomSeed" \
+        --output "./random_results/${dataset}/k${k}/result_${model}_k${k}_p${p}_q${maxQuestion}_${dataset}_bertscore_kvcache.txt_${i}"
+      
+      # Run RAG
+      for topk in "${top_k[@]}"; do
+          for index in "${indices[@]}"; do
+            echo "Running RAG with $index for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model, topk ${topk}"
+            echo "Running RAG with $index for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model, topk ${topk}" >> $logfilename
+            python ./rag.py --index "$index" --dataset "$dataset" --similarity bertscore \
+              --maxKnowledge "$k" --maxParagraph "$p" --maxQuestion "$maxQuestion" --topk "$topk" \
+              --modelname "meta-llama/Llama-${model}-Instruct" --randomSeed  "$randomSeed" \
+              --output "./random_results/${dataset}/k${k}/result_${model}_k${k}_p${p}_q${maxQuestion}_${dataset}_bertscore_rag_Index_${index}_top${topk}.txt_${i}" 
+          done
+      done
+      
+    done
+  done
+done
+
+logfilename="./log/random-squad-k3.log"
+# while log file exists, create a new one called random_i.log
+i=1
+while [ -f $logfilename ]; do
+    echo "log file ${logfilename} exists, create a new one"
+    logfilename="./log/random-squad$i-k3_$i.log"
+    i=$(($i+1))
+done
+
+# datasets=("squad-train")
+# when k = 3, tokens = 21,000
+# when k = 4, tokens = 32,000
+# when k = 7, tokens = 50,000
+
+# 在這裡自訂 k 和 p 的值
+k=5  # 設定 k 值
+p=100  # 設定 p 值
+
+datasets=("squad-train")
+# models=("3.1-8B" "3.2-3B" "3.2-1B")
+models=("3.1-8B")
+indices=("openai" "bm25")
+maxQuestions=("500")
+top_k=("1" "3" "5" "10" "20")
+
+for dataset in "${datasets[@]}"; do
+  for model in "${models[@]}"; do
+    for maxQuestion in "${maxQuestions[@]}"; do
+      
+      randomSeed=$(shuf -i 1-100000 -n 1)
+      echo "Random seed: $randomSeed" >> $logfilename
+
+      # Run KVCACHE without cache
+      echo "Running KVCACHE for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model"
+      echo "Running KVCACHE for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model" >> $logfilename
+      python ./kvcache.py --kvcache file --dataset "$dataset" --similarity bertscore \
+        --maxKnowledge "$k" --maxParagraph "$p" --maxQuestion "$maxQuestion" --usePrompt \
+        --modelname "meta-llama/Llama-${model}-Instruct" --randomSeed  "$randomSeed" \
+        --output "./random_results/${dataset}/k${k}/result_${model}_k${k}_p${p}_q${maxQuestion}_${dataset}_bertscore_kvcache_nokv.txt_${i}"
+      
+      # Run KVCACHE
+      echo "Running KVCACHE for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model"
+      echo "Running KVCACHE for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model" >> $logfilename
+      python ./kvcache.py --kvcache file --dataset "$dataset" --similarity bertscore \
+        --maxKnowledge "$k" --maxParagraph "$p" --maxQuestion "$maxQuestion" \
+        --modelname "meta-llama/Llama-${model}-Instruct" --randomSeed  "$randomSeed" \
+        --output "./random_results/${dataset}/k${k}/result_${model}_k${k}_p${p}_q${maxQuestion}_${dataset}_bertscore_kvcache.txt_${i}"
+      
+      # Run RAG
+      for topk in "${top_k[@]}"; do
+          for index in "${indices[@]}"; do
+            echo "Running RAG with $index for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model, topk ${topk}"
+            echo "Running RAG with $index for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model, topk ${topk}" >> $logfilename
+            python ./rag.py --index "$index" --dataset "$dataset" --similarity bertscore \
+              --maxKnowledge "$k" --maxParagraph "$p" --maxQuestion "$maxQuestion" --topk "$topk" \
+              --modelname "meta-llama/Llama-${model}-Instruct" --randomSeed  "$randomSeed" \
+              --output "./random_results/${dataset}/k${k}/result_${model}_k${k}_p${p}_q${maxQuestion}_${dataset}_bertscore_rag_Index_${index}_top${topk}.txt_${i}" 
+          done
+      done
+      
+    done
+  done
+done
+
+logfilename="./log/random-squad-k3.log"
+# while log file exists, create a new one called random_i.log
+i=1
+while [ -f $logfilename ]; do
+    echo "log file ${logfilename} exists, create a new one"
+    logfilename="./log/random-squad$i-k3_$i.log"
+    i=$(($i+1))
+done
+
+# datasets=("squad-train")
+# when k = 3, tokens = 21,000
+# when k = 4, tokens = 32,000
+# when k = 7, tokens = 50,000
+
+# 在這裡自訂 k 和 p 的值
+k=7  # 設定 k 值
+p=100  # 設定 p 值
+
+datasets=("squad-train")
+# models=("3.1-8B" "3.2-3B" "3.2-1B")
+models=("3.1-8B")
+indices=("openai" "bm25")
+maxQuestions=("500")
+top_k=("1" "3" "5" "10" "20")
+
+for dataset in "${datasets[@]}"; do
+  for model in "${models[@]}"; do
+    for maxQuestion in "${maxQuestions[@]}"; do
+      
+      randomSeed=$(shuf -i 1-100000 -n 1)
+      echo "Random seed: $randomSeed" >> $logfilename
+
+      # Run KVCACHE without cache
+      echo "Running KVCACHE for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model"
+      echo "Running KVCACHE for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model" >> $logfilename
+      python ./kvcache.py --kvcache file --dataset "$dataset" --similarity bertscore \
+        --maxKnowledge "$k" --maxParagraph "$p" --maxQuestion "$maxQuestion" --usePrompt \
+        --modelname "meta-llama/Llama-${model}-Instruct" --randomSeed  "$randomSeed" \
+        --output "./random_results/${dataset}/k${k}/result_${model}_k${k}_p${p}_q${maxQuestion}_${dataset}_bertscore_kvcache_nokv.txt_${i}"
+      
+      # Run KVCACHE
+      echo "Running KVCACHE for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model"
+      echo "Running KVCACHE for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model" >> $logfilename
+      python ./kvcache.py --kvcache file --dataset "$dataset" --similarity bertscore \
+        --maxKnowledge "$k" --maxParagraph "$p" --maxQuestion "$maxQuestion" \
+        --modelname "meta-llama/Llama-${model}-Instruct" --randomSeed  "$randomSeed" \
+        --output "./random_results/${dataset}/k${k}/result_${model}_k${k}_p${p}_q${maxQuestion}_${dataset}_bertscore_kvcache.txt_${i}"
+      
+      # Run RAG
+      for topk in "${top_k[@]}"; do
+          for index in "${indices[@]}"; do
+            echo "Running RAG with $index for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model, topk ${topk}"
+            echo "Running RAG with $index for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model, topk ${topk}" >> $logfilename
+            python ./rag.py --index "$index" --dataset "$dataset" --similarity bertscore \
+              --maxKnowledge "$k" --maxParagraph "$p" --maxQuestion "$maxQuestion" --topk "$topk" \
+              --modelname "meta-llama/Llama-${model}-Instruct" --randomSeed  "$randomSeed" \
+              --output "./random_results/${dataset}/k${k}/result_${model}_k${k}_p${p}_q${maxQuestion}_${dataset}_bertscore_rag_Index_${index}_top${topk}.txt_${i}" 
+          done
+      done
+      
+    done
+  done
+done
+
+echo "Finished running random-squad.sh" >> $logfilename
\ No newline at end of file
diff --git a/scripts/run.sh b/scripts/run.sh
new file mode 100644
index 0000000..f00d298
--- /dev/null
+++ b/scripts/run.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+# 在這裡自訂 k 和 p 的值
+
+k=3  # 設定 k 值
+p=100  # 設定 p 值
+
+datasets=("squad-train")
+# models=("3.1-8B" "3.2-3B" "3.2-1B")
+models=("3.1-8B")
+indices=("openai" "bm25")
+maxQuestions=("1000")
+top_k=("1" "3" "5" "10" "20")
+
+for dataset in "${datasets[@]}"; do
+  for model in "${models[@]}"; do
+    for maxQuestion in "${maxQuestions[@]}"; do
+      
+      # Run KVCACHE without cache
+      echo "Running KVCACHE for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model"
+      python ./kvcache.py --kvcache file --dataset "$dataset" --similarity bertscore \
+        --maxKnowledge "$k" --maxParagraph "$p" --maxQuestion "$maxQuestion" --usePrompt \
+        --modelname "meta-llama/Llama-${model}-Instruct" \
+        --output "./results/${dataset}/${maxQuestion}/result_${model}_k${k}_p${p}_q${maxQuestion}_${dataset}_bertscore_kvcache_nokv.txt"
+      
+      # # Run KVCACHE
+      # echo "Running KVCACHE for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model"
+      # python ./kvcache.py --kvcache file --dataset "$dataset" --similarity bertscore \
+      #   --maxKnowledge "$k" --maxParagraph "$p" --maxQuestion "$maxQuestion" \
+      #   --modelname "meta-llama/Llama-${model}-Instruct" \
+      #   --output "./results/${dataset}/${maxQuestion}/result_${model}_k${k}_p${p}_q${maxQuestion}_${dataset}_bertscore_kvcache.txt"
+      
+      # # Run RAG
+      # for topk in "${top_k[@]}"; do
+      #     for index in "${indices[@]}"; do
+      #       echo "Running RAG with $index for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model, topk ${topk}"
+      #       python ./rag.py --index "$index" --dataset "$dataset" --similarity bertscore \
+      #         --maxKnowledge "$k" --maxParagraph "$p" --maxQuestion "$maxQuestion" --topk "$topk" \
+      #         --modelname "meta-llama/Llama-${model}-Instruct" \
+      #         --output "./results/${dataset}/${maxQuestion}/result_${model}_k${k}_p${p}_q${maxQuestion}_${dataset}_bertscore_rag_Index_${index}.txt_top${topk}" 
+      #     done
+      # done
+      
+    done
+  done
+done
+
+
+datasets=("hotpotqa-train")
+# models=("3.1-8B" "3.2-3B" "3.2-1B")
+models=("3.1-8B")
+indices=("openai" "bm25")
+# maxQuestions=("16" "24" "32" "48" "64" "80")
+maxQuestions=("80")
+top_k=("1" "3" "5" "10" "20")
+# # all k = 7405 article, tokens = 10,038,084 
+# # when k = 16, tokens = 21,000
+# # when k = 24, tokens = 32,667
+# # when k = 32, tokens = 43,000
+# # when k = 48, tokens = 64,000
+# # when k = 64, tokens = 85,000
+# # when k = 80, tokens = 106,000
+
+
+for dataset in "${datasets[@]}"; do
+  for model in "${models[@]}"; do
+    for maxQuestion in "${maxQuestions[@]}"; do
+      k=$maxQuestion
+
+      # Run KVCACHE without cache
+      echo "Running KVCACHE for $dataset, maxQuestion $maxQuestion, model $model"
+      python ./kvcache.py --kvcache file --dataset "$dataset" --similarity bertscore \
+        --maxKnowledge "$k" --maxQuestion "$maxQuestion" --usePrompt \
+        --modelname "meta-llama/Llama-${model}-Instruct" \
+        --output "./results/${dataset}/${maxQuestion}/result_${model}_k${k}_q${maxQuestion}_${dataset}_bertscore_kvcache_nokv.txt"
+
+      # # Run KVCACHE
+      # echo "Running KVCACHE for $dataset, maxQuestion $maxQuestion, model $model"
+      # python ./kvcache.py --kvcache file --dataset "$dataset" --similarity bertscore \
+      #   --maxKnowledge "$k" --maxQuestion "$maxQuestion" \
+      #   --modelname "meta-llama/Llama-${model}-Instruct" \
+      #   --output "./results/${dataset}/${maxQuestion}/result_${model}_k${k}_q${maxQuestion}_${dataset}_bertscore_kvcache.txt"
+      
+      # # Run RAG
+      # for topk in "${top_k[@]}"; do
+      #     for index in "${indices[@]}"; do
+      #       echo "Running RAG with $index for $dataset, maxKnowledge $k, maxParagraph $p, maxQuestion $maxQuestion, model $model, topk ${topk}"
+      #       python ./rag.py --index "$index" --dataset "$dataset" --similarity bertscore \
+      #         --maxKnowledge "$k" --maxQuestion "$maxQuestion" --topk "$topk" \
+      #         --modelname "meta-llama/Llama-${model}-Instruct" \
+      #         --output "./results/${dataset}/${maxQuestion}/result_${model}_k${k}_q${maxQuestion}_${dataset}_bertscore_rag_Index_${index}.txt_top${topk}" 
+      #     done
+      # done
+      
+    done
+  done
+done