prometheus

agarwalishika · Sep 23, 2024 · d454541 · d454541
1 parent 265a491
commit d454541
Showing 1 changed file with 66 additions and 4 deletions.
diff --git a/visualization/similarity.py b/visualization/similarity.py
@@ -2,10 +2,14 @@
 from transformers import AutoTokenizer, AutoModel
 import numpy as np
 import torch
+from prometheus_eval.vllm import VLLM
+from prometheus_eval import PrometheusEval
+from prometheus_eval.prompts import ABSOLUTE_PROMPT, SCORE_RUBRIC_TEMPLATE
+from tqdm import tqdm
 
 rouge_metric = evaluate.load('rouge')
-bleu_metric = evaluate.load('bleu')
-bert_metric = evaluate.load('bertscore')
+#bleu_metric = evaluate.load('bleu')
+#bert_metric = evaluate.load('bertscore')
 
 def calculate_evaluate_metric(predictions, references, score="rouge", return_invidiual=True):
     """
@@ -65,7 +69,7 @@ def find_embedding(prompt):
         with torch.no_grad():
             model_output = embedding_model(**encoded_input)
             sentence_embeddings = model_output[0][:, 0]
-        sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=1, dim=1).squeeze()
+        sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings).squeeze()
         return sentence_embeddings
 
     metrics = []
@@ -78,4 +82,62 @@ def find_embedding(prompt):
     embedding_model.to('cpu')
     del embedding_model
     del embedding_tokenizer
-    return np.array(metrics)
+
+    if return_individual:
+        return np.array(metrics)
+    else:
+        return np.array(metrics).mean()
+
+def calculate_prometheus(predictions, refs, return_individual=False):
+    model = VLLM(model="prometheus-eval/prometheus-7b-v2.0")
+    judge = PrometheusEval(model=model, absolute_grade_template=ABSOLUTE_PROMPT)
+
+    instructions = refs[0]
+    references = refs[1]
+
+    rubric_data = {
+    "criteria": """Evaluate the model's ability to follow instructions and deliver a high-quality response across the following dimensions:
+1. **Instruction Following**: How accurately and fully does the model adhere to the given instruction?
+2. **Accuracy**: Is the information correct, reliable, and factually sound?
+3. **Relevance**: Does the response directly address the question or task without unnecessary information?
+4. **Completeness**: Does the response cover all essential aspects of the instruction or question?
+5. **Depth**: How thoroughly does the response explore the topic? Does it demonstrate insightful analysis where appropriate?
+6. **Clarity**: Is the response well-organized, easy to follow, and free from ambiguity or confusion?
+7. **Creativity**: Does the response offer original or innovative approaches where applicable?
+8. **Helpfulness**: Does the response effectively meet the user's needs and provide value in solving the problem or addressing the query?""",
+
+    "score1_description": "The response fails to meet expectations across most or all criteria. It does not follow the instruction, contains significant errors or misinformation, lacks relevance, is incomplete or shallow, unclear, unoriginal, and unhelpful.",
+
+    "score2_description": "The response shows major deficiencies across several criteria. It partially follows the instruction but includes significant inaccuracies, is often irrelevant, incomplete, or lacks depth, clarity, creativity, and helpfulness.",
+
+    "score3_description": "The response is average, meeting some but not all criteria. It follows the instruction but may fall short in terms of accuracy, depth, relevance, or helpfulness. Improvements in clarity and insightfulness may be needed.",
+
+    "score4_description": "The response is strong, performing well across most criteria. It follows the instruction closely, is mostly accurate and relevant, provides good depth, and is well-structured. Minor improvements could enhance clarity, creativity, or helpfulness.",
+
+    "score5_description": "The response excels in all or nearly all criteria. It fully follows the instruction, is highly accurate, directly relevant, complete, and demonstrates depth and insight. The response is well-organized, creative where appropriate, and very helpful in addressing the user's needs.",
+    }
+
+    score_rubric = SCORE_RUBRIC_TEMPLATE.format(**rubric_data)
+
+    metrics = []
+    for i, r, p in tqdm(zip(instructions, references, predictions)):
+        feedback, score = judge.single_absolute_grade(
+            instruction=i,
+            response=p,
+            rubric=score_rubric,
+            reference_answer=r
+        )
+
+        metrics.append(score)
+
+        with open('prometheus_log.txt', 'a+') as f:
+            f.write(f'{i}, {p}, {r}, {score}, {feedback}\n')
+
+    del model
+    del judge
+    torch.cuda.empty_cache()
+
+    if return_individual:
+        return np.array(metrics)
+    else:
+        return np.array(metrics).mean()