Skip to content

Commit

Permalink
prometheus
Browse files Browse the repository at this point in the history
  • Loading branch information
Ishika Agarwal committed Sep 23, 2024
1 parent 265a491 commit d454541
Showing 1 changed file with 66 additions and 4 deletions.
70 changes: 66 additions & 4 deletions visualization/similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,14 @@
from transformers import AutoTokenizer, AutoModel
import numpy as np
import torch
from prometheus_eval.vllm import VLLM
from prometheus_eval import PrometheusEval
from prometheus_eval.prompts import ABSOLUTE_PROMPT, SCORE_RUBRIC_TEMPLATE
from tqdm import tqdm

rouge_metric = evaluate.load('rouge')
bleu_metric = evaluate.load('bleu')
bert_metric = evaluate.load('bertscore')
#bleu_metric = evaluate.load('bleu')
#bert_metric = evaluate.load('bertscore')

def calculate_evaluate_metric(predictions, references, score="rouge", return_invidiual=True):
"""
Expand Down Expand Up @@ -65,7 +69,7 @@ def find_embedding(prompt):
with torch.no_grad():
model_output = embedding_model(**encoded_input)
sentence_embeddings = model_output[0][:, 0]
sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=1, dim=1).squeeze()
sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings).squeeze()
return sentence_embeddings

metrics = []
Expand All @@ -78,4 +82,62 @@ def find_embedding(prompt):
embedding_model.to('cpu')
del embedding_model
del embedding_tokenizer
return np.array(metrics)

if return_individual:
return np.array(metrics)
else:
return np.array(metrics).mean()

def calculate_prometheus(predictions, refs, return_individual=False):
model = VLLM(model="prometheus-eval/prometheus-7b-v2.0")
judge = PrometheusEval(model=model, absolute_grade_template=ABSOLUTE_PROMPT)

instructions = refs[0]
references = refs[1]

rubric_data = {
"criteria": """Evaluate the model's ability to follow instructions and deliver a high-quality response across the following dimensions:
1. **Instruction Following**: How accurately and fully does the model adhere to the given instruction?
2. **Accuracy**: Is the information correct, reliable, and factually sound?
3. **Relevance**: Does the response directly address the question or task without unnecessary information?
4. **Completeness**: Does the response cover all essential aspects of the instruction or question?
5. **Depth**: How thoroughly does the response explore the topic? Does it demonstrate insightful analysis where appropriate?
6. **Clarity**: Is the response well-organized, easy to follow, and free from ambiguity or confusion?
7. **Creativity**: Does the response offer original or innovative approaches where applicable?
8. **Helpfulness**: Does the response effectively meet the user's needs and provide value in solving the problem or addressing the query?""",

"score1_description": "The response fails to meet expectations across most or all criteria. It does not follow the instruction, contains significant errors or misinformation, lacks relevance, is incomplete or shallow, unclear, unoriginal, and unhelpful.",

"score2_description": "The response shows major deficiencies across several criteria. It partially follows the instruction but includes significant inaccuracies, is often irrelevant, incomplete, or lacks depth, clarity, creativity, and helpfulness.",

"score3_description": "The response is average, meeting some but not all criteria. It follows the instruction but may fall short in terms of accuracy, depth, relevance, or helpfulness. Improvements in clarity and insightfulness may be needed.",

"score4_description": "The response is strong, performing well across most criteria. It follows the instruction closely, is mostly accurate and relevant, provides good depth, and is well-structured. Minor improvements could enhance clarity, creativity, or helpfulness.",

"score5_description": "The response excels in all or nearly all criteria. It fully follows the instruction, is highly accurate, directly relevant, complete, and demonstrates depth and insight. The response is well-organized, creative where appropriate, and very helpful in addressing the user's needs.",
}

score_rubric = SCORE_RUBRIC_TEMPLATE.format(**rubric_data)

metrics = []
for i, r, p in tqdm(zip(instructions, references, predictions)):
feedback, score = judge.single_absolute_grade(
instruction=i,
response=p,
rubric=score_rubric,
reference_answer=r
)

metrics.append(score)

with open('prometheus_log.txt', 'a+') as f:
f.write(f'{i}, {p}, {r}, {score}, {feedback}\n')

del model
del judge
torch.cuda.empty_cache()

if return_individual:
return np.array(metrics)
else:
return np.array(metrics).mean()

0 comments on commit d454541

Please sign in to comment.