Skip to content

Commit

Permalink
Merge pull request EvolvingLMMs-Lab#250 from EvolvingLMMs-Lab/dev/add…
Browse files Browse the repository at this point in the history
…_raw_score_wb

Add raw score to wildvision bench
  • Loading branch information
Luodian authored Sep 13, 2024
2 parents e77fb31 + 01d6045 commit 07eee00
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 2 deletions.
28 changes: 26 additions & 2 deletions lmms_eval/tasks/wild_vision_bench/_default_template_yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,33 @@ generation_kwargs:
process_results: !function utils.wild_vision_process_results
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
metric_list:
- metric: gpt_eval_score
aggregation: !function utils.wild_vision_aggregation
- metric: raw_scores
aggregation: !function utils.wild_vision_aggregation_raw_scores
higher_is_better: true
- metric: elo_scores
aggregation: !function utils.wild_vision_aggregation_elo_scores
higher_is_better: true
- metric: win_rates
aggregation: !function utils.wild_vision_aggregation_win_rates
higher_is_better: true
- metric: judgements_better
aggregation: !function utils.wild_vision_aggregation_judgements_better
higher_is_better: true
- metric: judgements_better_plus
aggregation: !function utils.wild_vision_aggregation_judgements_better_plus
higher_is_better: true
- metric: judgements_worse
aggregation: !function utils.wild_vision_aggregation_judgements_worse
higher_is_better: false
- metric: judgements_worse_plus
aggregation: !function utils.wild_vision_aggregation_judgements_worse_plus
higher_is_better: false
- metric: judgements_tie
aggregation: !function utils.wild_vision_aggregation_judgements_tie
higher_is_better: false
- metric: judgements_unclear
aggregation: !function utils.wild_vision_aggregation_judgements_unclear
higher_is_better: false
metadata:
judge_model: gpt-4o
baseline_model: claude-3-sonnet-20240229
16 changes: 16 additions & 0 deletions lmms_eval/tasks/wild_vision_bench/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,25 +158,34 @@ def wild_vision_process_results(doc, results):
score = resps

if "A>B" in score:
raw_score = -1
winner = "model_a"
judgement = "Worse" # Baseline better
elif "A>>B" in score:
raw_score = -2
winner = "model_a"
judgement = "Worse++"
elif "A=B" in score:
raw_score = 0
winner = "tie"
judgement = "Tie"
elif "B>A" in score:
raw_score = 1
winner = "model_b"
judgement = "Better"
elif "B>>A" in score:
raw_score = 2
winner = "model_b"
judgement = "Better++"
else:
raw_score = 0
winner = "tie"
judgement = "Unclear"

return {
"raw_scores": {
"final_score": raw_score,
},
"elo_scores": {
"question": doc["instruction"],
"model_a": BASELINE_MODEL_NAME,
Expand Down Expand Up @@ -292,6 +301,13 @@ def get_win_rate_column(df, column, baseline):
return win_rate_table[baseline].fillna(0.5).apply(lambda x: round(x * 100, 2))


def wild_vision_aggregation_raw_scores(results):
total_score = 0
for result in results:
total_score += result["final_score"]
return total_score


def wild_vision_aggregation_elo_scores(results):
battles = prepare_elo_data(results)
elo_ratings = compute_mle_elo(battles, BASELINE_MODEL_NAME)
Expand Down

0 comments on commit 07eee00

Please sign in to comment.