Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
LengSicong authored Oct 8, 2024
1 parent 0fddee9 commit cc56da9
Showing 1 changed file with 156 additions and 0 deletions.
156 changes: 156 additions & 0 deletions evaluation/calculate_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import os
import json

if __name__ == '__main__':
# open output file
with open('./output.json', 'r') as f:
lines = f.readlines()

### Overall Accuracy, Perception Accuracy, and Hallucination Severity for the entire dataset
total = 0
correct = 0
total_yes_in_answer = 0
correct_yes = 0
total_no_in_answer = 0
correct_no = 0

for line in lines:
sample = json.loads(line)
answer = sample['answer'].strip().lower()
prediction = sample['pred'].strip().lower()
total += 1

# Check if the prediction is correct (for overall accuracy)
if answer in prediction[:5]:
correct += 1

# Check for "yes" or "no" in the answers
if 'yes' in answer[:5]:
total_yes_in_answer += 1
if 'yes' in prediction[:5]:
correct_yes += 1
elif 'no' in answer[:5]:
total_no_in_answer += 1
if 'no' in prediction[:5]:
correct_no += 1

# Calculate and print overall accuracy, perception accuracy, and hallucination severity for the entire dataset
overall_accuracy = correct / total if total > 0 else 0
overall_perception_accuracy = correct_yes / total_yes_in_answer if total_yes_in_answer > 0 else 0
overall_hallucination_severity = correct_no / total_no_in_answer if total_no_in_answer > 0 else 0

print(f'Overall Accuracy (entire dataset): {overall_accuracy:.4f}')
print(f'Overall Perception Accuracy (entire dataset): {overall_perception_accuracy:.4f}')
print(f'Overall Hallucination Severity (entire dataset): {overall_hallucination_severity:.4f}\n')

### Perception Accuracy, Hallucination Severity, and Accuracy for each sub_category
sub_category_dict = {}

# First loop to accumulate counts for each sub_category
for line in lines:
sample = json.loads(line)
sub_category = sample['sub_category']
if sub_category not in sub_category_dict:
sub_category_dict[sub_category] = {
'correct_yes': 0, 'total_yes_in_answer': 0, # For perception accuracy
'correct_no': 0, 'total_no_in_answer': 0, # For hallucination severity
'correct': 0, 'total': 0 # For overall accuracy
}

answer = sample['answer'].strip().lower()
prediction = sample['pred'].strip().lower()

# Update total for counting predictions and answers
sub_category_dict[sub_category]['total'] += 1

# Check for overall accuracy
if answer in prediction[:5]:
sub_category_dict[sub_category]['correct'] += 1

# Check for "yes" or "no" in the answers
if 'yes' in answer[:5]:
sub_category_dict[sub_category]['total_yes_in_answer'] += 1
if 'yes' in prediction[:5]:
sub_category_dict[sub_category]['correct_yes'] += 1
elif 'no' in answer[:5]:
sub_category_dict[sub_category]['total_no_in_answer'] += 1
if 'no' in prediction[:5]:
sub_category_dict[sub_category]['correct_no'] += 1

# Print the combined result in a table format with alignment
print(f'{"sub_category":<35}{"perception accuracy":<20}{"hallucination severity":<25}{"accuracy":<10}')
for sub_category in sub_category_dict:
correct_yes = sub_category_dict[sub_category]['correct_yes']
total_yes_in_answer = sub_category_dict[sub_category]['total_yes_in_answer']
correct_no = sub_category_dict[sub_category]['correct_no']
total_no_in_answer = sub_category_dict[sub_category]['total_no_in_answer']
correct = sub_category_dict[sub_category]['correct']
total = sub_category_dict[sub_category]['total']

# Calculate perception accuracy, hallucination severity, and accuracy
perception_accuracy = correct_yes / total_yes_in_answer if total_yes_in_answer > 0 else 0
hallucination_severity = correct_no / total_no_in_answer if total_no_in_answer > 0 else 0
accuracy = correct / total if total > 0 else 0

print(f'{sub_category:<35}{perception_accuracy:<20.4f}{hallucination_severity:<25.4f}{accuracy:<10.4f}')

### Perception Accuracy, Hallucination Severity, and Accuracy for object-level and event-level within each sub_category
granularity_dict = {
'object-level': {},
'event-level': {}
}

# First loop to accumulate counts for both levels
for line in lines:
sample = json.loads(line)
sub_category = sample['sub_category']
granularity = sample['granularity'] # Determine the level: object-level or event-level

if granularity not in granularity_dict:
continue # Skip if the granularity is not recognized

if sub_category not in granularity_dict[granularity]:
granularity_dict[granularity][sub_category] = {
'correct_yes': 0, 'total_yes_in_answer': 0, # For perception accuracy
'correct_no': 0, 'total_no_in_answer': 0, # For hallucination severity
'correct': 0, 'total': 0 # For overall accuracy
}

answer = sample['answer'].strip().lower()
prediction = sample['pred'].strip().lower()

# Update total for counting predictions and answers
granularity_dict[granularity][sub_category]['total'] += 1

# Check for overall accuracy
if answer in prediction[:5]:
granularity_dict[granularity][sub_category]['correct'] += 1

# Check for "yes" or "no" in the answers
if 'yes' in answer[:5]:
granularity_dict[granularity][sub_category]['total_yes_in_answer'] += 1
if 'yes' in prediction[:5]:
granularity_dict[granularity][sub_category]['correct_yes'] += 1
elif 'no' in answer[:5]:
granularity_dict[granularity][sub_category]['total_no_in_answer'] += 1
if 'no' in prediction[:5]:
granularity_dict[granularity][sub_category]['correct_no'] += 1

# Print the combined result for both levels in a table format with alignment
print(f'\n{"Granularity":<15}{"sub_category":<35}{"perception accuracy":<20}{"hallucination severity":<25}{"accuracy":<10}')
for granularity in granularity_dict:
print(f'{granularity.upper()} RESULTS:')
for sub_category in granularity_dict[granularity]:
correct_yes = granularity_dict[granularity][sub_category]['correct_yes']
total_yes_in_answer = granularity_dict[granularity][sub_category]['total_yes_in_answer']
correct_no = granularity_dict[granularity][sub_category]['correct_no']
total_no_in_answer = granularity_dict[granularity][sub_category]['total_no_in_answer']
correct = granularity_dict[granularity][sub_category]['correct']
total = granularity_dict[granularity][sub_category]['total']

# Calculate perception accuracy, hallucination severity, and accuracy
perception_accuracy = correct_yes / total_yes_in_answer if total_yes_in_answer > 0 else 0
hallucination_severity = correct_no / total_no_in_answer if total_no_in_answer > 0 else 0
accuracy = correct / total if total > 0 else 0

print(f'{granularity:<15}{sub_category:<35}{perception_accuracy:<20.4f}{hallucination_severity:<25.4f}{accuracy:<10.4f}')

0 comments on commit cc56da9

Please sign in to comment.