Skip to content

Commit

Permalink
Update evaluation scripts and instructions (lm-sys#223)
Browse files Browse the repository at this point in the history
  • Loading branch information
infwinston authored Apr 6, 2023
1 parent 7b8fa6d commit 6d98710
Show file tree
Hide file tree
Showing 24 changed files with 1,143 additions and 141 deletions.
19 changes: 14 additions & 5 deletions fastchat/eval/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,25 @@ Unfortunately, Bard has not release its public APIs till now. You may have to en

### Vicuna and others

To generate answers with Vicuna or other models, specify path to the model checkpoint. Then run:
To generate answers with Vicuna or other models, specify path to the model checkpoint, a desired model ID and run:
```bash
python model_qa.py --model-name /model/path --question-file tables/question.jsonl --answer-file table/answer/answer.jsonl
python get_model_answer.py --model-id [MODEL-ID] --model-path /model/path --question-file tables/question.jsonl --answer-file table/answer/answer.jsonl --num-gpus [NUM-GPUS]
```
Then the answers to the questions will be saved in `table/answer/answer.jsonl`.
Note: we assume the model can be loaded with a single GPU.

## Evaluate Answers Automatically

### Generete Reviews with GPT-4

PS: If you do not current have access to GPT-4 API, but you have access to GPT-4 chatbot, you can evaluate the answers manually, according to the instructions in the **Data Format** section. `table/review/*.jsonl` are some examples of reviews.
Note: Below script requires access to GPT-4 API. If you only have access to GPT-4 on web interface, you can evaluate the answers by manually formatting the prompt. See more details in the **Reviewers** and **Prompts** sections in **Data Format**.
It is critical to follow the prompt templates; otherwise GPT-4 may not give fair reviews. `table/review/*.jsonl` are some review examples generated by GPT-4 or you can view them on our eval [webpage](https://vicuna.lmsys.org/eval/).

TODO: add instructions
To use the script for generating reviews with GPT-4, you need to `export` your OpenAI API key in environment variable. Then run:
```bash
python eval_gpt_review.py -q table/question.jsonl -a /path/to/answer_1.jsonl /path/to/answer_2.jsonl -p table/prompt.jsonl -r table/reviewer.jsonl -o /path/to/review_output.jsonl
```
The GPT-4 reviews will be saved in `/path/to/review_output.jsonl`. Note: we implement some simple parsing code to extract the score pairs from GPT-4's reviews. However, you need to double check whether the parsed score pair are correct. Sometime the parsing logic may fail if GPT-4 doesn't give a structured answer.

## Visualize Results

Expand Down Expand Up @@ -108,6 +115,7 @@ For example:
* `prompt_id` (str): The ID of the prompt given to the reviewer (e.g., an AI assistant). Different prompts could result in different reviewing performance.
* `metadata` (dict): Metadata of a reviewer about its configurations.
* `description` (str): A description of the reviewer.
* `category` (str): The category that the reviewer belongs to.

For example:

Expand All @@ -117,7 +125,8 @@ For example:
"prompt_id": 1,
"temperature": 0.2,
"max_tokens": 8192,
"description": "GPT-4 for generic questions."
"description": "GPT-4 for general questions.",
"category": "general"
}
```

Expand Down
175 changes: 106 additions & 69 deletions fastchat/eval/eval_gpt_review.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,45 @@
import argparse
import json
import os
import time

import openai
import tqdm
import ray
import time

import shortuuid
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

MAX_API_RETRY = 5
REQ_TIME_GAP = 10

@ray.remote(num_cpus=4)
def get_eval(content: str, max_tokens: int):
try:
response = openai.ChatCompletion.create(
model='gpt-4',
messages=[{
'role': 'system',
'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
}, {
'role': 'user',
'content': content,
}],
temperature=0.2, # TODO: figure out which temperature is best for evaluation
max_tokens=max_tokens,
)
except Exception as e:
print(e)
return 'error'
return response['choices'][0]['message']['content']
def get_eval(sys_prompt, user_prompt: str, max_tokens: int):
logging.basicConfig(level=logging.INFO)
for i in range(MAX_API_RETRY):
try:
response = openai.ChatCompletion.create(
model='gpt-4',
messages=[{
'role': 'system',
'content': sys_prompt
}, {
'role': 'user',
'content': user_prompt,
}],
temperature=0.2, # TODO: figure out which temperature is best for evaluation
max_tokens=max_tokens,
)
content = response['choices'][0]['message']['content']
logger.info(content)
return content
except Exception as e:
logger.error(e)
time.sleep(5)
logger.error(f'Failed after {MAX_API_RETRY} retries.')
return 'error'


def parse_score(review):
Expand All @@ -36,70 +50,93 @@ def parse_score(review):
if len(sp) == 2:
return [float(sp[0]), float(sp[1])]
else:
print('error', review)
return [-1, -1]
raise Exception('Invalid score pair.')
except Exception as e:
print(e)
print('error', review)
logger.error(f'{e}\nContent: {review}\n'
'You must manually fix the score pair.')
return [-1, -1]


def gen_prompt(reviewer_jsons, prompt_jsons, cat, ques, ans1, ans2):
# Default to general category (index=0)
reviewer_idx = 0
for idx, reviewer in enumerate(reviewer_jsons):
if reviewer['category'] == cat:
reviewer_idx = idx
break
prompt_id = reviewer_jsons[reviewer_idx]['prompt_id']
prompt_json = prompt_jsons[prompt_id-1]
assert prompt_json['prompt_id'] == prompt_id

sys_prompt = prompt_json['system_prompt']
prompt_template = prompt_json['prompt_template']
defaults = prompt_json['defaults']
prompt = prompt_template.format(question=ques, answer_1=ans1, answer_2=ans2, **defaults)

return sys_prompt, prompt, reviewer_idx+1


def get_json_list(file_path):
file_path = os.path.expanduser(file_path)
with open(file_path, 'r') as f:
json_list = []
for line in f:
json_list.append(json.loads(line))
return json_list


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
parser.add_argument('-q', '--question')
# parser.add_argument('-a', '--answer')
parser.add_argument('-a', '--answer-list', nargs='+', default=[])
parser.add_argument('-r', '--rule')
parser.add_argument('-o', '--output')
parser.add_argument('-q', '--question-file')
parser.add_argument('-a', '--answer-file-list', nargs='+', default=[])
parser.add_argument('-p', '--prompt-file')
parser.add_argument('-r', '--reviewer-file')
parser.add_argument('-o', '--output-review-file')
parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
args = parser.parse_args()

ray.init()

f_q = open(os.path.expanduser(args.question))
f_ans1 = open(os.path.expanduser(args.answer_list[0]))
f_ans2 = open(os.path.expanduser(args.answer_list[1]))
rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
question_jsons = get_json_list(args.question_file)
answer1_jsons = get_json_list(args.answer_file_list[0])
answer2_jsons = get_json_list(args.answer_file_list[1])
reviewer_jsons = get_json_list(args.reviewer_file)
prompt_jsons = get_json_list(args.prompt_file)

review_file = open(f'{args.output}', 'w')
# check if # of questions, answers are the same
assert len(question_jsons) == len(answer1_jsons) == len(answer2_jsons)

js_list = []
handles = []
idx = 0
for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
# if idx == 10:
# break

ques = json.loads(ques_js)['question']
ans1 = json.loads(ans1_js)['answer']
ans2 = json.loads(ans2_js)['answer']

category = json.loads(ques_js)['category']
if category in rule_dict:
rule = rule_dict[category]
else:
rule = rule_dict['default']
prompt = rule['prompt']
role = rule['role']
content = (f'[Question]\n{ques}\n\n'
f'[{role} 1]\n{ans1}\n\n[End of {role} 1]\n\n'
f'[{role} 2]\n{ans2}\n\n[End of {role} 2]\n\n'
f'[System]\n{prompt}\n\n')
js_list.append({
'id': idx+1,
'question': ques,
'answer1': ans1,
'answer2': ans2,
'category': category})
idx += 1
handles.append(get_eval.remote(content, args.max_tokens))
review_jsons = []
total_len = len(question_jsons)
question_idx_list = list(range(total_len))

for i in question_idx_list:
assert answer1_jsons[i]['question_id'] == question_jsons[i]['question_id'] == answer2_jsons[i]['question_id']

ques = question_jsons[i]['text']
cat = question_jsons[i]['category']
ans1 = answer1_jsons[i]['text']
ans2 = answer2_jsons[i]['text']
sys_prompt, prompt, reviewer_id = gen_prompt(reviewer_jsons, prompt_jsons, cat, ques, ans1, ans2)
review_id = shortuuid.uuid()
review_jsons.append({
'review_id': review_id,
'question_id': question_jsons[i]['question_id'],
'answer1_id': answer1_jsons[i]['answer_id'],
'answer2_id': answer2_jsons[i]['answer_id'],
'reviewer_id': reviewer_id,
'metadata': {},
})
# To avoid the rate limit set by OpenAI
time.sleep(10)
handles.append(get_eval.remote(sys_prompt, prompt, args.max_tokens))
logger.info(f'Waiting for {REQ_TIME_GAP} seconds before sending the next request.')
time.sleep(REQ_TIME_GAP)

reviews = ray.get(handles)
for idx, review in enumerate(reviews):
scores = parse_score(review)
js_list[idx]['content'] = review
js_list[idx]['tuple'] = scores
review_file.write(json.dumps(js_list[idx]) + '\n')
review_file.close()
with open(f'{args.output_review_file}', 'w') as output_review_file:
for idx, review in enumerate(reviews):
scores = parse_score(review)
review_jsons[idx]['text'] = review
review_jsons[idx]['score'] = scores
output_review_file.write(json.dumps(review_jsons[idx]) + '\n')
85 changes: 85 additions & 0 deletions fastchat/eval/get_model_answer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import argparse
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os
import json
from tqdm import tqdm
import shortuuid
import ray

from fastchat.conversation import default_conversation
from fastchat.utils import disable_torch_init


def run_eval(model_path, model_id, question_file, answer_file, num_gpus):
# split question file into num_gpus files
ques_jsons = []
with open(os.path.expanduser(question_file), "r") as ques_file:
for line in ques_file:
ques_jsons.append(line)

chunk_size = len(ques_jsons) // num_gpus
ans_handles = []
for i in range(0, len(ques_jsons), chunk_size):
ans_handles.append(get_model_answers.remote(model_path, model_id, ques_jsons[i:i + chunk_size]))

ans_jsons = []
for ans_handle in ans_handles:
ans_jsons.extend(ray.get(ans_handle))

with open(os.path.expanduser(answer_file), "w") as ans_file:
for line in ans_jsons:
ans_file.write(json.dumps(line) + "\n")


@ray.remote(num_gpus=1)
@torch.inference_mode()
def get_model_answers(model_path, model_id, question_jsons):
disable_torch_init()
model_path = os.path.expanduser(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path,
torch_dtype=torch.float16).cuda()

ans_jsons = []
for i, line in enumerate(tqdm(question_jsons)):
ques_json = json.loads(line)
idx = ques_json["question_id"]
qs = ques_json["text"]
conv = default_conversation.copy()
conv.append_message(conv.roles[0], qs)
prompt = conv.get_prompt()
inputs = tokenizer([prompt])
output_ids = model.generate(
torch.as_tensor(inputs.input_ids).cuda(),
do_sample=True,
temperature=0.7,
max_new_tokens=1024)
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
try:
index = outputs.index(conv.sep, len(prompt))
except ValueError:
outputs += conv.sep
index = outputs.index(conv.sep, len(prompt))

outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip()
ans_id = shortuuid.uuid()
ans_jsons.append({"question_id": idx,
"text": outputs,
"answer_id": ans_id,
"model_id": model_id,
"metadata": {}})
return ans_jsons


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model-path", type=str, required=True)
parser.add_argument("--model-id", type=str, required=True)
parser.add_argument("--question-file", type=str, required=True)
parser.add_argument("--answer-file", type=str, default="answer.jsonl")
parser.add_argument("--num-gpus", type=int, default=1)
args = parser.parse_args()

ray.init()
run_eval(args.model_path, args.model_id, args.question_file, args.answer_file, args.num_gpus)
Loading

0 comments on commit 6d98710

Please sign in to comment.