Update evaluation scripts and instructions (lm-sys#223)

MrsWangHui · Apr 6, 2023 · 6d98710 · 6d98710
1 parent 7b8fa6d
commit 6d98710
Show file tree

Hide file tree

Showing 24 changed files with 1,143 additions and 141 deletions.
diff --git a/fastchat/eval/README.md b/fastchat/eval/README.md
@@ -18,18 +18,25 @@ Unfortunately, Bard has not release its public APIs till now. You may have to en
 
 ### Vicuna and others
 
-To generate answers with Vicuna or other models, specify path to the model checkpoint. Then run:
+To generate answers with Vicuna or other models, specify path to the model checkpoint, a desired model ID and run:
 ```bash
-python model_qa.py --model-name /model/path --question-file tables/question.jsonl --answer-file table/answer/answer.jsonl
+python get_model_answer.py --model-id [MODEL-ID] --model-path /model/path --question-file tables/question.jsonl --answer-file table/answer/answer.jsonl --num-gpus [NUM-GPUS]
 ```
+Then the answers to the questions will be saved in `table/answer/answer.jsonl`.
+Note: we assume the model can be loaded with a single GPU.
 
 ## Evaluate Answers Automatically
 
 ### Generete Reviews with GPT-4
 
-PS: If you do not current have access to GPT-4 API, but you have access to GPT-4 chatbot, you can evaluate the answers manually, according to the instructions in the **Data Format** section. `table/review/*.jsonl` are some examples of reviews.
+Note: Below script requires access to GPT-4 API. If you only have access to GPT-4 on web interface, you can evaluate the answers by manually formatting the prompt. See more details in the **Reviewers** and **Prompts** sections in **Data Format**.
+It is critical to follow the prompt templates; otherwise GPT-4 may not give fair reviews. `table/review/*.jsonl` are some review examples generated by GPT-4 or you can view them on our eval [webpage](https://vicuna.lmsys.org/eval/).
 
-TODO: add instructions
+To use the script for generating reviews with GPT-4, you need to `export` your OpenAI API key in environment variable. Then run:
+```bash
+python eval_gpt_review.py -q table/question.jsonl -a /path/to/answer_1.jsonl /path/to/answer_2.jsonl -p table/prompt.jsonl -r table/reviewer.jsonl -o /path/to/review_output.jsonl
+```
+The GPT-4 reviews will be saved in `/path/to/review_output.jsonl`. Note: we implement some simple parsing code to extract the score pairs from GPT-4's reviews. However, you need to double check whether the parsed score pair are correct. Sometime the parsing logic may fail if GPT-4 doesn't give a structured answer.
 
 ## Visualize Results
 
@@ -108,6 +115,7 @@ For example:
 * `prompt_id` (str): The ID of the prompt given to the reviewer (e.g., an AI assistant). Different prompts could result in different reviewing performance.
 * `metadata` (dict): Metadata of a reviewer about its configurations.
 * `description` (str): A description of the reviewer.
+* `category` (str): The category that the reviewer belongs to.
 
 For example:
 
@@ -117,7 +125,8 @@ For example:
   "prompt_id": 1,
   "temperature": 0.2,
   "max_tokens": 8192,
-  "description": "GPT-4 for generic questions."
+  "description": "GPT-4 for general questions.",
+  "category": "general"
 }
 ```
 

diff --git a/fastchat/eval/eval_gpt_review.py b/fastchat/eval/eval_gpt_review.py
@@ -1,31 +1,45 @@
 import argparse
 import json
 import os
+import time
 
 import openai
 import tqdm
 import ray
-import time
+
+import shortuuid
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+MAX_API_RETRY = 5
+REQ_TIME_GAP = 10
 
 @ray.remote(num_cpus=4)
-def get_eval(content: str, max_tokens: int):
-    try:
-        response = openai.ChatCompletion.create(
-            model='gpt-4',
-            messages=[{
-                'role': 'system',
-                'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
-            }, {
-                'role': 'user',
-                'content': content,
-            }],
-            temperature=0.2,  # TODO: figure out which temperature is best for evaluation
-            max_tokens=max_tokens,
-        )
-    except Exception as e:
-        print(e)
-        return 'error'
-    return response['choices'][0]['message']['content']
+def get_eval(sys_prompt, user_prompt: str, max_tokens: int):
+    logging.basicConfig(level=logging.INFO)
+    for i in range(MAX_API_RETRY):
+        try:
+            response = openai.ChatCompletion.create(
+                model='gpt-4',
+                messages=[{
+                    'role': 'system',
+                    'content': sys_prompt
+                }, {
+                    'role': 'user',
+                    'content': user_prompt,
+                }],
+                temperature=0.2,  # TODO: figure out which temperature is best for evaluation
+                max_tokens=max_tokens,
+            )
+            content = response['choices'][0]['message']['content']
+            logger.info(content)
+            return content
+        except Exception as e:
+            logger.error(e)
+            time.sleep(5)
+    logger.error(f'Failed after {MAX_API_RETRY} retries.')
+    return 'error'
 
 
 def parse_score(review):
@@ -36,70 +50,93 @@ def parse_score(review):
         if len(sp) == 2:
             return [float(sp[0]), float(sp[1])]
         else:
-            print('error', review)
-            return [-1, -1]
+            raise Exception('Invalid score pair.')
     except Exception as e:
-        print(e)
-        print('error', review)
+        logger.error(f'{e}\nContent: {review}\n'
+                     'You must manually fix the score pair.')
         return [-1, -1]
 
 
+def gen_prompt(reviewer_jsons, prompt_jsons, cat, ques, ans1, ans2):
+    # Default to general category (index=0)
+    reviewer_idx = 0
+    for idx, reviewer in enumerate(reviewer_jsons):
+        if reviewer['category'] == cat:
+            reviewer_idx = idx
+            break
+    prompt_id = reviewer_jsons[reviewer_idx]['prompt_id']
+    prompt_json = prompt_jsons[prompt_id-1]
+    assert prompt_json['prompt_id'] == prompt_id
+
+    sys_prompt = prompt_json['system_prompt']
+    prompt_template = prompt_json['prompt_template']
+    defaults = prompt_json['defaults']
+    prompt = prompt_template.format(question=ques, answer_1=ans1, answer_2=ans2, **defaults)
+
+    return sys_prompt, prompt, reviewer_idx+1
+
+
+def get_json_list(file_path):
+    file_path = os.path.expanduser(file_path)
+    with open(file_path, 'r') as f:
+        json_list = []
+        for line in f:
+            json_list.append(json.loads(line))
+        return json_list
+
+
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
-    parser.add_argument('-q', '--question')
-    # parser.add_argument('-a', '--answer')
-    parser.add_argument('-a', '--answer-list', nargs='+', default=[])
-    parser.add_argument('-r', '--rule')
-    parser.add_argument('-o', '--output')
+    parser.add_argument('-q', '--question-file')
+    parser.add_argument('-a', '--answer-file-list', nargs='+', default=[])
+    parser.add_argument('-p', '--prompt-file')
+    parser.add_argument('-r', '--reviewer-file')
+    parser.add_argument('-o', '--output-review-file')
     parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
     args = parser.parse_args()
 
     ray.init()
 
-    f_q = open(os.path.expanduser(args.question))
-    f_ans1 = open(os.path.expanduser(args.answer_list[0]))
-    f_ans2 = open(os.path.expanduser(args.answer_list[1]))
-    rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
+    question_jsons = get_json_list(args.question_file)
+    answer1_jsons = get_json_list(args.answer_file_list[0])
+    answer2_jsons = get_json_list(args.answer_file_list[1])
+    reviewer_jsons = get_json_list(args.reviewer_file)
+    prompt_jsons = get_json_list(args.prompt_file)
 
-    review_file = open(f'{args.output}', 'w')
+    # check if # of questions, answers are the same
+    assert len(question_jsons) == len(answer1_jsons) == len(answer2_jsons)
 
-    js_list = []
     handles = []
-    idx = 0
-    for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
-        # if idx == 10:
-        #     break
-
-        ques = json.loads(ques_js)['question']
-        ans1 = json.loads(ans1_js)['answer']
-        ans2 = json.loads(ans2_js)['answer']
-
-        category = json.loads(ques_js)['category']
-        if category in rule_dict:
-            rule = rule_dict[category]
-        else:
-            rule = rule_dict['default']
-        prompt = rule['prompt']
-        role = rule['role']
-        content = (f'[Question]\n{ques}\n\n'
-                   f'[{role} 1]\n{ans1}\n\n[End of {role} 1]\n\n'
-                   f'[{role} 2]\n{ans2}\n\n[End of {role} 2]\n\n'
-                   f'[System]\n{prompt}\n\n')
-        js_list.append({
-            'id': idx+1,
-            'question': ques,
-            'answer1': ans1,
-            'answer2': ans2,
-            'category': category})
-        idx += 1
-        handles.append(get_eval.remote(content, args.max_tokens))
+    review_jsons = []
+    total_len = len(question_jsons)
+    question_idx_list = list(range(total_len))
+
+    for i in question_idx_list:
+        assert answer1_jsons[i]['question_id'] == question_jsons[i]['question_id'] == answer2_jsons[i]['question_id']
+
+        ques = question_jsons[i]['text']
+        cat = question_jsons[i]['category']
+        ans1 = answer1_jsons[i]['text']
+        ans2 = answer2_jsons[i]['text']
+        sys_prompt, prompt, reviewer_id = gen_prompt(reviewer_jsons, prompt_jsons, cat, ques, ans1, ans2)
+        review_id = shortuuid.uuid()
+        review_jsons.append({
+            'review_id': review_id,
+            'question_id': question_jsons[i]['question_id'],
+            'answer1_id': answer1_jsons[i]['answer_id'],
+            'answer2_id': answer2_jsons[i]['answer_id'],
+            'reviewer_id': reviewer_id,
+            'metadata': {},
+        })
         # To avoid the rate limit set by OpenAI
-        time.sleep(10)
+        handles.append(get_eval.remote(sys_prompt, prompt, args.max_tokens))
+        logger.info(f'Waiting for {REQ_TIME_GAP} seconds before sending the next request.')
+        time.sleep(REQ_TIME_GAP)
 
     reviews = ray.get(handles)
-    for idx, review in enumerate(reviews):
-        scores = parse_score(review)
-        js_list[idx]['content'] = review
-        js_list[idx]['tuple'] = scores
-        review_file.write(json.dumps(js_list[idx]) + '\n')
-    review_file.close()
+    with open(f'{args.output_review_file}', 'w') as output_review_file:
+        for idx, review in enumerate(reviews):
+            scores = parse_score(review)
+            review_jsons[idx]['text'] = review
+            review_jsons[idx]['score'] = scores
+            output_review_file.write(json.dumps(review_jsons[idx]) + '\n')
diff --git a/fastchat/eval/get_model_answer.py b/fastchat/eval/get_model_answer.py
@@ -0,0 +1,85 @@
+import argparse
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+import ray
+
+from fastchat.conversation import default_conversation
+from fastchat.utils import disable_torch_init
+
+
+def run_eval(model_path, model_id, question_file, answer_file, num_gpus):
+    # split question file into num_gpus files
+    ques_jsons = []
+    with open(os.path.expanduser(question_file), "r") as ques_file:
+        for line in ques_file:
+            ques_jsons.append(line)
+
+    chunk_size = len(ques_jsons) // num_gpus
+    ans_handles = []
+    for i in range(0, len(ques_jsons), chunk_size):
+        ans_handles.append(get_model_answers.remote(model_path, model_id, ques_jsons[i:i + chunk_size]))
+
+    ans_jsons = []
+    for ans_handle in ans_handles:
+        ans_jsons.extend(ray.get(ans_handle))
+
+    with open(os.path.expanduser(answer_file), "w") as ans_file:
+        for line in ans_jsons:
+            ans_file.write(json.dumps(line) + "\n")
+
+
+@ray.remote(num_gpus=1)
+@torch.inference_mode()
+def get_model_answers(model_path, model_id, question_jsons):
+    disable_torch_init()
+    model_path = os.path.expanduser(model_path)
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    model = AutoModelForCausalLM.from_pretrained(model_path,
+        torch_dtype=torch.float16).cuda()
+
+    ans_jsons = []
+    for i, line in enumerate(tqdm(question_jsons)):
+        ques_json = json.loads(line)
+        idx = ques_json["question_id"]
+        qs = ques_json["text"]
+        conv = default_conversation.copy()
+        conv.append_message(conv.roles[0], qs)
+        prompt = conv.get_prompt()
+        inputs = tokenizer([prompt])
+        output_ids = model.generate(
+            torch.as_tensor(inputs.input_ids).cuda(),
+            do_sample=True,
+            temperature=0.7,
+            max_new_tokens=1024)
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
+        try:
+            index = outputs.index(conv.sep, len(prompt))
+        except ValueError:
+            outputs += conv.sep
+            index = outputs.index(conv.sep, len(prompt))
+
+        outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip()
+        ans_id = shortuuid.uuid()
+        ans_jsons.append({"question_id": idx,
+                          "text": outputs,
+                          "answer_id": ans_id,
+                          "model_id": model_id,
+                          "metadata": {}})
+    return ans_jsons
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, required=True)
+    parser.add_argument("--model-id", type=str, required=True)
+    parser.add_argument("--question-file", type=str, required=True)
+    parser.add_argument("--answer-file", type=str, default="answer.jsonl")
+    parser.add_argument("--num-gpus", type=int, default=1)
+    args = parser.parse_args()
+
+    ray.init()
+    run_eval(args.model_path, args.model_id, args.question_file, args.answer_file, args.num_gpus)