[eval] GPT-3.5 QA baseline (lm-sys#48)

kemolo · Mar 25, 2023 · 9fca947 · 9fca947
1 parent c91c005
commit 9fca947
Showing 1 changed file with 41 additions and 26 deletions.
diff --git a/chatserver/eval/qa_baseline_gpt-3.5-turbo.py b/chatserver/eval/qa_baseline_gpt-3.5-turbo.py
@@ -4,24 +4,34 @@
 import json
 import os
 import time
+import concurrent.futures
 
 import openai
 import tqdm
 
 
-def get_eval(question: str, max_tokens: int):
-    response = openai.ChatCompletion.create(
-        model='gpt-3.5-turbo',
-        messages=[{
-            'role': 'system',
-            'content': 'You are a helpful assistant.'
-        }, {
-            'role': 'user',
-            'content': question,
-        }],
-        max_tokens=max_tokens,
-    )
-    return response['choices'][0]['message']['content']
+MODEL = 'gpt-3.5-turbo'
+
+def get_answer(question_id: int, question: str, max_tokens: int):
+    for retries in range(3):
+        try:
+            response = openai.ChatCompletion.create(
+                model=MODEL,
+                messages=[{
+                    'role': 'system',
+                    'content': 'You are a helpful assistant.'
+                }, {
+                    'role': 'user',
+                    'content': question,
+                }],
+                max_tokens=max_tokens,
+            )
+            answer = response['choices'][0]['message']['content']
+            return {'id': question_id, 'answer': answer, 'model': MODEL}
+        except Exception as e:
+            print('[ERROR]', e)
+            time.sleep(1)
+    return {'id': question_id, 'answer': '#ERROR#', 'model': MODEL}
 
 
 if __name__ == '__main__':
@@ -31,22 +41,27 @@ def get_eval(question: str, max_tokens: int):
     parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
     args = parser.parse_args()
 
+    questions_dict = {}
     with open(os.path.expanduser(args.question)) as f:
-        question = json.load(f)
-        questions_dict = {q['id']: q['question'] for q in question['questions']}
+        for line in f:
+            if not line:
+                continue
+            q = json.loads(line)
+            questions_dict[q['id']] = q['question']
 
     answers = []
 
-    for qid, question in tqdm.tqdm(questions_dict.items()):
-        for retries in range(3):
-            try:
-                eval_result = get_eval(question, args.max_tokens)
-                answers.append({'id': qid, 'answer': eval_result})
-                break
-            except Exception as e:
-                print('Error: ', e)
-                if retries == 2:
-                    answers.append({'id': qid, 'answer': '#ERROR#'})
+    with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
+        futures = []
+        for qid, question in questions_dict.items():
+            future = executor.submit(get_answer, qid, question, args.max_tokens)
+            futures.append(future)
+
+        for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
+            answers.append(future.result())
+
+    answers.sort(key=lambda x: x['id'])
 
     with open(os.path.expanduser(args.output), 'w') as f:
-        json.dump({'model': 'gpt-3.5-turbo', 'answers': answers}, f)
+        table = [json.dumps(ans) for ans in answers]
+        f.write('\n'.join(table))