Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
yuyou-dev authored Sep 13, 2023
1 parent 44b3069 commit 3b89d8c
Showing 1 changed file with 18 additions and 26 deletions.
44 changes: 18 additions & 26 deletions scripts/preprocess_data.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,23 @@
# -*- coding: utf-8 -*-
import json
import random

def transform_jsonl(input_file_path, output_file_path):
entries = []
with open(input_file_path, 'r') as file:
for line in file:
entry = json.loads(line)
entries.append(entry)
raw_data = []
with open('../data/raw/test_datasets.jsonl', 'r') as file:
for line in file:
item = json.loads(line)
raw_data.append(item)
break

# 随机抽取100个条目
sampled_entries = random.sample(entries, 100)
# 随机从6000条数据中取出100条做训练测试
raw_messages = random.sample(raw_data,100)

with open(output_file_path, 'w') as outfile:
for entry in sampled_entries:
messages = []
messages.append({"role": "system", "content": "你是一个医疗助手"})
user_message = {"role": "user", "content": entry["questions"]}
assistant_message = {"role": "assistant", "content": entry["answers"]}
messages.extend([user_message, assistant_message])
result = {"messages": messages}
json.dump(result, outfile, ensure_ascii=False)
outfile.write('\n')

# 准备好的txt文件
input_file_path = "../data/raw/test_datasets.jsonl"
# 需要输出的jsonl文件地址
output_file_path = "../data/processed/chat_data.jsonl"

transform_jsonl(input_file_path, output_file_path)
with open('../data/processed/chat_data.jsonl', 'w') as data_file:
for item in raw_messages:
# 初始化消息对象
data = {"messages": []}
data["messages"].append({"role": "system", "content": "你是一个医疗助手"})
data["messages"].append({"role": "user", "content": item["questions"]})
data["messages"].append({"role": "assistant", "content": item["answers"]})
result = {"messages": data}
json.dump(result, data_file, ensure_ascii=False)
data_file.write('\n')

0 comments on commit 3b89d8c

Please sign in to comment.