Skip to content

Commit

Permalink
Create word2vec.py
Browse files Browse the repository at this point in the history
  • Loading branch information
wyl-willing authored Aug 26, 2023
1 parent 2d59da4 commit b114317
Showing 1 changed file with 32 additions and 0 deletions.
32 changes: 32 additions & 0 deletions pre-training/chatdoctor5k/word2vec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
import json
import re


re3 = r"<CLS>(.*?)<SEP>"
docs_dir = './data/chatdoctor5k/document'

docs = []
for file in os.listdir(docs_dir):
with open(os.path.join(docs_dir, file), 'r', encoding='utf-8') as f:
doc = f.read()
docs.append(doc)
questions = []
with open("./data/chatdoctor5k/NER_chatgpt.json", "r") as f:
for line in f.readlines():
x = json.loads(line)
input = x["qustion_output"]
input = input.replace("\n","")
input = input.replace("<OOS>","<EOS>")
input = input.replace(":","") + "<END>"
input_text = re.findall(re3,input)
if input_text == []:
continue
questions.append(input_text[0])

sentences = [doc.split() for doc in docs + questions]
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
model.save("./data/chatdoctor5k/word2vec.model")

0 comments on commit b114317

Please sign in to comment.