Create word2vec.py

xutianhan · Aug 26, 2023 · b114317 · b114317
1 parent 2d59da4
commit b114317
Showing 1 changed file with 32 additions and 0 deletions.
diff --git a/pre-training/chatdoctor5k/word2vec.py b/pre-training/chatdoctor5k/word2vec.py
@@ -0,0 +1,32 @@
+import os
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from gensim.models import Word2Vec
+import json
+import re
+
+
+re3 = r"<CLS>(.*?)<SEP>"
+docs_dir = './data/chatdoctor5k/document'
+
+docs = []
+for file in os.listdir(docs_dir):
+    with open(os.path.join(docs_dir, file), 'r', encoding='utf-8') as f:
+        doc = f.read()
+        docs.append(doc)
+questions = []
+with open("./data/chatdoctor5k/NER_chatgpt.json", "r") as f:
+    for line in f.readlines():
+        x = json.loads(line)
+        input = x["qustion_output"]
+        input = input.replace("\n","")
+        input = input.replace("<OOS>","<EOS>")
+        input = input.replace(":","") + "<END>"
+        input_text = re.findall(re3,input)
+        if input_text == []:
+            continue
+        questions.append(input_text[0])
+
+sentences = [doc.split() for doc in docs + questions]
+model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
+model.save("./data/chatdoctor5k/word2vec.model")