Rank: Supervise and Unsurpervise

wzzzd · Jan 17, 2023 · 0396105 · 0396105
1 parent da4fb32
commit 0396105
Show file tree

Hide file tree

Showing 6 changed files with 129 additions and 32 deletions.
diff --git a/FAQ.py b/FAQ.py
@@ -87,11 +87,6 @@ def query(self, query, recall_size=1000, prerank_size=100, rank_size=10):
         question = line['question']
         target = line['answer']
         answer = faq.query(question)
-        # print('- question:{}'.format(question))
-        # # print('- answer:  {}'.format(target))
-        # for line in answer:
-        #     print('     - question:{} '.format(line['question']))
-        #     # print('     - question:{}  - answer:{}'.format(line['question'],line['answer']))
         # top1 num
         module = answer[0]['answer']
         if target==module:
@@ -114,16 +109,12 @@ def query(self, query, recall_size=1000, prerank_size=100, rank_size=10):
             if target==module:
                 recall_num += 1
                 break
-    # top1 acc
+    # acc
     top1_acc = top1_num/max(len(data), 1)
-    # top3 acc
     top3_acc = top3_num/max(len(data), 1)
-    # top10 acc
     top10_acc = top10_num/max(len(data), 1)
-    # recall
     recall = recall_num/max(len(data), 1)
 
-
     print('top 1 acc={}/{}={}'.format(top1_num, max(len(data), 1), top1_acc))
     print('top 3 acc={}/{}={}'.format(top3_num, max(len(data), 1), top3_acc))
     print('top 10 acc={}/{}={}'.format(top10_num, max(len(data), 1), top10_acc))

diff --git a/Module/PreRank.py b/Module/PreRank.py
@@ -4,6 +4,7 @@
 from Module.BM25.BM25 import BM25
 from Module.Ngram.Ngram import Ngram
 from Module.Word2Vec.Word2Vec import W2V
+from Module.Word2Vec.Word2VecTX import W2VTX
 # from Module.Word2Vec.train import W2V
 from Module.LM.LMEmbedding import LMEmbedding
 from Utils.Logger import init_logger
@@ -18,7 +19,7 @@ def __init__(self, config):
         self.logger = init_logger() # 'PreRank model'
         self.logger.info('  - model: {}'.format(self.model_name))
 
-        # 初始化：Word2Vec
+        # 初始化：Word2Vec(自训练)
         if self.model_name == 'word2vec':
             self.model = W2V(config)
             ## 若不存在word2vec模型，则训练
@@ -34,6 +35,11 @@ def __init__(self, config):
                 self.logger.info('     - model: exist')
                 self.logger.info('     - model: loading ...')
                 self.model.load(self.config.path_w2v_model)
+        # 初始化：Word2Vec(tencent)
+        if self.model_name == 'word2vec-tx':
+            self.model = W2VTX(config)
+            self.logger.info('     - model: loading ...')
+            self.model.load(self.config.path_w2v_tx)
         # 初始化：BM25
         if self.model_name == 'bm25':
             # self.model = BM25()
@@ -73,7 +79,7 @@ def rank(self, query, query_token, corpus, size=100):
         elif self.model_name == 'ngram':
             score = self.model.compute_similarity(query, question)
         ## Word2Vec
-        elif self.model_name == 'word2vec':
+        elif self.model_name in ['word2vec', 'word2vec-tx']:
             query_vec = self.model.get_embedding(query_token)
             corpus_vec = [self.model.get_embedding(line) for line in question_token]
             score = self.model.compute_similarity(query_vec, corpus_vec)

diff --git a/Module/RankUnsupervise.py b/Module/RankUnsupervise.py
@@ -51,10 +51,10 @@ def rank(self, query, corpus, size=10):
         ## Distilbert
         if self.model_name in ['simcse-distilbert', 'simcse-bert']:
             # 获取query的embedding输出
-            in_query = self.tokenizer(query, return_tensors="pt")
+            inputs_token = self.tokenizer(query, return_tensors="pt")
             if self.config.use_cuda:
                 inputs_token = inputs_token.to(self.device)
-            out_query = self.model(**in_query)
+            out_query = self.model(**inputs_token)
             emb_query = out_query.last_hidden_state[:,0]
             emb_query = emb_query.transpose(0,1)
             # 获取question的embedding输出

diff --git a/Module/Word2Vec/Word2Vec.py b/Module/Word2Vec/Word2Vec.py
@@ -5,7 +5,6 @@
 from gensim.models import Word2Vec
 from gensim.models import KeyedVectors
 from gensim.test.utils import common_texts
-
 # from gensim.models.word2vec import Word2Vec
 import jieba
 import numpy as np
@@ -63,8 +62,6 @@ def get_word_embedding(self, word):
         获取单词的embedding vector
         同时需要考虑OOV的问题
         """
-        # dict_word = list(self.model.wv.key_to_index.keys())
-        # dict_word = self.model.wv.vocab.keys()
         if word in self.dict_word:
             # 在词典里
             vec = self.model.wv[word]                          # get numpy vector of a word

diff --git a/Module/Word2Vec/Word2VecTX.py b/Module/Word2Vec/Word2VecTX.py
@@ -0,0 +1,102 @@
+
+
+import os
+import pandas as pd
+from gensim.models import Word2Vec
+from gensim.models import KeyedVectors
+from gensim.test.utils import common_texts
+# from gensim.models.word2vec import Word2Vec
+import jieba
+import numpy as np
+
+
+
+
+class W2VTX(object):
+
+    def __init__(self, config):
+        self.config = config
+        path_stopword = self.config.path_stopword
+        self.min_count = 1
+        self.window = 5
+        self.workers = 8
+        self.size = 100
+        self.model = Word2Vec(min_count=self.min_count, window=self.window, workers=self.workers, vector_size=self.size)
+        self.stopword = [ w.strip() for w in open(path_stopword, 'r', encoding='utf8').readlines()]
+        # self.dict_word = list(self.model.wv.key_to_index.keys())
+
+
+    def tokenizer(self, sentence):
+        """
+        对单个句子进行分词
+        """
+        words = jieba.cut(sentence, cut_all=False)                  # 精确模式
+        # words = [ w for w in words if w not in self.stopword]
+        words = [ w for w in words]
+        return words
+
+
+    def get_corpus(self, corpus):
+        """
+        对语料库中的每个句子进行分词
+        """
+        print('word2vec: tokenizer...')
+        sentences = [ self.tokenizer(sen) for sen in corpus]
+        return sentences
+
+
+    def get_word_embedding(self, word):
+        """
+        获取单词的embedding vector
+        同时需要考虑OOV的问题
+        """
+        try:
+            vec = self.model.word_vec(word)
+        except KeyError:
+            vec = np.zeros(self.size, dtype=np.float32)
+            num = 0
+            for w in word:
+                if w in self.dict_word:
+                    vec += self.model.word_vec(w)
+                    num += 1
+            vec /= max(1, num)
+        return vec
+
+
+    def get_embedding(self, sentence):
+        """
+        获取句子的embedding vector
+        """
+        vec = np.zeros(self.size, dtype=np.float32)
+        for word in sentence:
+            vec += self.get_word_embedding(word)
+        vec /= max(1, len(sentence))
+        return vec
+
+
+    def compute_similarity(self, query, corpus):
+        """计算文本与语料库的相似度
+        Args:
+            query (_type_): _description_
+            corpus (_type_): _description_
+        """
+        score = []
+        for i,c in enumerate(corpus):
+            # cos_sim = query.dot(c) / np.linalg.norm(query) * np.linalg.norm(c)
+            dot = float(np.dot(query, c))
+            norm = np.linalg.norm(query)*np.linalg.norm(c)
+            cos_sim = dot / max(norm, 1)
+            score.append([i, cos_sim])
+        scores_rank = sorted(score, key=lambda x: x[1], reverse=True)
+        return scores_rank 
+
+
+    def load(self, path):
+        """
+        读取模型/词向量
+        """
+        self.model = KeyedVectors.load_word2vec_format(path, binary=False)
+        self.dict_word = list(self.model.key_to_index.keys())
+
+
+
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # FAQ
 
-FAQ问答服务
+FAQ智能问答系统
 
 使用多种方法，实现FAQ的问题-模板匹配功能。
 
@@ -38,7 +38,9 @@ FAQ问答服务
 - N-gram
      - 使用`2-gram`、`3-gram`、`4-gram`提取词片段，再使用`jaccard`计算相似度，作为排序分数。
 - Word2Vec
-     - 使用本项目的QA数据集作为语料，基于`gensim`框架训练得到的`word2vec`模型。
+     - 提供了两种获取词向量的方法：
+          - (1) 使用本项目的QA数据集作为语料，基于`gensim`框架训练得到的`word2vec`模型。
+          - (2) 使用腾讯AI Lab开源的词向量[Embedding](https://ai.tencent.com/ailab/nlp/en/embedding.html)。需要将文件放置在目录`file/model/`下。项目默认使用【Original size: 1.8G; tar.gz size: 763M】的词向量，若使用其他词向量文件，需要修改`Config.py`下的`path_w2v_tx`变量值。
      - 计算词向量之间的相似度，作为排序分数。
 
 ### 3. 精排
@@ -135,14 +137,14 @@ $ pip install -r requirements.txt
 
 ### 2.指标
 在100个样本的测试集中，分别统计top1、top3、top10的召回结果准确率。
-| 模块配置 | Top1 Acc | Top3 Acc | Top10 Acc |
-| :-----| :---- | :---- | :---- |
-| PreRank(bm25) | 0.77 | 0.86 | 0.92 |
-| PreRank(ngram | 0.52 | 0.69 | 0.84 |
-| PreRank(word2vec) | 0.43 | 0.56 | 0.69 |
-| PreRank(bm25) + Rank(lm-mini)(Unsup) | 0.45 | 0.54 | 0.64 |
-| PreRank(bm25) + Rank(simcse-bert)(Unsup) | 0.43 | 0.51 | 0.61 |
-| PreRank(bm25) + Rank(bert)(sup) | 0.99 | 1.0 | 1.0 |
+| 粗排模型 | 精排模型 | Top1 Acc | Top3 Acc | Top10 Acc |
+| :-----| :---- | :---- | :---- | :---- |
+| bm25 | - | 0.77 | 0.86 | 0.92 |
+| ngram | - | 0.52 | 0.69 | 0.84 |
+| word2vec(tencent) | - | 0.76 | 0.83 | 0.88 |
+| bm25 | lm-mini(Unsup) | 0.45 | 0.54 | 0.64 |
+| bm25 | simcse-bert(Unsup) | 0.43 | 0.51 | 0.61 |
+| bm25 | bert(sup) | 0.99 | 1.0 | 1.0 |
 
 
 
@@ -151,12 +153,12 @@ $ pip install -r requirements.txt
 根目录下的`Config.py`文件，是配置文件。可自行修改相关的参数：
 - `es_ip`：ES搜索引擎的地址，默认是部署在同一台设备环境里，如果ES是部署在其他服务器，那么需要改成其他服务器的地址。
 - `es_index`：表示语料存储在ES中的index名字。
-- `model_name`：表示排序的方法，分别包含`bm25`/`ngram`/`word2vec`/`lm`。
+- `model_name`：表示排序的方法，分别包含`bm25/ngram/word2vec/word2vec-tx`。
 - `dataset`：表示数据集的名称，对应目录`./data/`下的数据文件夹名。
 - `use_rank`：是否使用精排模块。True开启，False关闭。
 - `use_supervise`：精排模块使用有监督方法，还是无监督方法。True为监督方法，False为无监督方法。
-- `unsup_rank_name`：表示无监督精排中，使用的模型类型，可选：`simcse-distilbert`/`simcse-bert`/`lm-mini`
-- `sup_rank_name`：表示监督方法精排中，使用的模型类型，可选：`distilbert`/`bert`
+- `unsup_rank_name`：表示无监督精排中，使用的模型类型，可选：`simcse-distilbert/simcse-bert/lm-mini`
+- `sup_rank_name`：表示监督方法精排中，使用的模型类型，可选：`distilbert/bert`
 
 
 ### 2.项目初始化
@@ -256,7 +258,6 @@ $ curl --request POST \
 | 更新时间 | 版本号 | 说明 |
 | :-----| :---- | :---- |
 | 2023-01-05 | V1.0 | 项目初始化，主要包含召回+排序的算法框架，以及Web服务 |
-| 2023-01-11 | V1.1 | 算法框架修改为：召回+粗排+精排 |
-| 2023-01-15 | V1.1 | 新增精排的supervise方法 |
+| 2023-01-17 | V1.1 | 算法框架修改为：召回+粗排+精排；新增精排的supervise方法；在粗排方法中，引入外部词向量 |
 | ... | ... | ... |