Skip to content

Commit

Permalink
Rank: Supervise and Unsurpervise
Browse files Browse the repository at this point in the history
  • Loading branch information
wzzzd committed Jan 17, 2023
1 parent da4fb32 commit 0396105
Show file tree
Hide file tree
Showing 6 changed files with 129 additions and 32 deletions.
11 changes: 1 addition & 10 deletions FAQ.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,6 @@ def query(self, query, recall_size=1000, prerank_size=100, rank_size=10):
question = line['question']
target = line['answer']
answer = faq.query(question)
# print('- question:{}'.format(question))
# # print('- answer: {}'.format(target))
# for line in answer:
# print(' - question:{} '.format(line['question']))
# # print(' - question:{} - answer:{}'.format(line['question'],line['answer']))
# top1 num
module = answer[0]['answer']
if target==module:
Expand All @@ -114,16 +109,12 @@ def query(self, query, recall_size=1000, prerank_size=100, rank_size=10):
if target==module:
recall_num += 1
break
# top1 acc
# acc
top1_acc = top1_num/max(len(data), 1)
# top3 acc
top3_acc = top3_num/max(len(data), 1)
# top10 acc
top10_acc = top10_num/max(len(data), 1)
# recall
recall = recall_num/max(len(data), 1)


print('top 1 acc={}/{}={}'.format(top1_num, max(len(data), 1), top1_acc))
print('top 3 acc={}/{}={}'.format(top3_num, max(len(data), 1), top3_acc))
print('top 10 acc={}/{}={}'.format(top10_num, max(len(data), 1), top10_acc))
Expand Down
10 changes: 8 additions & 2 deletions Module/PreRank.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from Module.BM25.BM25 import BM25
from Module.Ngram.Ngram import Ngram
from Module.Word2Vec.Word2Vec import W2V
from Module.Word2Vec.Word2VecTX import W2VTX
# from Module.Word2Vec.train import W2V
from Module.LM.LMEmbedding import LMEmbedding
from Utils.Logger import init_logger
Expand All @@ -18,7 +19,7 @@ def __init__(self, config):
self.logger = init_logger() # 'PreRank model'
self.logger.info(' - model: {}'.format(self.model_name))

# 初始化:Word2Vec
# 初始化:Word2Vec(自训练)
if self.model_name == 'word2vec':
self.model = W2V(config)
## 若不存在word2vec模型,则训练
Expand All @@ -34,6 +35,11 @@ def __init__(self, config):
self.logger.info(' - model: exist')
self.logger.info(' - model: loading ...')
self.model.load(self.config.path_w2v_model)
# 初始化:Word2Vec(tencent)
if self.model_name == 'word2vec-tx':
self.model = W2VTX(config)
self.logger.info(' - model: loading ...')
self.model.load(self.config.path_w2v_tx)
# 初始化:BM25
if self.model_name == 'bm25':
# self.model = BM25()
Expand Down Expand Up @@ -73,7 +79,7 @@ def rank(self, query, query_token, corpus, size=100):
elif self.model_name == 'ngram':
score = self.model.compute_similarity(query, question)
## Word2Vec
elif self.model_name == 'word2vec':
elif self.model_name in ['word2vec', 'word2vec-tx']:
query_vec = self.model.get_embedding(query_token)
corpus_vec = [self.model.get_embedding(line) for line in question_token]
score = self.model.compute_similarity(query_vec, corpus_vec)
Expand Down
4 changes: 2 additions & 2 deletions Module/RankUnsupervise.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,10 @@ def rank(self, query, corpus, size=10):
## Distilbert
if self.model_name in ['simcse-distilbert', 'simcse-bert']:
# 获取query的embedding输出
in_query = self.tokenizer(query, return_tensors="pt")
inputs_token = self.tokenizer(query, return_tensors="pt")
if self.config.use_cuda:
inputs_token = inputs_token.to(self.device)
out_query = self.model(**in_query)
out_query = self.model(**inputs_token)
emb_query = out_query.last_hidden_state[:,0]
emb_query = emb_query.transpose(0,1)
# 获取question的embedding输出
Expand Down
3 changes: 0 additions & 3 deletions Module/Word2Vec/Word2Vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.test.utils import common_texts

# from gensim.models.word2vec import Word2Vec
import jieba
import numpy as np
Expand Down Expand Up @@ -63,8 +62,6 @@ def get_word_embedding(self, word):
获取单词的embedding vector
同时需要考虑OOV的问题
"""
# dict_word = list(self.model.wv.key_to_index.keys())
# dict_word = self.model.wv.vocab.keys()
if word in self.dict_word:
# 在词典里
vec = self.model.wv[word] # get numpy vector of a word
Expand Down
102 changes: 102 additions & 0 deletions Module/Word2Vec/Word2VecTX.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@


import os
import pandas as pd
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.test.utils import common_texts
# from gensim.models.word2vec import Word2Vec
import jieba
import numpy as np




class W2VTX(object):

def __init__(self, config):
self.config = config
path_stopword = self.config.path_stopword
self.min_count = 1
self.window = 5
self.workers = 8
self.size = 100
self.model = Word2Vec(min_count=self.min_count, window=self.window, workers=self.workers, vector_size=self.size)
self.stopword = [ w.strip() for w in open(path_stopword, 'r', encoding='utf8').readlines()]
# self.dict_word = list(self.model.wv.key_to_index.keys())


def tokenizer(self, sentence):
"""
对单个句子进行分词
"""
words = jieba.cut(sentence, cut_all=False) # 精确模式
# words = [ w for w in words if w not in self.stopword]
words = [ w for w in words]
return words


def get_corpus(self, corpus):
"""
对语料库中的每个句子进行分词
"""
print('word2vec: tokenizer...')
sentences = [ self.tokenizer(sen) for sen in corpus]
return sentences


def get_word_embedding(self, word):
"""
获取单词的embedding vector
同时需要考虑OOV的问题
"""
try:
vec = self.model.word_vec(word)
except KeyError:
vec = np.zeros(self.size, dtype=np.float32)
num = 0
for w in word:
if w in self.dict_word:
vec += self.model.word_vec(w)
num += 1
vec /= max(1, num)
return vec


def get_embedding(self, sentence):
"""
获取句子的embedding vector
"""
vec = np.zeros(self.size, dtype=np.float32)
for word in sentence:
vec += self.get_word_embedding(word)
vec /= max(1, len(sentence))
return vec


def compute_similarity(self, query, corpus):
"""计算文本与语料库的相似度
Args:
query (_type_): _description_
corpus (_type_): _description_
"""
score = []
for i,c in enumerate(corpus):
# cos_sim = query.dot(c) / np.linalg.norm(query) * np.linalg.norm(c)
dot = float(np.dot(query, c))
norm = np.linalg.norm(query)*np.linalg.norm(c)
cos_sim = dot / max(norm, 1)
score.append([i, cos_sim])
scores_rank = sorted(score, key=lambda x: x[1], reverse=True)
return scores_rank


def load(self, path):
"""
读取模型/词向量
"""
self.model = KeyedVectors.load_word2vec_format(path, binary=False)
self.dict_word = list(self.model.key_to_index.keys())



31 changes: 16 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# FAQ

FAQ问答服务
FAQ智能问答系统

使用多种方法,实现FAQ的问题-模板匹配功能。

Expand Down Expand Up @@ -38,7 +38,9 @@ FAQ问答服务
- N-gram
- 使用`2-gram``3-gram``4-gram`提取词片段,再使用`jaccard`计算相似度,作为排序分数。
- Word2Vec
- 使用本项目的QA数据集作为语料,基于`gensim`框架训练得到的`word2vec`模型。
- 提供了两种获取词向量的方法:
- (1) 使用本项目的QA数据集作为语料,基于`gensim`框架训练得到的`word2vec`模型。
- (2) 使用腾讯AI Lab开源的词向量[Embedding](https://ai.tencent.com/ailab/nlp/en/embedding.html)。需要将文件放置在目录`file/model/`下。项目默认使用【Original size: 1.8G; tar.gz size: 763M】的词向量,若使用其他词向量文件,需要修改`Config.py`下的`path_w2v_tx`变量值。
- 计算词向量之间的相似度,作为排序分数。

### 3. 精排
Expand Down Expand Up @@ -135,14 +137,14 @@ $ pip install -r requirements.txt
### 2.指标
在100个样本的测试集中,分别统计top1、top3、top10的召回结果准确率。
| 模块配置 | Top1 Acc | Top3 Acc | Top10 Acc |
| :-----| :---- | :---- | :---- |
| PreRank(bm25) | 0.77 | 0.86 | 0.92 |
| PreRank(ngram | 0.52 | 0.69 | 0.84 |
| PreRank(word2vec) | 0.43 | 0.56 | 0.69 |
| PreRank(bm25) + Rank(lm-mini)(Unsup) | 0.45 | 0.54 | 0.64 |
| PreRank(bm25) + Rank(simcse-bert)(Unsup) | 0.43 | 0.51 | 0.61 |
| PreRank(bm25) + Rank(bert)(sup) | 0.99 | 1.0 | 1.0 |
| 粗排模型 | 精排模型 | Top1 Acc | Top3 Acc | Top10 Acc |
| :-----| :---- | :---- | :---- | :---- |
| bm25 | - | 0.77 | 0.86 | 0.92 |
| ngram | - | 0.52 | 0.69 | 0.84 |
| word2vec(tencent) | - | 0.76 | 0.83 | 0.88 |
| bm25 | lm-mini(Unsup) | 0.45 | 0.54 | 0.64 |
| bm25 | simcse-bert(Unsup) | 0.43 | 0.51 | 0.61 |
| bm25 | bert(sup) | 0.99 | 1.0 | 1.0 |



Expand All @@ -151,12 +153,12 @@ $ pip install -r requirements.txt
根目录下的`Config.py`文件,是配置文件。可自行修改相关的参数:
- `es_ip`:ES搜索引擎的地址,默认是部署在同一台设备环境里,如果ES是部署在其他服务器,那么需要改成其他服务器的地址。
- `es_index`:表示语料存储在ES中的index名字。
- `model_name`:表示排序的方法,分别包含`bm25`/`ngram`/`word2vec`/`lm`
- `model_name`:表示排序的方法,分别包含`bm25/ngram/word2vec/word2vec-tx`
- `dataset`:表示数据集的名称,对应目录`./data/`下的数据文件夹名。
- `use_rank`:是否使用精排模块。True开启,False关闭。
- `use_supervise`:精排模块使用有监督方法,还是无监督方法。True为监督方法,False为无监督方法。
- `unsup_rank_name`:表示无监督精排中,使用的模型类型,可选:`simcse-distilbert`/`simcse-bert`/`lm-mini`
- `sup_rank_name`:表示监督方法精排中,使用的模型类型,可选:`distilbert`/`bert`
- `unsup_rank_name`:表示无监督精排中,使用的模型类型,可选:`simcse-distilbert/simcse-bert/lm-mini`
- `sup_rank_name`:表示监督方法精排中,使用的模型类型,可选:`distilbert/bert`


### 2.项目初始化
Expand Down Expand Up @@ -256,7 +258,6 @@ $ curl --request POST \
| 更新时间 | 版本号 | 说明 |
| :-----| :---- | :---- |
| 2023-01-05 | V1.0 | 项目初始化,主要包含召回+排序的算法框架,以及Web服务 |
| 2023-01-11 | V1.1 | 算法框架修改为:召回+粗排+精排 |
| 2023-01-15 | V1.1 | 新增精排的supervise方法 |
| 2023-01-17 | V1.1 | 算法框架修改为:召回+粗排+精排;新增精排的supervise方法;在粗排方法中,引入外部词向量 |
| ... | ... | ... |

0 comments on commit 0396105

Please sign in to comment.