forked from MachineLP/TextMatch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbm25.py
70 lines (60 loc) · 2.41 KB
/
bm25.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# -*- coding:utf-8 -*-
'''
-------------------------------------------------
Description : bm25
Author : machinelp
Date : 2020-06-03
-------------------------------------------------
'''
import os
import jieba
import pickle
import logging
import numpy as np
from rank_bm25 import BM25Okapi
from textmatch.models.text_embedding.stop_words import StopWords
from textmatch.config.config import Config as conf
from textmatch.config.constant import Constant as const
from textmatch.models.model_base.model_base import ModelBase
class BM25(ModelBase):
def __init__( self, stop_word=StopWords ):
'''
'''
self.stop_word = stop_word()
# init
def init(self, words_list=None, update=True):
word_list = self._seg_word(words_list)
self.bm25 = BM25Okapi(word_list)
return self
'''
# seg word
def _seg_word(self, words_list, jieba_flag=True, del_stopword=False):
if jieba_flag:
word_list = [[self.stop_word.del_stopwords(words) if del_stopword else word for word in jieba.cut(words)] for words in words_list]
else:
word_list = [[self.stop_word.del_stopwords(words) if del_stopword else word for word in words] for words in words_list]
print( 'word_list>>>', word_list )
return [ ' '.join(word) for word in word_list ]
'''
# seg word
def _seg_word(self, words_list, jieba_flag=conf.JIEBA_FLAG, del_stopword=conf.DEL_STOPWORD):
word_list = []
if jieba_flag:
for words in words_list:
if del_stopword:
if words!='' and type(words) == str:
word_list.append( [word for word in self.stop_word.del_stopwords(jieba.cut(words))] )
else:
if words!='' and type(words) == str:
word_list.append( [word for word in jieba.cut(words)] )
else:
for words in words_list:
if del_stopword:
if words!='' and type(words) == str:
word_list.append( [word for word in self.stop_word.del_stopwords(words)] )
else:
if words!='' and type(words) == str:
word_list.append( [word for word in words] )
return [ ' '.join(word) for word in word_list ]
def predict(self, words):
return self.bm25.get_scores( self._seg_word([words])[0] )