Skip to content

Commit

Permalink
update sim result.
Browse files Browse the repository at this point in the history
  • Loading branch information
shibing624 committed Dec 27, 2019
1 parent e99dc1d commit 0c90c9e
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 13 deletions.
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -230,16 +230,18 @@ b = '花呗更改绑定银行卡'
c = '我什么时候开通了花呗'
corpus = [a, b, c]
print(corpus)
search_sim = SearchSimilarity(corpus=corpus)
print(search_sim.get_scores(query=a))
print(search_sim.get_similarities(query=a))
print(a, 'scores:', search_sim.get_scores(query=a))
print(a, 'rank similarities:', search_sim.get_similarities(query=a))
```

output:
```
[ 0.9527457 -0.07449248 -0.03204909]
[['如何', '更换', '花', '呗', '绑定', '银行卡'], ['我', '什么', '时候', '开通', '了', '花', '呗'], ['花', '呗', '更改', '绑定', '银行卡']]
['如何更换花呗绑定银行卡', '花呗更改绑定银行卡', '我什么时候开通了花呗']
如何更换花呗绑定银行卡 scores: [ 0.9527457 -0.07449248 -0.03204909]
如何更换花呗绑定银行卡 rank similarities: ['如何更换花呗绑定银行卡', '我什么时候开通了花呗', '花呗更改绑定银行卡']
```


Expand Down
5 changes: 3 additions & 2 deletions examples/similarity_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
from text2vec import SearchSimilarity

corpus = [a, b, c]
print(corpus)
search_sim = SearchSimilarity(corpus=corpus)

print(search_sim.get_scores(query=a))
print(search_sim.get_similarities(query=a))
print(a, 'scores:', search_sim.get_scores(query=a))
print(a, 'rank similarities:', search_sim.get_similarities(query=a))
18 changes: 11 additions & 7 deletions text2vec/similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
@description:
"""

import numpy as np

from text2vec.algorithm.distance import cosine_distance
from text2vec.algorithm.rank_bm25 import BM25Okapi
from text2vec.utils.logger import get_logger
Expand Down Expand Up @@ -87,24 +89,26 @@ def init(self):
if isinstance(self.corpus, str):
self.corpus = [self.corpus]

self.corpus_seg = [self.tokenizer.tokenize(i) for i in self.corpus]
self.bm25_instance = BM25Okapi(corpus=self.corpus_seg)
self.corpus_seg = {k: self.tokenizer.tokenize(k) for k in self.corpus}
self.bm25_instance = BM25Okapi(corpus=list(self.corpus_seg.values()))

def get_similarities(self, query, n=5):
"""
Get similarity between `query` and this docs.
:param query: str
:param n: int, num_best
:return: float scores
:return: result, dict, float scores, docs rank
"""
self.init()
tokens = self.tokenizer.tokenize(query)
return self.bm25_instance.get_top_n(tokens, self.corpus_seg, n=n)
scores = self.get_scores(query)
rank_n = np.argsort(scores)[::-1]
if n > 0:
rank_n = rank_n[:n]
return [self.corpus[i] for i in rank_n]

def get_scores(self, query):
"""
Get scores between query and docs
:param query:
:param query: input str
:return: numpy array, scores for query between docs
"""
self.init()
Expand Down

0 comments on commit 0c90c9e

Please sign in to comment.