-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathfastbm25.py
183 lines (156 loc) · 6.16 KB
/
fastbm25.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
"""This module contains function of computing rank scores for documents in
corpus and helper class `BM25` used in calculations. Original algorithm
descibed in [1]_, also you may check Wikipedia page [2]_.
.. [1] Robertson, Stephen; Zaragoza, Hugo (2009). The Probabilistic Relevance Framework: BM25 and Beyond,
http://www.staff.city.ac.uk/~sb317/papers/foundations_bm25_review.pdf
.. [2] Okapi BM25 on Wikipedia, https://en.wikipedia.org/wiki/Okapi_BM25
Data:
-----
.. data:: PARAM_K1 - Free smoothing parameter for BM25.
.. data:: PARAM_B - Free smoothing parameter for BM25.
.. data:: EPSILON - Constant used for negative idf of document in corpus.
"""
import math
from six import iteritems
from six.moves import range
import numpy as np
import heapq
from collections.abc import Iterable
PARAM_K1 = 1.5
PARAM_B = 0.75
EPSILON = 0.25
class fastbm25(object):
"""Implementation of Best Matching 25 ranking function.
Attributes
----------
corpus_size : int
Size of corpus (number of documents).
avgdl : float
Average length of document in `corpus`.
doc_freqs : list of dicts of int
Dictionary with terms frequencies for each document in `corpus`. Words used as keys and frequencies as values.
idf : dict
Dictionary with inversed documents frequencies for whole `corpus`. Words used as keys and frequencies as values.
doc_len : list of int
List of document lengths.
"""
def __init__(self, corpus):
"""
Parameters
----------
corpus : list of list of str
Given corpus.
"""
self.corpus_size = len(corpus)
self.avgdl = 0
self.doc_freqs = []
self.idf = {}
self.doc_len = {}
self._initialize(corpus)
self.corpus = corpus
self.get_score_by_reversed_index_all_documents(corpus)
def _initialize(self, corpus):
"""Calculates frequencies of terms in documents and in corpus. Also computes inverse document frequencies."""
nd = {} # word -> number of documents with word
num_doc = 0
for j, document in enumerate(corpus):
self.doc_len[j] = len(document)
num_doc += len(document)
frequencies = {}
for word in document:
if word not in frequencies:
frequencies[word] = 0
frequencies[word] += 1
self.doc_freqs.append(frequencies)
for word, freq in iteritems(frequencies):
if word not in nd:
nd[word] = 0
nd[word] += 1
self.avgdl = float(num_doc) / self.corpus_size
# collect idf sum to calculate an average idf for epsilon value
idf_sum = 0
# collect words with negative idf to set them a special epsilon value.
# idf can be negative if word is contained in more than half of documents
negative_idfs = []
self.nd = nd
for word, freq in iteritems(nd):
idf = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
self.idf[word] = idf
idf_sum += idf
if idf < 0:
negative_idfs.append(word)
self.average_idf = float(idf_sum) / len(self.idf)
eps = EPSILON * self.average_idf
for word in negative_idfs:
self.idf[word] = eps
def similarity_bm25(self, document_a, document_b):
"""Computes BM25 score of given `document A` in relation to given `document B` .
Parameters
----------
document_a : list of str
Document to be scored.
document_b : list of str
Document to be scored.
Returns
-------
float
BM25 score.
"""
assert isinstance(document_a,Iterable),'document a is not iterable'
assert isinstance(document_b,Iterable),'document b is not iterable'
score = 0
doc_freqs = {}
for word in document_b:
if word not in doc_freqs:
doc_freqs[word] = 0
doc_freqs[word] += 1
freq = 1
default_idf = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
for word in document_a:
if word not in doc_freqs:
continue
score += (self.idf.get(word,default_idf) * doc_freqs[word] * (PARAM_K1 + 1)
/ (doc_freqs[word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * len(document_b) / self.avgdl)))
return score
def get_score_by_reversed_index_all_documents(self, corpus):
"""
Build reverted index for documents like {word:{index:grades}}
"""
document_score = {}
for index, document in enumerate(corpus):
q_id = index
doc_freqs = self.doc_freqs[index]
for word in document:
if word not in doc_freqs:
continue
score = (self.idf[word] * doc_freqs[word] * (PARAM_K1 + 1)
/ (doc_freqs[word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avgdl)))
if word not in document_score:
document_score[word] = {q_id: round(score, 2)}
else:
document_score[word].update({q_id: round(score, 2)})
self.document_score = document_score
def top_k_sentence(self,document,k=1):
"""
document: Iterable, to be retrieved
Returns
-------
float
List of [(nearest sentence,index,score)].
"""
assert isinstance(document,Iterable),'document is not iterable'
score_overall = {}
for word in document:
if word not in self.document_score:
continue
for key, value in self.document_score[word].items():
if key not in score_overall:
# print(score_overall)
score_overall[key] = value
else:
score_overall[key] += value
k_keys_sorted = heapq.nlargest(k, score_overall,key=score_overall.__getitem__)
return [(self.corpus[item],item,score_overall.get(item,None)) for item in k_keys_sorted]