forked from manticoresoftware/manticoresearch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathglobal_idf.cpp
339 lines (275 loc) · 8.18 KB
/
global_idf.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
//
// Copyright (c) 2017-2023, Manticore Software LTD (https://manticoresearch.com)
// Copyright (c) 2001-2016, Andrew Aksyonoff
// Copyright (c) 2008-2016, Sphinx Technologies Inc
// All rights reserved
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License. You should have
// received a copy of the GPL license along with this program; if you
// did not, you can find it at http://www.gnu.org/
//
#include "global_idf.h"
#include "sphinxint.h"
#include "fileutils.h"
#include <math.h>
#pragma pack(push, 4)
struct IDFWord_t
{
uint64_t m_uWordID;
DWORD m_iDocs;
};
#pragma pack(pop)
STATIC_SIZE_ASSERT ( IDFWord_t, 12 );
static const int HASH_BITS = 16;
using namespace sph;
/// global IDF
class CSphGlobalIDF final : public IDFer_c
{
protected:
~CSphGlobalIDF() final = default;
public:
bool Touch ( const CSphString& sFilename );
bool Preread ( const CSphString& sFilename, CSphString& sError );
float GetIDF ( const CSphString& sWord, int64_t iDocsLocal, bool bPlainIDF ) const final;
private:
DWORD GetDocs ( const CSphString& sWord ) const;
int64_t m_iTotalDocuments = 0;
int64_t m_iTotalWords = 0;
SphOffset_t m_uMTime = 0;
CSphLargeBuffer<IDFWord_t> m_pWords;
CSphLargeBuffer<int64_t> m_pHash;
};
using CSphGlobalIDFRefPtr_c = CSphRefcountedPtr<CSphGlobalIDF>;
// check if backend file was modified
bool CSphGlobalIDF::Touch ( const CSphString& sFilename )
{
// update m_uMTime, return true if modified
struct_stat tStat = { 0 };
if ( stat ( sFilename.cstr (), &tStat )<0 )
tStat.st_mtime = 0;
bool bModified = ( m_uMTime!=tStat.st_mtime );
m_uMTime = tStat.st_mtime;
return bModified;
}
bool CSphGlobalIDF::Preread ( const CSphString& sFilename, CSphString& sError )
{
Touch ( sFilename );
CSphAutofile tFile;
if ( tFile.Open ( sFilename, SPH_O_READ, sError )<0 )
return false;
const SphOffset_t iSize = sphGetFileSize ( tFile.GetFD (), nullptr ) - sizeof ( SphOffset_t );
sphReadThrottled ( tFile.GetFD (), &m_iTotalDocuments, sizeof ( SphOffset_t ));
m_iTotalWords = iSize / sizeof ( IDFWord_t );
// allocate words cache
CSphString sWarning;
if ( !m_pWords.Alloc ( m_iTotalWords, sError ))
return false;
// allocate lookup table if needed
int iHashSize = ( int ) ( U64C( 1 ) << HASH_BITS );
if ( m_iTotalWords>iHashSize * 8 )
{
if ( !m_pHash.Alloc ( iHashSize + 2, sError ))
return false;
}
// read file into memory (may exceed 2GB)
int64_t iRead = sphReadThrottled ( tFile.GetFD (), m_pWords.GetWritePtr (), iSize );
if ( iRead!=iSize )
return false;
if ( sphInterrupted ())
return false;
// build lookup table
if ( m_pHash.GetLengthBytes ())
{
int64_t* pHash = m_pHash.GetWritePtr ();
uint64_t uFirst = m_pWords[0].m_uWordID;
uint64_t uRange = m_pWords[m_iTotalWords - 1].m_uWordID - uFirst;
DWORD iShift = 0;
while ( uRange>=( U64C( 1 ) << HASH_BITS ))
{
iShift++;
uRange >>= 1;
}
pHash[0] = iShift;
pHash[1] = 0;
DWORD uLastHash = 0;
for ( int64_t i = 1; i<m_iTotalWords; i++ )
{
// check for interrupt (throttled for speed)
if (( i & 0xffff )==0 && sphInterrupted ())
return false;
auto uHash = ( DWORD ) (( m_pWords[i].m_uWordID - uFirst ) >> iShift );
if ( uHash==uLastHash )
continue;
while ( uLastHash<uHash )
pHash[++uLastHash + 1] = i;
uLastHash = uHash;
}
pHash[++uLastHash + 1] = m_iTotalWords;
}
return true;
}
DWORD CSphGlobalIDF::GetDocs ( const CSphString& sWord ) const
{
const char* s = sWord.cstr ();
// replace = to MAGIC_WORD_HEAD_NONSTEMMED for exact terms
char sBuf[3 * SPH_MAX_WORD_LEN + 4];
if ( s && *s=='=' )
{
strncpy ( sBuf, s, sizeof ( sBuf ) - 1 );
sBuf[0] = MAGIC_WORD_HEAD_NONSTEMMED;
s = sBuf;
}
uint64_t uWordID = sphFNV64 ( s );
int64_t iStart = 0;
int64_t iEnd = m_iTotalWords - 1;
auto pWords = (const IDFWord_t*)m_pWords.GetReadPtr();
if ( m_pHash.GetLengthBytes ())
{
uint64_t uFirst = pWords[0].m_uWordID;
auto uHash = ( DWORD ) (( uWordID - uFirst ) >> m_pHash[0] );
if ( uHash>( U64C( 1 ) << HASH_BITS ))
return 0;
iStart = m_pHash[uHash + 1];
iEnd = m_pHash[uHash + 2] - 1;
}
const IDFWord_t* pWord = sphBinarySearch ( pWords + iStart, pWords + iEnd,
bind ( &IDFWord_t::m_uWordID ), uWordID );
return pWord ? pWord->m_iDocs : 0;
}
float CSphGlobalIDF::GetIDF ( const CSphString& sWord, int64_t iDocsLocal, bool bPlainIDF ) const
{
int64_t iDocs = Max ( iDocsLocal, ( int64_t ) GetDocs ( sWord ));
int64_t iTotalClamped = Max ( m_iTotalDocuments, iDocs );
if ( !iDocs )
return 0.0f;
if ( bPlainIDF )
iTotalClamped += 1-iDocs;
float fLogTotal = logf ( float ( 1 + iTotalClamped ));
return logf ( float ( iTotalClamped ) / float ( iDocs )) / ( 2 * fLogTotal );
}
/// global idf definitions hash
class cGlobalIDF
{
mutable RwLock_t m_tLock;
SmallStringHash_T<CSphGlobalIDFRefPtr_c> m_hIDFs GUARDED_BY ( m_tLock );
public:
bool LoadGlobalIDF ( const CSphString& sPath, CSphString& sError );
bool ReloadGlobalIDF ( const CSphString& sPath, CSphString& sError );
CSphGlobalIDFRefPtr_c* GetIDF ( const CSphString& sPath );
StrVec_t Collect() const;
void DeleteMany ( const StrVec_t& dFiles );
void Clear ();
};
cGlobalIDF& GetGlobalIDF()
{
static cGlobalIDF tIDF;
return tIDF;
}
static CSphGlobalIDFRefPtr_c DoPrereadIDF ( const CSphString& sPath, CSphString& sError )
{
CSphGlobalIDFRefPtr_c pNewIDF { new CSphGlobalIDF };
if ( !pNewIDF->Preread ( sPath, sError ))
pNewIDF = nullptr;
return pNewIDF;
}
bool cGlobalIDF::LoadGlobalIDF ( const CSphString& sPath, CSphString& sError )
{
sphLogDebug ( "Loading global IDF (%s)", sPath.cstr ());
auto pGlobalIDF = DoPrereadIDF ( sPath, sError );
if ( !pGlobalIDF )
return false;
ScWL_t wLock ( m_tLock );
m_hIDFs.Add ( std::move (pGlobalIDF), sPath );
return true;
}
bool cGlobalIDF::ReloadGlobalIDF ( const CSphString& sPath, CSphString& sError )
{
sphLogDebug ( "Reloading global IDF (%s)", sPath.cstr ());
auto pGlobalIDF = DoPrereadIDF ( sPath, sError );
if ( !pGlobalIDF )
return false;
ScWL_t wLock ( m_tLock );
auto* ppGlobalIDF = m_hIDFs ( sPath );
if ( ppGlobalIDF )
*ppGlobalIDF = std::exchange ( pGlobalIDF, nullptr );
return true;
}
CSphGlobalIDFRefPtr_c* cGlobalIDF::GetIDF ( const CSphString& sPath )
{
ScRL_t RLock ( m_tLock );
return m_hIDFs ( sPath );
}
StrVec_t cGlobalIDF::Collect() const
{
StrVec_t dCollection;
ScRL_t rLock ( m_tLock );
for ( auto& dIdf : m_hIDFs )
dCollection.Add ( dIdf.first );
return dCollection;
}
void cGlobalIDF::DeleteMany ( const StrVec_t& dFiles )
{
ScWL_t wLock ( m_tLock );
for ( const auto& sKey : dFiles )
{
sphLogDebug ( "Unloading global IDF (%s)", sKey.cstr() );
m_hIDFs.Delete ( sKey );
}
}
void cGlobalIDF::Clear()
{
ScWL_t wLock ( m_tLock );
m_hIDFs.Reset();
}
bool sph::PrereadGlobalIDF ( const CSphString& sPath, CSphString& sError )
{
auto& tGlobalIDF = GetGlobalIDF();
auto* ppGlobalIDF = tGlobalIDF.GetIDF(sPath);
if ( !ppGlobalIDF )
return tGlobalIDF.LoadGlobalIDF ( sPath, sError );
auto& pGlobalIDF = *ppGlobalIDF;
if ( pGlobalIDF && pGlobalIDF->Touch ( sPath ))
return tGlobalIDF.ReloadGlobalIDF ( sPath, sError );
return true;
}
static StrVec_t CollectUnlistedIn ( const StrVec_t& dFiles )
{
StrVec_t dAllIDFs = GetGlobalIDF().Collect();
StrVec_t dCollection;
for ( const auto& sIdf : dAllIDFs )
if ( !dFiles.Contains ( sIdf ) )
dCollection.Add ( sIdf );
return dCollection;
}
static void DeleteUnlistedIn ( const StrVec_t& dFiles )
{
auto dUnlisted = CollectUnlistedIn ( dFiles );
GetGlobalIDF().DeleteMany(dUnlisted);
}
void sph::UpdateGlobalIDFs ( const StrVec_t& dFiles )
{
// delete unlisted entries
DeleteUnlistedIn ( dFiles );
// load/rotate remaining entries
CSphString sError;
ARRAY_FOREACH ( i, dFiles )
{
const auto& sPath = dFiles[i];
if ( !PrereadGlobalIDF ( sPath, sError ))
sphLogDebug ( "Could not load global IDF (%s): %s", sPath.cstr (), sError.cstr ());
}
}
void sph::ShutdownGlobalIDFs ()
{
StrVec_t dAllIDFs = GetGlobalIDF().Collect();
GetGlobalIDF().DeleteMany ( dAllIDFs );
}
IDFerRefPtr_c sph::GetIDFer ( const CSphString& IDFPath )
{
IDFerRefPtr_c pResult;
auto* ppGlobalIDF = GetGlobalIDF().GetIDF ( IDFPath );
if ( ppGlobalIDF )
pResult = *ppGlobalIDF;
return pResult;
}