Skip to content

Commit

Permalink
update exported api and code
Browse files Browse the repository at this point in the history
  • Loading branch information
vcaesar committed Nov 30, 2017
1 parent 468a368 commit c2ea8b3
Show file tree
Hide file tree
Showing 32 changed files with 111 additions and 111 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ var (

func main() {
// Init
searcher.Init(types.EngineInitOptions{
searcher.Init(types.EngineOpts{
Using: 4,
NotUsingSegmenter: true})
defer searcher.Close()
Expand Down
2 changes: 1 addition & 1 deletion README_zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ var (

func main() {
// 初始化
searcher.Init(types.EngineInitOptions{
searcher.Init(types.EngineOpts{
Using: 3,
SegmenterDict: "zh",
// SegmenterDict: "your gopath"+"/src/github.com/go-ego/riot/data/dict/dictionary.txt",
Expand Down
4 changes: 2 additions & 2 deletions core/indexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ type Indexer struct {
removeCache types.DocumentsId
}

initOptions types.IndexerInitOptions
initOptions types.IndexerOpts
initialized bool

// 这实际上是总文档数的一个近似
Expand All @@ -71,7 +71,7 @@ type KeywordIndices struct {
}

// Init 初始化索引器
func (indexer *Indexer) Init(options types.IndexerInitOptions) {
func (indexer *Indexer) Init(options types.IndexerOpts) {
if indexer.initialized == true {
log.Fatal("The Indexer can not be initialized twice.")
}
Expand Down
18 changes: 9 additions & 9 deletions core/indexer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import (

func TestAddKeywords(t *testing.T) {
var indexer Indexer
indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
indexer.Init(types.IndexerOpts{IndexType: types.LocationsIndex})
indexer.AddDocumentToCache(&types.DocumentIndex{
DocId: 1,
Keywords: []types.KeywordIndex{{"token1", 0, []int{}}},
Expand Down Expand Up @@ -44,7 +44,7 @@ func TestAddKeywords(t *testing.T) {

func TestRemoveDoc(t *testing.T) {
var indexer Indexer
indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
indexer.Init(types.IndexerOpts{IndexType: types.LocationsIndex})

// doc1 = "token2 token3"
indexer.AddDocumentToCache(&types.DocumentIndex{
Expand Down Expand Up @@ -128,7 +128,7 @@ func TestRemoveDoc(t *testing.T) {

func TestLookupLocationsIndex(t *testing.T) {
var indexer Indexer
indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
indexer.Init(types.IndexerOpts{IndexType: types.LocationsIndex})
// doc1 = "token2 token3"
indexer.AddDocumentToCache(&types.DocumentIndex{
DocId: 1,
Expand Down Expand Up @@ -208,7 +208,7 @@ func TestLookupLocationsIndex(t *testing.T) {

func TestLookupDocIdsIndex(t *testing.T) {
var indexer Indexer
indexer.Init(types.IndexerInitOptions{IndexType: types.DocIdsIndex})
indexer.Init(types.IndexerOpts{IndexType: types.DocIdsIndex})
// doc1 = "token2 token3"
indexer.AddDocumentToCache(&types.DocumentIndex{
DocId: 1,
Expand Down Expand Up @@ -288,7 +288,7 @@ func TestLookupDocIdsIndex(t *testing.T) {

func TestLookupWithProximity(t *testing.T) {
var indexer Indexer
indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
indexer.Init(types.IndexerOpts{IndexType: types.LocationsIndex})

// doc1 = "token2 token4 token4 token2 token3 token4"
indexer.AddDocumentToCache(&types.DocumentIndex{
Expand Down Expand Up @@ -329,7 +329,7 @@ func TestLookupWithProximity(t *testing.T) {

func TestLookupWithPartialLocations(t *testing.T) {
var indexer Indexer
indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
indexer.Init(types.IndexerOpts{IndexType: types.LocationsIndex})
// doc1 = "token2 token4 token4 token2 token3 token4" + "label1"(不在文本中)
indexer.AddDocumentToCache(&types.DocumentIndex{
DocId: 1,
Expand Down Expand Up @@ -358,7 +358,7 @@ func TestLookupWithPartialLocations(t *testing.T) {

func TestLookupWithBM25(t *testing.T) {
var indexer Indexer
indexer.Init(types.IndexerInitOptions{
indexer.Init(types.IndexerOpts{
IndexType: types.FrequenciesIndex,
BM25Parameters: &types.BM25Parameters{
K1: 1,
Expand Down Expand Up @@ -393,7 +393,7 @@ func TestLookupWithBM25(t *testing.T) {

func TestLookupWithinDocIds(t *testing.T) {
var indexer Indexer
indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
indexer.Init(types.IndexerOpts{IndexType: types.LocationsIndex})
// doc1 = "token2 token3"
indexer.AddDocumentToCache(&types.DocumentIndex{
DocId: 1,
Expand Down Expand Up @@ -436,7 +436,7 @@ func TestLookupWithinDocIds(t *testing.T) {

func TestLookupWithLocations(t *testing.T) {
var indexer Indexer
indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
indexer.Init(types.IndexerOpts{IndexType: types.LocationsIndex})
// doc1 = "token2 token4 token4 token2 token3 token4"
indexer.AddDocumentToCache(&types.DocumentIndex{
DocId: 1,
Expand Down
2 changes: 1 addition & 1 deletion data/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ var (

func main() {
// Init searcher
searcher.Init(types.EngineInitOptions{
searcher.Init(types.EngineOpts{
Using: 4,
SegmenterDict: "./dict/dictionary.txt"})
defer searcher.Close()
Expand Down
2 changes: 1 addition & 1 deletion docs/en/benchmarking.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Changing the NumShards variable in the test program can change the number of sin

The index item here refers to a non-repeating "search key" - "document" pair. For example, when there are N different search keys in a document, the document generates N index items.

The program uses 8 shards by default, and you can change this value when initializing the engine based on your specific requirements, see [types.EngineInitOptions.NumShards](/types/engine_init_options.go)
The program uses 8 shards by default, and you can change this value when initializing the engine based on your specific requirements, see [types.EngineOpts.NumShards](/types/engine_init_options.go)

# performance analysis

Expand Down
4 changes: 2 additions & 2 deletions docs/en/bm25.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@ BM25 is a search engine's classic sorting function that measures the relevance o
BM25 = sum ----------------------------
TF + k1 * (1 - b + b * D / L)

Sum sum of all keywords, TF (term frequency) for a keyword in the document appear in the frequency, D is the number of words in the document, L is the average number of words in all documents, k1 and b are constants, in the The riot defaults to 2.0 and 0.75, but it can be at engine initialization in the [EngineInitOptions.IndexerInitOptions.BM25Parameters](/types/indexer_init_options.go) amendment. IDF (inverse document frequency) measure keywords are common, riot engine using a smooth IDF formula
Sum sum of all keywords, TF (term frequency) for a keyword in the document appear in the frequency, D is the number of words in the document, L is the average number of words in all documents, k1 and b are constants, in the The riot defaults to 2.0 and 0.75, but it can be at engine initialization in the [EngineOpts.IndexerOpts.BM25Parameters](/types/indexer_init_options.go) amendment. IDF (inverse document frequency) measure keywords are common, riot engine using a smooth IDF formula

The total number of documents
IDF = log2( ------------------------ + 1 )
The number of documents that appear for this keyword
# Use

Indexer is responsible for calculating the BM25, in order to be able to calculate the BM25 value of the document, you must save the word frequency of all the keywords in the document, which needs [EngineInitOptions.IndexerInitOptions.IndexType] (/types/indexer_init_options.go) at engine initialization to at least FrequenciesIndex (LocationsIndex also calculates BM25, but this index also holds where words appear and consumes more memory).
Indexer is responsible for calculating the BM25, in order to be able to calculate the BM25 value of the document, you must save the word frequency of all the keywords in the document, which needs [EngineOpts.IndexerOpts.IndexType] (/types/indexer_init_options.go) at engine initialization to at least FrequenciesIndex (LocationsIndex also calculates BM25, but this index also holds where words appear and consumes more memory).

Then you can call IndexedDocument in your [Custom Scoring Rules] (/docs/en/ custom_scoring_criteria.md). BM25 gets this value as the scoring data. If you want to rely entirely on the BM25 score, you can use the default rating rule, which is RankByBM25.

Expand Down
8 changes: 4 additions & 4 deletions docs/en/codelab.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,17 +64,17 @@ The first package defines the engine function, the second package defines the co

```go
var searcher riot.Engine
searcher.Init(types.EngineInitOptions{
searcher.Init(types.EngineOpts{
SegmenterDict: "../../data/dict/dictionary.txt",
StopTokenFile: "../../data/dict/stop_tokens.txt",
IndexerInitOptions: &types.IndexerInitOptions{
IndexerOpts: &types.IndexerOpts{
IndexType: types.LocationsIndex,
},
})
```
[types.EngineInitOptions](/types/engine_init_options.go) defines parameters that need to be set by the initialization engine, such as where to load the word dictionary file, stop word list, indexer type, BM25 parameters, etc., as well as the default scoring rules (see the "Search" section) and Output Paging Option. Please read the structure of the code for details.
[types.EngineOpts](/types/engine_init_options.go) defines parameters that need to be set by the initialization engine, such as where to load the word dictionary file, stop word list, indexer type, BM25 parameters, etc., as well as the default scoring rules (see the "Search" section) and Output Paging Option. Please read the structure of the code for details.

In particular, it should be emphasized that please carefully choose IndexerInitOptions.IndexType types, there are three different types of index table:
In particular, it should be emphasized that please carefully choose IndexerOpts.IndexType types, there are three different types of index table:

1. DocIdsIndex, provides the most basic index, only record the document key docid appears.
2. FrequenciesIndex, in addition to record docid, but also save the search key appear in each document frequency, if you need BM25 then FrequenciesIndex is what you need.
Expand Down
4 changes: 2 additions & 2 deletions docs/en/persistent_storage.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
Persistent storage
====

The riot engine supports saving search data to the hard drive and restoring data from the hard drive when the machine restarts. Just use persistent storage to set the three options in EngineInitOptions:
The riot engine supports saving search data to the hard drive and restoring data from the hard drive when the machine restarts. Just use persistent storage to set the three options in EngineOpts:

```go
type EngineInitOptions struct {
type EngineOpts struct {
// Skip other options

// Whether to use persistent databases, and the number of directories and splits that database files hold
Expand Down
2 changes: 1 addition & 1 deletion docs/en/segmenter.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
## Word segmentation rules:

```Go
types.EngineInitOptions{
types.EngineOpts{
Using: 4,
}
```
Expand Down
2 changes: 1 addition & 1 deletion docs/en/token_proximity.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ The specific calculation process is to take a P_1 first, calculate the smallest

See the computeTokenProximity function in [core / indexer.go] (/ core / indexer.go) for implementation.

Close distance calculation need to save each word position in the indexer, which requires additional memory consumption, it is off by default, open this function, please set EngineInitOptions.IndexerInitOptions.IndexType LocationsIndex initialization engine.
Close distance calculation need to save each word position in the indexer, which requires additional memory consumption, it is off by default, open this function, please set EngineOpts.IndexerOpts.IndexType LocationsIndex initialization engine.
2 changes: 1 addition & 1 deletion docs/zh/benchmarking.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

这里的索引项是指一个不重复的“搜索键”-“文档”对,比如当一个文档中有N个不一样的搜索键时,该文档会产生N个索引项。

程序默认使用 8个 shard,你可以根据具体的需求在初始化引擎时改变这个值,见[types.EngineInitOptions.NumShards](/types/engine_init_options.go)
程序默认使用 8个 shard,你可以根据具体的需求在初始化引擎时改变这个值,见[types.EngineOpts.NumShards](/types/engine_init_options.go)

# 性能分析

Expand Down
4 changes: 2 additions & 2 deletions docs/zh/bm25.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@ BM25 是搜索引擎的经典排序函数,用于衡量一组关键词和某文
BM25 = sum ----------------------------
TF + k1 * (1 - b + b * D / L)

其中 sum 对所有关键词求和,TF(term frequency)为某关键词在该文档中出现的词频,D 为该文档的词数,L为所有文档的平均词数,k1和b为常数,在 riot 里默认值为 2.0 和 0.75,不过可以在引擎初始化的时候在 [EngineInitOptions.IndexerInitOptions.BM25Parameters](/types/indexer_init_options.go) 中修改。IDF(inverse document frequency)衡量关键词是否常见,riot 引擎使用带平滑的 IDF 公式
其中 sum 对所有关键词求和,TF(term frequency)为某关键词在该文档中出现的词频,D 为该文档的词数,L为所有文档的平均词数,k1和b为常数,在 riot 里默认值为 2.0 和 0.75,不过可以在引擎初始化的时候在 [EngineOpts.IndexerOpts.BM25Parameters](/types/indexer_init_options.go) 中修改。IDF(inverse document frequency)衡量关键词是否常见,riot 引擎使用带平滑的 IDF 公式

总文档数目
IDF = log2( ------------------------ + 1 )
出现该关键词的文档数目
# 使用

索引器负责计算 BM25,为了能计算文档的 BM25 值,必须保存文档中所有关键词的词频,这需要在引擎初始化时将[EngineInitOptions.IndexerInitOptions.IndexType](/types/indexer_init_options.go)至少设置为 FrequenciesIndex(LocationsIndex也可计算 BM25,但这种索引也保存词出现的位置,消耗更多内存)。
索引器负责计算 BM25,为了能计算文档的 BM25 值,必须保存文档中所有关键词的词频,这需要在引擎初始化时将[EngineOpts.IndexerOpts.IndexType](/types/indexer_init_options.go)至少设置为 FrequenciesIndex(LocationsIndex也可计算 BM25,但这种索引也保存词出现的位置,消耗更多内存)。

然后你可以在你[自定义的评分规则](/docs/zh/custom_scoring_criteria.md)中调用 IndexedDocument. BM25 得到此值作为评分数据。如果你想完全依赖 BM25 评分,可以使用默认的评分规则,既 RankByBM25。
8 changes: 4 additions & 4 deletions docs/zh/codelab.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,17 +64,17 @@ import (

```go
var searcher riot.Engine
searcher.Init(types.EngineInitOptions{
searcher.Init(types.EngineOpts{
SegmenterDict: "../../data/dict/dictionary.txt",
StopTokenFile: "../../data/dict/stop_tokens.txt",
IndexerInitOptions: &types.IndexerInitOptions{
IndexerOpts: &types.IndexerOpts{
IndexType: types.LocationsIndex,
},
})
```
[types.EngineInitOptions](/types/engine_init_options.go) 定义了初始化引擎需要设定的参数,比如从何处载入分词字典文件,停用词列表,索引器类型,BM25 参数等,以及默认的评分规则(见“搜索”一节)和输出分页选项。具体细节请阅读代码中结构体的注释。
[types.EngineOpts](/types/engine_init_options.go) 定义了初始化引擎需要设定的参数,比如从何处载入分词字典文件,停用词列表,索引器类型,BM25 参数等,以及默认的评分规则(见“搜索”一节)和输出分页选项。具体细节请阅读代码中结构体的注释。

特别需要强调的是请慎重选择 IndexerInitOptions.IndexType 的类型,共有三种不同类型的索引表:
特别需要强调的是请慎重选择 IndexerOpts.IndexType 的类型,共有三种不同类型的索引表:

1. DocIdsIndex,提供了最基本的索引,仅仅记录搜索键出现的文档 docid。
2. FrequenciesIndex,除了记录 docid 外,还保存了搜索键在每个文档中出现的频率,如果你需要BM25那么 FrequenciesIndex 是你需要的。
Expand Down
4 changes: 2 additions & 2 deletions docs/zh/persistent_storage.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
持久存储
====

riot 引擎支持将搜索数据存入硬盘,并在当机重启动时从硬盘恢复数据。使用持久存储只需设置EngineInitOptions 中的三个选项:
riot 引擎支持将搜索数据存入硬盘,并在当机重启动时从硬盘恢复数据。使用持久存储只需设置EngineOpts 中的三个选项:

```go
type EngineInitOptions struct {
type EngineOpts struct {
// 略过其他选项

// 是否使用持久数据库,以及数据库文件保存的目录和裂分数目
Expand Down
2 changes: 1 addition & 1 deletion docs/zh/segmenter.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
## 分词规则:

```Go
types.EngineInitOptions{
types.EngineOpts{
Using: 4,
}
```
Expand Down
2 changes: 1 addition & 1 deletion docs/zh/token_proximity.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ N 关键词的紧邻距离计算公式如下:

具体实现见[core/indexer.go](/core/indexer.go) 文件中 computeTokenProximity 函数。

紧邻距离计算需要在索引器中保存每个分词的位置,这需要额外消耗内存,因此是默认关闭的,打开这一功能请在引擎初始化时设定 EngineInitOptions.IndexerInitOptions.IndexType 为 LocationsIndex。
紧邻距离计算需要在索引器中保存每个分词的位置,这需要额外消耗内存,因此是默认关闭的,打开这一功能请在引擎初始化时设定 EngineOpts.IndexerOpts.IndexType 为 LocationsIndex。
10 changes: 5 additions & 5 deletions engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ type Engine struct {
numDocumentsStored uint64

// 记录初始化参数
initOptions types.EngineInitOptions
initOptions types.EngineOpts
initialized bool

indexers []core.Indexer
Expand All @@ -94,7 +94,7 @@ type Engine struct {
}

// Indexer initialize the indexer channel
func (engine *Engine) Indexer(options types.EngineInitOptions) {
func (engine *Engine) Indexer(options types.EngineOpts) {
engine.indexerAddDocChannels = make(
[]chan indexerAddDocumentRequest, options.NumShards)
engine.indexerRemoveDocChannels = make(
Expand All @@ -115,7 +115,7 @@ func (engine *Engine) Indexer(options types.EngineInitOptions) {
}

// Ranker initialize the ranker channel
func (engine *Engine) Ranker(options types.EngineInitOptions) {
func (engine *Engine) Ranker(options types.EngineOpts) {
engine.rankerAddDocChannels = make(
[]chan rankerAddDocRequest, options.NumShards)
engine.rankerRankChannels = make(
Expand Down Expand Up @@ -219,7 +219,7 @@ func (engine *Engine) Storage() {
}

// Init initialize the engine
func (engine *Engine) Init(options types.EngineInitOptions) {
func (engine *Engine) Init(options types.EngineOpts) {
// 将线程数设置为CPU数
// runtime.GOMAXPROCS(runtime.NumCPU())
// runtime.GOMAXPROCS(128)
Expand All @@ -243,7 +243,7 @@ func (engine *Engine) Init(options types.EngineInitOptions) {
// 初始化索引器和排序器
for shard := 0; shard < options.NumShards; shard++ {
engine.indexers = append(engine.indexers, core.Indexer{})
engine.indexers[shard].Init(*options.IndexerInitOptions)
engine.indexers[shard].Init(*options.IndexerOpts)

engine.rankers = append(engine.rankers, core.Ranker{})
engine.rankers[shard].Init(options.OnlyID)
Expand Down
Loading

0 comments on commit c2ea8b3

Please sign in to comment.