Skip to content

Commit

Permalink
index_with_full_text
Browse files Browse the repository at this point in the history
  • Loading branch information
no398 committed Jan 15, 2024
1 parent ba512e8 commit 5c8f41f
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 36 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
/target
/search_index
/rocksdb
10 changes: 7 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,14 @@

### 将数据加载到 rocksdb 数据库中

运行 `convert` 程序。此过程会将原始数据放入 rocksdb 数据库中,数据库文件路径为 `config.toml` 中的 `db` 变量;转换后的数据大小约为 200G,转换可能会花费数小时的时间;如果中途中断,再次运行会从中断处继续。
运行 `convert config.toml` 程序。此过程会将原始数据放入 rocksdb 数据库中,数据库文件路径为 `config.toml` 中的 `db` 变量;转换后的数据大小约为 200G,转换可能会花费数小时的时间;如果中途中断,再次运行会从中断处继续。

### 创建索引
运行 `index` 程序会将数据库中的数据创建索引,索引文件路径为 `config.toml` 中的 `index_path` 变量;如果中途中断,需要删除 `index_path` 中的文件,重新运行 `index` 程序;默认情况下,不会索引案件内容,如果需要索引案件内容,需要将 index.rs 文件中对应的注释去掉(相应索引文件会超过150G)。索引大小约为 15.5G,转换可能会花费数小时的时间;
运行 `index config.toml` 程序会将数据库中的数据创建索引,索引文件路径为 `config.toml` 中的 `index_path` 变量;如果中途中断,需要删除 `index_path` 中的文件,重新运行 `index` 程序;默认情况下,不会索引案件内容,索引大小约为 15.5G,可能会花费数小时的时间。如果需要索引案件内容,需要将index.toml中的 `index_with_full_text` 设置为 `true`,但是这会使索引文件增加到150G左右,索引时间也会增加到十几个小时。

### 运行搜索服务
运行 `main` 程序,用浏览器打开网址,即可搜索。
运行 `main config.toml` 程序,用浏览器打开网址,即可搜索。

## 说明

当程序和配置文件放在同一目录下,且配置文件命名为 `config.toml` 时,可以省略配置文件路径参数。
1 change: 1 addition & 0 deletions config.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
db = "rocksdb"
index_path ="search_index"
index_with_full_text = false
addr = "127.0.0.1:8081"

# The raw data path you downloaded from the torrent, and you must NOT unzip it.
Expand Down
30 changes: 16 additions & 14 deletions src/bin/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ fn main() {
let parties = schema.get_field("parties").unwrap();
let cause = schema.get_field("cause").unwrap();
let legal_basis = schema.get_field("legal_basis").unwrap();
// let full_text = schema.get_field("full_text").unwrap();
let full_text = schema.get_field("full_text").unwrap();

let index_path = Path::new(&CONFIG.index_path);
if !index_path.exists() {
Expand Down Expand Up @@ -54,16 +54,18 @@ fn main() {
let mut rdr = csv::Reader::from_reader(file);
for result in rdr.deserialize() {
id += 1;
let case: Case = result.unwrap();
// case.full_text =
// case.full_text
// .split_whitespace()
// .fold(String::new(), |mut acc, x| {
// acc.push_str("<p>");
// acc.push_str(x);
// acc.push_str("</p>");
// acc
// });
let mut case: Case = result.unwrap();
if CONFIG.index_with_full_text {
case.full_text = case.full_text.split_whitespace().fold(
String::new(),
|mut acc, x| {
acc.push_str("<p>");
acc.push_str(x);
acc.push_str("</p>");
acc
},
);
}

let mut doc = Document::default();
doc.add_text(id_field, id);
Expand Down Expand Up @@ -100,9 +102,9 @@ fn main() {
if !case.legal_basis.is_empty() {
doc.add_text(legal_basis, &case.legal_basis);
}
// if !case.full_text.is_empty() {
// doc.add_text(full_text, &case.full_text);
// }
if CONFIG.index_with_full_text && !case.full_text.is_empty() {
doc.add_text(full_text, &case.full_text);
}
writer.add_document(doc).unwrap();

if id % 1000 == 0 {
Expand Down
1 change: 1 addition & 0 deletions src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ pub static CONFIG: Lazy<Config> = Lazy::new(Config::load_config);
pub struct Config {
pub db: String,
pub index_path: String,
pub index_with_full_text: bool,
pub addr: String,
pub raw_data_path: Option<String>,
}
Expand Down
20 changes: 1 addition & 19 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
pub use config::CONFIG;
pub use controller::{case, logo, search, style};
use rocksdb::DB;
pub use tantivy::Tan;

use serde::{Deserialize, Serialize};
use std::sync::Arc;
use tantivy::Searcher;
pub use tantivy::Tan;

mod config;
mod controller;
Expand Down Expand Up @@ -50,20 +49,3 @@ pub struct Case {
#[serde(rename(deserialize = "全文"))]
pub full_text: String,
}

#[derive(Debug, Deserialize, Serialize)]
pub struct Meta {
pub id: u32,
pub url: String,
pub case_id: String,
pub case_name: String,
pub court: String,
pub region: String,
pub case_type: String,
pub procedure: String,
pub judgment_date: String,
pub public_date: String,
pub parties: String,
pub cause: String,
pub legal_basis: String,
}

0 comments on commit 5c8f41f

Please sign in to comment.