-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpipeline.rs
42 lines (34 loc) · 1.16 KB
/
pipeline.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
use clap::Parser;
use serde::{Deserialize, Serialize};
use walkdir::DirEntry;
use crate::cleaners;
use crate::cleaners::html::HtmlCleaning;
use crate::normalizers;
use crate::normalizers::character::CharNormalization;
use crate::normalizers::word::WordNormalization;
#[derive(Parser, Debug, Clone, Serialize, Deserialize, Default)]
#[serde(rename_all = "camelCase")]
pub struct PreprocOpts {
#[clap(flatten)]
html_clean: HtmlCleaning,
#[clap(flatten)]
char_normalization: CharNormalization,
#[clap(flatten)]
word_normalization: WordNormalization,
}
pub fn file_run(opts: PreprocOpts, dir_entry: DirEntry) -> String {
run(opts, &std::fs::read(dir_entry.path()).unwrap())
}
pub fn run(opts: PreprocOpts, raw_text: &[u8]) -> String {
let mut text = normalizers::encoding::to_utf8(raw_text);
if opts.html_clean.enabled {
text = cleaners::html::run(text, opts.html_clean);
}
if opts.char_normalization.enabled {
text = normalizers::character::run(text, opts.char_normalization);
}
if opts.word_normalization.enabled {
text = normalizers::word::run(text, opts.word_normalization);
}
text + "\n"
}