-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathmain.rs
103 lines (91 loc) · 3.32 KB
/
main.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
use anyhow::Result;
use structopt::StructOpt;
mod cmd;
pub mod io;
pub mod ngrams;
pub mod progress;
pub mod tokens;
pub mod util;
#[derive(Debug, StructOpt)]
#[structopt(version = option_env!("BUILD_VERSION").unwrap_or(env!("CARGO_PKG_VERSION")))]
#[structopt(
name = "wimbd",
about = "What's in my big data?",
setting = structopt::clap::AppSettings::ColoredHelp,
)]
struct Opt {
#[structopt(subcommand)]
cmd: WimbdCmd,
}
#[derive(Debug, StructOpt)]
enum WimbdCmd {
/// Find the top-k ngrams in a dataset of compressed JSON lines files using a counting Bloom
/// filter.
///
/// Work is parallelized over files.
///
/// EXAMPLES
///
/// Find the top 20 3-grams in a file:
///
/// > wimbd topk c4-train.01011-of-01024.json.gz --ngram=3 --topk=20 --seed=42 --size=50GiB
///
/// Find the top 20 100-grams in a file:
///
/// > wimbd topk c4-train.01011-of-01024.json.gz --ngram=3 --topk=20 --seed=42 --size=50GiB
///
/// You can also pass directories instead of files, in which case files will be found by
/// globbing for '**/*.json.gz' within each directory.
///
/// ACCURACY
///
/// In general you should set '--size' to however many free gigabytes of RAM you have available, minus some buffer room.
/// This minimizes the probability of incorrect counts and false positives in the top-k.
#[structopt(setting = structopt::clap::AppSettings::ColoredHelp)]
Topk(cmd::topk::Opt),
/// Like 'topk' but for finding the least common ngrams.
///
/// Work is parallelized over files.
#[structopt(setting = structopt::clap::AppSettings::ColoredHelp)]
Botk(cmd::botk::Opt),
/// Get exact counts for given search strings. Note that the search strings will be tokenized
/// and the search will be done over tokens instead of searching for those substrings directly.
///
/// If you want to count occurrences of a regex pattern instead, use the 'search' command.
///
/// Work is parallelized over files.
#[structopt(setting = structopt::clap::AppSettings::ColoredHelp)]
Count(cmd::count::Opt),
/// Get exact counts for matches of given regex patterns.
///
/// Work is parallelized over files.
#[structopt(setting = structopt::clap::AppSettings::ColoredHelp)]
Search(cmd::search::Opt),
/// Collect summary statistics about a dataset.
///
/// Work is parallelized over files.
#[structopt(setting = structopt::clap::AppSettings::ColoredHelp)]
Stats(cmd::stats::Opt),
/// Estimate the number of unique ngrams in a dataset using a Bloom filter.
///
/// Work is parallelized over files.
#[structopt(setting = structopt::clap::AppSettings::ColoredHelp)]
Unique(cmd::unique::Opt),
}
fn main() -> Result<()> {
let opt = Opt::from_args();
simple_logger::init_with_level(log::Level::Info)?;
let result = match opt.cmd {
WimbdCmd::Topk(opt) => cmd::topk::main(opt),
WimbdCmd::Count(opt) => cmd::count::main(opt),
WimbdCmd::Search(opt) => cmd::search::main(opt),
WimbdCmd::Stats(opt) => cmd::stats::main(opt),
WimbdCmd::Botk(opt) => cmd::botk::main(opt),
WimbdCmd::Unique(opt) => cmd::unique::main(opt),
};
if let Err(err) = result {
log::error!("{}", err);
std::process::exit(1);
}
Ok(())
}