forked from mozilla/DeepSpeech
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_trie.cpp
64 lines (52 loc) · 1.75 KB
/
generate_trie.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#include <algorithm>
#include <iostream>
#include <string>
using namespace std;
#include "lm/model.hh"
#include "trie_node.h"
#include "alphabet.h"
typedef lm::ngram::QuantArrayTrieModel Model;
lm::WordIndex GetWordIndex(const Model& model, const std::string& word) {
return model.GetVocabulary().Index(word);
}
float ScoreWord(const Model& model, lm::WordIndex word_index) {
// We don't need to keep state here as we're scoring the words individually.
Model::State out;
return model.FullScore(model.NullContextState(), word_index, out).prob;
}
int generate_trie(const char* alphabet_path, const char* kenlm_path, const char* vocab_path, const char* trie_path) {
Alphabet a(alphabet_path);
lm::ngram::Config config;
config.load_method = util::POPULATE_OR_READ;
Model model(kenlm_path, config);
TrieNode root(a.GetSize());
std::ifstream ifs(vocab_path, std::ifstream::in | std::ios::binary);
if (!ifs) {
std::cerr << "unable to open vocabulary file " << vocab_path << std::endl;
return -1;
}
std::ofstream ofs(trie_path);
if (!ofs) {
std::cerr << "unable to open output file " << trie_path << std::endl;
return -1;
}
std::string word;
while (ifs >> word) {
lm::WordIndex word_index = GetWordIndex(model, word);
float unigram_score = ScoreWord(model, word_index);
root.Insert(word,
[&a](const std::string& c) {
return a.LabelFromString(c);
},
word_index, unigram_score);
}
root.WriteToStream(ofs);
return 0;
}
int main(int argc, char** argv) {
if (argc != 5) {
std::cerr << "Usage: " << argv[0] << " <alphabet> <lm_model> <vocabulary> <trie_path>" << std::endl;
return -1;
}
return generate_trie(argv[1], argv[2], argv[3], argv[4]);
}