forked from alphacep/vosk-api
-
Notifications
You must be signed in to change notification settings - Fork 0
/
vosk_api.h
211 lines (173 loc) · 6.93 KB
/
vosk_api.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
// Copyright 2020 Alpha Cephei Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/* This header contains the C API for Vosk speech recognition system */
#ifndef VOSK_API_H
#define VOSK_API_H
#ifdef __cplusplus
extern "C" {
#endif
/** Model stores all the data required for recognition
* it contains static data and can be shared across processing
* threads. */
typedef struct VoskModel VoskModel;
/** Speaker model is the same as model but contains the data
* for speaker identification. */
typedef struct VoskSpkModel VoskSpkModel;
/** Recognizer object is the main object which processes data.
* Each recognizer usually runs in own thread and takes audio as input.
* Once audio is processed recognizer returns JSON object as a string
* which represent decoded information - words, confidences, times, n-best lists,
* speaker information and so on */
typedef struct VoskRecognizer VoskRecognizer;
/** Loads model data from the file and returns the model object
*
* @param model_path: the path of the model on the filesystem
@ @returns model object */
VoskModel *vosk_model_new(const char *model_path);
/** Releases the model memory
*
* The model object is reference-counted so if some recognizer
* depends on this model, model might still stay alive. When
* last recognizer is released, model will be released too. */
void vosk_model_free(VoskModel *model);
/** Loads speaker model data from the file and returns the model object
*
* @param model_path: the path of the model on the filesystem
* @returns model object */
VoskSpkModel *vosk_spk_model_new(const char *model_path);
/** Releases the model memory
*
* The model object is reference-counted so if some recognizer
* depends on this model, model might still stay alive. When
* last recognizer is released, model will be released too. */
void vosk_spk_model_free(VoskSpkModel *model);
/** Creates the recognizer object
*
* The recognizers process the speech and return text using shared model data
* @param sample_rate The sample rate of the audio you going to feed into the recognizer
* @returns recognizer object */
VoskRecognizer *vosk_recognizer_new(VoskModel *model, float sample_rate);
/** Creates the recognizer object with speaker recognition
*
* With the speaker recognition mode the recognizer not just recognize
* text but also return speaker vectors one can use for speaker identification
*
* @param spk_model speaker model for speaker identification
* @param sample_rate The sample rate of the audio you going to feed into the recognizer
* @returns recognizer object */
VoskRecognizer *vosk_recognizer_new_spk(VoskModel *model, VoskSpkModel *spk_model, float sample_rate);
/** Creates the recognizer object with the phrase list
*
* Sometimes when you want to improve recognition accuracy and when you don't need
* to recognize large vocabulary you can specify a list of phrases to recognize. This
* will improve recognizer speed and accuracy but might return [unk] if user said
* something different.
*
* Only recognizers with lookahead models support this type of quick configuration.
* Precompiled HCLG graph models are not supported.
*
* @param sample_rate The sample rate of the audio you going to feed into the recognizer
* @param grammar The string with the list of phrases to recognize as JSON array of strings,
* for example "["one two three four five", "[unk]"]".
*
* @returns recognizer object */
VoskRecognizer *vosk_recognizer_new_grm(VoskModel *model, float sample_rate, const char *grammar);
/** Accept voice data
*
* accept and process new chunk of voice data
*
* @param data - audio data in PCM 16-bit mono format
* @param length - length of the audio data
* @returns true if silence is occured and you can retrieve a new utterance with result method */
int vosk_recognizer_accept_waveform(VoskRecognizer *recognizer, const char *data, int length);
/** Same as above but the version with the short data for language bindings where you have
* audio as array of shorts */
int vosk_recognizer_accept_waveform_s(VoskRecognizer *recognizer, const short *data, int length);
/** Same as above but the version with the float data for language bindings where you have
* audio as array of floats */
int vosk_recognizer_accept_waveform_f(VoskRecognizer *recognizer, const float *data, int length);
/** Returns speech recognition result
*
* @returns the result in JSON format which contains decoded line, decoded
* words, times in seconds and confidences. You can parse this result
* with any json parser
*
* <pre>
* {
* "result" : [{
* "conf" : 1.000000,
* "end" : 1.110000,
* "start" : 0.870000,
* "word" : "what"
* }, {
* "conf" : 1.000000,
* "end" : 1.530000,
* "start" : 1.110000,
* "word" : "zero"
* }, {
* "conf" : 1.000000,
* "end" : 1.950000,
* "start" : 1.530000,
* "word" : "zero"
* }, {
* "conf" : 1.000000,
* "end" : 2.340000,
* "start" : 1.950000,
* "word" : "zero"
* }, {
* "conf" : 1.000000,
* "end" : 2.610000,
* "start" : 2.340000,
* "word" : "one"
* }],
* "text" : "what zero zero zero one"
* }
* </pre>
*/
const char *vosk_recognizer_result(VoskRecognizer *recognizer);
/** Returns partial speech recognition
*
* @returns partial speech recognition text which is not yet finalized.
* result may change as recognizer process more data.
*
* <pre>
* {
* "partial" : "cyril one eight zero"
* }
* </pre>
*/
const char *vosk_recognizer_partial_result(VoskRecognizer *recognizer);
/** Returns speech recognition result. Same as result, but doesn't wait for silence
* You usually call it in the end of the stream to get final bits of audio. It
* flushes the feature pipeline, so all remaining audio chunks got processed.
*
* @returns speech result in JSON format.
*/
const char *vosk_recognizer_final_result(VoskRecognizer *recognizer);
/** Releases recognizer object
*
* Underlying model is also unreferenced and if needed released */
void vosk_recognizer_free(VoskRecognizer *recognizer);
/** Set log level for Kaldi messages
*
* @param log_level the level
* 0 - default value to print info and error messages but no debug
* less than 0 - don't print info messages
* greather than 0 - more verbose mode
*/
void vosk_set_log_level(int log_level);
#ifdef __cplusplus
}
#endif
#endif /* VOSK_API_H */