forked from abb128/april-asr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexample_srt.cpp
278 lines (219 loc) · 8.62 KB
/
example_srt.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
#include <stdio.h>
#include <cstdlib>
#include <cstring>
#include <assert.h>
#include <time.h>
#include <errno.h>
#include "april_api.h"
#ifndef _MSC_VER
#include <unistd.h>
#else
#include <io.h>
#include <BaseTsd.h>
#define STDIN_FILENO 0
typedef SSIZE_T ssize_t;
#endif
#define BUFFER_SIZE 1024
int ends_with(const char *str, const char *suffix);
struct {
int num;
} subrip;
struct wav_header {
// RIFF Header
char riff_header[4]; // Contains "RIFF"
uint32_t wav_size; // Size of the wav portion of the file, which follows the first 8 bytes. File size - 8
char wave_header[4]; // Contains "WAVE"
// Format Header
char fmt_header[4]; // Contains "fmt " (includes trailing space)
int32_t fmt_chunk_size; // Should be 16 for PCM
int16_t audio_format; // Should be 1 for PCM. 3 for IEEE Float
int16_t num_channels;
int32_t sample_rate;
int32_t byte_rate; // Number of bytes per second. sample_rate * num_channels * Bytes Per Sample
int16_t sample_alignment; // num_channels * Bytes Per Sample
int16_t bit_depth; // Number of bits per sample
// Data
char data_header[4]; // Contains "data"
uint32_t data_bytes; // Number of bytes in data. Number of samples * num_channels * sample byte size
};
static_assert(sizeof(wav_header) == 44L, "wav header must be 44 bytes");
// In this example, the internal state is just a global struct just for testing
// In your program you can pass any pointer into userdata to access it in the
// handler
struct {
int xyz;
} some_internal_state;
// This callback function will get called every time a new result is decoded.
// It's passed into the AprilConfig along with the userdata pointer.
void handler(void *userdata, AprilResultType result, size_t count, const AprilToken *tokens) {
if(result == APRIL_RESULT_RECOGNITION_FINAL) {
for(int t=0; t<count; t++){
const AprilToken *token = &tokens[t];
size_t timeStart = token->time_ms;
size_t timeEnd = 0.0;
if((t+1) < count) timeEnd = tokens[t+1].time_ms;
else timeEnd = timeStart + 2000;
subrip.num++;
printf("%d\n", subrip.num);
int start_hours = 0;
int start_minutes = 0;
int start_seconds = 0;
int start_milliseconds = 0;
while(timeStart > 3600 * 1000) {
timeStart -= 3600 * 1000;
start_hours++;
}
while(timeStart > 60 * 1000) {
timeStart -= 60 * 1000;
start_minutes++;
}
while(timeStart > 1000) {
timeStart -= 1000;
start_seconds++;
}
start_milliseconds = timeStart;
int end_hours = 0;
int end_minutes = 0;
int end_seconds = 0;
int end_milliseconds = 0;
while(timeEnd > 3600 * 1000) {
timeEnd -= 3600 * 1000;
end_hours++;
}
while(timeEnd > 60 * 1000) {
timeEnd -= 60 * 1000;
end_minutes++;
}
while(timeEnd > 1000) {
timeEnd -= 1000;
end_seconds++;
}
end_milliseconds = timeEnd;
printf("%02d:%02d:%02d,%03d --> %02d:%02d:%02d,%03d\n",
start_hours, start_minutes, start_seconds, start_milliseconds,
end_hours, end_minutes, end_seconds, end_milliseconds);
for(int y=0; y<=t; y++){
printf("%s", tokens[y].token);
}
printf("\n\n");
}
}
}
int main(int argc, char *argv[]){
if(argc != 3){
fprintf(stderr, "Usage: %s [file] [modelpath]\n", argv[0]);
fprintf(stderr, " - [file] must be a 16000Hz raw PCM16 file, or may be - for stdin\n");
fprintf(stderr, " - [modelpath] must be a path to the models\n");
return 1;
}
const char *input_file = argv[1];
const char *input_model = argv[2];
// In the start of our program we should call aam_api_init.
// This should only be called once.
aam_api_init(APRIL_VERSION);
// Next we should load the model. The model by itself doesn't allow us
// to do much except for get the metadata. If loading the model
// fails, NULL is returned.
AprilASRModel model = aam_create_model(input_model);
if(model == NULL){
return 1;
}
size_t model_sample_rate = aam_get_sample_rate(model);
// To do actual speech recognition, it's necessary to create a session.
// Models and sessions are separate to allow for more efficient handling
// of multiple sessions. For example, an application may be performing
// recognition on 20 different audio streams by using 20 different sessions
// and by re-using the same AprilASRModel the relative memory use is low.
// However, it's important that the AprilASRModel does not get freed
// before all of its sessions.
AprilConfig config = { 0 };
config.handler = handler;
config.userdata = (void*)&some_internal_state;
// By default, the session runs in synchronous mode. If you want async
// processing, you may choose to set it to APRIL_CONFIG_FLAG_ASYNC_RT_BIT
// here.
config.flags = APRIL_CONFIG_FLAG_ZERO_BIT;
AprilASRSession session = aas_create_session(model, config);
if(argv[1][0] == '-' && argv[1][1] == 0) {
// Reading stdin mode. It's assumed that the input data is pcm16 audio,
// sampled in the model's sample rate.
// You can achieve this on Linux like this:
// $ parec --format=s16 --rate=16000 --channels=1 --latency-ms=100 | ./main - /path/to/model.april
char data[BUFFER_SIZE];
ssize_t r;
for(;;){
r = read(STDIN_FILENO, data, BUFFER_SIZE);
if (r == -1) {
aas_flush(session);
break;
} else
if (r <= 0) {
continue;
}
aas_feed_pcm16(session, (short *)data, r/2);
}
} else if (argv[1][0] == '?' && argv[1][1] == 0) {
// Run some blank data, mainly for memory leak testing
char data[6400];
memset(data, 0, 6400);
aas_feed_pcm16(session, (short *)data, 3200);
aas_flush(session);
} else {
// wave file mode, the file must be in PCM16 format sampled in the
// model's sample rate.
FILE *fd = fopen(argv[1], "rb");
if(fd == 0){
fprintf(stderr, "Failed to open file %s\n", argv[1]);
return 2;
}
fseek(fd, 0L, SEEK_END);
size_t sz = ftell(fd);
size_t offset = 0L;
fseek(fd, 0L, SEEK_SET);
// Verify the RIFF header if supplied
if(ends_with(argv[1], ".wav")) {
wav_header header;
fread(&header, 1L, 44L, fd);
bool is_valid_wav = (header.fmt_chunk_size == 16)
&& (header.audio_format == 1)
&& (header.sample_rate == model_sample_rate)
&& (header.num_channels == 1);
if(!is_valid_wav){
fprintf(stderr, "Wave file must be single-channel 16-bit PCM sampled in %llu Hz!\n", model_sample_rate);
return 2;
}
offset = 44L;
}
fseek(fd, 0L, SEEK_SET);
// read the file
uint8_t *file = (uint8_t *)calloc(1, sz);
if (fread(file, 1, sz, fd) != sz) {
fprintf(stderr, "reading file failed\n");
return 4;
}
int16_t *file_data = (int16_t *)(file + offset);
size_t num_shorts = (sz - offset) / 2;
// For synchronous mode, it's possible to feed the entire thing at once
// For asynchronous, you may want to break it up into smaller chunks
// over time because the size of the internal buffer is limited
aas_feed_pcm16(session, file_data, num_shorts);
// Flushing makes sure any remaining frames get processed and a final
// result is given
aas_flush(session);
fprintf(stderr, "\ndone\n");
free(file);
fclose(fd);
}
aas_free(session);
aam_free(model);
return 0;
}
int ends_with(const char *str, const char *suffix) {
if (!str || !suffix)
return 0;
size_t lenstr = strlen(str);
size_t lensuffix = strlen(suffix);
if (lensuffix > lenstr)
return 0;
return strncmp(str + lenstr - lensuffix, suffix, lensuffix) == 0;
}