Skip to content

Commit

Permalink
stream : add "max_tokens" parameter
Browse files Browse the repository at this point in the history
Used to limit the number of tokens in a segment.
Useful to battle with word repetition when using partial encoder context
  • Loading branch information
ggerganov committed Nov 20, 2022
1 parent d351771 commit 62b5ff8
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 2 deletions.
1 change: 1 addition & 0 deletions examples/stream/stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,7 @@ int main(int argc, char ** argv) {
{
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);

wparams.max_tokens = 32;
wparams.print_progress = false;
wparams.print_special_tokens = params.print_special_tokens;
wparams.print_realtime = false;
Expand Down
4 changes: 3 additions & 1 deletion whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2402,6 +2402,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
/*.thold_pt =*/ 0.01f,
/*.thold_ptsum =*/ 0.01f,
/*.max_len =*/ 0,
/*.max_tokens =*/ 0,

/*.speed_up =*/ false,

Expand Down Expand Up @@ -2443,6 +2444,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
/*.thold_pt =*/ 0.01f,
/*.thold_ptsum =*/ 0.01f,
/*.max_len =*/ 0,
/*.max_tokens =*/ 0,

/*.speed_up =*/ false,

Expand Down Expand Up @@ -2685,7 +2687,7 @@ int whisper_full(
//}

// end of text token
if (token.id == whisper_token_eot(ctx) || (i > WHISPER_EXPERIMENT_MAX_TOKENS_PER_SEGMENT)) {
if (token.id == whisper_token_eot(ctx) || (params.max_tokens > 0 && i > params.max_tokens)) {
if (result_len == 0) {
if (seek + seek_delta + 100 >= seek_end) {
result_len = i + 1;
Expand Down
2 changes: 1 addition & 1 deletion whisper.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
#define WHISPER_CHUNK_SIZE 30

#define WHISPER_EXPERIMENT_AUDIO_CTX 512
#define WHISPER_EXPERIMENT_MAX_TOKENS_PER_SEGMENT 32

#ifdef __cplusplus
extern "C" {
Expand Down Expand Up @@ -205,6 +204,7 @@ extern "C" {
float thold_pt; // timestamp token probability threshold (~0.01)
float thold_ptsum; // timestamp token sum probability threshold (~0.01)
int max_len; // max segment length in characters
int max_tokens; // max tokens per segment (0 = no limit)

// [EXPERIMENTAL] speed-up techniques
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
Expand Down

0 comments on commit 62b5ff8

Please sign in to comment.