Skip to content

Commit

Permalink
Allow to continue after final result. See for discussion
Browse files Browse the repository at this point in the history
  • Loading branch information
nshmyrev committed Jun 22, 2020
1 parent c428162 commit e5c08f7
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 46 deletions.
96 changes: 54 additions & 42 deletions src/kaldi_recognizer.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2019 Alpha Cephei Inc.
// Copyright 2019-2020 Alpha Cephei Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -44,12 +44,9 @@ KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency) : model_(
model_->hclg_fst_ ? *model_->hclg_fst_ : *decode_fst_,
feature_pipeline_);

frame_offset_ = 0;
samples_processed_ = 0;
samples_round_start_ = 0;
input_finalized_ = false;
spk_feature_ = NULL;

InitState();
InitRescoring();
}

Expand Down Expand Up @@ -90,12 +87,9 @@ KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency, char cons
model_->hclg_fst_ ? *model_->hclg_fst_ : *decode_fst_,
feature_pipeline_);

frame_offset_ = 0;
samples_processed_ = 0;
samples_round_start_ = 0;
input_finalized_ = false;
spk_feature_ = NULL;

InitState();
InitRescoring();
}

Expand Down Expand Up @@ -124,13 +118,9 @@ KaldiRecognizer::KaldiRecognizer(Model *model, SpkModel *spk_model, float sample
model_->hclg_fst_ ? *model_->hclg_fst_ : *decode_fst_,
feature_pipeline_);

frame_offset_ = 0;
samples_processed_ = 0;
samples_round_start_ = 0;
input_finalized_ = false;

spk_feature_ = new OnlineMfcc(spk_model_->spkvector_mfcc_opts);

InitState();
InitRescoring();
}

Expand All @@ -148,6 +138,15 @@ KaldiRecognizer::~KaldiRecognizer() {
spk_model_->Unref();
}

void KaldiRecognizer::InitState()
{
frame_offset_ = 0;
samples_processed_ = 0;
samples_round_start_ = 0;

state_ = RECOGNIZER_INITIALIZED;
}

void KaldiRecognizer::InitRescoring()
{
if (model_->std_lm_fst_) {
Expand Down Expand Up @@ -178,7 +177,7 @@ void KaldiRecognizer::CleanUp()

// Also restart if we retrieved final result already

if (frame_offset_ > 20000 || input_finalized_) {
if (frame_offset_ > 20000 || state_ == RECOGNIZER_FINALIZED) {
samples_round_start_ += samples_processed_;
frame_offset_ = 0;

Expand Down Expand Up @@ -238,10 +237,11 @@ bool KaldiRecognizer::AcceptWaveform(const float *fdata, int len)

bool KaldiRecognizer::AcceptWaveform(Vector<BaseFloat> &wdata)
{
if (input_finalized_) {
// Cleanup if we finalized previous utterance or the whole feature pipeline
if (!(state_ == RECOGNIZER_RUNNING || state_ == RECOGNIZER_INITIALIZED)) {
CleanUp();
input_finalized_ = false;
}
state_ = RECOGNIZER_RUNNING;

int step = static_cast<int>(sample_frequency_ * 0.2);
for (int i = 0; i < wdata.Dim(); i+= step) {
Expand Down Expand Up @@ -323,17 +323,10 @@ void KaldiRecognizer::GetSpkVector(Vector<BaseFloat> &xvector)
RunNnetComputation(features, spk_model_->speaker_nnet, &compiler, &xvector);
}

const char* KaldiRecognizer::Result()
const char* KaldiRecognizer::GetResult()
{

if (!input_finalized_) {
decoder_->FinalizeDecoding();
input_finalized_ = true;
}

if (decoder_->NumFramesDecoded() == 0) {
last_result_ = "{\"text\": \"\"}";
return last_result_.c_str();
return StoreReturn("{\"text\": \"\"}");
}

kaldi::CompactLattice clat;
Expand Down Expand Up @@ -405,17 +398,21 @@ const char* KaldiRecognizer::Result()
}
}

last_result_ = obj.dump();
return last_result_.c_str();
return StoreReturn(obj.dump());
}


const char* KaldiRecognizer::PartialResult()
{
if (state_ != RECOGNIZER_RUNNING) {
return StoreReturn("{\"text\": \"\"}");
}

json::JSON res;

if (decoder_->NumFramesDecoded() == 0) {
res["partial"] = "";
last_result_ = res.dump();
return last_result_.c_str();
return StoreReturn(res.dump());
}

kaldi::Lattice lat;
Expand All @@ -433,21 +430,36 @@ const char* KaldiRecognizer::PartialResult()
}
res["partial"] = text.str();

last_result_ = res.dump();
return last_result_.c_str();
return StoreReturn(res.dump());
}

const char* KaldiRecognizer::Result()
{
if (state_ != RECOGNIZER_RUNNING) {
return StoreReturn("{\"text\": \"\"}");
}
decoder_->FinalizeDecoding();
state_ = RECOGNIZER_ENDPOINT;
return GetResult();
}

const char* KaldiRecognizer::FinalResult()
{
if (!input_finalized_) {
feature_pipeline_->InputFinished();
UpdateSilenceWeights();
decoder_->AdvanceDecoding();
decoder_->FinalizeDecoding();
input_finalized_ = true;
return Result();
} else {
last_result_ = "{\"text\": \"\"}";
return last_result_.c_str();
if (state_ != RECOGNIZER_RUNNING) {
return StoreReturn("{\"text\": \"\"}");
}

feature_pipeline_->InputFinished();
UpdateSilenceWeights();
decoder_->AdvanceDecoding();
decoder_->FinalizeDecoding();
state_ = RECOGNIZER_FINALIZED;
return GetResult();
}

// Store result in recognizer and return as const string
const char *KaldiRecognizer::StoreReturn(const string &res)
{
last_result_ = res;
return last_result_.c_str();
}
12 changes: 11 additions & 1 deletion src/kaldi_recognizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,13 @@

using namespace kaldi;

enum KaldiRecognizerState {
RECOGNIZER_INITIALIZED,
RECOGNIZER_RUNNING,
RECOGNIZER_ENDPOINT,
RECOGNIZER_FINALIZED
};

class KaldiRecognizer {
public:
KaldiRecognizer(Model *model, float sample_frequency);
Expand All @@ -43,11 +50,14 @@ class KaldiRecognizer {
const char* PartialResult();

private:
void InitState();
void InitRescoring();
void CleanUp();
void UpdateSilenceWeights();
bool AcceptWaveform(Vector<BaseFloat> &wdata);
void GetSpkVector(Vector<BaseFloat> &xvector);
const char *GetResult();
const char *StoreReturn(const string &res);

Model *model_;
SingleUtteranceNnet3Decoder *decoder_;
Expand All @@ -67,6 +77,6 @@ class KaldiRecognizer {
int64 samples_processed_;
int64 samples_round_start_;

bool input_finalized_;
KaldiRecognizerState state_;
string last_result_;
};
5 changes: 2 additions & 3 deletions src/vosk_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,9 +181,8 @@ const char *vosk_recognizer_partial_result(VoskRecognizer *recognizer);


/** Returns speech recognition result. Same as result, but doesn't wait for silence
* You usually call it in the end of the stream to get final bits. You can not
* recognize data after final result is retrieved, you have to release the recognizer
* and create a new one.
* You usually call it in the end of the stream to get final bits of audio. It
* flushes the feature pipeline, so all remaining audio chunks got processed.
*
* @returns speech result in JSON format.
*/
Expand Down

0 comments on commit e5c08f7

Please sign in to comment.