Allow to continue after final result. See for discussion

alphacep#146
dsjonline · Jun 22, 2020 · e5c08f7 · e5c08f7
1 parent c428162
commit e5c08f7
Show file tree

Hide file tree

Showing 3 changed files with 67 additions and 46 deletions.
diff --git a/src/kaldi_recognizer.cc b/src/kaldi_recognizer.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 Alpha Cephei Inc.
+// Copyright 2019-2020 Alpha Cephei Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -44,12 +44,9 @@ KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency) : model_(
             model_->hclg_fst_ ? *model_->hclg_fst_ : *decode_fst_,
             feature_pipeline_);
 
-    frame_offset_ = 0;
-    samples_processed_ = 0;
-    samples_round_start_ = 0;
-    input_finalized_ = false;
     spk_feature_ = NULL;
 
+    InitState();
     InitRescoring();
 }
 
@@ -90,12 +87,9 @@ KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency, char cons
             model_->hclg_fst_ ? *model_->hclg_fst_ : *decode_fst_,
             feature_pipeline_);
 
-    frame_offset_ = 0;
-    samples_processed_ = 0;
-    samples_round_start_ = 0;
-    input_finalized_ = false;
     spk_feature_ = NULL;
 
+    InitState();
     InitRescoring();
 }
 
@@ -124,13 +118,9 @@ KaldiRecognizer::KaldiRecognizer(Model *model, SpkModel *spk_model, float sample
             model_->hclg_fst_ ? *model_->hclg_fst_ : *decode_fst_,
             feature_pipeline_);
 
-    frame_offset_ = 0;
-    samples_processed_ = 0;
-    samples_round_start_ = 0;
-    input_finalized_ = false;
-
     spk_feature_ = new OnlineMfcc(spk_model_->spkvector_mfcc_opts);
 
+    InitState();
     InitRescoring();
 }
 
@@ -148,6 +138,15 @@ KaldiRecognizer::~KaldiRecognizer() {
          spk_model_->Unref();
 }
 
+void KaldiRecognizer::InitState()
+{
+    frame_offset_ = 0;
+    samples_processed_ = 0;
+    samples_round_start_ = 0;
+
+    state_ = RECOGNIZER_INITIALIZED;
+}
+
 void KaldiRecognizer::InitRescoring()
 {
     if (model_->std_lm_fst_) {
@@ -178,7 +177,7 @@ void KaldiRecognizer::CleanUp()
 
     // Also restart if we retrieved final result already
 
-    if (frame_offset_ > 20000 || input_finalized_) {
+    if (frame_offset_ > 20000 || state_ == RECOGNIZER_FINALIZED) {
         samples_round_start_ += samples_processed_;
         frame_offset_ = 0;
 
@@ -238,10 +237,11 @@ bool KaldiRecognizer::AcceptWaveform(const float *fdata, int len)
 
 bool KaldiRecognizer::AcceptWaveform(Vector<BaseFloat> &wdata)
 {
-    if (input_finalized_) {
+    // Cleanup if we finalized previous utterance or the whole feature pipeline
+    if (!(state_ == RECOGNIZER_RUNNING || state_ == RECOGNIZER_INITIALIZED)) {
         CleanUp();
-        input_finalized_ = false;
     }
+    state_ = RECOGNIZER_RUNNING;
 
     int step = static_cast<int>(sample_frequency_ * 0.2);
     for (int i = 0; i < wdata.Dim(); i+= step) {
@@ -323,17 +323,10 @@ void KaldiRecognizer::GetSpkVector(Vector<BaseFloat> &xvector)
     RunNnetComputation(features, spk_model_->speaker_nnet, &compiler, &xvector);
 }
 
-const char* KaldiRecognizer::Result()
+const char* KaldiRecognizer::GetResult()
 {
-
-    if (!input_finalized_) {
-        decoder_->FinalizeDecoding();
-        input_finalized_ = true;
-    }
-
     if (decoder_->NumFramesDecoded() == 0) {
-        last_result_ = "{\"text\": \"\"}";
-        return last_result_.c_str();
+        return StoreReturn("{\"text\": \"\"}");
     }
 
     kaldi::CompactLattice clat;
@@ -405,17 +398,21 @@ const char* KaldiRecognizer::Result()
         }
     }
 
-    last_result_ = obj.dump();
-    return last_result_.c_str();
+    return StoreReturn(obj.dump());
 }
 
+
 const char* KaldiRecognizer::PartialResult()
 {
+    if (state_ != RECOGNIZER_RUNNING) {
+        return StoreReturn("{\"text\": \"\"}");
+    }
+
     json::JSON res;
+
     if (decoder_->NumFramesDecoded() == 0) {
         res["partial"] = "";
-        last_result_ = res.dump();
-        return last_result_.c_str();
+        return StoreReturn(res.dump());
     }
 
     kaldi::Lattice lat;
@@ -433,21 +430,36 @@ const char* KaldiRecognizer::PartialResult()
     }
     res["partial"] = text.str();
 
-    last_result_ = res.dump();
-    return last_result_.c_str();
+    return StoreReturn(res.dump());
+}
+
+const char* KaldiRecognizer::Result()
+{
+    if (state_ != RECOGNIZER_RUNNING) {
+        return StoreReturn("{\"text\": \"\"}");
+    }
+    decoder_->FinalizeDecoding();
+    state_ = RECOGNIZER_ENDPOINT;
+    return GetResult();
 }
 
 const char* KaldiRecognizer::FinalResult()
 {
-    if (!input_finalized_) {
-        feature_pipeline_->InputFinished();
-        UpdateSilenceWeights();
-        decoder_->AdvanceDecoding();
-        decoder_->FinalizeDecoding();
-        input_finalized_ = true;
-        return Result();
-    } else {
-        last_result_ = "{\"text\": \"\"}";
-        return last_result_.c_str();
+    if (state_ != RECOGNIZER_RUNNING) {
+        return StoreReturn("{\"text\": \"\"}");
     }
+
+    feature_pipeline_->InputFinished();
+    UpdateSilenceWeights();
+    decoder_->AdvanceDecoding();
+    decoder_->FinalizeDecoding();
+    state_ = RECOGNIZER_FINALIZED;
+    return GetResult();
+}
+
+// Store result in recognizer and return as const string
+const char *KaldiRecognizer::StoreReturn(const string &res)
+{
+    last_result_ = res;
+    return last_result_.c_str();
 }
diff --git a/src/kaldi_recognizer.h b/src/kaldi_recognizer.h
@@ -29,6 +29,13 @@
 
 using namespace kaldi;
 
+enum KaldiRecognizerState {
+    RECOGNIZER_INITIALIZED,
+    RECOGNIZER_RUNNING,
+    RECOGNIZER_ENDPOINT,
+    RECOGNIZER_FINALIZED
+};
+
 class KaldiRecognizer {
     public:
         KaldiRecognizer(Model *model, float sample_frequency);
@@ -43,11 +50,14 @@ class KaldiRecognizer {
         const char* PartialResult();
 
     private:
+        void InitState();
         void InitRescoring();
         void CleanUp();
         void UpdateSilenceWeights();
         bool AcceptWaveform(Vector<BaseFloat> &wdata);
         void GetSpkVector(Vector<BaseFloat> &xvector);
+        const char *GetResult();
+        const char *StoreReturn(const string &res);
 
         Model *model_;
         SingleUtteranceNnet3Decoder *decoder_;
@@ -67,6 +77,6 @@ class KaldiRecognizer {
         int64 samples_processed_;
         int64 samples_round_start_;
 
-        bool input_finalized_;
+        KaldiRecognizerState state_;
         string last_result_;
 };
diff --git a/src/vosk_api.h b/src/vosk_api.h
@@ -181,9 +181,8 @@ const char *vosk_recognizer_partial_result(VoskRecognizer *recognizer);
 
 
 /** Returns speech recognition result. Same as result, but doesn't wait for silence
- *  You usually call it in the end of the stream to get final bits. You can not
- *  recognize data after final result is retrieved, you have to release the recognizer
- *  and create a new one.
+ *  You usually call it in the end of the stream to get final bits of audio. It
+ *  flushes the feature pipeline, so all remaining audio chunks got processed.
  *
  *  @returns speech result in JSON format.
  */