ES8311 + ES7210

fivif · Oct 24, 2024 · a2487f4 · a2487f4
1 parent 4c6da77
commit a2487f4
Show file tree

Hide file tree

Showing 17 changed files with 3,304 additions and 163 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -4,7 +4,7 @@
 # CMakeLists in this exact order for cmake to work correctly
 cmake_minimum_required(VERSION 3.16)
 
-set(PROJECT_VER "0.3.3")
+set(PROJECT_VER "0.4.0")
 
 include($ENV{IDF_PATH}/tools/cmake/project.cmake)
 project(xiaozhi)
diff --git a/main/Application.cc b/main/Application.cc
@@ -18,25 +18,28 @@
 
 Application::Application()
     : boot_button_((gpio_num_t)CONFIG_BOOT_BUTTON_GPIO),
-      volume_up_button_((gpio_num_t)CONFIG_VOLUME_UP_BUTTON_GPIO)
+      volume_up_button_((gpio_num_t)CONFIG_VOLUME_UP_BUTTON_GPIO),
+      volume_down_button_((gpio_num_t)CONFIG_VOLUME_DOWN_BUTTON_GPIO),
+#ifdef CONFIG_USE_DISPLAY
+      display_(CONFIG_DISPLAY_SDA_PIN, CONFIG_DISPLAY_SCL_PIN),
+#endif
 #ifdef CONFIG_USE_ML307
-    , ml307_at_modem_(CONFIG_ML307_TX_PIN, CONFIG_ML307_RX_PIN, 4096),
+      ml307_at_modem_(CONFIG_ML307_TX_PIN, CONFIG_ML307_RX_PIN, 4096),
       http_(ml307_at_modem_),
-      firmware_upgrade_(http_)
 #else
-    , http_(),
-    firmware_upgrade_(http_)
-#endif
-#ifdef CONFIG_USE_DISPLAY
-    , display_(CONFIG_DISPLAY_SDA_PIN, CONFIG_DISPLAY_SCL_PIN)
+      http_(),
 #endif
+      firmware_upgrade_(http_)
 {
     event_group_ = xEventGroupCreate();
-    
-    opus_encoder_.Configure(CONFIG_AUDIO_INPUT_SAMPLE_RATE, 1);
+
+    opus_encoder_.Configure(16000, 1);
     opus_decoder_ = opus_decoder_create(opus_decode_sample_rate_, 1, NULL);
     if (opus_decode_sample_rate_ != CONFIG_AUDIO_OUTPUT_SAMPLE_RATE) {
-        opus_resampler_.Configure(opus_decode_sample_rate_, CONFIG_AUDIO_OUTPUT_SAMPLE_RATE);
+        output_resampler_.Configure(CONFIG_AUDIO_OUTPUT_SAMPLE_RATE, opus_decode_sample_rate_);
+    }
+    if (16000 != CONFIG_AUDIO_INPUT_SAMPLE_RATE) {
+        input_resampler_.Configure(CONFIG_AUDIO_INPUT_SAMPLE_RATE, 16000);
     }
 
     firmware_upgrade_.SetCheckVersionUrl(CONFIG_OTA_VERSION_URL);
@@ -185,29 +188,49 @@ void Application::Start() {
     }
 #endif
 
-    audio_device_.OnInputData([this](const int16_t* data, int size) {
+    audio_device_.Initialize();
+    audio_device_.OnInputData([this](std::vector<int16_t>&& data) {
+        if (16000 != CONFIG_AUDIO_INPUT_SAMPLE_RATE) {
+            if (audio_device_.input_channels() == 2) {
+                auto left_channel = std::vector<int16_t>(data.size() / 2);
+                auto right_channel = std::vector<int16_t>(data.size() / 2);
+                for (size_t i = 0, j = 0; i < left_channel.size(); ++i, j += 2) {
+                    left_channel[i] = data[j];
+                    right_channel[i] = data[j + 1];
+                }
+                auto resampled_left = std::vector<int16_t>(input_resampler_.GetOutputSamples(left_channel.size()));
+                auto resampled_right = std::vector<int16_t>(input_resampler_.GetOutputSamples(right_channel.size()));
+                input_resampler_.Process(left_channel.data(), left_channel.size(), resampled_left.data());
+                input_resampler_.Process(right_channel.data(), right_channel.size(), resampled_right.data());
+                data.resize(resampled_left.size() + resampled_right.size());
+                for (size_t i = 0, j = 0; i < resampled_left.size(); ++i, j += 2) {
+                    data[j] = resampled_left[i];
+                    data[j + 1] = resampled_right[i];
+                }
+            } else {
+                auto resampled = std::vector<int16_t>(input_resampler_.GetOutputSamples(data.size()));
+                input_resampler_.Process(data.data(), data.size(), resampled.data());
+                data = std::move(resampled);
+            }
+        }
 #ifdef CONFIG_USE_AFE_SR
         if (audio_processor_.IsRunning()) {
-            audio_processor_.Input(data, size);
+            audio_processor_.Input(data);
         }
         if (wake_word_detect_.IsDetectionRunning()) {
-            wake_word_detect_.Feed(data, size);
+            wake_word_detect_.Feed(data);
         }
 #else
-        std::vector<int16_t> pcm(data, data + size);
-        Schedule([this, pcm = std::move(pcm)]() {
+        Schedule([this, data = std::move(data)]() {
             if (chat_state_ == kChatStateListening) {
                 std::lock_guard<std::mutex> lock(mutex_);
-                audio_encode_queue_.emplace_back(std::move(pcm));
+                audio_encode_queue_.emplace_back(std::move(data));
                 cv_.notify_all();
             }
         });
 #endif
     });
 
-    // Initialize the audio device
-    audio_device_.Start(CONFIG_AUDIO_INPUT_SAMPLE_RATE, CONFIG_AUDIO_OUTPUT_SAMPLE_RATE);
-
     // OPUS encoder / decoder use a lot of stack memory
     const size_t opus_stack_size = 4096 * 8;
     audio_encode_task_stack_ = (StackType_t*)malloc(opus_stack_size);
@@ -221,9 +244,10 @@ void Application::Start() {
         Application* app = (Application*)arg;
         app->AudioPlayTask();
         vTaskDelete(NULL);
-    }, "play_audio", 4096 * 4, this, 5, NULL);
+    }, "play_audio", 4096 * 4, this, 4, NULL);
 
 #ifdef CONFIG_USE_AFE_SR
+    wake_word_detect_.Initialize(audio_device_.input_channels(), audio_device_.input_reference());
     wake_word_detect_.OnVadStateChange([this](bool speaking) {
         Schedule([this, speaking]() {
             auto& builtin_led = BuiltinLed::GetInstance();
@@ -272,6 +296,7 @@ void Application::Start() {
     });
     wake_word_detect_.StartDetection();
 
+    audio_processor_.Initialize(audio_device_.input_channels(), audio_device_.input_reference());
     audio_processor_.OnOutput([this](std::vector<int16_t>&& data) {
         Schedule([this, data = std::move(data)]() {
             if (chat_state_ == kChatStateListening) {
@@ -317,7 +342,7 @@ void Application::Start() {
         Schedule([this]() {
             auto volume = audio_device_.output_volume() + 10;
             if (volume > 100) {
-                volume = 0;
+                volume = 100;
             }
             audio_device_.SetOutputVolume(volume);
 #ifdef CONFIG_USE_DISPLAY
@@ -327,6 +352,28 @@ void Application::Start() {
     });
 
     volume_up_button_.OnLongPress([this]() {
+        Schedule([this]() {
+            audio_device_.SetOutputVolume(100);
+#ifdef CONFIG_USE_DISPLAY
+            display_.ShowNotification("Volume\n100");
+#endif
+        });
+    });
+
+    volume_down_button_.OnClick([this]() {
+        Schedule([this]() {
+            auto volume = audio_device_.output_volume() - 10;
+            if (volume < 0) {
+                volume = 0;
+            }
+            audio_device_.SetOutputVolume(volume);
+#ifdef CONFIG_USE_DISPLAY
+            display_.ShowNotification("Volume\n" + std::to_string(volume));
+#endif
+        });
+    });
+
+    volume_down_button_.OnLongPress([this]() {
         Schedule([this]() {
             audio_device_.SetOutputVolume(0);
 #ifdef CONFIG_USE_DISPLAY
@@ -449,10 +496,12 @@ BinaryProtocol* Application::AllocateBinaryProtocol(const uint8_t* payload, size
 
 void Application::AudioEncodeTask() {
     ESP_LOGI(TAG, "Audio encode task started");
+    const int max_audio_play_queue_size_ = 2;
+
     while (true) {
         std::unique_lock<std::mutex> lock(mutex_);
         cv_.wait(lock, [this]() {
-            return !audio_encode_queue_.empty() || !audio_decode_queue_.empty();
+            return !audio_encode_queue_.empty() || (!audio_decode_queue_.empty() && audio_play_queue_.size() < max_audio_play_queue_size_);
         });
 
         if (!audio_encode_queue_.empty()) {
@@ -488,9 +537,9 @@ void Application::AudioEncodeTask() {
             }
 
             if (opus_decode_sample_rate_ != CONFIG_AUDIO_OUTPUT_SAMPLE_RATE) {
-                int target_size = opus_resampler_.GetOutputSamples(frame_size);
+                int target_size = output_resampler_.GetOutputSamples(frame_size);
                 std::vector<int16_t> resampled(target_size);
-                opus_resampler_.Process(packet->pcm.data(), frame_size, resampled.data());
+                output_resampler_.Process(packet->pcm.data(), frame_size, resampled.data());
                 packet->pcm = std::move(resampled);
             }
 
@@ -513,7 +562,6 @@ void Application::HandleAudioPacket(AudioPacket* packet) {
         audio_device_.OutputData(packet->pcm);
 
         if (break_speaking_) {
-            break_speaking_ = false;
             skip_to_end_ = true;
 
             // Play a silence and skip to the end
@@ -525,12 +573,13 @@ void Application::HandleAudioPacket(AudioPacket* packet) {
         break;
     }
     case kAudioPacketTypeStart:
+        break_speaking_ = false;
+        skip_to_end_ = false;
         Schedule([this]() {
             SetChatState(kChatStateSpeaking);
         });
         break;
     case kAudioPacketTypeStop:
-        skip_to_end_ = false;
         Schedule([this]() {
             SetChatState(kChatStateListening);
         });
@@ -558,6 +607,7 @@ void Application::AudioPlayTask() {
         });
         auto packet = std::move(audio_play_queue_.front());
         audio_play_queue_.pop_front();
+        cv_.notify_all();
         lock.unlock();
 
         HandleAudioPacket(packet);
@@ -574,7 +624,7 @@ void Application::SetDecodeSampleRate(int sample_rate) {
     opus_decoder_ = opus_decoder_create(opus_decode_sample_rate_, 1, NULL);
     if (opus_decode_sample_rate_ != CONFIG_AUDIO_OUTPUT_SAMPLE_RATE) {
         ESP_LOGI(TAG, "Resampling audio from %d to %d", opus_decode_sample_rate_, CONFIG_AUDIO_OUTPUT_SAMPLE_RATE);
-        opus_resampler_.Configure(opus_decode_sample_rate_, CONFIG_AUDIO_OUTPUT_SAMPLE_RATE);
+        output_resampler_.Configure(opus_decode_sample_rate_, CONFIG_AUDIO_OUTPUT_SAMPLE_RATE);
     }
 }
 
@@ -607,7 +657,7 @@ void Application::StartWebSocketClient() {
         std::string message = "{";
         message += "\"type\":\"hello\",";
         message += "\"audio_params\":{";
-        message += "\"format\":\"opus\", \"sample_rate\":" + std::to_string(CONFIG_AUDIO_INPUT_SAMPLE_RATE) + ", \"channels\":1";
+        message += "\"format\":\"opus\", \"sample_rate\":16000, \"channels\":1";
         message += "}}";
         ws_client_->Send(message);
     });
@@ -640,6 +690,10 @@ void Application::StartWebSocketClient() {
                         if (sample_rate != NULL) {
                             SetDecodeSampleRate(sample_rate->valueint);
                         }
+
+                        // If the device is speaking, we need to break the speaking
+                        break_speaking_ = true;
+                        skip_to_end_ = true;
                     } else if (strcmp(state->valuestring, "stop") == 0) {
                         packet->type = kAudioPacketTypeStop;
                     } else if (strcmp(state->valuestring, "sentence_end") == 0) {

diff --git a/main/Application.h b/main/Application.h
@@ -1,7 +1,6 @@
 #ifndef _APPLICATION_H_
 #define _APPLICATION_H_
 
-#include "AudioDevice.h"
 #include <OpusEncoder.h>
 #include <OpusResampler.h>
 #include <WebSocket.h>
@@ -17,6 +16,7 @@
 #include <list>
 #include <condition_variable>
 
+#include "BoxAudioDevice.h"
 #include "Display.h"
 #include "FirmwareUpgrade.h"
 
@@ -86,7 +86,15 @@ class Application {
 
     Button boot_button_;
     Button volume_up_button_;
+    Button volume_down_button_;
+#ifdef CONFIG_AUDIO_CODEC_ES8311_ES7210
+    BoxAudioDevice audio_device_;
+#else
     AudioDevice audio_device_;
+#endif
+#ifdef CONFIG_USE_DISPLAY
+    Display display_;
+#endif
 #ifdef CONFIG_USE_AFE_SR
     WakeWordDetect wake_word_detect_;
     AudioProcessor audio_processor_;
@@ -98,9 +106,6 @@ class Application {
     EspHttp http_;
 #endif
     FirmwareUpgrade firmware_upgrade_;
-#ifdef CONFIG_USE_DISPLAY
-    Display display_;
-#endif
     std::mutex mutex_;
     std::condition_variable_any cv_;
     std::list<std::function<void()>> main_tasks_;
@@ -123,7 +128,8 @@ class Application {
 
     int opus_duration_ms_ = 60;
     int opus_decode_sample_rate_ = CONFIG_AUDIO_OUTPUT_SAMPLE_RATE;
-    OpusResampler opus_resampler_;
+    OpusResampler input_resampler_;
+    OpusResampler output_resampler_;
 
     TaskHandle_t check_new_version_task_ = nullptr;
     StaticTask_t check_new_version_task_buffer_;