log audio file after ocr

andreyadrian · Apr 14, 2021 · 0a4aee4 · 0a4aee4
1 parent 6451fd5
commit 0a4aee4
Show file tree

Hide file tree

Showing 12 changed files with 226 additions and 69 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,7 @@
 
 # Logs
 logs/text/*.txt
+logs/audio/
 
 # Debian Build
 build/

diff --git a/audio.py b/audio.py
@@ -2,71 +2,111 @@
 import wave 
 import os
 from config import r_config, LOG_CONFIG
+from recordaudio import RecordThread
+from time import sleep
 
-p = pyaudio.PyAudio()
-
-#Set default to first in list or ask Windows
-try:
-    default_device_index = p.get_default_input_device_info()
-except IOError:
-    default_device_index = -1
+def get_default_device_index():
+    p = pyaudio.PyAudio()
+    #Set default to first in list or ask Windows
+    try:
+        default_device_index = p.get_default_input_device_info()
+    except IOError:
+        return -1
+    info = p.get_device_info_by_index(0)
+    p.terminate()
+    return info["index"]
 
 #Select Device
 # print ("Available devices:\n")
 def get_audio_objects():
+    p = pyaudio.PyAudio()
     audio_objects = {}
     for i in range(0, p.get_device_count()):
         info = p.get_device_info_by_index(i)
         audio_host = p.get_host_api_info_by_index(info["hostApi"])["name"]
         if valid_output_device(info["index"]):
             audio_objects.setdefault(audio_host,[]).append({info["index"]: info["name"]})
+    p.terminate()
     return audio_objects
 
 def valid_output_device(deviceIndex):
     if not isinstance(deviceIndex, int):
         return False
+    p = pyaudio.PyAudio()
     device_info = p.get_device_info_by_index(deviceIndex)
     is_input = device_info["maxInputChannels"] > 0
     is_wasapi = (p.get_host_api_info_by_index(device_info["hostApi"])["name"]).find("WASAPI") != -1
+    p.terminate()
     if is_input:
         if is_wasapi:
             return True
         else:
             return False
     return True
 
-def record_audio_by_device_index(deviceIndex, duration):
-    frames = int(r_config(LOG_CONFIG, "logaudioframes"))
-    device_info = p.get_device_info_by_index(deviceIndex)
-    is_input = device_info["maxInputChannels"] > 0
-    is_wasapi = (p.get_host_api_info_by_index(device_info["hostApi"])["name"]).find("WASAPI") != -1
-    useloopback = is_wasapi and not is_input
-    recorded_frames = []
-    # Open stream
-    channelcount = device_info["maxInputChannels"] if (device_info["maxOutputChannels"] < device_info["maxInputChannels"]) else device_info["maxOutputChannels"]
-    stream = p.open(format = pyaudio.paInt16,   
-                channels = channelcount,
-                rate = int(device_info["defaultSampleRate"]),
-                input = True,
-                frames_per_buffer = frames,
-                input_device_index = device_info["index"],
-                as_loopback = useloopback)
+def play_audio_from_file(filename):
+    print("gonna play some", filename)
+    # length of data to read.
+    chunk = 1024
+   # open the file for reading.
+    wf = wave.open(filename, 'rb')
+
+    # create an audio object
+    p = pyaudio.PyAudio()
+
+    # open stream based on the wave object which has been input.
+    stream = p.open(format =
+                    p.get_format_from_width(wf.getsampwidth()),
+                    channels = wf.getnchannels(),
+                    rate = wf.getframerate(),
+                    output = True)
+
+    # read data (based on the chunk size)
+    data = wf.readframes(chunk)
+
+    while True:
+        if data != '':
+            stream.write(data)
+            data = wf.readframes(chunk)
+
+        if data == b'':
+            break
+
+    # cleanup stuff.
+    stream.close()    
+    p.terminate()
 
-    # Start recording
-    print("started record")
-    for i in range(0, int(int(device_info["defaultSampleRate"]) / frames * duration)):
-        recorded_frames.append(stream.read(frames))
+# def record_audio_by_device_index(deviceIndex, duration):
+#     frames = int(r_config(LOG_CONFIG, "logaudioframes"))
+#     device_info = p.get_device_info_by_index(deviceIndex)
+#     is_input = device_info["maxInputChannels"] > 0
+#     is_wasapi = (p.get_host_api_info_by_index(device_info["hostApi"])["name"]).find("WASAPI") != -1
+#     useloopback = is_wasapi and not is_input
+#     recorded_frames = []
+#     # Open stream
+#     channelcount = device_info["maxInputChannels"] if (device_info["maxOutputChannels"] < device_info["maxInputChannels"]) else device_info["maxOutputChannels"]
+#     stream = p.open(format = pyaudio.paInt16,   
+#                 channels = channelcount,
+#                 rate = int(device_info["defaultSampleRate"]),
+#                 input = True,
+#                 frames_per_buffer = frames,
+#                 input_device_index = device_info["index"],
+#                 as_loopback = useloopback)
 
-    # Stop Recording
-    print("stopped record")
-    stream.stop_stream()
-    stream.close()
+#     # Start recording
+#     print("started record")
+#     for i in range(0, int(int(device_info["defaultSampleRate"]) / frames * duration)):
+#         recorded_frames.append(stream.read(frames))
 
-    filename = "out.wav"
-    waveFile = wave.open(filename, 'wb')
-    waveFile.setnchannels(channelcount)
-    waveFile.setsampwidth(p.get_sample_size(pyaudio.paInt16))
-    waveFile.setframerate(int(device_info["defaultSampleRate"]))
-    waveFile.writeframes(b''.join(recorded_frames))
-    waveFile.close()
+#     # Stop Recording
+#     print("stopped record")
+#     stream.stop_stream()
+#     stream.close()
 
+#     filename = "out.wav"
+#     waveFile = wave.open(filename, 'wb')
+#     waveFile.setnchannels(channelcount)
+#     waveFile.setsampwidth(p.get_sample_size(pyaudio.paInt16))
+#     waveFile.setframerate(int(device_info["defaultSampleRate"]))
+#     waveFile.writeframes(b''.join(recorded_frames))
+#     waveFile.close()
diff --git a/config.ini b/config.ini
@@ -24,6 +24,8 @@ target_lang = EN
 logimages = true
 logimagetype = jpg
 logimagequality = 1.0
+logaudio = true
+logaudioduration = 7.0
 logaudiohost = Windows WASAPI
 logaudioframes = 512
 

diff --git a/game2text.py b/game2text.py
@@ -4,11 +4,12 @@
 from translate import deepl_translate
 from hotkeys import refresh_ocr_hotkey, esc_hotkey
 from util import RepeatedTimer, open_folder_by_relative_path
-from audio import get_audio_objects, record_audio_by_device_index
+from audio import get_default_device_index, get_audio_objects
+from recordaudio import RecordThread
 from pynput import keyboard
 from clipboard import clipboard_to_output, text_to_clipboard
 from logger import get_time_string
-from config import r_config, w_config, WINDOWS_HOTKEYS_CONFIG, APP_CONFIG
+from config import r_config, w_config, WINDOWS_HOTKEYS_CONFIG, APP_CONFIG, LOG_CONFIG
 
 session_start_time = get_time_string()
 
@@ -22,8 +23,8 @@ def close(page, sockets):
       os._exit(0)
 
 @eel.expose                         # Expose this function to Javascript
-def recognize_image(engine, image, orientation, log_images):
-    return detect_and_log(engine, image, orientation, session_start_time, get_time_string(), log_images)
+def recognize_image(engine, image, orientation):
+    return detect_and_log(engine, image, orientation, session_start_time, get_time_string(), audio_recorder)
 
 @eel.expose                         # Expose this function to Javascript
 def translate(text):
@@ -36,6 +37,14 @@ def monitor_clipboard():
     else:
         clipboard_timer.start()
 
+@eel.expose
+def restart_audio_recording(device_index=get_default_device_index()):
+    global audio_recorder
+    if not audio_recorder.bRecord:
+        audio_recorder.stop_recording(None, -1)
+    audio_recorder = RecordThread(device_index, int(r_config(LOG_CONFIG, "logaudioframes")))
+    audio_recorder.start()
+
 @eel.expose
 def copy_text_to_clipboard(text):
     text_to_clipboard(text)
@@ -56,10 +65,6 @@ def open_folder(relative_path):
 def get_audio_sources():
     return get_audio_objects()
 
-@eel.expose
-def record_audio(device_index, duration):
-    return record_audio_by_device_index(device_index, duration)
-
 @eel.expose
 def open_new_window(html_file, height=800, width=600):
     eel.start(html_file, 
@@ -81,9 +86,15 @@ def run_eel():
 
 main_thread = threading.Thread(target=run_eel, args=())
 main_thread.start()
+
+# Thread to export clipboard text continuously
 clipboard_timer = RepeatedTimer(1, clipboard_to_output)
 clipboard_timer.stop() # stop the initial timer
 
+# Thread to record audio continuously
+audio_recorder = RecordThread(get_default_device_index(), int(r_config(LOG_CONFIG, "logaudioframes")))
+audio_recorder.start()
+
 refresh_hotkey_string = {
     "Linux" : "<ctrl>+q",
     "Darwin": "<cmd>+b",

diff --git a/logger.py b/logger.py
@@ -8,9 +8,11 @@
 from pathlib import Path
 from datetime import datetime
 from util import create_directory_if_not_exists
+from audio import play_audio_from_file
 
 SCRIPT_DIR = Path(__file__).parent 
 TEXT_LOG_PATH = Path(SCRIPT_DIR, "logs", "text")
+AUDIO_LOG_PATH = Path(SCRIPT_DIR, "logs", "audio")
 
 def get_time_string():
     return time.strftime("%Y%m%d-%H%M%S")
@@ -71,4 +73,15 @@ def show_logs():
             }
             output.append(log)
         f.close()
-    return output
+    return output
+
+@eel.expose
+def play_log_audio(log_id, folder_name):
+    path = Path(AUDIO_LOG_PATH, folder_name)
+    if not path.is_dir():
+        return None
+    file_name = next((f for f in os.listdir(path) if re.match('{}.(?:wav|mp3|m4a|flac)$'.format(log_id), f)), None)
+    print("gonna play log file", file_name)
+    if not file_name:
+        return None
+    play_audio_from_file(str(Path(AUDIO_LOG_PATH, folder_name, file_name)))
diff --git a/logs/audio/.gitkeep b/logs/audio/.gitkeep
diff --git a/ocr.py b/ocr.py
@@ -32,16 +32,25 @@ def base64_to_image_path(base64string, path):
         fh.write(base64.b64decode(base64string))
     return path
 
-def detect_and_log(engine, cropped_image,  text_orientation, session_start_time, request_time, log_images=True):
+def detect_and_log(engine, cropped_image,  text_orientation, session_start_time, request_time, audio_recorder):
     result = recognize_japanese(engine, cropped_image, text_orientation)
+    is_log_images = r_config(LOG_CONFIG, "logimages").lower() == "true"
+    is_log_audio = r_config(LOG_CONFIG, "logaudio").lower() == "true"
+    audio_duration = float(r_config(LOG_CONFIG, "logaudioduration"))
     if result is not None:
         log_text(session_start_time, request_time, result)
-        if log_images:
+        if is_log_images:
             image_extension = r_config(LOG_CONFIG, "logimagetype")
             file_name = request_time + "." + image_extension
             full_image_path = str(Path(SCRIPT_DIR,"logs", "images", session_start_time, file_name))
             thread = threading.Thread(target = log_video_image,  args=[full_image_path])
             thread.start()
+        if is_log_audio:
+            file_name = request_time + ".wav"
+            audio_file_path = str(Path(SCRIPT_DIR,"logs", "audio", session_start_time, file_name))
+            create_directory_if_not_exists(audio_file_path)
+            audio_recorder.stop_recording(audio_file_path, audio_duration)
+            eel.restartAudioRecording()()
         return result
     else:
         return "Error: OCR Failed"

diff --git a/recordaudio.py b/recordaudio.py
@@ -0,0 +1,61 @@
+import threading
+import pyaudio
+import wave
+from config import r_config, LOG_CONFIG
+
+class RecordThread(threading.Thread):
+    def __init__(self, deviceIndex=-1, frames=512):
+        threading.Thread.__init__(self)
+        self.bRecord = True
+        self.deviceIndex = deviceIndex
+        self.recorded_frames = []
+        self.audiofile = "out.wav"
+        self.duration = 10
+        self.frames = frames
+
+    def run(self):
+        # print("index?", self.deviceIndex)
+        p = pyaudio.PyAudio()
+        device_info = p.get_device_info_by_index(self.deviceIndex)
+        is_input = device_info["maxInputChannels"] > 0
+        is_wasapi = (p.get_host_api_info_by_index(device_info["hostApi"])["name"]).find("WASAPI") != -1
+        useloopback = is_wasapi and not is_input
+        # Open stream
+        channelcount = device_info["maxInputChannels"] if (device_info["maxOutputChannels"] < device_info["maxInputChannels"]) else device_info["maxOutputChannels"]
+        stream = p.open(format = pyaudio.paInt16,   
+                    channels = channelcount,
+                    rate = int(device_info["defaultSampleRate"]),
+                    input = True,
+                    frames_per_buffer = self.frames,
+                    input_device_index = device_info["index"],
+                    as_loopback = useloopback)
+
+        # Start recording
+        while self.bRecord:
+            self.recorded_frames.append(stream.read(self.frames))
+
+        stream.stop_stream()
+        stream.close()
+
+        # Don't save file if duration is 0
+        if (self.duration <= 0):
+            p.terminate()
+            return
+
+        filename = self.audiofile
+        waveFile = wave.open(filename, 'wb')
+        waveFile.setnchannels(channelcount)
+        waveFile.setsampwidth(p.get_sample_size(pyaudio.paInt16))
+        waveFile.setframerate(int(device_info["defaultSampleRate"]))
+        start_frame = len(self.recorded_frames) - int(int(device_info["defaultSampleRate"]) / self.frames * self.duration)
+        waveFile.writeframes(b''.join(self.recorded_frames[start_frame:]))
+        waveFile.close()
+        p.terminate()
+
+    def stop_recording(self, audiofile='out.wav', duration = 10):
+        self.audiofile = audiofile
+        self.duration = duration
+        self.bRecord = False
+
+    def restart_recording(self):
+        self.bRecord = False
diff --git a/web/index.html b/web/index.html
@@ -253,10 +253,11 @@ <h4 class="mdl-dialog__title">Settings</h4>
                         </div>
 
                         <div style="padding-left: 16px">
-                          <button onclick="testRecord()">
-                            Test Record
-                          </button>
-
+                          <div class="mdl-textfield mdl-js-textfield mdl-textfield--floating-label">
+                            <input onchange="changeAudioDuration()" class="mdl-textfield__input" type="text" pattern="-?[0-9]*(\.[0-9]+)?" id="audio_duration_input">
+                            <label class="mdl-textfield__label" for="sample4">Audio Duration (seconds)</label>
+                            <span class="mdl-textfield__error">Input is not a number!</span>
+                          </div>
                         </div>
 
                     </ul>
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,7 @@ @@
     # Logs
     logs/text/*.txt
+    logs/audio/
     # Debian Build
     build/
@@ Expand Down @@