Skip to content

Commit

Permalink
log audio file after ocr
Browse files Browse the repository at this point in the history
  • Loading branch information
mathewthe2 committed Apr 14, 2021
1 parent 6451fd5 commit 0a4aee4
Show file tree
Hide file tree
Showing 12 changed files with 226 additions and 69 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

# Logs
logs/text/*.txt
logs/audio/

# Debian Build
build/
Expand Down
116 changes: 78 additions & 38 deletions audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,71 +2,111 @@
import wave
import os
from config import r_config, LOG_CONFIG
from recordaudio import RecordThread
from time import sleep

p = pyaudio.PyAudio()

#Set default to first in list or ask Windows
try:
default_device_index = p.get_default_input_device_info()
except IOError:
default_device_index = -1
def get_default_device_index():
p = pyaudio.PyAudio()
#Set default to first in list or ask Windows
try:
default_device_index = p.get_default_input_device_info()
except IOError:
return -1
info = p.get_device_info_by_index(0)
p.terminate()
return info["index"]

#Select Device
# print ("Available devices:\n")
def get_audio_objects():
p = pyaudio.PyAudio()
audio_objects = {}
for i in range(0, p.get_device_count()):
info = p.get_device_info_by_index(i)
audio_host = p.get_host_api_info_by_index(info["hostApi"])["name"]
if valid_output_device(info["index"]):
audio_objects.setdefault(audio_host,[]).append({info["index"]: info["name"]})
p.terminate()
return audio_objects

def valid_output_device(deviceIndex):
if not isinstance(deviceIndex, int):
return False
p = pyaudio.PyAudio()
device_info = p.get_device_info_by_index(deviceIndex)
is_input = device_info["maxInputChannels"] > 0
is_wasapi = (p.get_host_api_info_by_index(device_info["hostApi"])["name"]).find("WASAPI") != -1
p.terminate()
if is_input:
if is_wasapi:
return True
else:
return False
return True

def record_audio_by_device_index(deviceIndex, duration):
frames = int(r_config(LOG_CONFIG, "logaudioframes"))
device_info = p.get_device_info_by_index(deviceIndex)
is_input = device_info["maxInputChannels"] > 0
is_wasapi = (p.get_host_api_info_by_index(device_info["hostApi"])["name"]).find("WASAPI") != -1
useloopback = is_wasapi and not is_input
recorded_frames = []
# Open stream
channelcount = device_info["maxInputChannels"] if (device_info["maxOutputChannels"] < device_info["maxInputChannels"]) else device_info["maxOutputChannels"]
stream = p.open(format = pyaudio.paInt16,
channels = channelcount,
rate = int(device_info["defaultSampleRate"]),
input = True,
frames_per_buffer = frames,
input_device_index = device_info["index"],
as_loopback = useloopback)
def play_audio_from_file(filename):
print("gonna play some", filename)
# length of data to read.
chunk = 1024
# open the file for reading.
wf = wave.open(filename, 'rb')

# create an audio object
p = pyaudio.PyAudio()

# open stream based on the wave object which has been input.
stream = p.open(format =
p.get_format_from_width(wf.getsampwidth()),
channels = wf.getnchannels(),
rate = wf.getframerate(),
output = True)

# read data (based on the chunk size)
data = wf.readframes(chunk)

while True:
if data != '':
stream.write(data)
data = wf.readframes(chunk)

if data == b'':
break

# cleanup stuff.
stream.close()
p.terminate()

# Start recording
print("started record")
for i in range(0, int(int(device_info["defaultSampleRate"]) / frames * duration)):
recorded_frames.append(stream.read(frames))
# def record_audio_by_device_index(deviceIndex, duration):
# frames = int(r_config(LOG_CONFIG, "logaudioframes"))
# device_info = p.get_device_info_by_index(deviceIndex)
# is_input = device_info["maxInputChannels"] > 0
# is_wasapi = (p.get_host_api_info_by_index(device_info["hostApi"])["name"]).find("WASAPI") != -1
# useloopback = is_wasapi and not is_input
# recorded_frames = []
# # Open stream
# channelcount = device_info["maxInputChannels"] if (device_info["maxOutputChannels"] < device_info["maxInputChannels"]) else device_info["maxOutputChannels"]
# stream = p.open(format = pyaudio.paInt16,
# channels = channelcount,
# rate = int(device_info["defaultSampleRate"]),
# input = True,
# frames_per_buffer = frames,
# input_device_index = device_info["index"],
# as_loopback = useloopback)

# Stop Recording
print("stopped record")
stream.stop_stream()
stream.close()
# # Start recording
# print("started record")
# for i in range(0, int(int(device_info["defaultSampleRate"]) / frames * duration)):
# recorded_frames.append(stream.read(frames))

filename = "out.wav"
waveFile = wave.open(filename, 'wb')
waveFile.setnchannels(channelcount)
waveFile.setsampwidth(p.get_sample_size(pyaudio.paInt16))
waveFile.setframerate(int(device_info["defaultSampleRate"]))
waveFile.writeframes(b''.join(recorded_frames))
waveFile.close()
# # Stop Recording
# print("stopped record")
# stream.stop_stream()
# stream.close()

# filename = "out.wav"
# waveFile = wave.open(filename, 'wb')
# waveFile.setnchannels(channelcount)
# waveFile.setsampwidth(p.get_sample_size(pyaudio.paInt16))
# waveFile.setframerate(int(device_info["defaultSampleRate"]))
# waveFile.writeframes(b''.join(recorded_frames))
# waveFile.close()
2 changes: 2 additions & 0 deletions config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ target_lang = EN
logimages = true
logimagetype = jpg
logimagequality = 1.0
logaudio = true
logaudioduration = 7.0
logaudiohost = Windows WASAPI
logaudioframes = 512

Expand Down
27 changes: 19 additions & 8 deletions game2text.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
from translate import deepl_translate
from hotkeys import refresh_ocr_hotkey, esc_hotkey
from util import RepeatedTimer, open_folder_by_relative_path
from audio import get_audio_objects, record_audio_by_device_index
from audio import get_default_device_index, get_audio_objects
from recordaudio import RecordThread
from pynput import keyboard
from clipboard import clipboard_to_output, text_to_clipboard
from logger import get_time_string
from config import r_config, w_config, WINDOWS_HOTKEYS_CONFIG, APP_CONFIG
from config import r_config, w_config, WINDOWS_HOTKEYS_CONFIG, APP_CONFIG, LOG_CONFIG

session_start_time = get_time_string()

Expand All @@ -22,8 +23,8 @@ def close(page, sockets):
os._exit(0)

@eel.expose # Expose this function to Javascript
def recognize_image(engine, image, orientation, log_images):
return detect_and_log(engine, image, orientation, session_start_time, get_time_string(), log_images)
def recognize_image(engine, image, orientation):
return detect_and_log(engine, image, orientation, session_start_time, get_time_string(), audio_recorder)

@eel.expose # Expose this function to Javascript
def translate(text):
Expand All @@ -36,6 +37,14 @@ def monitor_clipboard():
else:
clipboard_timer.start()

@eel.expose
def restart_audio_recording(device_index=get_default_device_index()):
global audio_recorder
if not audio_recorder.bRecord:
audio_recorder.stop_recording(None, -1)
audio_recorder = RecordThread(device_index, int(r_config(LOG_CONFIG, "logaudioframes")))
audio_recorder.start()

@eel.expose
def copy_text_to_clipboard(text):
text_to_clipboard(text)
Expand All @@ -56,10 +65,6 @@ def open_folder(relative_path):
def get_audio_sources():
return get_audio_objects()

@eel.expose
def record_audio(device_index, duration):
return record_audio_by_device_index(device_index, duration)

@eel.expose
def open_new_window(html_file, height=800, width=600):
eel.start(html_file,
Expand All @@ -81,9 +86,15 @@ def run_eel():

main_thread = threading.Thread(target=run_eel, args=())
main_thread.start()

# Thread to export clipboard text continuously
clipboard_timer = RepeatedTimer(1, clipboard_to_output)
clipboard_timer.stop() # stop the initial timer

# Thread to record audio continuously
audio_recorder = RecordThread(get_default_device_index(), int(r_config(LOG_CONFIG, "logaudioframes")))
audio_recorder.start()

refresh_hotkey_string = {
"Linux" : "<ctrl>+q",
"Darwin": "<cmd>+b",
Expand Down
15 changes: 14 additions & 1 deletion logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@
from pathlib import Path
from datetime import datetime
from util import create_directory_if_not_exists
from audio import play_audio_from_file

SCRIPT_DIR = Path(__file__).parent
TEXT_LOG_PATH = Path(SCRIPT_DIR, "logs", "text")
AUDIO_LOG_PATH = Path(SCRIPT_DIR, "logs", "audio")

def get_time_string():
return time.strftime("%Y%m%d-%H%M%S")
Expand Down Expand Up @@ -71,4 +73,15 @@ def show_logs():
}
output.append(log)
f.close()
return output
return output

@eel.expose
def play_log_audio(log_id, folder_name):
path = Path(AUDIO_LOG_PATH, folder_name)
if not path.is_dir():
return None
file_name = next((f for f in os.listdir(path) if re.match('{}.(?:wav|mp3|m4a|flac)$'.format(log_id), f)), None)
print("gonna play log file", file_name)
if not file_name:
return None
play_audio_from_file(str(Path(AUDIO_LOG_PATH, folder_name, file_name)))
Empty file added logs/audio/.gitkeep
Empty file.
13 changes: 11 additions & 2 deletions ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,25 @@ def base64_to_image_path(base64string, path):
fh.write(base64.b64decode(base64string))
return path

def detect_and_log(engine, cropped_image, text_orientation, session_start_time, request_time, log_images=True):
def detect_and_log(engine, cropped_image, text_orientation, session_start_time, request_time, audio_recorder):
result = recognize_japanese(engine, cropped_image, text_orientation)
is_log_images = r_config(LOG_CONFIG, "logimages").lower() == "true"
is_log_audio = r_config(LOG_CONFIG, "logaudio").lower() == "true"
audio_duration = float(r_config(LOG_CONFIG, "logaudioduration"))
if result is not None:
log_text(session_start_time, request_time, result)
if log_images:
if is_log_images:
image_extension = r_config(LOG_CONFIG, "logimagetype")
file_name = request_time + "." + image_extension
full_image_path = str(Path(SCRIPT_DIR,"logs", "images", session_start_time, file_name))
thread = threading.Thread(target = log_video_image, args=[full_image_path])
thread.start()
if is_log_audio:
file_name = request_time + ".wav"
audio_file_path = str(Path(SCRIPT_DIR,"logs", "audio", session_start_time, file_name))
create_directory_if_not_exists(audio_file_path)
audio_recorder.stop_recording(audio_file_path, audio_duration)
eel.restartAudioRecording()()
return result
else:
return "Error: OCR Failed"
Expand Down
61 changes: 61 additions & 0 deletions recordaudio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import threading
import pyaudio
import wave
from config import r_config, LOG_CONFIG

class RecordThread(threading.Thread):
def __init__(self, deviceIndex=-1, frames=512):
threading.Thread.__init__(self)
self.bRecord = True
self.deviceIndex = deviceIndex
self.recorded_frames = []
self.audiofile = "out.wav"
self.duration = 10
self.frames = frames

def run(self):
# print("index?", self.deviceIndex)
p = pyaudio.PyAudio()
device_info = p.get_device_info_by_index(self.deviceIndex)
is_input = device_info["maxInputChannels"] > 0
is_wasapi = (p.get_host_api_info_by_index(device_info["hostApi"])["name"]).find("WASAPI") != -1
useloopback = is_wasapi and not is_input
# Open stream
channelcount = device_info["maxInputChannels"] if (device_info["maxOutputChannels"] < device_info["maxInputChannels"]) else device_info["maxOutputChannels"]
stream = p.open(format = pyaudio.paInt16,
channels = channelcount,
rate = int(device_info["defaultSampleRate"]),
input = True,
frames_per_buffer = self.frames,
input_device_index = device_info["index"],
as_loopback = useloopback)

# Start recording
while self.bRecord:
self.recorded_frames.append(stream.read(self.frames))

stream.stop_stream()
stream.close()

# Don't save file if duration is 0
if (self.duration <= 0):
p.terminate()
return

filename = self.audiofile
waveFile = wave.open(filename, 'wb')
waveFile.setnchannels(channelcount)
waveFile.setsampwidth(p.get_sample_size(pyaudio.paInt16))
waveFile.setframerate(int(device_info["defaultSampleRate"]))
start_frame = len(self.recorded_frames) - int(int(device_info["defaultSampleRate"]) / self.frames * self.duration)
waveFile.writeframes(b''.join(self.recorded_frames[start_frame:]))
waveFile.close()
p.terminate()

def stop_recording(self, audiofile='out.wav', duration = 10):
self.audiofile = audiofile
self.duration = duration
self.bRecord = False

def restart_recording(self):
self.bRecord = False
9 changes: 5 additions & 4 deletions web/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -253,10 +253,11 @@ <h4 class="mdl-dialog__title">Settings</h4>
</div>

<div style="padding-left: 16px">
<button onclick="testRecord()">
Test Record
</button>

<div class="mdl-textfield mdl-js-textfield mdl-textfield--floating-label">
<input onchange="changeAudioDuration()" class="mdl-textfield__input" type="text" pattern="-?[0-9]*(\.[0-9]+)?" id="audio_duration_input">
<label class="mdl-textfield__label" for="sample4">Audio Duration (seconds)</label>
<span class="mdl-textfield__error">Input is not a number!</span>
</div>
</div>

</ul>
Expand Down
Loading

0 comments on commit 0a4aee4

Please sign in to comment.