Skip to content

Commit fc0fbe5

Browse files
committedMay 9, 2017
Break off hot word detection into seperate function
1 parent 4d6ec9c commit fc0fbe5

File tree

2 files changed

+98
-34
lines changed

2 files changed

+98
-34
lines changed
 

‎setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def run(self):
4949
description=speech_recognition.__doc__,
5050
long_description=open("README.rst").read(),
5151
license=speech_recognition.__license__,
52-
keywords="speech recognition voice sphinx google wit bing api houndify ibm",
52+
keywords="speech recognition voice sphinx google wit bing api houndify ibm snowboy",
5353
url="https://github.com/Uberi/speech_recognition#readme",
5454
classifiers=[
5555
"Development Status :: 5 - Production/Stable",

‎speech_recognition/__init__.py

+97-33
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
import hmac
2020
import time
2121
import uuid
22+
import sys
23+
import struct
2224

2325
__author__ = "Anthony Zhang (Uberi)"
2426
__version__ = "3.6.3"
@@ -515,7 +517,96 @@ def adjust_for_ambient_noise(self, source, duration=1):
515517
target_energy = energy * self.dynamic_energy_ratio
516518
self.energy_threshold = self.energy_threshold * damping + target_energy * (1 - damping)
517519

518-
def listen(self, source, timeout=None, phrase_time_limit=None, hot_words=[]]):
520+
def __wait_for_hot_word(self, snowboy_location, hot_words, source, timeout=None):
521+
"""
522+
Blocks until a hot word, sometimes refered to as a wake word, it found in an audio input.
523+
524+
Intended to be used as a means to limit network traffic and reduce cost of online speech-to-text services
525+
526+
Currently utilizes the SnowBoy service which is free for hobbiest with a paid option for commerical use.
527+
528+
``snowboy_location`` is the local top level directory containing the compiled SnowBoy files.
529+
530+
``hot_words`` is an iterable element that contains the local file location of models provided by the SnowBoy service, either .pmdl or .umdl format
531+
532+
``source`` is the actual audio input as u
533+
"""
534+
assert isinstance(source, AudioSource), "Source must be an audio source"
535+
assert source.stream is not None, "Audio source must be entered before listening, see documentation for ``AudioSource``; are you using ``source`` outside of a ``with`` statement?"
536+
assert snowboy_location != None, "Need to specify snowboy_location argument if using hot words"
537+
assert os.path.isfile(snowboy_location + "/snowboydetect.py"), "Can not find snowboydetect.py. Make sure snowboy_location is pointed at the root directory"
538+
for f in hot_words: assert os.path.isfile(f), "Unable to locate file with given path: {}".format(f)
539+
540+
sys.path.append(snowboy_location)
541+
import snowboydetect
542+
543+
models = ",".join(hot_words)
544+
# get file path to needed resource file
545+
resource = snowboy_location + "/resources/common.res"
546+
detector = snowboydetect.SnowboyDetect(resource_filename=resource.encode(), model_str=models.encode())
547+
detector.SetAudioGain(1.0)
548+
sensitivity = [0.4]*len(hot_words)
549+
sensitivity_str = ",".join(str(t) for t in sensitivity)
550+
detector.SetSensitivity(sensitivity_str.encode())
551+
552+
# create a deque to store our raw mic input data and one to store snowboy downsampled data, each hold 5sec of audio
553+
mic_buffer = collections.deque(maxlen=(source.SAMPLE_RATE * 5))
554+
sb_buffer = collections.deque(maxlen=(detector.SampleRate() * 5))
555+
556+
# snowboy requires a specific sample rate that it provides, to avoid a ripple of issues we will just downsample momentarily by this ammount
557+
resample_ratio = float(source.SAMPLE_RATE) / float(detector.SampleRate())
558+
resample_count = 0
559+
560+
seconds_per_buffer = (source.CHUNK + 0.0) / source.SAMPLE_RATE
561+
elapsed_time = 0
562+
563+
while True:
564+
# handle phrase being too long by cutting off the audio
565+
elapsed_time += seconds_per_buffer
566+
if timeout and elapsed_time > timeout:
567+
break
568+
569+
buffer = source.stream.read(source.CHUNK)
570+
if len(buffer) == 0: break # reached end of the stream
571+
572+
# record mic data for use later
573+
mic_buffer.extend(buffer)
574+
575+
# convert byte's into ints so we can downsample
576+
int_data = struct.unpack('<' + ('h'*(len(buffer)/source.SAMPLE_WIDTH)), buffer)
577+
ds_data = []
578+
579+
# rough downsampling, can handle downsampling by non-integer values
580+
for i in range(len(int_data)):
581+
if resample_count <= 0:
582+
sample = int_data[i]
583+
584+
# grab the previous sample too, but make sure we have one to grab
585+
prev_sample = sample
586+
if i != 0:
587+
prev_sample = int_data[i-1]
588+
589+
# get a number betwen 0 and 1, this is used to linearly interpolate between the two samples we have
590+
ratio = 0.0 - resample_count
591+
fab_sample = int((1.0 - ratio) * sample + (ratio) * prev_sample + 0.5)
592+
ds_data.append(fab_sample)
593+
resample_count += resample_ratio
594+
595+
resample_count -= 1.0
596+
597+
# convert back into bytes so we can feed it into snowboy
598+
sb_buffer.extend(struct.pack('<' + ('h' * len(ds_data)), *ds_data))
599+
600+
# actually run the snowboy detector
601+
ans = detector.RunDetection(bytes(bytearray(sb_buffer)))
602+
assert ans != -1, "Error initializing streams or reading audio data"
603+
604+
# if ans is greater than 0, we found a wake word! return audio
605+
if ans > 0:
606+
print "FOUND WORD"
607+
return bytes(mic_buffer), elapsed_time
608+
609+
def listen(self, source, timeout=None, phrase_time_limit=None, hot_words=[], snowboy_location=None, wait_for_hot_word=False):
519610
"""
520611
Records a single phrase from ``source`` (an ``AudioSource`` instance) into an ``AudioData`` instance, which it returns.
521612
@@ -535,13 +626,11 @@ def listen(self, source, timeout=None, phrase_time_limit=None, hot_words=[]]):
535626
if not hasattr(hot_words, '__iter__'):
536627
hot_words = [hot_words]
537628

538-
for f in hot_words: assert os.path.isfile(f), "Unable to locate file with given path: {}".format(f)
539-
540629
seconds_per_buffer = (source.CHUNK + 0.0) / source.SAMPLE_RATE
541630
pause_buffer_count = int(math.ceil(self.pause_threshold / seconds_per_buffer)) # number of buffers of non-speaking audio during a phrase, before the phrase should be considered complete
542631
phrase_buffer_count = int(math.ceil(self.phrase_threshold / seconds_per_buffer)) # minimum number of buffers of speaking audio before we consider the speaking audio a phrase
543632
non_speaking_buffer_count = int(math.ceil(self.non_speaking_duration / seconds_per_buffer)) # maximum number of buffers of non-speaking audio to retain before and after a phrase
544-
search_for_hot_word = (len(hot_words) != 0)
633+
545634

546635
# read audio input for phrases until there is a phrase that is long enough
547636
elapsed_time = 0 # number of seconds of audio read
@@ -576,35 +665,10 @@ def listen(self, source, timeout=None, phrase_time_limit=None, hot_words=[]]):
576665
pause_count, phrase_count = 0, 0
577666
phrase_start_time = elapsed_time
578667

579-
# if chosen to do so, wait until we hear our hot word(s) to actually start
580-
detector = None
581-
if search_for_hot_word:
582-
models = ",".join(hot_words)
583-
# get file path to needed resource file
584-
resource = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/common.res")
585-
detector = snowboydetect.SnowboyDetect(resource_filename=resource.encode(), model_str=models.encode())
586-
detector.SetAudioGain(1.0) # can this be removed?
587-
hot_word_buffer = collections.deque(maxlen=4096)
588-
589-
while True:
590-
# handle phrase being too long by cutting off the audio
591-
elapsed_time += seconds_per_buffer
592-
if phrase_time_limit and elapsed_time - phrase_start_time > phrase_time_limit:
593-
break
594-
595-
buffer = source.stream.read(source.CHUNK)
596-
if len(buffer) == 0: break # reached end of the stream
597-
598-
hot_word_buffer.extend(buffer)
599-
ans = detector.RunDetection(bytes(bytearray(hot_word_buffer)))
600-
assert ans >= 0, "Error initializing streams or reading audio data"
601-
602-
# if ans is greater than 0, we found a wake word!
603-
if ans > 0:
604-
frames.extend(hot_word_buffer)
605-
break
606-
607-
668+
if wait_for_hot_word:
669+
audio_data, delta_time = self.__wait_for_hot_word(snowboy_location, hot_words, source, timeout)
670+
elapsed_time += delta_time
671+
frames.append(audio_data)
608672
while True:
609673
# handle phrase being too long by cutting off the audio
610674
elapsed_time += seconds_per_buffer

0 commit comments

Comments
 (0)