Skip to content

Commit

Permalink
Various updates (thanks to @mailto1587 for bug reports!), fixing Uber…
Browse files Browse the repository at this point in the history
  • Loading branch information
Uberi committed Dec 16, 2015
1 parent a0024eb commit 5c82511
Showing 1 changed file with 31 additions and 23 deletions.
54 changes: 31 additions & 23 deletions speech_recognition/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@

"""Library for performing speech recognition with support for Google Speech Recognition, Wit.ai, IBM Speech to Text, and AT&T Speech to Text."""

#wip: convert sample rate and widths as necessary for all API calls
#wip: get more models from http://sourceforge.net/projects/cmusphinx/files/Acoustic%20and%20Language%20Models/
#wip: provide binaries for PocketSphinx on Windows, or see if the 0.0.5 binaries will work

__author__ = "Anthony Zhang (Uberi)"
__version__ = "3.1.3"
Expand Down Expand Up @@ -154,8 +153,8 @@ def get_raw_data(self, convert_rate = None, convert_width = None):
Writing these bytes directly to a file results in a valid `RAW/PCM audio file <https://en.wikipedia.org/wiki/Raw_audio_format>`__.
"""
assert convert_rate > 0, "Sample rate to convert to must be a positive integer"
assert convert_width % 1 == 0 and 2 <= convert_width <= 4, "Sample width to convert to must be 2, 3, or 4"
assert convert_rate is None or convert_rate > 0, "Sample rate to convert to must be a positive integer"
assert convert_width is None or (convert_width % 1 == 0 and 2 <= convert_width <= 4), "Sample width to convert to must be 2, 3, or 4"

raw_data = self.frame_data

Expand Down Expand Up @@ -253,6 +252,7 @@ def record(self, source, duration = None, offset = None):
If ``duration`` is not specified, then it will record until there is no more audio input.
"""
assert isinstance(source, AudioSource), "Source must be an audio source"
assert source.stream is not None, "Audio source must be opened before recording - see documentation for `AudioSource`"

frames = io.BytesIO()
seconds_per_buffer = (source.CHUNK + 0.0) / source.SAMPLE_RATE
Expand Down Expand Up @@ -287,6 +287,7 @@ def adjust_for_ambient_noise(self, source, duration = 1):
The ``duration`` parameter is the maximum number of seconds that it will dynamically adjust the threshold for before returning. This value should be at least 0.5 in order to get a representative sample of the ambient noise.
"""
assert isinstance(source, AudioSource), "Source must be an audio source"
assert source.stream is not None, "Audio source must be opened before recording - see documentation for `AudioSource`"
assert self.pause_threshold >= self.non_speaking_duration >= 0

seconds_per_buffer = (source.CHUNK + 0.0) / source.SAMPLE_RATE
Expand All @@ -313,6 +314,7 @@ def listen(self, source, timeout = None):
The ``timeout`` parameter is the maximum number of seconds that it will wait for a phrase to start before giving up and throwing an ``speech_recognition.WaitTimeoutError`` exception. If ``timeout`` is ``None``, it will wait indefinitely.
"""
assert isinstance(source, AudioSource), "Source must be an audio source"
assert source.stream is not None, "Audio source must be opened before recording - see documentation for `AudioSource`"
assert self.pause_threshold >= self.non_speaking_duration >= 0

seconds_per_buffer = (source.CHUNK + 0.0) / source.SAMPLE_RATE
Expand Down Expand Up @@ -407,34 +409,40 @@ def stopper():

def recognize_sphinx(self, audio_data, language = "en-US", show_all = False):
"""
;wip
Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using CMU Sphinx.
The recognition language is determined by ``language``, an IETF language tag like ``"en-US"`` or ``"en-GB"``, defaulting to US English. A list of supported language codes can be found `here <http://stackoverflow.com/questions/14257598/>`__. Basically, language codes can be just the language (``en``), or a language with a dialect (``en-US``). ;wip
Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the Sphinx ``pocketsphinx.pocketsphinx.Hypothesis`` object generated by Sphinx.
Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if there are any issues with the Sphinx installation.
"""
assert isinstance(audio_data, AudioData), "`audio_data` must be audio data"
assert isinstance(language, str), "`language` must be a string"
assert isinstance(language, str), "`language` must be a string" #wip: do this properly

# import the PocketSphinx speech recognition module
try:
from pocketsphinx import pocketsphinx
from sphinxbase import sphinxbase
except ImportError:
raise RequestError("missing PocketSphinx module: see the \"Setting up PocketSphinx\" section of the README.")

model_directory = os.path.join(os.path.dirname(os.path.realpath(__file__)), "pocketsphinx-data")
if not os.path.isdir(model_directory):
raise RequestError("missing PocketSphinx model directory: \"{}\"".format(model_directory))
model_parameters_directory = os.path.join(model_directory, "en-us", "en-us")
if not os.path.isdir(model_parameters_directory):
raise RequestError("missing PocketSphinx model parameters directory: \"{}\"".format(model_parameters_directory))
language_model_file = os.path.join(model_directory, "en-us", "en-us.lm.bin")
raise RequestError("missing PocketSphinx module: ensure that PocketSphinx is set up correctly.")

language_directory = os.path.join(os.path.dirname(os.path.realpath(__file__)), "pocketsphinx-data", language)
if not os.path.isdir(language_directory):
raise RequestError("missing PocketSphinx model directory: \"{}\"".format(language_directory))
acoustic_parameters_directory = os.path.join(language_directory, "acoustic-model")
if not os.path.isdir(acoustic_parameters_directory):
raise RequestError("missing PocketSphinx model parameters directory: \"{}\"".format(acoustic_parameters_directory))
language_model_file = os.path.join(language_directory, "language-model.lm.bin")
if not os.path.isfile(language_model_file):
raise RequestError("missing PocketSphinx language model file: \"{}\"".format(language_model_file))
phoneme_dictionary_file = os.path.join(model_directory, "en-us", "cmudict-en-us.dict")
phoneme_dictionary_file = os.path.join(language_directory, "pronounciation-dictionary.dict")
if not os.path.isfile(phoneme_dictionary_file):
raise RequestError("missing PocketSphinx phoneme dictionary file: \"{}\"".format(phoneme_dictionary_file))

# create decoder object
config = pocketsphinx.Decoder.default_config()
config.set_string("-hmm", model_parameters_directory)
config.set_string("-hmm", acoustic_parameters_directory)
config.set_string("-lm", language_model_file)
config.set_string("-dict", phoneme_dictionary_file)
config.set_string("-logfn", os.devnull) # disable logging (logging causes unwanted output in terminal)
Expand Down Expand Up @@ -462,7 +470,7 @@ def recognize_google(self, audio_data, key = None, language = "en-US", show_all
To obtain your own API key, simply following the steps on the `API Keys <http://www.chromium.org/developers/how-tos/api-keys>`__ page at the Chromium Developers site. In the Google Developers Console, Google Speech Recognition is listed as "Speech API".
The recognition language is determined by ``language``, an IETF language tag like `"en-US"` or ``"en-GB"``, defaulting to US English. A list of supported language codes can be found `here <http://stackoverflow.com/questions/14257598/>`__. Basically, language codes can be just the language (``en``), or a language with a dialect (``en-US``).
The recognition language is determined by ``language``, an IETF language tag like ``"en-US"`` or ``"en-GB"``, defaulting to US English. A list of supported language codes can be found `here <http://stackoverflow.com/questions/14257598/>`__. Basically, language codes can be just the language (``en``), or a language with a dialect (``en-US``).
Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the raw API response as a JSON dictionary.
Expand All @@ -485,7 +493,7 @@ def recognize_google(self, audio_data, key = None, language = "en-US", show_all
except HTTPError as e:
raise RequestError("recognition request failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6
except URLError as e:
raise RequestError("recognition connection failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6
raise RequestError("recognition connection failed: {0}".format(e.reason))
response_text = response.read().decode("utf-8")

# ignore any blank blocks
Expand Down Expand Up @@ -535,7 +543,7 @@ def recognize_wit(self, audio_data, key, show_all = False):
except HTTPError as e:
raise RequestError("recognition request failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6
except URLError as e:
raise RequestError("recognition connection failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6
raise RequestError("recognition connection failed: {0}".format(e.reason))
response_text = response.read().decode("utf-8")
result = json.loads(response_text)

Expand Down Expand Up @@ -577,7 +585,7 @@ def recognize_ibm(self, audio_data, username, password, language = "en-US", show
except HTTPError as e:
raise RequestError("recognition request failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6
except URLError as e:
raise RequestError("recognition connection failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6
raise RequestError("recognition connection failed: {0}".format(e.reason))
response_text = response.read().decode("utf-8")
result = json.loads(response_text)

Expand Down Expand Up @@ -616,7 +624,7 @@ def recognize_att(self, audio_data, app_key, app_secret, language = "en-US", sho
except HTTPError as e:
raise RequestError("credential request failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6
except URLError as e:
raise RequestError("credential connection failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6
raise RequestError("credential connection failed: {0}".format(e.reason))
authorization_text = authorization_response.read().decode("utf-8")
authorization_bearer = json.loads(authorization_text).get("access_token")
if authorization_bearer is None: raise RequestError("missing OAuth access token in requested credentials")
Expand All @@ -632,7 +640,7 @@ def recognize_att(self, audio_data, app_key, app_secret, language = "en-US", sho
except HTTPError as e:
raise RequestError("recognition request failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6
except URLError as e:
raise RequestError("recognition connection failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6
raise RequestError("recognition connection failed: {0}".format(e.reason))
response_text = response.read().decode("utf-8")
result = json.loads(response_text)

Expand Down

0 comments on commit 5c82511

Please sign in to comment.