Various updates (thanks to @mailto1587 for bug reports!), fixing Uber…

…i#73.
techscientist · Dec 16, 2015 · 5c82511 · 5c82511
1 parent a0024eb
commit 5c82511
Showing 1 changed file with 31 additions and 23 deletions.
diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py
@@ -2,8 +2,7 @@
 
 """Library for performing speech recognition with support for Google Speech Recognition, Wit.ai, IBM Speech to Text, and AT&T Speech to Text."""
 
-#wip: convert sample rate and widths as necessary for all API calls
-#wip: get more models from http://sourceforge.net/projects/cmusphinx/files/Acoustic%20and%20Language%20Models/
+#wip: provide binaries for PocketSphinx on Windows, or see if the 0.0.5 binaries will work
 
 __author__ = "Anthony Zhang (Uberi)"
 __version__ = "3.1.3"
@@ -154,8 +153,8 @@ def get_raw_data(self, convert_rate = None, convert_width = None):
 
         Writing these bytes directly to a file results in a valid `RAW/PCM audio file <https://en.wikipedia.org/wiki/Raw_audio_format>`__.
         """
-        assert convert_rate > 0, "Sample rate to convert to must be a positive integer"
-        assert convert_width % 1 == 0 and 2 <= convert_width <= 4, "Sample width to convert to must be 2, 3, or 4"
+        assert convert_rate is None or convert_rate > 0, "Sample rate to convert to must be a positive integer"
+        assert convert_width is None or (convert_width % 1 == 0 and 2 <= convert_width <= 4), "Sample width to convert to must be 2, 3, or 4"
 
         raw_data = self.frame_data
 
@@ -253,6 +252,7 @@ def record(self, source, duration = None, offset = None):
         If ``duration`` is not specified, then it will record until there is no more audio input.
         """
         assert isinstance(source, AudioSource), "Source must be an audio source"
+        assert source.stream is not None, "Audio source must be opened before recording - see documentation for `AudioSource`"
 
         frames = io.BytesIO()
         seconds_per_buffer = (source.CHUNK + 0.0) / source.SAMPLE_RATE
@@ -287,6 +287,7 @@ def adjust_for_ambient_noise(self, source, duration = 1):
         The ``duration`` parameter is the maximum number of seconds that it will dynamically adjust the threshold for before returning. This value should be at least 0.5 in order to get a representative sample of the ambient noise.
         """
         assert isinstance(source, AudioSource), "Source must be an audio source"
+        assert source.stream is not None, "Audio source must be opened before recording - see documentation for `AudioSource`"
         assert self.pause_threshold >= self.non_speaking_duration >= 0
 
         seconds_per_buffer = (source.CHUNK + 0.0) / source.SAMPLE_RATE
@@ -313,6 +314,7 @@ def listen(self, source, timeout = None):
         The ``timeout`` parameter is the maximum number of seconds that it will wait for a phrase to start before giving up and throwing an ``speech_recognition.WaitTimeoutError`` exception. If ``timeout`` is ``None``, it will wait indefinitely.
         """
         assert isinstance(source, AudioSource), "Source must be an audio source"
+        assert source.stream is not None, "Audio source must be opened before recording - see documentation for `AudioSource`"
         assert self.pause_threshold >= self.non_speaking_duration >= 0
 
         seconds_per_buffer = (source.CHUNK + 0.0) / source.SAMPLE_RATE
@@ -407,34 +409,40 @@ def stopper():
 
     def recognize_sphinx(self, audio_data, language = "en-US", show_all = False):
         """
-        ;wip
+        Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using CMU Sphinx.
+
+        The recognition language is determined by ``language``, an IETF language tag like ``"en-US"`` or ``"en-GB"``, defaulting to US English. A list of supported language codes can be found `here <http://stackoverflow.com/questions/14257598/>`__. Basically, language codes can be just the language (``en``), or a language with a dialect (``en-US``). ;wip
+
+        Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the Sphinx ``pocketsphinx.pocketsphinx.Hypothesis`` object generated by Sphinx.
+
+        Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if there are any issues with the Sphinx installation.
         """
         assert isinstance(audio_data, AudioData), "`audio_data` must be audio data"
-        assert isinstance(language, str), "`language` must be a string"
+        assert isinstance(language, str), "`language` must be a string" #wip: do this properly
 
         # import the PocketSphinx speech recognition module
         try:
             from pocketsphinx import pocketsphinx
             from sphinxbase import sphinxbase
         except ImportError:
-            raise RequestError("missing PocketSphinx module: see the \"Setting up PocketSphinx\" section of the README.")
-
-        model_directory = os.path.join(os.path.dirname(os.path.realpath(__file__)), "pocketsphinx-data")
-        if not os.path.isdir(model_directory):
-            raise RequestError("missing PocketSphinx model directory: \"{}\"".format(model_directory))
-        model_parameters_directory = os.path.join(model_directory, "en-us", "en-us")
-        if not os.path.isdir(model_parameters_directory):
-            raise RequestError("missing PocketSphinx model parameters directory: \"{}\"".format(model_parameters_directory))
-        language_model_file = os.path.join(model_directory, "en-us", "en-us.lm.bin")
+            raise RequestError("missing PocketSphinx module: ensure that PocketSphinx is set up correctly.")
+
+        language_directory = os.path.join(os.path.dirname(os.path.realpath(__file__)), "pocketsphinx-data", language)
+        if not os.path.isdir(language_directory):
+            raise RequestError("missing PocketSphinx model directory: \"{}\"".format(language_directory))
+        acoustic_parameters_directory = os.path.join(language_directory, "acoustic-model")
+        if not os.path.isdir(acoustic_parameters_directory):
+            raise RequestError("missing PocketSphinx model parameters directory: \"{}\"".format(acoustic_parameters_directory))
+        language_model_file = os.path.join(language_directory, "language-model.lm.bin")
         if not os.path.isfile(language_model_file):
             raise RequestError("missing PocketSphinx language model file: \"{}\"".format(language_model_file))
-        phoneme_dictionary_file = os.path.join(model_directory, "en-us", "cmudict-en-us.dict")
+        phoneme_dictionary_file = os.path.join(language_directory, "pronounciation-dictionary.dict")
         if not os.path.isfile(phoneme_dictionary_file):
             raise RequestError("missing PocketSphinx phoneme dictionary file: \"{}\"".format(phoneme_dictionary_file))
 
         # create decoder object
         config = pocketsphinx.Decoder.default_config()
-        config.set_string("-hmm", model_parameters_directory)
+        config.set_string("-hmm", acoustic_parameters_directory)
         config.set_string("-lm", language_model_file)
         config.set_string("-dict", phoneme_dictionary_file)
         config.set_string("-logfn", os.devnull) # disable logging (logging causes unwanted output in terminal)
@@ -462,7 +470,7 @@ def recognize_google(self, audio_data, key = None, language = "en-US", show_all
 
         To obtain your own API key, simply following the steps on the `API Keys <http://www.chromium.org/developers/how-tos/api-keys>`__ page at the Chromium Developers site. In the Google Developers Console, Google Speech Recognition is listed as "Speech API".
 
-        The recognition language is determined by ``language``, an IETF language tag like `"en-US"` or ``"en-GB"``, defaulting to US English. A list of supported language codes can be found `here <http://stackoverflow.com/questions/14257598/>`__. Basically, language codes can be just the language (``en``), or a language with a dialect (``en-US``).
+        The recognition language is determined by ``language``, an IETF language tag like ``"en-US"`` or ``"en-GB"``, defaulting to US English. A list of supported language codes can be found `here <http://stackoverflow.com/questions/14257598/>`__. Basically, language codes can be just the language (``en``), or a language with a dialect (``en-US``).
 
         Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the raw API response as a JSON dictionary.
 
@@ -485,7 +493,7 @@ def recognize_google(self, audio_data, key = None, language = "en-US", show_all
         except HTTPError as e:
             raise RequestError("recognition request failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6
         except URLError as e:
-            raise RequestError("recognition connection failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6
+            raise RequestError("recognition connection failed: {0}".format(e.reason))
         response_text = response.read().decode("utf-8")
 
         # ignore any blank blocks
@@ -535,7 +543,7 @@ def recognize_wit(self, audio_data, key, show_all = False):
         except HTTPError as e:
             raise RequestError("recognition request failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6
         except URLError as e:
-            raise RequestError("recognition connection failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6
+            raise RequestError("recognition connection failed: {0}".format(e.reason))
         response_text = response.read().decode("utf-8")
         result = json.loads(response_text)
 
@@ -577,7 +585,7 @@ def recognize_ibm(self, audio_data, username, password, language = "en-US", show
         except HTTPError as e:
             raise RequestError("recognition request failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6
         except URLError as e:
-            raise RequestError("recognition connection failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6
+            raise RequestError("recognition connection failed: {0}".format(e.reason))
         response_text = response.read().decode("utf-8")
         result = json.loads(response_text)
 
@@ -616,7 +624,7 @@ def recognize_att(self, audio_data, app_key, app_secret, language = "en-US", sho
         except HTTPError as e:
             raise RequestError("credential request failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6
         except URLError as e:
-            raise RequestError("credential connection failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6
+            raise RequestError("credential connection failed: {0}".format(e.reason))
         authorization_text = authorization_response.read().decode("utf-8")
         authorization_bearer = json.loads(authorization_text).get("access_token")
         if authorization_bearer is None: raise RequestError("missing OAuth access token in requested credentials")
@@ -632,7 +640,7 @@ def recognize_att(self, audio_data, app_key, app_secret, language = "en-US", sho
         except HTTPError as e:
             raise RequestError("recognition request failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6
         except URLError as e:
-            raise RequestError("recognition connection failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6
+            raise RequestError("recognition connection failed: {0}".format(e.reason))
         response_text = response.read().decode("utf-8")
         result = json.loads(response_text)