diff --git a/README.md b/README.md index f4e25dd..b1f8716 100644 --- a/README.md +++ b/README.md @@ -9,21 +9,20 @@ $ gcloud auth application-default login ### Install Dependencies ```bash -$ pip install google-cloud-speech +$ pip install -r requirements.txt ``` -## 테스트 음성 파일 생성 -공식 *API* 페이지에는 *wav format* 이 지원된다고 명시되어 있지만 정상적으로 인식되지 않음. -그래서 *wav* 파일을 raw 형태로 변환하고 사용해야 함. 아래는 *mac* 에서 *wav* 를 *raw* 로 변환하는 예제. -```bash -$ sox hello.wav --channels=1 --rate 16k --bits 16 test.raw -``` +# Usage -## Usage +## 음성파일 인식 +파일을 텍스트로 통째로 변환하는 방식은 아래와 같이 진행한다. ```bash $ python3 speech.py Transcript: 안녕 하세요 좋은 아침입니다 -$ +``` + +다음은 파일을 `streaming` 방식으로 변환하는 방식이다. +```bash $ python3 speech_streaming.py ==================== transcript: 안녕 하세요 좋은 아침입니다 @@ -41,4 +40,5 @@ https://stackoverflow.com/a/33821084/4599185 ```bash $ python3 transcribe_streaming_mic.py ``` + 이 코드는 `googlecloudplatform` 에 등록되어있는 예제 코드를 *language* 만 변경해서 사용한 것이므로 참고하도록 한다. diff --git a/speech.py b/speech.py index 4ad0fae..60eacd1 100644 --- a/speech.py +++ b/speech.py @@ -2,13 +2,26 @@ # -*- coding: utf-8 -*- import io import os +import sys +import argparse +from opts import add_basic_args + +parser = argparse.ArgumentParser(description='speech to text') +parser = add_basic_args(parser) +args = parser.parse_args() + +if args.language_code not in ('ko-KR', 'en-US'): + raise ValueError('Unknown language-code') + +if not os.path.isfile(args.audio_path): + raise ValueError('No such file: ', args.audio_path) # Imports the Google Cloud client library try: from google.cloud import speech except ImportError: - print("Error speech import error") - exit(255) + raise ImportError('Error import speech error') + from google.cloud.speech import enums from google.cloud.speech import types @@ -18,7 +31,7 @@ # The name of the audio file to transcribe file_name = os.path.join( os.path.dirname(__file__), - 'test.raw') + args.audio_path) # Loads the audio into memory with io.open(file_name, 'rb') as audio_file: @@ -28,11 +41,11 @@ config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, - language_code='ko-KR') + language_code=args.language_code) # Detects speech in the audio file response = client.recognize(config, audio) alternatives = response.results[0].alternatives for alternative in alternatives: - print('Transcript: {}'.format(alternative.transcript)) + print('{} : {}'.format(args.audio_path, alternative.transcript)) diff --git a/speech_streaming.py b/speech_streaming.py index 2f0c010..7aeb8ed 100644 --- a/speech_streaming.py +++ b/speech_streaming.py @@ -2,24 +2,36 @@ # -*- coding: utf-8 -*- import io import os +import argparse +from opts import add_basic_args + +parser = argparse.ArgumentParser(description='speech to text') +parser = add_basic_args(parser) +args = parser.parse_args() + +if args.language_code not in ('ko-KR', 'en-US'): + raise ValueError('Unknown language-code') + +if not os.path.isfile(args.audio_path): + raise ValueError('No such file: ', args.audio_path) # Imports the Google Cloud client library try: from google.cloud import speech except ImportError: - print("Error speech import error") - exit(255) + raise ImportError('Error import speech error') + from google.cloud.speech import types # Instantiates a client client = speech.SpeechClient() config = types.RecognitionConfig( encoding='LINEAR16', - language_code='ko-KR', + language_code=args.language_code, sample_rate_hertz=16000) # Loads the audio into memory -with io.open("./test.raw", 'rb') as stream: +with io.open(args.audio_path, 'rb') as stream: requests = [types.StreamingRecognizeRequest( audio_content=stream.read(), )] diff --git a/transcribe_streaming_mic.py b/transcribe_streaming_mic.py index 9d5528a..33d9d41 100644 --- a/transcribe_streaming_mic.py +++ b/transcribe_streaming_mic.py @@ -36,12 +36,19 @@ from google.cloud.speech import types import pyaudio from six.moves import queue +import argparse # [END import_libraries] # Audio recording parameters RATE = 16000 CHUNK = int(RATE / 10) # 100ms +parser = argparse.ArgumentParser(description='speech to text') +parser.add_argument('--language-code', default='ko-KR', help='Language code. ( ko-KR, en-US, etc.. )') +args = parser.parse_args() + +if args.language_code not in ('ko-KR', 'en-US'): + raise ValueError('Unknown language-code') class MicrophoneStream(object): """Opens a recording stream as a generator yielding the audio chunks.""" @@ -165,7 +172,7 @@ def listen_print_loop(responses): def main(): # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. - language_code = 'ko-KR' # a BCP-47 language tag + language_code = args.language_code # a BCP-47 language tag client = speech.SpeechClient() config = types.RecognitionConfig(