-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstt.py
135 lines (110 loc) · 4.54 KB
/
stt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
"""Support for Wyoming speech-to-text services."""
from collections.abc import AsyncIterable
import logging
from wyoming.asr import Transcribe, Transcript
from wyoming.audio import AudioChunk, AudioStart, AudioStop
from wyoming.client import AsyncTcpClient
from homeassistant.components import stt
from homeassistant.config_entries import ConfigEntry
from homeassistant.core import HomeAssistant
from homeassistant.helpers.entity_platform import AddEntitiesCallback
from .const import DOMAIN, SAMPLE_CHANNELS, SAMPLE_RATE, SAMPLE_WIDTH
from .data import WyomingService
from .error import WyomingError
from .models import DomainDataItem
_LOGGER = logging.getLogger(__name__)
async def async_setup_entry(
hass: HomeAssistant,
config_entry: ConfigEntry,
async_add_entities: AddEntitiesCallback,
) -> None:
"""Set up Wyoming speech-to-text."""
item: DomainDataItem = hass.data[DOMAIN][config_entry.entry_id]
async_add_entities(
[
WyomingSttProvider(config_entry, item.service),
]
)
class WyomingSttProvider(stt.SpeechToTextEntity):
"""Wyoming speech-to-text provider."""
def __init__(
self,
config_entry: ConfigEntry,
service: WyomingService,
) -> None:
"""Set up provider."""
self.service = service
asr_service = service.info.asr[0]
model_languages: set[str] = set()
for asr_model in asr_service.models:
if asr_model.installed:
model_languages.update(asr_model.languages)
self._supported_languages = list(model_languages)
self._attr_name = asr_service.name
self._attr_unique_id = f"{config_entry.entry_id}-stt"
@property
def supported_languages(self) -> list[str]:
"""Return a list of supported languages."""
return self._supported_languages
@property
def supported_formats(self) -> list[stt.AudioFormats]:
"""Return a list of supported formats."""
return [stt.AudioFormats.WAV]
@property
def supported_codecs(self) -> list[stt.AudioCodecs]:
"""Return a list of supported codecs."""
return [stt.AudioCodecs.PCM]
@property
def supported_bit_rates(self) -> list[stt.AudioBitRates]:
"""Return a list of supported bitrates."""
return [stt.AudioBitRates.BITRATE_16]
@property
def supported_sample_rates(self) -> list[stt.AudioSampleRates]:
"""Return a list of supported samplerates."""
return [stt.AudioSampleRates.SAMPLERATE_16000]
@property
def supported_channels(self) -> list[stt.AudioChannels]:
"""Return a list of supported channels."""
return [stt.AudioChannels.CHANNEL_MONO]
async def async_process_audio_stream(
self, metadata: stt.SpeechMetadata, stream: AsyncIterable[bytes]
) -> stt.SpeechResult:
"""Process an audio stream to STT service."""
try:
async with AsyncTcpClient(self.service.host, self.service.port) as client:
# Set transcription language
await client.write_event(Transcribe(language=metadata.language).event())
# Begin audio stream
await client.write_event(
AudioStart(
rate=SAMPLE_RATE,
width=SAMPLE_WIDTH,
channels=SAMPLE_CHANNELS,
).event(),
)
async for audio_bytes in stream:
chunk = AudioChunk(
rate=SAMPLE_RATE,
width=SAMPLE_WIDTH,
channels=SAMPLE_CHANNELS,
audio=audio_bytes,
)
await client.write_event(chunk.event())
# End audio stream
await client.write_event(AudioStop().event())
while True:
event = await client.read_event()
if event is None:
_LOGGER.debug("Connection lost")
return stt.SpeechResult(None, stt.SpeechResultState.ERROR)
if Transcript.is_type(event.type):
transcript = Transcript.from_event(event)
text = transcript.text
break
except (OSError, WyomingError) as err:
_LOGGER.exception("Error processing audio stream: %s", err)
return stt.SpeechResult(None, stt.SpeechResultState.ERROR)
return stt.SpeechResult(
text,
stt.SpeechResultState.SUCCESS,
)