Skip to content

Commit

Permalink
Feature/Add Azure Cognitive speech-to-text functionality (#3718)
Browse files Browse the repository at this point in the history
* feat: Add Azure Cognitive Services integration for speech-to-text functionality

- Introduced a new credential class for Azure Cognitive Services.
- Updated speech-to-text processing to support Azure Cognitive Services as a provider.
- Enhanced UI components to include Azure Cognitive Services options and inputs for configuration.
- Added necessary imports and error handling for Azure API requests.

* Update SpeechToText.jsx linting

* refactor: Update audio file handling in SpeechToText component

- Removed the dependency on 'form-data' and replaced it with a Blob for audio file uploads.
- Simplified the audio file appending process to the form data.
- Cleaned up the headers in the Axios request by removing unnecessary form data headers.

This change enhances the efficiency of audio file processing in the speech-to-text functionality.

---------

Co-authored-by: Henry Heng <[email protected]>
Co-authored-by: Henry <[email protected]>
  • Loading branch information
3 people authored Dec 18, 2024
1 parent fff6319 commit 2360f5f
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import { INodeParams, INodeCredential } from '../src/Interface'

class AzureCognitiveServices implements INodeCredential {
label: string
name: string
version: number
inputs: INodeParams[]

constructor() {
this.label = 'Azure Cognitive Services'
this.name = 'azureCognitiveServices'
this.version = 1.0
this.inputs = [
{
label: 'Azure Subscription Key',
name: 'azureSubscriptionKey',
type: 'password',
description: 'Your Azure Cognitive Services subscription key'
},
{
label: 'Service Region',
name: 'serviceRegion',
type: 'string',
description: 'The Azure service region (e.g., "westus", "eastus")',
placeholder: 'westus'
},
{
label: 'API Version',
name: 'apiVersion',
type: 'string',
description: 'The API version to use (e.g., "2024-05-15-preview")',
placeholder: '2024-05-15-preview',
default: '2024-05-15-preview'
}
]
}
}

module.exports = { credClass: AzureCognitiveServices }
36 changes: 36 additions & 0 deletions packages/components/src/speechToText.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@ import { getCredentialData } from './utils'
import { type ClientOptions, OpenAIClient, toFile } from '@langchain/openai'
import { AssemblyAI } from 'assemblyai'
import { getFileFromStorage } from './storageUtils'
import axios from 'axios'
import Groq from 'groq-sdk'

const SpeechToTextType = {
OPENAI_WHISPER: 'openAIWhisper',
ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe',
LOCALAI_STT: 'localAISTT',
AZURE_COGNITIVE: 'azureCognitive',
GROQ_WHISPER: 'groqWhisper'
}

Expand Down Expand Up @@ -72,6 +74,40 @@ export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfi
}
break
}
case SpeechToTextType.AZURE_COGNITIVE: {
try {
const baseUrl = `https://${credentialData.serviceRegion}.cognitiveservices.azure.com/speechtotext/transcriptions:transcribe`
const apiVersion = credentialData.apiVersion || '2024-05-15-preview'

const formData = new FormData()
const audioBlob = new Blob([audio_file], { type: upload.type })
formData.append('audio', audioBlob, upload.name)

const channelsStr = speechToTextConfig.channels || '0,1'
const channels = channelsStr.split(',').map(Number)

const definition = {
locales: [speechToTextConfig.language || 'en-US'],
profanityFilterMode: speechToTextConfig.profanityFilterMode || 'Masked',
channels
}
formData.append('definition', JSON.stringify(definition))

const response = await axios.post(`${baseUrl}?api-version=${apiVersion}`, formData, {
headers: {
'Ocp-Apim-Subscription-Key': credentialData.azureSubscriptionKey,
Accept: 'application/json'
}
})

if (response.data && response.data.combinedPhrases.length > 0) {
return response.data.combinedPhrases[0]?.text || ''
}
return ''
} catch (error) {
throw error.response?.data || error
}
}
case SpeechToTextType.GROQ_WHISPER: {
const groqClient = new Groq({
apiKey: credentialData.groqApiKey
Expand Down
54 changes: 54 additions & 0 deletions packages/ui/src/ui-component/extended/SpeechToText.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import { Dropdown } from '@/ui-component/dropdown/Dropdown'
import openAISVG from '@/assets/images/openai.svg'
import assemblyAIPng from '@/assets/images/assemblyai.png'
import localAiPng from '@/assets/images/localai.png'
import azureSvg from '@/assets/images/azure_openai.svg'
import groqPng from '@/assets/images/groq.png'

// store
Expand All @@ -31,6 +32,7 @@ const SpeechToTextType = {
OPENAI_WHISPER: 'openAIWhisper',
ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe',
LOCALAI_STT: 'localAISTT',
AZURE_COGNITIVE: 'azureCognitive',
GROQ_WHISPER: 'groqWhisper'
}

Expand Down Expand Up @@ -142,6 +144,58 @@ const speechToTextProviders = {
}
]
},
[SpeechToTextType.AZURE_COGNITIVE]: {
label: 'Azure Cognitive Services',
name: SpeechToTextType.AZURE_COGNITIVE,
icon: azureSvg,
url: 'https://azure.microsoft.com/en-us/products/cognitive-services/speech-services',
inputs: [
{
label: 'Connect Credential',
name: 'credential',
type: 'credential',
credentialNames: ['azureCognitiveServices']
},
{
label: 'Language',
name: 'language',
type: 'string',
description: 'The recognition language (e.g., "en-US", "es-ES")',
placeholder: 'en-US',
optional: true
},
{
label: 'Profanity Filter Mode',
name: 'profanityFilterMode',
type: 'options',
description: 'How to handle profanity in the transcription',
options: [
{
label: 'None',
name: 'None'
},
{
label: 'Masked',
name: 'Masked'
},
{
label: 'Removed',
name: 'Removed'
}
],
default: 'Masked',
optional: true
},
{
label: 'Audio Channels',
name: 'channels',
type: 'string',
description: 'Comma-separated list of audio channels to process (e.g., "0,1")',
placeholder: '0,1',
default: '0,1'
}
]
},
[SpeechToTextType.GROQ_WHISPER]: {
label: 'Groq Whisper',
name: SpeechToTextType.GROQ_WHISPER,
Expand Down

0 comments on commit 2360f5f

Please sign in to comment.