Skip to content

Commit

Permalink
Merge pull request microsoft#199 from microsoft/speech
Browse files Browse the repository at this point in the history
add Speech and code refactor
  • Loading branch information
davidxw authored Oct 3, 2023
2 parents 9fdcfaf + 17d4be4 commit c473210
Show file tree
Hide file tree
Showing 29 changed files with 867 additions and 302 deletions.
6 changes: 4 additions & 2 deletions docs/7-environment-variables.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,7 @@ Below are the required environment variables, to be added to the Azure Portal or
| `AZURE_SEARCH_NAME` | `https://AZURE_SEARCH_NAME.search.windows.net` | The deployment name of your Azure Cognitive Search |
| `AZURE_SEARCH_INDEX_NAME` | | The index name with [vector search](https://learn.microsoft.com/en-us/azure/search/vector-search-overview) enabled |
| `AZURE_SEARCH_API_VERSION` | `2023-07-01-Preview` | API version which supports vector search `2023-07-01-Preview` |
| `AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT` | `https://REGION.api.cognitive.microsoft.com/` | Endpoint url of the Azure document intelligence. The REGION is specific to your Azure resource location |
| `AZURE_DOCUMENT_INTELLIGENCE_KEY` | | API keys of your Azure Document intelligence resource |
| `AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT` | `https://NAME.api.cognitive.microsoft.com/` | Endpoint url of the Azure document intelligence. The REGION is specific to your Azure resource location |
| `AZURE_SPEECH_REGION` | australiaeast | Region of your Azure Speech service |
| `AZURE_SPEECH_KEY` | | API Key of Azure Speech service |
| |
34 changes: 33 additions & 1 deletion infra/resources.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ param embeddingDeploymentName string = 'text-embedding-ada-002'
param embeddingDeploymentCapacity int = 30
param embeddingModelName string = 'text-embedding-ada-002'

param speechServiceSkuName string = 'S0'
param formRecognizerSkuName string = 'S0'
param searchServiceSkuName string = 'standard'
param searchServiceIndexName string = 'azure-chat'
Expand All @@ -27,6 +28,7 @@ param tags object = {}

var openai_name = toLower('${name}ai${resourceToken}')
var form_recognizer_name = toLower('${name}-form-${resourceToken}')
var speech_service_name = toLower('${name}-speech-${resourceToken}')
var cosmos_name = toLower('${name}-cosmos-${resourceToken}')
var search_name = toLower('${name}search${resourceToken}')
var webapp_name = toLower('${name}-webapp-${resourceToken}')
Expand Down Expand Up @@ -127,7 +129,7 @@ resource webApp 'Microsoft.Web/sites@2020-06-01' = {
}
{
name: 'AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT'
value: 'https://${location}.api.cognitive.microsoft.com/'
value: 'https://${form_recognizer_name}.cognitiveservices.azure.com/'
}
{
name: 'SCM_DO_BUILD_DURING_DEPLOYMENT'
Expand Down Expand Up @@ -161,6 +163,14 @@ resource webApp 'Microsoft.Web/sites@2020-06-01' = {
name: 'NEXTAUTH_URL'
value: 'https://${webapp_name}.azurewebsites.net'
}
{
name: 'AZURE_SPEECH_REGION'
value: resourceGroup().location
}
{
name: 'AZURE_SPEECH_KEY'
value: '@Microsoft.KeyVault(VaultName=${kv.name};SecretName=${kv::AZURE_SPEECH_KEY.name})'
}
]
}
}
Expand Down Expand Up @@ -236,6 +246,15 @@ resource kv 'Microsoft.KeyVault/vaults@2021-06-01-preview' = {
}
}

resource AZURE_SPEECH_KEY 'secrets' = {
name: 'AZURE-SPEECH-KEY'
properties: {
contentType: 'text/plain'
value: speechService.listKeys().key1
}
}


resource AZURE_SEARCH_API_KEY 'secrets' = {
name: 'AZURE-SEARCH-API-KEY'
properties: {
Expand Down Expand Up @@ -351,5 +370,18 @@ resource deployment 'Microsoft.CognitiveServices/accounts/deployments@2023-05-01
}
}]

resource speechService 'Microsoft.CognitiveServices/accounts@2023-05-01' = {
name: speech_service_name
location: location
tags: tags
kind: 'SpeechServices'
properties: {
customSubDomainName: speech_service_name
publicNetworkAccess: 'Enabled'
}
sku: {
name: speechServiceSkuName
}
}

output url string = 'https://${webApp.properties.defaultHostName}'
8 changes: 6 additions & 2 deletions src/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -42,5 +42,9 @@ AZURE_SEARCH_INDEX_NAME=
AZURE_SEARCH_API_VERSION="2023-07-01-Preview"

# Azure AI Document Intelligence to extract content from your data
AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT="https://REGION.api.cognitive.microsoft.com/"
AZURE_DOCUMENT_INTELLIGENCE_KEY=
AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT="https://NAME.api.cognitive.microsoft.com/"
AZURE_DOCUMENT_INTELLIGENCE_KEY=

# Azure Speech to Text to convert audio to text
AZURE_SPEECH_REGION=""
AZURE_SPEECH_KEY=""
7 changes: 6 additions & 1 deletion src/app/chat/[id]/page.tsx
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { FindAllChats } from "@/features/chat/chat-services/chat-service";
import { FindChatThreadByID } from "@/features/chat/chat-services/chat-thread-service";
import { ChatProvider } from "@/features/chat/chat-ui/chat-context";
import { ChatUI } from "@/features/chat/chat-ui/chat-ui";
import { notFound } from "next/navigation";

Expand All @@ -15,5 +16,9 @@ export default async function Home({ params }: { params: { id: string } }) {
notFound();
}

return <ChatUI chats={items} chatThread={thread[0]} />;
return (
<ChatProvider id={params.id} chats={items} chatThread={thread[0]}>
<ChatUI />
</ChatProvider>
);
}
42 changes: 27 additions & 15 deletions src/components/chat/chat-input.tsx
Original file line number Diff line number Diff line change
@@ -1,21 +1,25 @@
import { Microphone } from "@/features/chat/chat-speech/microphone";
import { useSpeechContext } from "@/features/chat/chat-speech/speech-context";
import { useChatContext } from "@/features/chat/chat-ui/chat-context";
import { Loader, Send } from "lucide-react";
import { FC, FormEvent, useRef, useState } from "react";
import { FC, FormEvent, useEffect, useRef, useState } from "react";
import { Button } from "../ui/button";
import { Textarea } from "../ui/textarea";

interface Props {
value: string;
handleSubmit: (e: FormEvent<HTMLFormElement>) => void;
handleInputChange: (e: any) => void;
isLoading: boolean;
}
interface Props {}

const ChatInput: FC<Props> = (props) => {
const { setInput, handleSubmit, isLoading } = useChatContext();

const buttonRef = useRef<HTMLButtonElement>(null);
const [rows, setRows] = useState(1);

const maxRows = 6;

const [keysPressed, setKeysPressed] = useState(new Set());

const { speech, setSpeechText } = useSpeechContext();

const onKeyDown = (event: React.KeyboardEvent<HTMLTextAreaElement>) => {
setKeysPressed(keysPressed.add(event.key));

Expand All @@ -34,10 +38,11 @@ const ChatInput: FC<Props> = (props) => {
}
};

const handleSubmit = (e: FormEvent<HTMLFormElement>) => {
const submit = (e: FormEvent<HTMLFormElement>) => {
e.preventDefault();
props.handleSubmit(e);
handleSubmit(e);
setRows(1);
setSpeechText("");
};

const onKeyUp = (event: React.KeyboardEvent<HTMLTextAreaElement>) => {
Expand All @@ -47,7 +52,8 @@ const ChatInput: FC<Props> = (props) => {

const onChange = (event: React.ChangeEvent<HTMLTextAreaElement>) => {
setRowsToMax(event.target.value.split("\n").length - 1);
props.handleInputChange(event);
setInput(event.target.value);
setSpeechText(event.target.value);
};

const setRowsToMax = (rows: number) => {
Expand All @@ -56,30 +62,36 @@ const ChatInput: FC<Props> = (props) => {
}
};

// TODO: this is a temp fix. Move the useChat into a context and reuse that context here
useEffect(() => {
setInput(speech);
}, [speech]);

return (
<form
onSubmit={handleSubmit}
onSubmit={submit}
className="absolute bottom-0 w-full flex items-center"
>
<div className="container mx-auto max-w-4xl relative py-2 flex gap-2 items-end">
<Textarea
rows={rows}
value={speech}
placeholder="Send a message"
className="min-h-fit bg-background shadow-sm resize-none py-4"
value={props.value}
className="min-h-fit bg-background shadow-sm resize-none py-4 pr-[80px]"
onKeyUp={onKeyUp}
onKeyDown={onKeyDown}
onChange={onChange}
></Textarea>
<div className="absolute right-0 bottom-0 px-8 flex items-end h-full mr-2 mb-4">
<Microphone disabled={isLoading} />
<Button
size="icon"
type="submit"
variant={"ghost"}
ref={buttonRef}
disabled={props.isLoading}
disabled={isLoading}
>
{props.isLoading ? (
{isLoading ? (
<Loader className="animate-spin" size={16} />
) : (
<Send size={16} />
Expand Down
15 changes: 4 additions & 11 deletions src/features/chat/chat-services/chat-document-service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,18 @@ import {
AzureKeyCredential,
DocumentAnalysisClient,
} from "@azure/ai-form-recognizer";
import { SqlQuerySpec } from "@azure/cosmos";
import { Document } from "langchain/document";
import { OpenAIEmbeddings } from "langchain/embeddings/openai";
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
import { nanoid } from "nanoid";
import {
CHAT_DOCUMENT_ATTRIBUTE,
ChatDocumentModel,
ChatMessageModel,
FaqDocumentIndex,
MESSAGE_ATTRIBUTE,
ServerActionResponse,
} from "./models";
import { isNotNullOrEmpty } from "./utils";
import { SqlQuerySpec } from "@azure/cosmos";

const MAX_DOCUMENT_SIZE = 20000000;

Expand Down Expand Up @@ -105,12 +103,9 @@ const SplitDocuments = async (docs: Array<Document>) => {

export const DeleteDocuments = async (chatThreadId: string) => {
try {

const vectorStore = initAzureSearchVectorStore();
await vectorStore.deleteDocuments(chatThreadId);

} catch (e) {
console.log("************");
return {
success: false,
error: (e as Error).message,
Expand All @@ -126,6 +121,7 @@ export const IndexDocuments = async (
): Promise<ServerActionResponse<FaqDocumentIndex[]>> => {
try {
const vectorStore = initAzureSearchVectorStore();

const documentsToIndex: FaqDocumentIndex[] = [];
let index = 0;
for (const doc of docs) {
Expand All @@ -143,14 +139,14 @@ export const IndexDocuments = async (
}

await vectorStore.addDocuments(documentsToIndex);

await UpsertChatDocument(fileName, chatThreadId);
return {
success: true,
error: "",
response: documentsToIndex,
};
} catch (e) {
console.log("************");
return {
success: false,
error: (e as Error).message,
Expand All @@ -175,10 +171,7 @@ export const initAzureSearchVectorStore = () => {
export const initDocumentIntelligence = () => {
const client = new DocumentAnalysisClient(
process.env.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT,
new AzureKeyCredential(process.env.AZURE_DOCUMENT_INTELLIGENCE_KEY),
{
apiVersion: "2022-08-31",
}
new AzureKeyCredential(process.env.AZURE_DOCUMENT_INTELLIGENCE_KEY)
);

return client;
Expand Down
21 changes: 21 additions & 0 deletions src/features/chat/chat-speech/microphone.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import { FC } from "react";
import { RecordSpeech } from "./record-speech";
import { useSpeechContext } from "./speech-context";
import { StopSpeech } from "./stop-speech";

interface MicrophoneProps {
disabled: boolean;
}

export const Microphone: FC<MicrophoneProps> = (props) => {
const { isPlaying } = useSpeechContext();
return (
<>
{isPlaying ? (
<StopSpeech disabled={props.disabled} />
) : (
<RecordSpeech disabled={props.disabled} />
)}
</>
);
};
39 changes: 39 additions & 0 deletions src/features/chat/chat-speech/record-speech.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import { Button } from "@/components/ui/button";
import { Mic } from "lucide-react";
import { FC, useState } from "react";
import { useSpeechContext } from "./speech-context";

interface Prop {
disabled: boolean;
}

export const RecordSpeech: FC<Prop> = (props) => {
const [isPressed, setIsPressed] = useState(false);

const { startRecognition, stopRecognition } = useSpeechContext();

const handleMouseDown = async () => {
await startRecognition();
setIsPressed(true);
};

const handleMouseUp = () => {
stopRecognition();
setIsPressed(false);
};

return (
<Button
type="button"
size="icon"
variant={"ghost"}
disabled={props.disabled}
onMouseDown={handleMouseDown}
onMouseUp={handleMouseUp}
onMouseLeave={handleMouseUp}
className={isPressed ? "bg-red-400 hover:bg-red-400" : ""}
>
<Mic size={18} />
</Button>
);
};
42 changes: 42 additions & 0 deletions src/features/chat/chat-speech/speech-context.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import React, { createContext } from "react";
import { useSpeechRecognizer } from "./use-speech-recognizer";
import { useSpeechSynthesizer } from "./use-speech-synthesizer";

interface SpeechContextProps {
textToSpeech: (textToSpeak: string) => Promise<void>;
stopPlaying: () => void;
isPlaying: boolean;
startRecognition: () => void;
stopRecognition: () => void;
speech: string;
setSpeechText: (text: string) => void;
resetMicrophoneUsed: () => void;
isMicrophoneUsed: boolean;
}

const SpeechContext = createContext<SpeechContextProps | null>(null);

export const SpeechProvider = ({ children }: { children: React.ReactNode }) => {
const speechSynthesizer = useSpeechSynthesizer();
const speechRecognizer = useSpeechRecognizer();

return (
<SpeechContext.Provider
value={{
...speechSynthesizer,
...speechRecognizer,
}}
>
{children}
</SpeechContext.Provider>
);
};

export const useSpeechContext = () => {
const context = React.useContext(SpeechContext);
if (!context) {
throw new Error("SpeechContext is null");
}

return context;
};
Loading

0 comments on commit c473210

Please sign in to comment.