Merge pull request microsoft#199 from microsoft/speech

add Speech and code refactor
Bitscape-MSP · Oct 3, 2023 · c473210 · c473210
2 parents 9fdcfaf + 17d4be4
commit c473210
Show file tree

Hide file tree

Showing 29 changed files with 867 additions and 302 deletions.
diff --git a/docs/7-environment-variables.md b/docs/7-environment-variables.md
@@ -23,5 +23,7 @@ Below are the required environment variables, to be added to the Azure Portal or
 | `AZURE_SEARCH_NAME`                                                                       | `https://AZURE_SEARCH_NAME.search.windows.net` | The deployment name of your Azure Cognitive Search                                                                                        |
 | `AZURE_SEARCH_INDEX_NAME`                                                                 |                                                | The index name with [vector search](https://learn.microsoft.com/en-us/azure/search/vector-search-overview) enabled                        |
 | `AZURE_SEARCH_API_VERSION`                                                                | `2023-07-01-Preview`                           | API version which supports vector search `2023-07-01-Preview`                                                                             |
-| `AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT`                                                    | `https://REGION.api.cognitive.microsoft.com/`  | Endpoint url of the Azure document intelligence. The REGION is specific to your Azure resource location                                   |
-| `AZURE_DOCUMENT_INTELLIGENCE_KEY`                                                         |                                                | API keys of your Azure Document intelligence resource                                                                                     |
+| `AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT`                                                    | `https://NAME.api.cognitive.microsoft.com/`    | Endpoint url of the Azure document intelligence. The REGION is specific to your Azure resource location                                   |
+| `AZURE_SPEECH_REGION`                                                                     | australiaeast                                  | Region of your Azure Speech service                                                                                                       |
+| `AZURE_SPEECH_KEY`                                                                        |                                                | API Key of Azure Speech service                                                                                                           |
+|  |
diff --git a/infra/resources.bicep b/infra/resources.bicep
@@ -13,6 +13,7 @@ param embeddingDeploymentName string = 'text-embedding-ada-002'
 param embeddingDeploymentCapacity int = 30
 param embeddingModelName string = 'text-embedding-ada-002'
 
+param speechServiceSkuName string = 'S0'
 param formRecognizerSkuName string = 'S0'
 param searchServiceSkuName string = 'standard'
 param searchServiceIndexName string = 'azure-chat'
@@ -27,6 +28,7 @@ param tags object = {}
 
 var openai_name = toLower('${name}ai${resourceToken}')
 var form_recognizer_name = toLower('${name}-form-${resourceToken}')
+var speech_service_name = toLower('${name}-speech-${resourceToken}')
 var cosmos_name = toLower('${name}-cosmos-${resourceToken}')
 var search_name = toLower('${name}search${resourceToken}')
 var webapp_name = toLower('${name}-webapp-${resourceToken}')
@@ -127,7 +129,7 @@ resource webApp 'Microsoft.Web/sites@2020-06-01' = {
         }
         { 
           name: 'AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT'
-          value: 'https://${location}.api.cognitive.microsoft.com/'
+          value: 'https://${form_recognizer_name}.cognitiveservices.azure.com/'
         }
         { 
           name: 'SCM_DO_BUILD_DURING_DEPLOYMENT'
@@ -161,6 +163,14 @@ resource webApp 'Microsoft.Web/sites@2020-06-01' = {
           name: 'NEXTAUTH_URL'
           value: 'https://${webapp_name}.azurewebsites.net'
         }
+        {
+          name: 'AZURE_SPEECH_REGION'
+          value: resourceGroup().location
+        }
+        {
+          name: 'AZURE_SPEECH_KEY'
+          value: '@Microsoft.KeyVault(VaultName=${kv.name};SecretName=${kv::AZURE_SPEECH_KEY.name})'
+        }
       ]
     }
   }
@@ -236,6 +246,15 @@ resource kv 'Microsoft.KeyVault/vaults@2021-06-01-preview' = {
     }
   }
 
+  resource AZURE_SPEECH_KEY 'secrets' = {
+    name: 'AZURE-SPEECH-KEY'
+    properties: {
+      contentType: 'text/plain'
+      value: speechService.listKeys().key1
+    }
+  }
+
+
   resource AZURE_SEARCH_API_KEY 'secrets' = {
     name: 'AZURE-SEARCH-API-KEY'
     properties: {
@@ -351,5 +370,18 @@ resource deployment 'Microsoft.CognitiveServices/accounts/deployments@2023-05-01
   }
 }]
 
+resource speechService 'Microsoft.CognitiveServices/accounts@2023-05-01' = {
+  name: speech_service_name
+  location: location
+  tags: tags
+  kind: 'SpeechServices'
+  properties: {
+    customSubDomainName: speech_service_name
+    publicNetworkAccess: 'Enabled'
+  }
+  sku: {
+    name: speechServiceSkuName
+  }
+}
 
 output url string = 'https://${webApp.properties.defaultHostName}'
diff --git a/src/.env.example b/src/.env.example
@@ -42,5 +42,9 @@ AZURE_SEARCH_INDEX_NAME=
 AZURE_SEARCH_API_VERSION="2023-07-01-Preview"
 
 # Azure AI Document Intelligence to extract content from your data
-AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT="https://REGION.api.cognitive.microsoft.com/"
-AZURE_DOCUMENT_INTELLIGENCE_KEY=
+AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT="https://NAME.api.cognitive.microsoft.com/"
+AZURE_DOCUMENT_INTELLIGENCE_KEY=
+
+# Azure Speech to Text to convert audio to text
+AZURE_SPEECH_REGION=""
+AZURE_SPEECH_KEY=""
diff --git a/src/app/chat/[id]/page.tsx b/src/app/chat/[id]/page.tsx
@@ -1,5 +1,6 @@
 import { FindAllChats } from "@/features/chat/chat-services/chat-service";
 import { FindChatThreadByID } from "@/features/chat/chat-services/chat-thread-service";
+import { ChatProvider } from "@/features/chat/chat-ui/chat-context";
 import { ChatUI } from "@/features/chat/chat-ui/chat-ui";
 import { notFound } from "next/navigation";
 
@@ -15,5 +16,9 @@ export default async function Home({ params }: { params: { id: string } }) {
     notFound();
   }
 
-  return <ChatUI chats={items} chatThread={thread[0]} />;
+  return (
+    <ChatProvider id={params.id} chats={items} chatThread={thread[0]}>
+      <ChatUI />
+    </ChatProvider>
+  );
 }
diff --git a/src/components/chat/chat-input.tsx b/src/components/chat/chat-input.tsx
@@ -1,21 +1,25 @@
+import { Microphone } from "@/features/chat/chat-speech/microphone";
+import { useSpeechContext } from "@/features/chat/chat-speech/speech-context";
+import { useChatContext } from "@/features/chat/chat-ui/chat-context";
 import { Loader, Send } from "lucide-react";
-import { FC, FormEvent, useRef, useState } from "react";
+import { FC, FormEvent, useEffect, useRef, useState } from "react";
 import { Button } from "../ui/button";
 import { Textarea } from "../ui/textarea";
 
-interface Props {
-  value: string;
-  handleSubmit: (e: FormEvent<HTMLFormElement>) => void;
-  handleInputChange: (e: any) => void;
-  isLoading: boolean;
-}
+interface Props {}
 
 const ChatInput: FC<Props> = (props) => {
+  const { setInput, handleSubmit, isLoading } = useChatContext();
+
   const buttonRef = useRef<HTMLButtonElement>(null);
   const [rows, setRows] = useState(1);
+
   const maxRows = 6;
+
   const [keysPressed, setKeysPressed] = useState(new Set());
 
+  const { speech, setSpeechText } = useSpeechContext();
+
   const onKeyDown = (event: React.KeyboardEvent<HTMLTextAreaElement>) => {
     setKeysPressed(keysPressed.add(event.key));
 
@@ -34,10 +38,11 @@ const ChatInput: FC<Props> = (props) => {
     }
   };
 
-  const handleSubmit = (e: FormEvent<HTMLFormElement>) => {
+  const submit = (e: FormEvent<HTMLFormElement>) => {
     e.preventDefault();
-    props.handleSubmit(e);
+    handleSubmit(e);
     setRows(1);
+    setSpeechText("");
   };
 
   const onKeyUp = (event: React.KeyboardEvent<HTMLTextAreaElement>) => {
@@ -47,7 +52,8 @@ const ChatInput: FC<Props> = (props) => {
 
   const onChange = (event: React.ChangeEvent<HTMLTextAreaElement>) => {
     setRowsToMax(event.target.value.split("\n").length - 1);
-    props.handleInputChange(event);
+    setInput(event.target.value);
+    setSpeechText(event.target.value);
   };
 
   const setRowsToMax = (rows: number) => {
@@ -56,30 +62,36 @@ const ChatInput: FC<Props> = (props) => {
     }
   };
 
+  // TODO: this is a temp fix. Move the useChat into a context and reuse that context here
+  useEffect(() => {
+    setInput(speech);
+  }, [speech]);
+
   return (
     <form
-      onSubmit={handleSubmit}
+      onSubmit={submit}
       className="absolute bottom-0 w-full flex items-center"
     >
       <div className="container mx-auto max-w-4xl relative py-2 flex gap-2 items-end">
         <Textarea
           rows={rows}
+          value={speech}
           placeholder="Send a message"
-          className="min-h-fit bg-background shadow-sm resize-none py-4"
-          value={props.value}
+          className="min-h-fit bg-background shadow-sm resize-none py-4 pr-[80px]"
           onKeyUp={onKeyUp}
           onKeyDown={onKeyDown}
           onChange={onChange}
         ></Textarea>
         <div className="absolute right-0 bottom-0 px-8 flex items-end h-full mr-2 mb-4">
+          <Microphone disabled={isLoading} />
           <Button
             size="icon"
             type="submit"
             variant={"ghost"}
             ref={buttonRef}
-            disabled={props.isLoading}
+            disabled={isLoading}
           >
-            {props.isLoading ? (
+            {isLoading ? (
               <Loader className="animate-spin" size={16} />
             ) : (
               <Send size={16} />

diff --git a/src/features/chat/chat-services/chat-document-service.ts b/src/features/chat/chat-services/chat-document-service.ts
@@ -7,20 +7,18 @@ import {
   AzureKeyCredential,
   DocumentAnalysisClient,
 } from "@azure/ai-form-recognizer";
+import { SqlQuerySpec } from "@azure/cosmos";
 import { Document } from "langchain/document";
 import { OpenAIEmbeddings } from "langchain/embeddings/openai";
 import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
 import { nanoid } from "nanoid";
 import {
   CHAT_DOCUMENT_ATTRIBUTE,
   ChatDocumentModel,
-  ChatMessageModel,
   FaqDocumentIndex,
-  MESSAGE_ATTRIBUTE,
   ServerActionResponse,
 } from "./models";
 import { isNotNullOrEmpty } from "./utils";
-import { SqlQuerySpec } from "@azure/cosmos";
 
 const MAX_DOCUMENT_SIZE = 20000000;
 
@@ -105,12 +103,9 @@ const SplitDocuments = async (docs: Array<Document>) => {
 
 export const DeleteDocuments = async (chatThreadId: string) => {
   try {
-
     const vectorStore = initAzureSearchVectorStore();
     await vectorStore.deleteDocuments(chatThreadId);
-
   } catch (e) {
-    console.log("************");
     return {
       success: false,
       error: (e as Error).message,
@@ -126,6 +121,7 @@ export const IndexDocuments = async (
 ): Promise<ServerActionResponse<FaqDocumentIndex[]>> => {
   try {
     const vectorStore = initAzureSearchVectorStore();
+
     const documentsToIndex: FaqDocumentIndex[] = [];
     let index = 0;
     for (const doc of docs) {
@@ -143,14 +139,14 @@ export const IndexDocuments = async (
     }
 
     await vectorStore.addDocuments(documentsToIndex);
+
     await UpsertChatDocument(fileName, chatThreadId);
     return {
       success: true,
       error: "",
       response: documentsToIndex,
     };
   } catch (e) {
-    console.log("************");
     return {
       success: false,
       error: (e as Error).message,
@@ -175,10 +171,7 @@ export const initAzureSearchVectorStore = () => {
 export const initDocumentIntelligence = () => {
   const client = new DocumentAnalysisClient(
     process.env.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT,
-    new AzureKeyCredential(process.env.AZURE_DOCUMENT_INTELLIGENCE_KEY),
-    {
-      apiVersion: "2022-08-31",
-    }
+    new AzureKeyCredential(process.env.AZURE_DOCUMENT_INTELLIGENCE_KEY)
   );
 
   return client;

diff --git a/src/features/chat/chat-speech/microphone.tsx b/src/features/chat/chat-speech/microphone.tsx
@@ -0,0 +1,21 @@
+import { FC } from "react";
+import { RecordSpeech } from "./record-speech";
+import { useSpeechContext } from "./speech-context";
+import { StopSpeech } from "./stop-speech";
+
+interface MicrophoneProps {
+  disabled: boolean;
+}
+
+export const Microphone: FC<MicrophoneProps> = (props) => {
+  const { isPlaying } = useSpeechContext();
+  return (
+    <>
+      {isPlaying ? (
+        <StopSpeech disabled={props.disabled} />
+      ) : (
+        <RecordSpeech disabled={props.disabled} />
+      )}
+    </>
+  );
+};
diff --git a/src/features/chat/chat-speech/record-speech.tsx b/src/features/chat/chat-speech/record-speech.tsx
@@ -0,0 +1,39 @@
+import { Button } from "@/components/ui/button";
+import { Mic } from "lucide-react";
+import { FC, useState } from "react";
+import { useSpeechContext } from "./speech-context";
+
+interface Prop {
+  disabled: boolean;
+}
+
+export const RecordSpeech: FC<Prop> = (props) => {
+  const [isPressed, setIsPressed] = useState(false);
+
+  const { startRecognition, stopRecognition } = useSpeechContext();
+
+  const handleMouseDown = async () => {
+    await startRecognition();
+    setIsPressed(true);
+  };
+
+  const handleMouseUp = () => {
+    stopRecognition();
+    setIsPressed(false);
+  };
+
+  return (
+    <Button
+      type="button"
+      size="icon"
+      variant={"ghost"}
+      disabled={props.disabled}
+      onMouseDown={handleMouseDown}
+      onMouseUp={handleMouseUp}
+      onMouseLeave={handleMouseUp}
+      className={isPressed ? "bg-red-400 hover:bg-red-400" : ""}
+    >
+      <Mic size={18} />
+    </Button>
+  );
+};
diff --git a/src/features/chat/chat-speech/speech-context.tsx b/src/features/chat/chat-speech/speech-context.tsx
@@ -0,0 +1,42 @@
+import React, { createContext } from "react";
+import { useSpeechRecognizer } from "./use-speech-recognizer";
+import { useSpeechSynthesizer } from "./use-speech-synthesizer";
+
+interface SpeechContextProps {
+  textToSpeech: (textToSpeak: string) => Promise<void>;
+  stopPlaying: () => void;
+  isPlaying: boolean;
+  startRecognition: () => void;
+  stopRecognition: () => void;
+  speech: string;
+  setSpeechText: (text: string) => void;
+  resetMicrophoneUsed: () => void;
+  isMicrophoneUsed: boolean;
+}
+
+const SpeechContext = createContext<SpeechContextProps | null>(null);
+
+export const SpeechProvider = ({ children }: { children: React.ReactNode }) => {
+  const speechSynthesizer = useSpeechSynthesizer();
+  const speechRecognizer = useSpeechRecognizer();
+
+  return (
+    <SpeechContext.Provider
+      value={{
+        ...speechSynthesizer,
+        ...speechRecognizer,
+      }}
+    >
+      {children}
+    </SpeechContext.Provider>
+  );
+};
+
+export const useSpeechContext = () => {
+  const context = React.useContext(SpeechContext);
+  if (!context) {
+    throw new Error("SpeechContext is null");
+  }
+
+  return context;
+};