integrated LangChain

pierlag · Mar 30, 2023 · 24e7a94 · 24e7a94
1 parent 37f01a0
commit 24e7a94
Show file tree

Hide file tree

Showing 24 changed files with 720 additions and 721 deletions.
diff --git a/.env.template b/.env.template
@@ -1,17 +1,21 @@
-OPENAI_ENGINES=text-davinci-003
+OPENAI_ENGINE=text-davinci-003
+OPENAI_EMBEDDINGS_ENGINE=text-embedding-ada-002
 OPENAI_EMBEDDINGS_ENGINE_DOC=text-embedding-ada-002
 OPENAI_EMBEDDINGS_ENGINE_QUERY=text-embedding-ada-002
 OPENAI_API_BASE=https://YOUR_AZURE_OPENAI_RESOURCE.openai.azure.com/
 OPENAI_API_KEY=YOUR_AZURE_OPENAI_API_KEY
 REDIS_ADDRESS=api
+REDIS_PORT=6379
 REDIS_PASSWORD=redis-stack-password
 REDIS_ARGS=--requirepass $REDIS_PASSWORD
 BLOB_ACCOUNT_NAME=YOUR_AZURE_BLOB_STORAGE_ACCOUNT_NAME
 BLOB_ACCOUNT_KEY=YOUR_AZURE_BLOB_STORAGE_ACCOUNT_KEY
 BLOB_CONTAINER_NAME=YOUR_AZURE_BLOB_STORAGE_CONTAINER_NAME
+QUEUE_NAME=doc-processing
 FORM_RECOGNIZER_ENDPOINT=YOUR_AZURE_FORM_RECOGNIZER_ENDPOINT
 FORM_RECOGNIZER_KEY=YOUR_AZURE_FORM_RECOGNIZER_KEY
-PAGES_PER_EMBEDDINGS=2
+CHUNK_SIZE=500
+CHUNK_OVERLAP=100
 TRANSLATE_ENDPOINT=YOUR_AZURE_TRANSLATE_ENDPOINT
 TRANSLATE_KEY=YOUR_AZURE_TRANSLATE_KEY
 TRANSLATE_REGION=YOUR_AZURE_TRANSLATE_REGION

diff --git a/README.md b/README.md
@@ -4,6 +4,16 @@ A simple web application for a OpenAI-enabled document search. This repo uses Az
 
 ![Architecture](docs/architecture.png)
 
+# IMPORTANT NOTE (OpenAI generated)
+We have made some changes to the data format in the latest update of this repo. 
+<br>The new format is more efficient and compatible with the latest standards and libraries. However, we understand that some of you may have existing applications that rely on the previous format and may not be able to migrate to the new one immediately.
+
+Therefore, we have provided a way for you to continue using the previous format in a running application. All you need to do is to set your web application tag to fruocco/oai-embeddings:2023-03-27_25. This will ensure that your application will use the data format that was available on March 27, 2023. We strongly recommend that you update your applications to use the new format as soon as possible.
+
+If you want to move to the new format, please go to:
+-   "Add Document" -> "Add documents in Batch" and click on "Convert all files and add embeddings" to reprocess your documents. 
+
+
 # Running this repo
 You have multiple options to run the code:
 -   [Deploy on Azure (WebApp + Redis Stack + Batch Processing)](#deploy-on-azure-webapp--redis-stack--batch-processing)
@@ -125,32 +135,35 @@ Configure your `.env` as described in as described in [Environment variables](#e
 Then run:
 
 ```console
-docker run -e .env -p 8080:80 fruocco/oai-embeddings:latest
+docker run --env-file .env -p 8080:80 fruocco/oai-embeddings:latest
 ```
 
 ### Option 2 - Build the Docker image yourself
 
 Configure your `.env` as described in as described in [Environment variables](#environment-variables)
 
 ```console
-docker build . -t your_docker_registry/your_docker_image:your_tag
-docker run -e .env -p 8080:80 your_docker_registry/your_docker_image:your_tag
+docker build . -f Dockerfile -t your_docker_registry/your_docker_image:your_tag
+docker run --env-file .env -p 8080:80 your_docker_registry/your_docker_image:your_tag
 ```
 
-
+Note: You can use 
+-   WebApp.Dockerfile to build the Web Application
+-   BatchProcess.Dockerfile to build the Azure Function for Batch Processing
 
 ## Environment variables
 
 Here is the explanation of the parameters:
 
 | App Setting | Value | Note |
 | --- | --- | ------------- |
-|OPENAI_ENGINES|text-davinci-003|Instruction engines deployed in your Azure OpenAI resource|
+|OPENAI_ENGINE|text-davinci-003|Instruction engine deployed in your Azure OpenAI resource|
 |OPENAI_EMBEDDINGS_ENGINE_DOC | text-embedding-ada-002  | Embedding engine for documents deployed in your Azure OpenAI resource|
 |OPENAI_EMBEDDINGS_ENGINE_QUERY | text-embedding-ada-002  | Embedding engine for query deployed in your Azure OpenAI resource|
 |OPENAI_API_BASE | https://YOUR_AZURE_OPENAI_RESOURCE.openai.azure.com/ | Your Azure OpenAI Resource name. Get it in the [Azure Portal](https://portal.azure.com)|
 |OPENAI_API_KEY| YOUR_AZURE_OPENAI_KEY | Your Azure OpenAI API Key. Get it in the [Azure Portal](https://portal.azure.com)|
 |REDIS_ADDRESS| api | URL for Redis Stack: "api" for docker compose|
+|REDIS_PORT | 6379 | Port for Redis |
 |REDIS_PASSWORD| redis-stack-password | OPTIONAL - Password for your Redis Stack|
 |REDIS_ARGS | --requirepass redis-stack-password | OPTIONAL - Password for your Redis Stack|
 |CONVERT_ADD_EMBEDDINGS_URL| http://batch/api/BatchStartProcessing | URL for Batch processing Function: "http://batch/api/BatchStartProcessing" for docker compose |

diff --git a/code/BatchPushResults/__init__.py b/code/BatchPushResults/__init__.py
@@ -1,41 +1,24 @@
-import logging, json, os, io
+import logging, json
 import azure.functions as func
-from azure.storage.blob import BlobServiceClient, generate_blob_sas
-from datetime import datetime, timedelta
-from utilities.formrecognizer import analyze_read
-from utilities.azureblobstorage import upload_file, upsert_blob_metadata
-from utilities.redisembeddings import set_document
-from utilities.utils import chunk_and_embed
-from utilities.utils import add_embeddings, convert_file_and_add_embeddings, initialize
-
-account_name = os.environ['BLOB_ACCOUNT_NAME']
-account_key = os.environ['BLOB_ACCOUNT_KEY']
-connect_str = f"DefaultEndpointsProtocol=https;AccountName={account_name};AccountKey={account_key};EndpointSuffix=core.windows.net"
-container_name = os.environ['BLOB_CONTAINER_NAME']
+from utilities.helper import LLMHelper
 
 def main(msg: func.QueueMessage) -> None:
     logging.info('Python queue trigger function processed a queue item: %s',
                  msg.get_body().decode('utf-8'))
 
-    # Set up Azure OpenAI connection
-    initialize()
-
+    # Set up LLM Helper
+    llm_helper = LLMHelper()
     # Get the file name from the message
     file_name = json.loads(msg.get_body().decode('utf-8'))['filename']
+    # Generate the SAS URL for the file
+    file_sas = llm_helper.blob_client.get_blob_sas(file_name)
 
     # Check the file extension
     if file_name.endswith('.txt'):
-        # Read the file from Blob Storage
-        blob_client = BlobServiceClient.from_connection_string(connect_str).get_blob_client(container=container_name, blob=file_name)
-        file_content = blob_client.download_blob().readall().decode('utf-8')
-
-        # Embed the file
-        data = chunk_and_embed(file_content, file_name)
-
-        # Set the document in Redis
-        set_document(data)
+        # Add the text to the embeddings
+        llm_helper.add_embeddings_lc(file_sas)
     else:
-        file_sas = generate_blob_sas(account_name, container_name, file_name, account_key= account_key, permission='r', expiry=datetime.utcnow() + timedelta(hours=1))
-        convert_file_and_add_embeddings(f"https://{account_name}.blob.core.windows.net/{container_name}/{file_name}?{file_sas}" , file_name)
+        # Get OCR with Layout API and then add embeddigns
+        llm_helper.convert_file_and_add_embeddings(file_sas , file_name)
 
-    upsert_blob_metadata(file_name, {'embeddings_added': 'true'})
+    llm_helper.blob_client.upsert_blob_metadata(file_name, {'embeddings_added': 'true'})
diff --git a/code/BatchStartProcessing/__init__.py b/code/BatchStartProcessing/__init__.py
@@ -1,21 +1,21 @@
 import logging, json, os
 import azure.functions as func
 from azure.storage.queue import QueueClient, BinaryBase64EncodePolicy
-from utilities.azureblobstorage import get_all_files
+from utilities.helper import LLMHelper
 
-account_name = os.environ['BLOB_ACCOUNT_NAME']
-account_key = os.environ['BLOB_ACCOUNT_KEY']
-connect_str = f"DefaultEndpointsProtocol=https;AccountName={account_name};AccountKey={account_key};EndpointSuffix=core.windows.net"
-container_name = os.environ['BLOB_CONTAINER_NAME']
 queue_name = os.environ['QUEUE_NAME']
 
 def main(req: func.HttpRequest) -> func.HttpResponse:
     logging.info('Requested to start processing all documents received')
-    files_data = get_all_files()
-    files_data = list(filter(lambda x : not x['embeddings_added'], files_data))
+    # Set up LLM Helper
+    llm_helper = LLMHelper()
+    # Get all files from Blob Storage
+    files_data = llm_helper.blob_client.get_all_files()
+    # Filter out files that have already been processed
+    files_data = list(filter(lambda x : not x['embeddings_added'], files_data)) if req.params.get('process_all') != 'true' else files_data
     files_data = list(map(lambda x: {'filename': x['filename']}, files_data))
     # Create the QueueClient object
-    queue_client = QueueClient.from_connection_string(connect_str, queue_name, message_encode_policy=BinaryBase64EncodePolicy())
+    queue_client = QueueClient.from_connection_string(llm_helper.blob_client.connect_str, queue_name, message_encode_policy=BinaryBase64EncodePolicy())
     # Send a message to the queue for each file
     for fd in files_data:
         queue_client.send_message(json.dumps(fd).encode('utf-8'))

diff --git a/code/OpenAI_Queries.py b/code/OpenAI_Queries.py
@@ -2,16 +2,70 @@
 load_dotenv()
 
 import streamlit as st
-from urllib.error import URLError
-import pandas as pd
-from utilities import utils, translator
 import os
+import traceback
+from utilities.helper import LLMHelper
 
-df = utils.initialize(engine='davinci')
+import logging
+logger = logging.getLogger('azure.core.pipeline.policies.http_logging_policy').setLevel(logging.WARNING)
 
-@st.cache(suppress_st_warning=True)
+def check_deployment():
+    # Check if the deployment is working
+    #\ 1. Check if the llm is working
+    try:
+        llm_helper = LLMHelper()
+        llm_helper.llm("Generate a joke!")
+        st.success("LLM is working!")
+    except Exception as e:
+        st.error(f"""LLM is not working. 
+            Please check you have a deployment name {llm_helper.deployment_name} in your Azure OpenAI resource {llm_helper.api_base}.
+            Then restart your application.
+            """)
+        st.error(traceback.format_exc())
+    #\ 2. Check if the embedding is working
+    try:
+        llm_helper = LLMHelper()
+        llm_helper.embeddings.embed_documents(texts=["This is a test"])
+        st.success("Embedding is working!")
+    except Exception as e:
+        st.error(f"""Embedding model is not working. 
+            Please check you have a deployment name {llm_helper.model} in your Azure OpenAI resource {llm_helper.api_base}.
+            Then restart your application.
+            """)
+        st.error(traceback.format_exc())
+    #\ 3. Check if the translation is working
+    try:
+        llm_helper = LLMHelper()
+        llm_helper.translator.translate("This is a test", "it")
+        st.success("Translation is working!")
+    except Exception as e:
+        st.error(f"""Translation model is not working. 
+            Please check your Azure Translator key in the App Settings.
+            Then restart your application.
+            """)
+        st.error(traceback.format_exc())
+    #\ 4. Check if the Redis is working with previous version of data
+    try:
+        llm_helper = LLMHelper()
+        if llm_helper.vector_store.check_existing_index("embeddings-index"):
+            st.warning("""Seems like you're using a Redis with an old data structure.
+            If you want to use the new data structure, you can start using the app and go to "Add Document" -> "Add documents in Batch" and click on "Convert all files and add embeddings" to reprocess your documents. 
+            To remove this working, please delete the index "embeddings-index" from your Redis.
+            If you prefer to use the old data structure, please change your Web App container image to point to the docker image: fruocco/oai-embeddings:2023-03-27_25. 
+            """)
+        else:
+            st.success("Redis is working!")
+    except Exception as e:
+        st.error(f"""Redis is not working. 
+            Please check your Redis connection string in the App Settings.
+            Then restart your application.
+            """)
+        st.error(traceback.format_exc())
+
+
+@st.cache_data()
 def get_languages():
-    return translator.get_available_languages()
+    return llm_helper.translator.get_available_languages()
 
 try:
 
@@ -21,18 +75,12 @@ def get_languages():
 
     if 'question' not in st.session_state:
         st.session_state['question'] = default_question
-    if 'prompt' not in st.session_state:
-        st.session_state['prompt'] = os.getenv("QUESTION_PROMPT", "Please reply to the question using only the information present in the text above. If you can't find it, reply 'Not in the text'.\nQuestion: _QUESTION_\nAnswer:").replace(r'\n', '\n')
+    # if 'prompt' not in st.session_state:
+    #     st.session_state['prompt'] = os.getenv("QUESTION_PROMPT", "Please reply to the question using only the information present in the text above. If you can't find it, reply 'Not in the text'.\nQuestion: _QUESTION_\nAnswer:").replace(r'\n', '\n')
     if 'response' not in st.session_state:
-        st.session_state['response'] = {
-            "choices" :[{
-                "text" : default_answer
-            }]
-        }    
-    if 'limit_response' not in st.session_state:
-        st.session_state['limit_response'] = True
-    if 'full_prompt' not in st.session_state:
-        st.session_state['full_prompt'] = ""
+        st.session_state['response'] = default_answer
+    if 'context' not in st.session_state:
+        st.session_state['context'] = ""
 
     # Set page layout to wide screen and menu item
     menu_items = {
@@ -45,6 +93,8 @@ def get_languages():
     }
     st.set_page_config(layout="wide", menu_items=menu_items)
 
+    llm_helper = LLMHelper()
+
     # Get available languages for translation
     available_languages = get_languages()
 
@@ -53,43 +103,33 @@ def get_languages():
         st.image(os.path.join('images','microsoft.png'))
 
     col1, col2, col3 = st.columns([2,2,2])
+    with col1:
+        st.button("Check deployment", on_click=check_deployment)
     with col3:
         with st.expander("Settings"):
-            model = st.selectbox(
-                "OpenAI GPT-3 Model",
-                (os.environ['OPENAI_ENGINES'].split(','))
-            )
-            st.text_area("Prompt",height=100, key='prompt')
-            st.tokens_response = st.slider("Tokens response length", 100, 500, 400)
-            st.temperature = st.slider("Temperature", 0.0, 1.0, 0.1)
+            # model = st.selectbox(
+            #     "OpenAI GPT-3 Model",
+            #     [os.environ['OPENAI_ENGINE']]
+            # )
+            # st.text_area("Prompt",height=100, key='prompt')
+            # st.tokens_response = st.slider("Tokens response length", 100, 500, 400)
+            # st.temperature = st.slider("Temperature", 0.0, 1.0, 0.1)
             st.selectbox("Language", [None] + list(available_languages.keys()), key='translation_language')
 
-
     question = st.text_input("OpenAI Semantic Answer", default_question)
 
     if question != '':
-        if question != st.session_state['question']:
-            st.session_state['question'] = question
-            st.session_state['full_prompt'], st.session_state['response'] = utils.get_semantic_answer(df, question, st.session_state['prompt'] ,model=model, engine='davinci', limit_response=st.session_state['limit_response'], tokens_response=st.tokens_response, temperature=st.temperature)
-            st.write(f"Q: {question}")  
-            st.write(st.session_state['response']['choices'][0]['text'])
-            with st.expander("Question and Answer Context"):
-                st.text(st.session_state['full_prompt'].replace('$', '\$')) 
-        else:
-            st.write(f"Q: {st.session_state['question']}")  
-            st.write(f"{st.session_state['response']['choices'][0]['text']}")
-            with st.expander("Question and Answer Context"):
-                st.text(st.session_state['full_prompt'].encode().decode())
+        st.session_state['question'] = question
+        st.session_state['question'], st.session_state['response'], st.session_state['context'], sources = llm_helper.get_semantic_answer_lang_chain(question, [])
+        st.markdown("Answer:" + st.session_state['response'])
+        st.markdown(f'\n\nSources: {sources}') 
+        with st.expander("Question and Answer Context"):
+            st.markdown(st.session_state['context'].replace('$', '\$'))
+            st.markdown(f"SOURCES: {sources}") 
 
-    if st.session_state['translation_language'] is not None:
+    if st.session_state['translation_language'] and st.session_state['translation_language'] != '':
         st.write(f"Translation to other languages, 翻译成其他语言, النص باللغة العربية")
-        st.write(f"{translator.translate(st.session_state['response']['choices'][0]['text'], available_languages[st.session_state['translation_language']])}")		
+        st.write(f"{llm_helper.translator.translate(st.session_state['response'], available_languages[st.session_state['translation_language']])}")		
 
-except URLError as e:
-    st.error(
-        """
-        **This demo requires internet access.**
-        Connection error: %s
-        """
-        % e.reason
-    )
+except Exception:
+    st.error(traceback.format_exc())
diff --git a/code/pages/00_Chat.py b/code/pages/00_Chat.py
@@ -0,0 +1,27 @@
+import streamlit as st
+from streamlit_chat import message
+from utilities.helper import LLMHelper
+
+# Initialize chat history
+if 'chat_history' not in st.session_state:
+    st.session_state['chat_history'] = []
+if 'source_documents' not in st.session_state:
+    st.session_state['source_documents'] = []
+
+llm_helper = LLMHelper()
+
+# Chat 
+input_text = st.text_input("You: ", placeholder="type your question", key="input")
+
+if input_text:
+    question = input_text
+    input_text = ""
+    question, result, _, sources = llm_helper.get_semantic_answer_lang_chain(question, st.session_state['chat_history'])
+    st.session_state['chat_history'].append((question, result))
+    st.session_state['source_documents'].append(sources)
+
+if st.session_state['chat_history']:
+    for i in range(len(st.session_state['chat_history'])-1, -1, -1):
+        message(st.session_state['chat_history'][i][1], key=str(i))
+        st.markdown(f'\n\nSources: {st.session_state["source_documents"][i]}')
+        message(st.session_state['chat_history'][i][0], is_user=True, key=str(i) + '_user')