add streaming example code (langchain-ai#11)

* add streaming example code * cleanup * add gif to readme * update readme * update readme * update readme * consolidate * consolidate * fix readme * address comments * format * update requirements
kwaku · Feb 15, 2023 · 748ada7 · 748ada7
1 parent c2b10c3
commit 748ada7
Show file tree

Hide file tree

Showing 18 changed files with 635 additions and 106 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,139 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# JetBrains
+.idea
+
+*.db
+
+.DS_Store
+
+vectorstore.pkl
+langchain.readthedocs.io/
diff --git a/Makefile b/Makefile
@@ -0,0 +1,8 @@
+.PHONY: start
+start:
+	uvicorn main:app --reload --port 9000
+
+.PHONY: format
+format:
+	black .
+	isort .
diff --git a/README.md b/README.md
@@ -1,14 +1,26 @@
-# ChatLangChain
+# 🦜️🔗 ChatLangChain
 
-This repo is an implementation of a chatbot specifically focused on question answering over the [LangChain documentation](https://langchain.readthedocs.io/en/latest/).
+This repo is an implementation of a locally hosted chatbot specifically focused on question answering over the [LangChain documentation](https://langchain.readthedocs.io/en/latest/).
+Built with [LangChain](https://github.com/hwchase17/langchain/) and [FastAPI](https://fastapi.tiangolo.com/).
+
+The app leverages LangChain's streaming support and async API to update the page in real time for multiple users.
+
+## ✅ To run:
+1. Install dependencies: `pip install -r requirements.txt`
+1. Run `ingest.sh` to ingest LangChain docs data into the vectorstore (only needs to be done once).
+   1. You can use other [Document Loaders](https://langchain.readthedocs.io/en/latest/modules/document_loaders.html) to load your own data into the vectorstore.
+1. Run the app: `make start`
+1. To enable tracing, make sure `langchain-server` is running locally and pass `tracing=True` to `get_chain` in `main.py`.
+1. Open [localhost:9000](http://localhost:9000) in your browser.
 
 ## 🚀 Important Links
 
-Website: [chat.langchain.dev](https://chat.langchain.dev)
+Deployed version (to be updated soon): [chat.langchain.dev](https://chat.langchain.dev)
 
-Hugging Face Space: [huggingface.co/spaces/hwchase17/chat-langchain](https://huggingface.co/spaces/hwchase17/chat-langchain)
+Hugging Face Space (to be updated soon): [huggingface.co/spaces/hwchase17/chat-langchain](https://huggingface.co/spaces/hwchase17/chat-langchain)
 
-Blog Post: [blog.langchain.dev/langchain-chat/](https://blog.langchain.dev/langchain-chat/)
+Blog Posts: 
+* [blog.langchain.dev/langchain-chat/](https://blog.langchain.dev/langchain-chat/)
 
 ## 📚 Technical description
 
@@ -21,12 +33,8 @@ Ingestion has the following steps:
 3. Split documents with LangChain's [TextSplitter](https://langchain.readthedocs.io/en/latest/modules/utils/combine_docs_examples/textsplitter.html)
 4. Create a vectorstore of embeddings, using LangChain's [vectorstore wrapper](https://langchain.readthedocs.io/en/latest/modules/utils/combine_docs_examples/vectorstores.html) (with OpenAI's embeddings and Weaviate's vectorstore).
 
-Question-Answering has the following steps:
+Question-Answering has the following steps, all handled by [ChatVectorDBChain](https://langchain.readthedocs.io/en/latest/modules/chains/combine_docs_examples/chat_vector_db.html):
 
 1. Given the chat history and new user input, determine what a standalone question would be (using GPT-3).
 2. Given that standalone question, look up relevant documents from the vectorstore.
 3. Pass the standalone question and relevant documents to GPT-3 to generate a final answer.
-
-## 🧠 How to Extend to your documentation?
-
-Coming soon.
diff --git a/app.py → archive/app.py b/app.py → archive/app.py
@@ -4,9 +4,8 @@
 import gradio as gr
 import langchain
 import weaviate
-from langchain.vectorstores import Weaviate
-
 from chain import get_new_chain1
+from langchain.vectorstores import Weaviate
 
 WEAVIATE_URL = os.environ["WEAVIATE_URL"]
 

diff --git a/chain.py → archive/chain.py b/chain.py → archive/chain.py
@@ -19,7 +19,6 @@
 
 
 class CustomChain(Chain, BaseModel):
-
     vstore: Weaviate
     chain: BaseCombineDocumentsChain
     key_word_extractor: Chain

diff --git a/archive/ingest.py b/archive/ingest.py
@@ -0,0 +1,92 @@
+"""Load html from files, clean up, split, ingest into Weaviate."""
+import os
+from pathlib import Path
+
+import weaviate
+from bs4 import BeautifulSoup
+from langchain.text_splitter import CharacterTextSplitter
+
+
+def clean_data(data):
+    soup = BeautifulSoup(data)
+    text = soup.find_all("main", {"id": "main-content"})[0].get_text()
+    return "\n".join([t for t in text.split("\n") if t])
+
+
+docs = []
+metadatas = []
+for p in Path("langchain.readthedocs.io/en/latest/").rglob("*"):
+    if p.is_dir():
+        continue
+    with open(p) as f:
+        docs.append(clean_data(f.read()))
+        metadatas.append({"source": p})
+
+
+text_splitter = CharacterTextSplitter(
+    separator="\n",
+    chunk_size=1000,
+    chunk_overlap=200,
+    length_function=len,
+)
+
+documents = text_splitter.create_documents(docs, metadatas=metadatas)
+
+
+WEAVIATE_URL = os.environ["WEAVIATE_URL"]
+client = weaviate.Client(
+    url=WEAVIATE_URL,
+    additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]},
+)
+
+client.schema.delete_class("Paragraph")
+client.schema.get()
+schema = {
+    "classes": [
+        {
+            "class": "Paragraph",
+            "description": "A written paragraph",
+            "vectorizer": "text2vec-openai",
+            "moduleConfig": {
+                "text2vec-openai": {
+                    "model": "ada",
+                    "modelVersion": "002",
+                    "type": "text",
+                }
+            },
+            "properties": [
+                {
+                    "dataType": ["text"],
+                    "description": "The content of the paragraph",
+                    "moduleConfig": {
+                        "text2vec-openai": {
+                            "skip": False,
+                            "vectorizePropertyName": False,
+                        }
+                    },
+                    "name": "content",
+                },
+                {
+                    "dataType": ["text"],
+                    "description": "The link",
+                    "moduleConfig": {
+                        "text2vec-openai": {
+                            "skip": True,
+                            "vectorizePropertyName": False,
+                        }
+                    },
+                    "name": "source",
+                },
+            ],
+        },
+    ]
+}
+
+client.schema.create(schema)
+
+with client.batch as batch:
+    for text in documents:
+        batch.add_data_object(
+            {"content": text.page_content, "source": str(text.metadata["source"])},
+            "Paragraph",
+        )
diff --git a/archive/ingest.sh b/archive/ingest.sh
@@ -0,0 +1,6 @@
+# Bash script to ingest data
+# This involves scraping the data from the web and then cleaning up and putting in Weaviate.
+!set -eu
+wget -r -A.html https://langchain.readthedocs.io/en/latest/
+python3 ingest.py
+python3 ingest_examples.py
diff --git a/ingest_examples.py → archive/ingest_examples.py b/ingest_examples.py → archive/ingest_examples.py
diff --git a/archive/requirements.txt b/archive/requirements.txt
@@ -0,0 +1,9 @@
+langchain==0.0.64
+beautifulsoup4
+weaviate-client
+openai
+black
+isort
+Flask
+transformers
+gradio
diff --git a/assets/images/Chat_Your_Data.gif b/assets/images/Chat_Your_Data.gif
diff --git a/callback.py b/callback.py
@@ -0,0 +1,33 @@
+"""Callback handlers used in the app."""
+from typing import Any, Dict, List
+
+from langchain.callbacks.base import AsyncCallbackHandler
+
+from schemas import ChatResponse
+
+
+class StreamingLLMCallbackHandler(AsyncCallbackHandler):
+    """Callback handler for streaming LLM responses."""
+
+    def __init__(self, websocket):
+        self.websocket = websocket
+
+    async def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
+        resp = ChatResponse(sender="bot", message=token, type="stream")
+        await self.websocket.send_json(resp.dict())
+
+
+class QuestionGenCallbackHandler(AsyncCallbackHandler):
+    """Callback handler for question generation."""
+
+    def __init__(self, websocket):
+        self.websocket = websocket
+
+    async def on_llm_start(
+        self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any
+    ) -> None:
+        """Run when LLM starts running."""
+        resp = ChatResponse(
+            sender="bot", message="Synthesizing question...", type="info"
+        )
+        await self.websocket.send_json(resp.dict())