Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
hwchase17 committed Jan 16, 2023
0 parents commit 638f0ee
Show file tree
Hide file tree
Showing 6 changed files with 554 additions and 0 deletions.
101 changes: 101 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import datetime
import os

import gradio as gr
import langchain
import weaviate
from langchain.vectorstores import Weaviate

from chain import get_new_chain1

WEAVIATE_URL = os.environ["WEAVIATE_URL"]


def get_weaviate_store():
client = weaviate.Client(
url=WEAVIATE_URL,
additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]},
)
return Weaviate(client, "Paragraph", "content", attributes=["source"])


vectorstore = get_weaviate_store()


def set_openai_api_key(api_key, agent):
if api_key:
os.environ["OPENAI_API_KEY"] = api_key
qa_chain = get_new_chain1(vectorstore)
os.environ["OPENAI_API_KEY"] = ""
return qa_chain


def chat(inp, history, agent):
history = history or []
if agent is None:
history.append((inp, "Please paste your OpenAI key to use"))
return history, history
print("\n==== date/time: " + str(datetime.datetime.now()) + " ====")
print("inp: " + inp)
history = history or []
output = agent({"question": inp, "chat_history": history})
answer = output["answer"]
history.append((inp, answer))
print(history)
return history, history


block = gr.Blocks(css=".gradio-container {background-color: lightgray}")

with block:
with gr.Row():
gr.Markdown("<h3><center>LangChain AI</center></h3>")

openai_api_key_textbox = gr.Textbox(
placeholder="Paste your OpenAI API key (sk-...)",
show_label=False,
lines=1,
type="password",
)

chatbot = gr.Chatbot()

with gr.Row():
message = gr.Textbox(
label="What's your question?",
placeholder="What's the answer to life, the universe, and everything?",
lines=1,
)
submit = gr.Button(value="Send", variant="secondary").style(full_width=False)

gr.Examples(
examples=[
"What are agents?",
"How do I summarize a long document?",
"What types of memory exist?",
],
inputs=message,
)

gr.HTML(
"""
This simple application is an implementation of ChatGPT but over an external dataset (in this case, the LangChain documentation)."""
)

gr.HTML(
"<center>Powered by <a href='https://github.com/hwchase17/langchain'>LangChain 🦜️🔗</a></center>"
)

state = gr.State()
agent_state = gr.State()

submit.click(chat, inputs=[message, state, agent_state], outputs=[chatbot, state])
message.submit(chat, inputs=[message, state, agent_state], outputs=[chatbot, state])

openai_api_key_textbox.change(
set_openai_api_key,
inputs=[openai_api_key_textbox, agent_state],
outputs=[agent_state],
)

block.launch(debug=True)
127 changes: 127 additions & 0 deletions chain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import json
import os
import pathlib
from typing import Dict, List, Tuple

import weaviate
from langchain import OpenAI, PromptTemplate
from langchain.chains import LLMChain
from langchain.chains.base import Chain
from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
from langchain.chains.conversation.memory import ConversationBufferMemory
from langchain.chains.question_answering import load_qa_chain
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain.prompts.example_selector import \
SemanticSimilarityExampleSelector
from langchain.vectorstores import FAISS, Weaviate
from pydantic import BaseModel

WEAVIATE_URL = os.environ["WEAVIATE_URL"]
client = weaviate.Client(
url=WEAVIATE_URL,
additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]},
)

_eg_template = """## Example:
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question: {answer}"""
_eg_prompt = PromptTemplate(
template=_eg_template,
input_variables=["chat_history", "question", "answer"],
)


_prefix = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question. You should assume that the question is related to LangChain."""
_suffix = """## Example:
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""
eg_store = Weaviate(
client,
"Rephrase",
"content",
attributes=["question", "answer", "chat_history"],
)
example_selector = SemanticSimilarityExampleSelector(vectorstore=eg_store, k=4)
prompt = FewShotPromptTemplate(
prefix=_prefix,
suffix=_suffix,
example_selector=example_selector,
example_prompt=_eg_prompt,
input_variables=["question", "chat_history"],
)
llm = OpenAI(temperature=0, model_name="text-davinci-003")
key_word_extractor = LLMChain(llm=llm, prompt=prompt)


class CustomChain(Chain, BaseModel):

vstore: Weaviate
chain: BaseCombineDocumentsChain

@property
def input_keys(self) -> List[str]:
return ["question"]

@property
def output_keys(self) -> List[str]:
return ["answer"]

def _call(self, inputs: Dict[str, str]) -> Dict[str, str]:
question = inputs["question"]
chat_history_str = _get_chat_history(inputs["chat_history"])
if chat_history_str:
new_question = key_word_extractor.run(
question=question, chat_history=chat_history_str
)
else:
new_question = question
print(new_question)
docs = self.vstore.similarity_search(new_question, k=4)
new_inputs = inputs.copy()
new_inputs["question"] = new_question
new_inputs["chat_history"] = chat_history_str
answer, _ = self.chain.combine_docs(docs, **new_inputs)
return {"answer": answer}


def get_new_chain1(vectorstore) -> Chain:

EXAMPLE_PROMPT = PromptTemplate(
template=">Example:\nContent:\n---------\n{page_content}\n----------\nSource: {source}",
input_variables=["page_content", "source"],
)
template = """You are an AI assistant for the open source library LangChain. The documentation is located at https://langchain.readthedocs.io.
You are given the following extracted parts of a long document and a question. Provide a conversational answer with a hyperlink to the documentation.
You should only use hyperlinks that are explicitly listed as a source in the context. Do NOT make up a hyperlink that is not listed.
If the question includes a request for code, provide a code block directly from the documentation.
If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
If the question is not about LangChain, politely inform them that you are tuned to only answer questions about LangChain.
Question: {question}
=========
{context}
=========
Answer in Markdown:"""
PROMPT = PromptTemplate(template=template, input_variables=["question", "context"])
doc_chain = load_qa_chain(
OpenAI(temperature=0, model_name="text-davinci-003", max_tokens=-1),
chain_type="stuff",
prompt=PROMPT,
document_prompt=EXAMPLE_PROMPT,
)
return CustomChain(chain=doc_chain, vstore=vectorstore)


def _get_chat_history(chat_history: List[Tuple[str, str]]):
buffer = ""
for human_s, ai_s in chat_history:
human = f"Human: " + human_s
ai = f"Assistant: " + ai_s
buffer += "\n" + "\n".join([human, ai])
return buffer
92 changes: 92 additions & 0 deletions ingest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
"""Load html from files, clean up, split, ingest into Weaviate."""
import os
from pathlib import Path

import weaviate
from bs4 import BeautifulSoup
from langchain.text_splitter import CharacterTextSplitter


def clean_data(data):
soup = BeautifulSoup(data)
text = soup.find_all("main", {"id": "main-content"})[0].get_text()
return "\n".join([t for t in text.split("\n") if t])


docs = []
metadatas = []
for p in Path("langchain.readthedocs.io/en/latest/").rglob("*"):
if p.is_dir():
continue
with open(p) as f:
docs.append(clean_data(f.read()))
metadatas.append({"source": p})


text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=1000,
chunk_overlap=200,
length_function=len,
)

documents = text_splitter.create_documents(docs, metadatas=metadatas)


WEAVIATE_URL = os.environ["WEAVIATE_URL"]
client = weaviate.Client(
url=WEAVIATE_URL,
additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]},
)

client.schema.delete_class("Paragraph")
client.schema.get()
schema = {
"classes": [
{
"class": "Paragraph",
"description": "A written paragraph",
"vectorizer": "text2vec-openai",
"moduleConfig": {
"text2vec-openai": {
"model": "ada",
"modelVersion": "002",
"type": "text",
}
},
"properties": [
{
"dataType": ["text"],
"description": "The content of the paragraph",
"moduleConfig": {
"text2vec-openai": {
"skip": False,
"vectorizePropertyName": False,
}
},
"name": "content",
},
{
"dataType": ["text"],
"description": "The link",
"moduleConfig": {
"text2vec-openai": {
"skip": True,
"vectorizePropertyName": False,
}
},
"name": "source",
},
],
},
]
}

client.schema.create(schema)

with client.batch as batch:
for text in documents:
batch.add_data_object(
{"content": text.page_content, "source": str(text.metadata["source"])},
"Paragraph",
)
6 changes: 6 additions & 0 deletions ingest.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Bash script to ingest data
# This involves scraping the data from the web and then cleaning up and putting in Weaviate.
!set -eu
wget -r -A.html https://langchain.readthedocs.io/en/latest/
python3 ingest.py
python3 ingest_examples.py
Loading

0 comments on commit 638f0ee

Please sign in to comment.