Feature/bot: Add Gradio UI (iusztinpaul#28)

* feat: Add Gradio UI boilerplate * feat: Finish UI Gradio * docs: Update README * docs: Update README * fix: Linting issues
garimamiet · Oct 19, 2023 · 00d3498 · 00d3498
1 parent a868775
commit 00d3498
Show file tree

Hide file tree

Showing 15 changed files with 1,131 additions and 66 deletions.
diff --git a/.gitignore b/.gitignore
@@ -185,3 +185,4 @@ user_data.sh
 set_env_variables.sh
 logs/
 .DS_Store
+gradio_cached_examples
diff --git a/modules/financial_bot/Makefile b/modules/financial_bot/Makefile
@@ -38,6 +38,16 @@ run_dev:
 		--question "Should I consider investing in stocks from the Tech Sector?" \
 		--debug True
 
+run_ui:
+	@echo "Running financial_bot UI..."
+
+	poetry run python -m tools.ui
+
+run_ui_dev:
+	@echo "Running financial_bot UI [Dev Mode]..."
+
+	poetry run gradio tools/ui.py
+
 
 # === Beam ===
 

diff --git a/modules/financial_bot/README.md b/modules/financial_bot/README.md
@@ -17,8 +17,9 @@ Inference pipeline that uses [LangChain](https://github.com/langchain-ai/langcha
     - [2.3. Beam](#21-beam)
 - [3. Usage](#3-usage)
     - [3.1. Local](#31-local)
-    - [3.2. Local](#32-deploy-to-beam)
-    - [3.3. Linting & Formatting](#34-linting--formatting)
+    - [3.2. Deploy to Beam as a RESTful API](#32-deploy-to-beam)
+    - [3.3. Gradio UI](#33-gradio-ui)
+    - [3.4. Linting & Formatting](#34-linting--formatting)
 
 # 1. Motivation
 
@@ -35,7 +36,7 @@ Also, the final step is to put the financial assistant to good use and deploy it
 
 # 2. Install 
 
-# 2.1. Dependencies
+## 2.1. Dependencies
 
 Main dependencies you have to install yourself:
 * Python 3.10
@@ -58,14 +59,14 @@ cp .env.example .env
 ```
 --> and complete the `.env` file with your credentials.
 
-### 2.2. Qdrant
+## 2.2. Qdrant
 
 You must create a FREE account in Qdrant and generate the `QDRANT_API_KEY` and `QDRANT_URL` environment variables. After, be sure to add them to your `.env` file.
 
 -> [Check out this document to see how.](https://qdrant.tech/documentation/cloud/authentication/?utm_source=thepauls&utm_medium=partner&utm_content=github)
 
 
-### 2.3. Beam
+## 2.3. Beam
 `optional step in case you want to use Beam` 
 
 Create and configure a free Beam account to deploy it as a serverless RESTful API and show it to your friends. You will pay only for what you use. 
@@ -75,7 +76,7 @@ Create and configure a free Beam account to deploy it as a serverless RESTful AP
 
 # 3. Usage
 
-# 3.1. Local
+## 3.1. Local
 
 Run bot locally:
 ```shell
@@ -87,7 +88,7 @@ Run bot locally in dev mode:
 make run_dev
 ```
 
-# 3.2. Deploy to Beam
+## 3.2. Deploy to Beam as a RESTful API
 
 Deploy the bot under a RESTful API using Beam:
 ```shell
@@ -107,7 +108,21 @@ export BEAM_AUTH_TOKEN=<BEAM_AUTH_TOKEN>
 make call_restful_api DEPLOYMENT_ID=${BEAM_DEPLOYMENT_ID} TOKEN=${BEAM_AUTH_TOKEN} 
 ```
 
-# 3.3. Linting & Formatting
+## 3.3. Gradio UI
+
+Start the Gradio UI:
+```shell
+make run_ui
+```
+
+Start the Gradio UI in dev mode:
+```shell
+make run_ui_dev
+```
+
+**NOTE:** Running the commands from above will host the UI on your computer. To run them, **you need an Nvidia GPU with enough resources** (e.g., to run the inference using Falcon 7B, you need ~8 GB VRAM). If you don't have that available, you can deploy it to `Gradio Spaces` on HuggingFace. It is pretty straightforward to do so. [Here are some docs to get you started](https://huggingface.co/docs/hub/spaces-sdks-gradio).
+
+## 3.4. Linting & Formatting
 
 **Check** the code for **linting** issues:
 ```shell

diff --git a/modules/financial_bot/financial_bot/chains.py b/modules/financial_bot/financial_bot/chains.py
@@ -32,7 +32,6 @@ def _call(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         _, quest_key = self.input_keys
         question_str = inputs[quest_key]
 
-        # TODO: maybe async embed?
         embeddings = self.embedding_model(question_str)
 
         # TODO: Using the metadata filter the news from the latest week (or other timeline).
@@ -77,10 +76,4 @@ def _call(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         )["prompt"]
         response = self.hf_pipeline(prompt)
 
-        import logging
-
-        logger = logging.getLogger(__name__)
-        logger.info("HISTORY")
-        logger.info(inputs["chat_history"])
-
         return {self.output_key: response}
diff --git a/modules/financial_bot/financial_bot/constants.py b/modules/financial_bot/financial_bot/constants.py
@@ -11,11 +11,14 @@
 # == LLM Model ==
 LLM_MODEL_ID = "tiiuae/falcon-7b-instruct"
 LLM_QLORA_CHECKPOINT = "joywalker/financial-assistant-falcon-7b:1.0.0"
-CACHE_DIR = Path.home() / ".cache" / "hands-on-llms"
+
+LLM_INFERNECE_MAX_NEW_TOKENS = 250
+LLM_INFERENCE_TEMPERATURE = 1.0
+
 
 # == Prompt Template ==
 TEMPLATE_NAME = "falcon"
 SYSTEM_MESSAGE = "You are a financial expert. Based on the context I provide, respond in a helpful manner"
 
 # === Misc ===
-DEBUG = True
+CACHE_DIR = Path.home() / ".cache" / "hands-on-llms"
diff --git a/modules/financial_bot/financial_bot/langchain_bot.py b/modules/financial_bot/financial_bot/langchain_bot.py
@@ -1,5 +1,6 @@
 import logging
 from pathlib import Path
+from typing import Iterable
 
 from langchain import chains
 from langchain.memory import ConversationBufferMemory
@@ -19,17 +20,27 @@ def __init__(
         self,
         llm_model_id: str = constants.LLM_MODEL_ID,
         llm_lora_model_id: str = constants.LLM_QLORA_CHECKPOINT,
+        llm_template_name: str = constants.TEMPLATE_NAME,
+        vector_collection_name: str = constants.VECTOR_DB_OUTPUT_COLLECTION_NAME,
+        vector_db_search_topk: int = constants.VECTOR_DB_SEARCH_TOPK,
         model_cache_dir: Path = constants.CACHE_DIR,
         embedding_model_device: str = "cuda:0",
         debug: bool = False,
     ):
+        self._llm_template_name = llm_template_name
+        self._llm_template = get_llm_template(name=self._llm_template_name)
+
+        self._vector_collection_name = vector_collection_name
+        self._vector_db_search_topk = vector_db_search_topk
         self._qdrant_client = build_qdrant_client()
+
         self._embd_model = EmbeddingModelSingleton(
             cache_dir=model_cache_dir, device=embedding_model_device
         )
-        self._llm_agent = build_huggingface_pipeline(
+        self._llm_agent, self._streamer = build_huggingface_pipeline(
             llm_model_id=llm_model_id,
             llm_lora_model_id=llm_lora_model_id,
+            use_streamer=True,
             cache_dir=model_cache_dir,
             debug=debug,
         )
@@ -69,14 +80,14 @@ def build_chain(self) -> chains.SequentialChain:
         context_retrieval_chain = ContextExtractorChain(
             embedding_model=self._embd_model,
             vector_store=self._qdrant_client,
-            vector_collection=constants.VECTOR_DB_OUTPUT_COLLECTION_NAME,
-            top_k=constants.VECTOR_DB_SEARCH_TOPK,
+            vector_collection=self._vector_collection_name,
+            top_k=self._vector_db_search_topk,
         )
 
         logger.info("Building 2/3 - FinancialBotQAChain")
         llm_generator_chain = FinancialBotQAChain(
             hf_pipeline=self._llm_agent,
-            template=get_llm_template(name=constants.TEMPLATE_NAME),
+            template=self._llm_template,
         )
 
         logger.info("Building 3/3 - Connecting chains into SequentialChain")
@@ -125,3 +136,17 @@ def answer(self, about_me: str, question: str) -> str:
         response = self.finbot_chain.run(inputs)
 
         return response
+
+    def stream_answer(self) -> Iterable[str]:
+        """Stream the answer from the LLM after each token is generated after calling `answer()`."""
+
+        assert (
+            self._streamer
+        ), "Stream answer not available. Build the bot with `use_streamer=True`."
+
+        partial_answer = ""
+        for new_token in self._streamer:
+            if new_token != self._llm_template.eos:
+                partial_answer += new_token
+
+                yield partial_answer
diff --git a/modules/financial_bot/financial_bot/models.py b/modules/financial_bot/financial_bot/models.py
@@ -11,6 +11,7 @@
     AutoModelForCausalLM,
     AutoTokenizer,
     BitsAndBytesConfig,
+    TextIteratorStreamer,
     pipeline,
 )
 
@@ -49,7 +50,10 @@ def download_from_model_registry(model_id: str, cache_dir: Optional[Path] = None
 def build_huggingface_pipeline(
     llm_model_id: str,
     llm_lora_model_id: str,
+    max_new_tokens: int = constants.LLM_INFERNECE_MAX_NEW_TOKENS,
+    temperature: float = constants.LLM_INFERENCE_TEMPERATURE,
     gradient_checkpointing: bool = False,
+    use_streamer: bool = False,
     cache_dir: Optional[Path] = None,
     debug: bool = False,
 ):
@@ -68,11 +72,26 @@ def build_huggingface_pipeline(
     )
     model.eval()
 
+    if use_streamer:
+        streamer = TextIteratorStreamer(
+            tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
+        )
+    else:
+        streamer = None
+
     pipe = pipeline(
-        "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=100
+        "text-generation",
+        model=model,
+        tokenizer=tokenizer,
+        max_new_tokens=max_new_tokens,
+        temperature=temperature,
+        streamer=streamer,
     )
     hf = HuggingFacePipeline(pipeline=pipe)
 
+    if use_streamer:
+        return hf, streamer
+
     return hf
 
 
@@ -103,7 +122,7 @@ def build_qlora_model(
         quantization_config=bnb_config,
         load_in_4bit=True,
         device_map="auto",
-        trust_remote_code=True,
+        trust_remote_code=False,
         cache_dir=str(cache_dir) if cache_dir else None,
     )
 
@@ -113,7 +132,7 @@ def build_qlora_model(
 
     tokenizer = AutoTokenizer.from_pretrained(
         pretrained_model_name_or_path,
-        trust_remote_code=True,
+        trust_remote_code=False,
         truncation=True,
         cache_dir=str(cache_dir) if cache_dir else None,
     )