Merge pull request PromtEngineer#131 from LeafmanZ/initial_AutoGPT01

Adding Support for Quantized Models. - The default model changed to TheBloke/WizardLM-7B-uncensored-GPTQ - Will reduce the VRAM requirements (around 8GB) if the quantized model is used. - Issues addressed: PromtEngineer#129 PromtEngineer#92 PromtEngineer#51 PromtEngineer#21 PromtEngineer#30 PromtEngineer#45 PromtEngineer#51 PromtEngineer#73
pavadik · Jun 11, 2023 · b4f7f7c · b4f7f7c
2 parents 72919e4 + d9ec5eb
commit b4f7f7c
Show file tree

Hide file tree

Showing 2 changed files with 97 additions and 16 deletions.
diff --git a/README.md b/README.md
@@ -17,6 +17,14 @@ In order to set your environment up to run the code here, first install all requ
 ```shell
 pip install -r requirements.txt
 ```
+Then install AutoGPTQ - if you want to run quantized models for GPU
+```shell
+git clone https://github.com/PanQiWei/AutoGPTQ.git
+cd AutoGPTQ
+git checkout v0.2.2
+pip install .
+```
+For more support on [AutoGPTQ] (https://github.com/PanQiWei/AutoGPTQ).
 
 ## Test dataset
 This repo uses a [Constitution of USA ](https://constitutioncenter.org/media/files/constitution.pdf) as an example.
@@ -94,6 +102,30 @@ Selecting the right local models and the power of `LangChain` you can run the en
 - `run_localGPT.py` uses a local LLM (Vicuna-7B in this case) to understand questions and create answers. The context for the answers is extracted from the local vector store using a similarity search to locate the right piece of context from the docs.
 - You can replace this local LLM with any other LLM from the HuggingFace. Make sure whatever LLM you select is in the HF format.
 
+# How to select different LLM models?
+The following will provide instructions on how you can select a different LLM model to create your response:
+1. Open up `run_localGPT.py`
+2. Go to `def main(device_type, show_sources)`
+3. Go to the comment where it says `# load the LLM for generating Natural Language responses`
+4. Below it, it details a bunch of examples on models from HuggingFace that have already been tested to be run with the original trained model (ending with HF or have a .bin in its "Files and versions"), and quantized models (ending with GPTQ or have a .no-act-order or .safetensors in its "Files and versions").
+5. For models that end with HF or have a .bin inside its "Files and versions" on its HuggingFace page.
+   * Make sure you have a model_id selected. For example -> `model_id = "TheBloke/guanaco-7B-HF"`
+   * If you go to its HuggingFace [Site] (https://huggingface.co/TheBloke/guanaco-7B-HF) and go to "Files and versions" you will notice model files that end with a .bin extension.
+   * Any model files that contain .bin extensions will be run with the following code where the `# load the LLM for generating Natural Language responses` comment is found.
+   *    `model_id = "TheBloke/guanaco-7B-HF"`
+
+        `llm = load_model(device_type, model_id=model_id)`
+6. For models that contain GPTQ in its name and or have a .no-act-order or .safetensors extension inside its "Files and versions on its HuggingFace page.
+   * Make sure you have a model_id selected. For example -> model_id = `"TheBloke/wizardLM-7B-GPTQ"`
+   * You will also need its model basename file selected. For example -> `model_basename = "wizardLM-7B-GPTQ-4bit.compat.no-act-order.safetensors"`
+   * If you go to its HuggingFace [Site] (https://huggingface.co/TheBloke/wizardLM-7B-GPTQ) and go to "Files and versions" you will notice a model file that ends with a .safetensors extension.
+   * Any model files that contain no-act-order or .safetensors extensions will be run with the following code where the `# load the LLM for generating Natural Language responses` comment is found.
+   *    `model_id = "TheBloke/WizardLM-7B-uncensored-GPTQ"`
+
+        `model_basename = "WizardLM-7B-uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors"`
+
+        `llm = load_model(device_type, model_id=model_id, model_basename = model_basename)`
+7. Comment out all other instances of `model_id="other model names"`, `model_basename=other base model names`, and `llm = load_model(args*)`
 # System Requirements
 
 ## Python Version

diff --git a/run_localGPT.py b/run_localGPT.py
@@ -9,26 +9,57 @@
 from langchain.vectorstores import Chroma
 from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline
 from transformers import AutoTokenizer, AutoModelForCausalLM
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
 from constants import CHROMA_SETTINGS, PERSIST_DIRECTORY, ROOT_DIRECTORY
 from transformers import GenerationConfig
 
 
-def load_model(device_type):
+def load_model(device_type, model_id, model_basename=None):
     """
-    Select a model on huggingface.
+    Select a model for text generation using the HuggingFace library.
     If you are running this for the first time, it will download a model for you.
     subsequent runs will use the model from the disk.
+
+    Args:
+        device_type (str): Type of device to use, e.g., "cuda" for GPU or "cpu" for CPU.
+        model_id (str): Identifier of the model to load from HuggingFace's model hub.
+        model_basename (str, optional): Basename of the model if using quantized models. 
+            Defaults to None.
+
+    Returns:
+        HuggingFacePipeline: A pipeline object for text generation using the loaded model.
+
+    Raises:
+        ValueError: If an unsupported model or device type is provided.
     """
-    # The code supports all huggingface models that ends with -HF or which have a .bin file in their HF repo.
-    model_id = "TheBloke/vicuna-7B-1.1-HF"
-    # model_id = "TheBloke/guanaco-7B-HF"
-    # model_id = 'NousResearch/Nous-Hermes-13b'
-    logging.info(f'Loading Model: {model_id}, on : {device_type}')
-    logging.info(f'This action can take a few minutes!')
+
+    logging.info(f'Loading Model: {model_id}, on: {device_type}')
+    logging.info('This action can take a few minutes!')
+
+    if model_basename is not None:
+        # The code supports all huggingface models that ends with GPTQ and have some variation of .no-act.order or .safetensors in their HF repo.
+        print('Using AutoGPTQForCausalLM for quantized models')
+
+        if '.safetensors' in model_basename:
+            # Remove the ".safetensors" ending if present
+            model_basename = model_basename.replace('.safetensors', "")
 
-    if device_type.lower() == 'cuda':
+        tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
+        logging.info('Tokenizer loaded')
+
+        model = AutoGPTQForCausalLM.from_quantized(
+            model_id,
+            model_basename=model_basename,
+            use_safetensors=True,
+            trust_remote_code=True,
+            device="cuda:0",
+            use_triton=False,
+            quantize_config=None
+        )
+    elif device_type.lower() == 'cuda': # The code supports all huggingface models that ends with -HF or which have a .bin file in their HF repo.
+        print('Using AutoModelForCausalLM for full models')
         tokenizer = AutoTokenizer.from_pretrained(model_id)
-        logging.info(f'Tokenizer loaded')
+        logging.info('Tokenizer loaded')
 
         model = AutoModelForCausalLM.from_pretrained(
             model_id,
@@ -39,14 +70,15 @@ def load_model(device_type):
         )
         model.tie_weights()
     else:
+        print('Using LlamaTokenizer')
         tokenizer = LlamaTokenizer.from_pretrained(model_id)
-        model = LlamaForCausalLM.from_pretrained(model_id, )
+        model = LlamaForCausalLM.from_pretrained(model_id)
 
-    # load configuration from the model to avoid warnings.
+    # Load configuration from the model to avoid warnings
     generation_config = GenerationConfig.from_pretrained(model_id)
     # see here for details: https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.from_pretrained.returns
 
-    # create pipeline for text generation
+    # Create a pipeline for text generation
     pipe = pipeline(
         "text-generation",
         model=model,
@@ -59,7 +91,7 @@ def load_model(device_type):
     )
 
     local_llm = HuggingFacePipeline(pipeline=pipe)
-    logging.info(f'Local LLM Loaded')
+    logging.info('Local LLM Loaded')
 
     return local_llm
 
@@ -118,8 +150,25 @@ def main(device_type, show_sources):
     )
     retriever = db.as_retriever()
 
-    # load the LLM for generating Natural Language responses.
-    llm = load_model(device_type)
+    # load the LLM for generating Natural Language responses
+
+    # for HF models
+    # model_id = "TheBloke/vicuna-7B-1.1-HF"
+    # model_id = "TheBloke/Wizard-Vicuna-7B-Uncensored-HF"
+    # model_id = "TheBloke/guanaco-7B-HF"
+    # model_id = 'NousResearch/Nous-Hermes-13b' # Requires ~ 23GB VRAM. Using STransformers alongside will 100% create OOM on 24GB cards. 
+    # llm = load_model(device_type, model_id=model_id)
+
+    # for GPTQ (quantized) models
+    # model_id = "TheBloke/Nous-Hermes-13B-GPTQ"
+    # model_basename = "nous-hermes-13b-GPTQ-4bit-128g.no-act.order"
+    # model_id = "TheBloke/WizardLM-30B-Uncensored-GPTQ"
+    # model_basename = "WizardLM-30B-Uncensored-GPTQ-4bit.act-order.safetensors" # Requires ~21GB VRAM. Using STransformers alongside can potentially create OOM on 24GB cards.
+    # model_id = "TheBloke/wizardLM-7B-GPTQ"
+    # model_basename = "wizardLM-7B-GPTQ-4bit.compat.no-act-order.safetensors"
+    model_id = "TheBloke/WizardLM-7B-uncensored-GPTQ"
+    model_basename = "WizardLM-7B-uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors"
+    llm = load_model(device_type, model_id=model_id, model_basename = model_basename)
 
     qa = RetrievalQA.from_chain_type(
         llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True