Merge branch 'main' into patch-1

xeransis · Jul 26, 2023 · dfa6f65 · dfa6f65
2 parents c0c59c3 + 99b105b
commit dfa6f65
Show file tree

Hide file tree

Showing 2 changed files with 66 additions and 24 deletions.
diff --git a/README.md b/README.md
@@ -30,6 +30,25 @@ In order to set your environment up to run the code here, first install all requ
 pip install -r requirements.txt
 ```
 
+
+If you want to use BLAS or Metal with [llama-cpp](<(https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal)>) you can set appropriate flags:
+
+```shell
+# Example: cuBLAS
+CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install -r requirements.txt
+```
+
+Then install AutoGPTQ - if you want to run quantized models for GPU
+
+```shell
+git clone https://github.com/PanQiWei/AutoGPTQ.git
+cd AutoGPTQ
+git checkout v0.2.2
+pip install .
+```
+
+For more support on [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ).
+
 ## Test dataset
 
 This repo uses a [Constitution of USA ](https://constitutioncenter.org/media/files/constitution.pdf) as an example.
@@ -249,9 +268,30 @@ Follow this [page](https://linuxconfig.org/how-to-install-the-nvidia-drivers-on-
 
 This is a test project to validate the feasibility of a fully local solution for question answering using LLMs and Vector embeddings. It is not production ready, and it is not meant to be used in production. Vicuna-7B is based on the Llama model so that has the original Llama license.
 
+# Common Errors
 
+- [Torch not compatible with cuda enabled](https://github.com/pytorch/pytorch/issues/30664)
 
-# Common Errors
+  - Get cuda version
+
+    ```shell
+    nvcc --version
+    ```
+
+    ```shell
+    nvidia-smi
+    ```
+
+  - Try Install pytorch fepending on your cuda version
+    ```shell
+       conda install -c pytorch torchvision cudatoolkit=10.1 pytorch
+    ```
+  - If doesn't work try re installing
+    ```shell
+       pip uninstall torch
+       pip cache purge
+       pip install torch -f https://download.pytorch.org/whl/torch_stable.html
+    ```
 
  - [Torch not compatible with cuda enabled](https://github.com/pytorch/pytorch/issues/30664)
 
@@ -273,16 +313,16 @@ This is a test project to validate the feasibility of a fully local solution for
          pip cache purge
          pip install torch -f https://download.pytorch.org/whl/torch_stable.html
       ```
+
 - [ERROR: pip's dependency resolver does not currently take into account all the packages that are installed](https://stackoverflow.com/questions/72672196/error-pips-dependency-resolver-does-not-currently-take-into-account-all-the-pa/76604141#76604141)
-   ```shell
-      pip install h5py
-      pip install typing-extensions
-      pip install wheel
-   ```
+  ```shell
+     pip install h5py
+     pip install typing-extensions
+     pip install wheel
+  ```
 - [Failed to import transformers](https://github.com/huggingface/transformers/issues/11262)
-   - Try  re-install
-      ```shell
-         conda uninstall tokenizers, transformers
-         pip install transformers
-      ```
-
+  - Try re-install
+    ```shell
+       conda uninstall tokenizers, transformers
+       pip install transformers
+    ```
diff --git a/run_localGPT.py b/run_localGPT.py
@@ -44,19 +44,21 @@ def load_model(device_type, model_id, model_basename=None):
     logging.info("This action can take a few minutes!")
 
     if model_basename is not None:
-        if device_type.lower() in ["cpu", "mps"]:
-            logging.info("Using Llamacpp for quantized models")
+        if ".ggml" in model_basename:
+            logging.info("Using Llamacpp for GGML quantized models")
             model_path = hf_hub_download(repo_id=model_id, filename=model_basename)
+            max_ctx_size = 2048
+            kwargs = {
+                "model_path": model_path,
+                "n_ctx": max_ctx_size,
+                "max_tokens": max_ctx_size,
+            }
             if device_type.lower() == "mps":
-                return LlamaCpp(
-                    model_path=model_path,
-                    n_ctx=2048,
-                    max_tokens=2048,
-                    temperature=0,
-                    repeat_penalty=1.15,
-                    n_gpu_layers=1000,
-                )
-            return LlamaCpp(model_path=model_path, n_ctx=2048, max_tokens=2048, temperature=0, repeat_penalty=1.15)
+                kwargs["n_gpu_layers"] = 1000
+            if device_type.lower() == "cuda":
+                kwargs["n_gpu_layers"] = 1000
+                kwargs["n_batch"] = max_ctx_size
+            return LlamaCpp(**kwargs)
 
         else:
             # The code supports all huggingface models that ends with GPTQ and have some variation
@@ -219,7 +221,7 @@ def main(device_type, show_sources):
     # model_id = "TheBloke/orca_mini_3B-GGML"
     # model_basename = "orca-mini-3b.ggmlv3.q4_0.bin"
 
-    model_id="TheBloke/Llama-2-7B-Chat-GGML"
+    model_id = "TheBloke/Llama-2-7B-Chat-GGML"
     model_basename = "llama-2-7b-chat.ggmlv3.q4_0.bin"
 
     llm = load_model(device_type, model_id=model_id, model_basename=model_basename)