cleaner benchmark

OxTapinear · Jan 30, 2024 · 22a7994 · 22a7994
1 parent 823db17
commit 22a7994
Showing 1 changed file with 20 additions and 71 deletions.
diff --git a/notebooks/generate_benchmark.ipynb b/notebooks/generate_benchmark.ipynb
@@ -10,37 +10,17 @@
      "output_type": "stream",
      "text": [
       "env: CUDA_DEVICE_ORDER=PCI_BUS_ID\n",
-      "env: CUDA_VISIBLE_DEVICES=0\n",
-      "env: TORCH_USE_CUDA_DSA=True\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "I0130 00:36:03.164406 594052 utils.py:145] Note: detected 255 virtual cores but NumExpr set to maximum of 64, check \"NUMEXPR_MAX_THREADS\" environment variable.\n",
-      "I0130 00:36:03.166201 594052 utils.py:148] Note: NumExpr detected 255 cores but \"NUMEXPR_MAX_THREADS\" not set, so enforcing safe limit of 8.\n",
-      "I0130 00:36:03.166588 594052 utils.py:160] NumExpr defaulting to 8 threads.\n",
-      "I0130 00:36:03.450076 594052 config.py:58] PyTorch version 2.1.2 available.\n",
-      "Using `is_flash_attn_available` is deprecated and will be removed in v4.38. Please use `is_flash_attn_2_available` instead.\n",
-      "Using `is_flash_attn_available` is deprecated and will be removed in v4.38. Please use `is_flash_attn_2_available` instead.\n",
-      "Using `is_flash_attn_available` is deprecated and will be removed in v4.38. Please use `is_flash_attn_2_available` instead.\n"
+      "env: CUDA_VISIBLE_DEVICES=3\n"
      ]
     }
    ],
    "source": [
     "%env CUDA_DEVICE_ORDER=PCI_BUS_ID\n",
-    "%env CUDA_VISIBLE_DEVICES=0\n",
-    "%env TORCH_USE_CUDA_DSA=True\n",
+    "%env CUDA_VISIBLE_DEVICES=3\n",
     "\n",
     "import torch\n",
     "\n",
-    "from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM\n",
-    "\n",
-    "import sys\n",
-    "sys.path.append(\"/home/blacksamorez/quip-sharp\")\n",
-    "from lib.utils.unsafe_import import model_from_hf_path\n",
-    "from model.llama import LlamaForCausalLM as QuipSharpLlamaForCausalLM"
+    "from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM"
    ]
   },
   {
@@ -52,17 +32,14 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "W0130 00:36:54.963166 594052 warnings.py:109] /home/blacksamorez/quip-sharp/.conda/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
-      "  return self.fget.__get__(instance, owner)()\n",
-      "\n",
-      "The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation=\"flash_attention_2\"` instead.\n",
-      "I0130 00:37:13.597410 594052 modeling.py:920] We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).\n"
+      "/home/blacksamorez/quip-sharp/.conda/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "31a2f10a11f34892a7fdad042390b9a4",
+       "model_id": "3c602aaf8e6c486d8ecbd77aa22892cd",
        "version_major": 2,
        "version_minor": 0
       },
@@ -75,12 +52,7 @@
     }
    ],
    "source": [
-    "aqlm_model = AutoModelForCausalLM.from_pretrained(\"BlackSamorez/Llama-2-7b-AQLM-6288ppl-hf\", trust_remote_code=True, low_cpu_mem_usage=True, torch_dtype=\"auto\").cuda()\n",
-    "quip_sharp_model = model_from_hf_path(\n",
-    "    \"relaxml/Llama-2-7b-E8P-2Bit\",\n",
-    "    use_flash_attn=True,\n",
-    "    use_cuda_graph=False,\n",
-    ")[0].cuda()\n",
+    "aqlm_model = AutoModelForCausalLM.from_pretrained(\"BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf\", trust_remote_code=True, torch_dtype=\"auto\").cuda()\n",
     "fp16_model = AutoModelForCausalLM.from_pretrained(\"meta-llama/Llama-2-7b-hf\", torch_dtype=\"auto\").cuda()\n",
     "tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Llama-2-7b-hf\")"
    ]
@@ -95,25 +67,21 @@
      "output_type": "stream",
      "text": [
       "AQLM:\n",
-      " <s> Hi, I'm a newbie here. I'm looking for a good place to learn about the basics of programming.\n",
-      "I'm \n",
-      "\n",
-      "QUIP#:\n",
-      " <s> Hi! I'm looking for a new and exciting adventure in my life. hopefully in a few days I will have the chance to join you \n",
+      " <s> 1999-2000, 2001-02, 2002-03,  \n",
       "\n",
       "FP16:\n",
-      " <s> Hi everyone, I'm new to the forum.\n",
-      "I've been playing with a few different synths lately and I've been having \n",
+      " <s> \\title{A new method to derive the effective mass of the $^3$H resonance}\n",
+      "\n",
+      "\\begin{abstract}\n",
+      "The \n",
       "\n"
      ]
     }
    ],
    "source": [
-    "output = aqlm_model.generate(tokenizer(\"Hi\", return_tensors=\"pt\")[\"input_ids\"].cuda(), max_new_tokens=30)\n",
+    "output = aqlm_model.generate(tokenizer(\"\", return_tensors=\"pt\")[\"input_ids\"].cuda(), max_new_tokens=30)\n",
     "print(\"AQLM:\\n\", tokenizer.decode(output[0]), \"\\n\")\n",
-    "output = quip_sharp_model.generate(tokenizer(\"Hi\", return_tensors=\"pt\")[\"input_ids\"].cuda(), max_new_tokens=30)\n",
-    "print(\"QUIP#:\\n\", tokenizer.decode(output[0]), \"\\n\")\n",
-    "output = fp16_model.generate(tokenizer(\"Hi\", return_tensors=\"pt\")[\"input_ids\"].cuda(), max_new_tokens=30)\n",
+    "output = fp16_model.generate(tokenizer(\"\", return_tensors=\"pt\")[\"input_ids\"].cuda(), max_new_tokens=30)\n",
     "print(\"FP16:\\n\", tokenizer.decode(output[0]), \"\\n\")"
    ]
   },
@@ -126,14 +94,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 8.64 s, sys: 22.9 ms, total: 8.66 s\n",
-      "Wall time: 8.65 s\n"
+      "CPU times: user 5.43 s, sys: 21.9 ms, total: 5.45 s\n",
+      "Wall time: 5.52 s\n"
      ]
     }
    ],
    "source": [
     "%%time\n",
-    "output = aqlm_model.generate(tokenizer(\"Hi\", return_tensors=\"pt\")[\"input_ids\"].cuda(), min_new_tokens=200, max_new_tokens=200)"
+    "output = aqlm_model.generate(tokenizer(\"\", return_tensors=\"pt\")[\"input_ids\"].cuda(), min_new_tokens=128, max_new_tokens=128)"
    ]
   },
   {
@@ -145,33 +113,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 15.2 s, sys: 29.2 s, total: 44.4 s\n",
-      "Wall time: 45.9 s\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%time\n",
-    "output = quip_sharp_model.generate(tokenizer(\"Hi\", return_tensors=\"pt\")[\"input_ids\"].cuda(), min_new_tokens=200, max_new_tokens=200)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "CPU times: user 5.94 s, sys: 5.65 ms, total: 5.95 s\n",
-      "Wall time: 5.94 s\n"
+      "CPU times: user 3.68 s, sys: 2.83 ms, total: 3.69 s\n",
+      "Wall time: 3.68 s\n"
      ]
     }
    ],
    "source": [
     "%%time\n",
-    "output = fp16_model.generate(tokenizer(\"Hi\", return_tensors=\"pt\")[\"input_ids\"].cuda(), min_new_tokens=200, max_new_tokens=200)"
+    "output = fp16_model.generate(tokenizer(\"\", return_tensors=\"pt\")[\"input_ids\"].cuda(), min_new_tokens=128, max_new_tokens=128)"
    ]
   }
  ],