Add notebooks for HuggingFace demos: T5 and GPT-2

Signed-off-by: Rajeev Rao <[email protected]>
cmxnono · Oct 21, 2021 · c922539 · c922539
1 parent 7f269a7
commit c922539
Show file tree

Hide file tree

Showing 5 changed files with 1,733 additions and 0 deletions.
diff --git a/demo/HuggingFace/notebooks/README.md b/demo/HuggingFace/notebooks/README.md
@@ -0,0 +1,11 @@
+# TensorRT Demo with HuggingFace Models
+
+To run the demo Jupyter notebooks in this folder, follow the instructions in the [TRT setup guide](../../../README.md) to build and launch the docker container. Then, use your browswer to open the Jupyter lab interface at <host_name>:8888/lab using the password provided in the terminal.
+
+
+Notebook list:
+
+- [gpt2.ipynb](gpt2.ipynb): Step by step walkthrough for building the GPT-2 TensorRT engine.
+- [gpt2-playground.ipynb](gpt2-playground.ipynb): GUI for benchmarking GPT-2 TensorRT engines.
+- [t5.ipynb](t5.ipynb): Step by step walkthrough for building the T5 TensorRT engine.
+- [t5-playground.ipynb](t5-playground.ipynb): GUI for benchmarking T5 TensorRT engines.
diff --git a/demo/HuggingFace/notebooks/gpt2-playground.ipynb b/demo/HuggingFace/notebooks/gpt2-playground.ipynb
@@ -0,0 +1,338 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "64974d33-d028-440c-86fa-1a0633b3d31d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Copyright 2021 NVIDIA Corporation. All Rights Reserved.\n",
+    "#\n",
+    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "#     http://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License.\n",
+    "# =============================================================================="
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c3f0ff46-9958-4d57-9067-a64be34e75da",
+   "metadata": {},
+   "source": [
+    "<img src=\"http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png\" style=\"width: 90px; float: right;\">\n",
+    "\n",
+    "# GPT-2 Playground\n",
+    "\n",
+    "This notebook demonstrates the GPT-2 model for open-end text generation.\n",
+    "\n",
+    "The TensorRT HuggingFace GPT-2 model is a plug-in replacement for the original PyTorch  HuggingFace GPT-2 model.\n",
+    "\n",
+    "\n",
+    "**Notes**: \n",
+    " - For \"CPU - PyTorch\" and \"GPU - PyTorch\", a GPT-2 small model from HuggingFace model repository is employed. Inference is carried out with PyTorch in FP32 precision. All models run with batch size 1.\n",
+    "Average run time across 5 runs is reported.\n",
+    " - Prior to running this notebook, run [gpt2.ipynb](gpt2.ipynb) to download the GPT-2 model and generate the TensorRT engine."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "3530e767-7050-4329-a4bc-e2221b9eb578",
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2021-10-18 00:51:18,099][OSS][INFO] Reading and loading engine file ./models/gpt2/tensorrt/gpt2.onnx.engine using trt native runner.\n",
+      "[2021-10-18 00:51:22,581][OSS][DEBUG] Number of profiles detected in engine: 2\n",
+      "[2021-10-18 00:51:22,585][OSS][DEBUG] Selected profile: [(1, 1), (1, 32), (1, 64)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "ROOT_DIR = os.path.abspath(\"../\")\n",
+    "sys.path.append(ROOT_DIR)\n",
+    "\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')\n",
+    "\n",
+    "import torch \n",
+    "\n",
+    "# huggingface\n",
+    "from transformers import (\n",
+    "    GPT2LMHeadModel,\n",
+    "    GPT2Tokenizer,\n",
+    "    GPT2Config,\n",
+    ")\n",
+    "\n",
+    "from GPT2.trt import GPT2TRTDecoder, GPT2TRTEngine\n",
+    "from NNDF.networks import NetworkMetadata, Precision\n",
+    "from collections import namedtuple \n",
+    "from GPT2.GPT2ModelConfig import GPT2ModelTRTConfig\n",
+    "\n",
+    "# download HuggingFace model and tokernizer\n",
+    "GPT2_VARIANT = 'gpt2' # choices: gpt2 | gpt2-large\n",
+    "model = GPT2LMHeadModel.from_pretrained(GPT2_VARIANT)\n",
+    "config = GPT2Config(GPT2_VARIANT)\n",
+    "tokenizer = GPT2Tokenizer.from_pretrained(GPT2_VARIANT)\n",
+    "\n",
+    "# load TensorRT engine\n",
+    "metadata=NetworkMetadata(GPT2_VARIANT, Precision('fp16'), None)\n",
+    "from os.path import exists\n",
+    "if not exists('./models/gpt2/tensorrt/gpt2.onnx.engine'):\n",
+    "    print(\"Error: TensorRT engine not found at ./models/gpt2/tensorrt/gpt2.onnx.engine. Please run gpt2.ipynb to generate the TensorRT engine first!\")\n",
+    "else:\n",
+    "    gpt2_engine = GPT2TRTEngine('./models/gpt2/tensorrt/gpt2.onnx.engine', metadata)\n",
+    "    gpt2_trt = GPT2TRTDecoder(gpt2_engine, metadata, config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "766b8c94-ba8e-47c8-8624-57da462a0496",
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f4971e06250e4ff78206e05b67a0f1d6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Textarea(value='TensorRT is a high performance deep learning inference platform that delivers low latency and …"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bd35da4869764703b4ae4495af4fdbc2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Textarea(value='...', description='GPT-2:', layout=Layout(width='auto'), placeholder='GPT-2 generated text', r…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6bd69c0458a8426f904a6672a1d885c0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "RadioButtons(description='Device:', options=('CPU - PyTorch', 'GPU - PyTorch', 'GPU - TensorRT'), value='CPU -…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "01dabe5d2c4340d3b163d6b680776755",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(Button(description='Generate', style=ButtonStyle()),), layout=Layout(align_items='center', disp…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "865124339ace4e0986b07f516bdac937",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "IntProgress(value=0, description='Progress:', layout=Layout(height='50px', width='100%'), max=6, style=Progres…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "975e84c56f1f43578abaf8260a38aee5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import ipywidgets as widgets\n",
+    "import numpy as np\n",
+    "import time\n",
+    "\n",
+    "device = widgets.RadioButtons(\n",
+    "    options=['CPU - PyTorch', \n",
+    "             'GPU - PyTorch', \n",
+    "             'GPU - TensorRT'],\n",
+    "    description='Device:',\n",
+    "    disabled=False\n",
+    ")\n",
+    "\n",
+    "paragraph_text = widgets.Textarea(\n",
+    "    value='TensorRT is a high performance deep learning inference platform that delivers low latency and high throughput for apps '\\\n",
+    "'such as recommenders, speech and image/video on NVIDIA GPUs. ',\n",
+    "    placeholder='Type something',\n",
+    "    description='Context:',\n",
+    "    disabled=False,\n",
+    "    layout=widgets.Layout(width=\"auto\"),\n",
+    "    rows=5,  \n",
+    ")\n",
+    "\n",
+    "generated_text = widgets.Textarea(\n",
+    "    value='...',\n",
+    "    placeholder='GPT-2 generated text',\n",
+    "    description='GPT-2:',\n",
+    "    disabled=False,\n",
+    "    layout=widgets.Layout(width=\"auto\"),\n",
+    "    rows=5,\n",
+    ")\n",
+    "button = widgets.Button(description=\"Generate\")\n",
+    "\n",
+    "display(paragraph_text)\n",
+    "display(generated_text)\n",
+    "display(device)\n",
+    "\n",
+    "from IPython.display import display\n",
+    "box_layout = widgets.Layout(display='flex',\n",
+    "                flex_flow='column',\n",
+    "                align_items='center',\n",
+    "                width='100%')\n",
+    "N_RUN = 6\n",
+    "progress_bar = widgets.IntProgress(\n",
+    "    value=0,\n",
+    "    min=0,\n",
+    "    max=N_RUN,\n",
+    "    description='Progress:',\n",
+    "    bar_style='', # 'success', 'info', 'warning', 'danger' or ''\n",
+    "    style={'bar_color': 'green'},\n",
+    "    orientation='horizontal', \n",
+    "    layout=widgets.Layout(width='100%', height='50px')\n",
+    ")\n",
+    "\n",
+    "box = widgets.HBox(children=[button],layout=box_layout)\n",
+    "output = widgets.Output()\n",
+    "display(box)\n",
+    "display(progress_bar)\n",
+    "display(output)\n",
+    "\n",
+    "def generate(b):\n",
+    "    progress_bar.value = 0\n",
+    "    inference_time_arr = []\n",
+    "    with output:\n",
+    "        if device.value == 'GPU - TensorRT':\n",
+    "            inputs = tokenizer(paragraph_text.value, return_tensors=\"pt\")\n",
+    "            for _ in range(N_RUN):\n",
+    "                start_time = time.time()\n",
+    "                sample_output = gpt2_trt.generate(inputs.input_ids.to('cuda'), max_length=GPT2ModelTRTConfig.MAX_SEQUENCE_LENGTH[GPT2_VARIANT])\n",
+    "                inference_time_arr.append(time.time()-start_time)\n",
+    "                progress_bar.value += 1\n",
+    "\n",
+    "            # de-tokenize model output to raw text\n",
+    "            text = tokenizer.decode(sample_output[0], skip_special_tokens=True)\n",
+    "            generated_text.value = text\n",
+    "            print(\"GPU - TensorRT - Average inference time: %.2f (ms)\"%(1000*np.mean(inference_time_arr[1:])))                  \n",
+    "                \n",
+    "        elif device.value == 'CPU - PyTorch':\n",
+    "            inputs = tokenizer(paragraph_text.value, return_tensors=\"pt\")\n",
+    "            for _ in range(N_RUN):\n",
+    "                start_time = time.time()\n",
+    "                sample_output = model.to('cpu').generate(inputs.input_ids.to('cpu'), max_length=GPT2ModelTRTConfig.MAX_SEQUENCE_LENGTH[GPT2_VARIANT])\n",
+    "                inference_time_arr.append(time.time()-start_time)\n",
+    "                progress_bar.value += 1\n",
+    "\n",
+    "            # de-tokenize model output to raw text\n",
+    "            text = tokenizer.decode(sample_output[0], skip_special_tokens=True)\n",
+    "            generated_text.value = text\n",
+    "            print(\"CPU - PyTorch - Average inference time: %.2f (ms)\"%(1000*np.mean(inference_time_arr[1:])))\n",
+    "            \n",
+    "        elif  device.value == 'GPU - PyTorch':  \n",
+    "            inputs = tokenizer(paragraph_text.value, return_tensors=\"pt\")\n",
+    "            for _ in range(N_RUN):\n",
+    "                start_time = time.time()\n",
+    "                sample_output = model.to('cuda').generate(inputs.input_ids.to('cuda'), max_length=GPT2ModelTRTConfig.MAX_SEQUENCE_LENGTH[GPT2_VARIANT])\n",
+    "                inference_time_arr.append(time.time()-start_time)\n",
+    "                progress_bar.value += 1\n",
+    "\n",
+    "            # de-tokenize model output to raw text\n",
+    "            text = tokenizer.decode(sample_output[0], skip_special_tokens=True)\n",
+    "            generated_text.value = text\n",
+    "            print(\"GPU - PyTorch - Average inference time: %.2f (ms)\"%(1000*np.mean(inference_time_arr[1:])))    \n",
+    "            \n",
+    "button.on_click(generate)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "58f473c0-6682-41af-8040-72f0a9472b0f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}