Skip to content

Commit

Permalink
Add notebooks for HuggingFace demos: T5 and GPT-2
Browse files Browse the repository at this point in the history
Signed-off-by: Rajeev Rao <[email protected]>
  • Loading branch information
vinhngx authored and rajeevsrao committed Oct 21, 2021
1 parent 7f269a7 commit c922539
Show file tree
Hide file tree
Showing 5 changed files with 1,733 additions and 0 deletions.
11 changes: 11 additions & 0 deletions demo/HuggingFace/notebooks/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# TensorRT Demo with HuggingFace Models

To run the demo Jupyter notebooks in this folder, follow the instructions in the [TRT setup guide](../../../README.md) to build and launch the docker container. Then, use your browswer to open the Jupyter lab interface at <host_name>:8888/lab using the password provided in the terminal.


Notebook list:

- [gpt2.ipynb](gpt2.ipynb): Step by step walkthrough for building the GPT-2 TensorRT engine.
- [gpt2-playground.ipynb](gpt2-playground.ipynb): GUI for benchmarking GPT-2 TensorRT engines.
- [t5.ipynb](t5.ipynb): Step by step walkthrough for building the T5 TensorRT engine.
- [t5-playground.ipynb](t5-playground.ipynb): GUI for benchmarking T5 TensorRT engines.
338 changes: 338 additions & 0 deletions demo/HuggingFace/notebooks/gpt2-playground.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,338 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "64974d33-d028-440c-86fa-1a0633b3d31d",
"metadata": {},
"outputs": [],
"source": [
"# Copyright 2021 NVIDIA Corporation. All Rights Reserved.\n",
"#\n",
"# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
"# you may not use this file except in compliance with the License.\n",
"# You may obtain a copy of the License at\n",
"#\n",
"# http://www.apache.org/licenses/LICENSE-2.0\n",
"#\n",
"# Unless required by applicable law or agreed to in writing, software\n",
"# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
"# See the License for the specific language governing permissions and\n",
"# limitations under the License.\n",
"# =============================================================================="
]
},
{
"cell_type": "markdown",
"id": "c3f0ff46-9958-4d57-9067-a64be34e75da",
"metadata": {},
"source": [
"<img src=\"http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png\" style=\"width: 90px; float: right;\">\n",
"\n",
"# GPT-2 Playground\n",
"\n",
"This notebook demonstrates the GPT-2 model for open-end text generation.\n",
"\n",
"The TensorRT HuggingFace GPT-2 model is a plug-in replacement for the original PyTorch HuggingFace GPT-2 model.\n",
"\n",
"\n",
"**Notes**: \n",
" - For \"CPU - PyTorch\" and \"GPU - PyTorch\", a GPT-2 small model from HuggingFace model repository is employed. Inference is carried out with PyTorch in FP32 precision. All models run with batch size 1.\n",
"Average run time across 5 runs is reported.\n",
" - Prior to running this notebook, run [gpt2.ipynb](gpt2.ipynb) to download the GPT-2 model and generate the TensorRT engine."
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "3530e767-7050-4329-a4bc-e2221b9eb578",
"metadata": {
"jupyter": {
"source_hidden": true
},
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[2021-10-18 00:51:18,099][OSS][INFO] Reading and loading engine file ./models/gpt2/tensorrt/gpt2.onnx.engine using trt native runner.\n",
"[2021-10-18 00:51:22,581][OSS][DEBUG] Number of profiles detected in engine: 2\n",
"[2021-10-18 00:51:22,585][OSS][DEBUG] Selected profile: [(1, 1), (1, 32), (1, 64)]\n"
]
}
],
"source": [
"import os\n",
"import sys\n",
"ROOT_DIR = os.path.abspath(\"../\")\n",
"sys.path.append(ROOT_DIR)\n",
"\n",
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"\n",
"import torch \n",
"\n",
"# huggingface\n",
"from transformers import (\n",
" GPT2LMHeadModel,\n",
" GPT2Tokenizer,\n",
" GPT2Config,\n",
")\n",
"\n",
"from GPT2.trt import GPT2TRTDecoder, GPT2TRTEngine\n",
"from NNDF.networks import NetworkMetadata, Precision\n",
"from collections import namedtuple \n",
"from GPT2.GPT2ModelConfig import GPT2ModelTRTConfig\n",
"\n",
"# download HuggingFace model and tokernizer\n",
"GPT2_VARIANT = 'gpt2' # choices: gpt2 | gpt2-large\n",
"model = GPT2LMHeadModel.from_pretrained(GPT2_VARIANT)\n",
"config = GPT2Config(GPT2_VARIANT)\n",
"tokenizer = GPT2Tokenizer.from_pretrained(GPT2_VARIANT)\n",
"\n",
"# load TensorRT engine\n",
"metadata=NetworkMetadata(GPT2_VARIANT, Precision('fp16'), None)\n",
"from os.path import exists\n",
"if not exists('./models/gpt2/tensorrt/gpt2.onnx.engine'):\n",
" print(\"Error: TensorRT engine not found at ./models/gpt2/tensorrt/gpt2.onnx.engine. Please run gpt2.ipynb to generate the TensorRT engine first!\")\n",
"else:\n",
" gpt2_engine = GPT2TRTEngine('./models/gpt2/tensorrt/gpt2.onnx.engine', metadata)\n",
" gpt2_trt = GPT2TRTDecoder(gpt2_engine, metadata, config)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "766b8c94-ba8e-47c8-8624-57da462a0496",
"metadata": {
"jupyter": {
"source_hidden": true
},
"tags": []
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f4971e06250e4ff78206e05b67a0f1d6",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Textarea(value='TensorRT is a high performance deep learning inference platform that delivers low latency and …"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "bd35da4869764703b4ae4495af4fdbc2",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Textarea(value='...', description='GPT-2:', layout=Layout(width='auto'), placeholder='GPT-2 generated text', r…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6bd69c0458a8426f904a6672a1d885c0",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"RadioButtons(description='Device:', options=('CPU - PyTorch', 'GPU - PyTorch', 'GPU - TensorRT'), value='CPU -…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "01dabe5d2c4340d3b163d6b680776755",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(Button(description='Generate', style=ButtonStyle()),), layout=Layout(align_items='center', disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "865124339ace4e0986b07f516bdac937",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"IntProgress(value=0, description='Progress:', layout=Layout(height='50px', width='100%'), max=6, style=Progres…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "975e84c56f1f43578abaf8260a38aee5",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Output()"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import ipywidgets as widgets\n",
"import numpy as np\n",
"import time\n",
"\n",
"device = widgets.RadioButtons(\n",
" options=['CPU - PyTorch', \n",
" 'GPU - PyTorch', \n",
" 'GPU - TensorRT'],\n",
" description='Device:',\n",
" disabled=False\n",
")\n",
"\n",
"paragraph_text = widgets.Textarea(\n",
" value='TensorRT is a high performance deep learning inference platform that delivers low latency and high throughput for apps '\\\n",
"'such as recommenders, speech and image/video on NVIDIA GPUs. ',\n",
" placeholder='Type something',\n",
" description='Context:',\n",
" disabled=False,\n",
" layout=widgets.Layout(width=\"auto\"),\n",
" rows=5, \n",
")\n",
"\n",
"generated_text = widgets.Textarea(\n",
" value='...',\n",
" placeholder='GPT-2 generated text',\n",
" description='GPT-2:',\n",
" disabled=False,\n",
" layout=widgets.Layout(width=\"auto\"),\n",
" rows=5,\n",
")\n",
"button = widgets.Button(description=\"Generate\")\n",
"\n",
"display(paragraph_text)\n",
"display(generated_text)\n",
"display(device)\n",
"\n",
"from IPython.display import display\n",
"box_layout = widgets.Layout(display='flex',\n",
" flex_flow='column',\n",
" align_items='center',\n",
" width='100%')\n",
"N_RUN = 6\n",
"progress_bar = widgets.IntProgress(\n",
" value=0,\n",
" min=0,\n",
" max=N_RUN,\n",
" description='Progress:',\n",
" bar_style='', # 'success', 'info', 'warning', 'danger' or ''\n",
" style={'bar_color': 'green'},\n",
" orientation='horizontal', \n",
" layout=widgets.Layout(width='100%', height='50px')\n",
")\n",
"\n",
"box = widgets.HBox(children=[button],layout=box_layout)\n",
"output = widgets.Output()\n",
"display(box)\n",
"display(progress_bar)\n",
"display(output)\n",
"\n",
"def generate(b):\n",
" progress_bar.value = 0\n",
" inference_time_arr = []\n",
" with output:\n",
" if device.value == 'GPU - TensorRT':\n",
" inputs = tokenizer(paragraph_text.value, return_tensors=\"pt\")\n",
" for _ in range(N_RUN):\n",
" start_time = time.time()\n",
" sample_output = gpt2_trt.generate(inputs.input_ids.to('cuda'), max_length=GPT2ModelTRTConfig.MAX_SEQUENCE_LENGTH[GPT2_VARIANT])\n",
" inference_time_arr.append(time.time()-start_time)\n",
" progress_bar.value += 1\n",
"\n",
" # de-tokenize model output to raw text\n",
" text = tokenizer.decode(sample_output[0], skip_special_tokens=True)\n",
" generated_text.value = text\n",
" print(\"GPU - TensorRT - Average inference time: %.2f (ms)\"%(1000*np.mean(inference_time_arr[1:]))) \n",
" \n",
" elif device.value == 'CPU - PyTorch':\n",
" inputs = tokenizer(paragraph_text.value, return_tensors=\"pt\")\n",
" for _ in range(N_RUN):\n",
" start_time = time.time()\n",
" sample_output = model.to('cpu').generate(inputs.input_ids.to('cpu'), max_length=GPT2ModelTRTConfig.MAX_SEQUENCE_LENGTH[GPT2_VARIANT])\n",
" inference_time_arr.append(time.time()-start_time)\n",
" progress_bar.value += 1\n",
"\n",
" # de-tokenize model output to raw text\n",
" text = tokenizer.decode(sample_output[0], skip_special_tokens=True)\n",
" generated_text.value = text\n",
" print(\"CPU - PyTorch - Average inference time: %.2f (ms)\"%(1000*np.mean(inference_time_arr[1:])))\n",
" \n",
" elif device.value == 'GPU - PyTorch': \n",
" inputs = tokenizer(paragraph_text.value, return_tensors=\"pt\")\n",
" for _ in range(N_RUN):\n",
" start_time = time.time()\n",
" sample_output = model.to('cuda').generate(inputs.input_ids.to('cuda'), max_length=GPT2ModelTRTConfig.MAX_SEQUENCE_LENGTH[GPT2_VARIANT])\n",
" inference_time_arr.append(time.time()-start_time)\n",
" progress_bar.value += 1\n",
"\n",
" # de-tokenize model output to raw text\n",
" text = tokenizer.decode(sample_output[0], skip_special_tokens=True)\n",
" generated_text.value = text\n",
" print(\"GPU - PyTorch - Average inference time: %.2f (ms)\"%(1000*np.mean(inference_time_arr[1:]))) \n",
" \n",
"button.on_click(generate)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "58f473c0-6682-41af-8040-72f0a9472b0f",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit c922539

Please sign in to comment.