forked from NVIDIA/TensorRT
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add notebooks for HuggingFace demos: T5 and GPT-2
Signed-off-by: Rajeev Rao <[email protected]>
- Loading branch information
1 parent
7f269a7
commit c922539
Showing
5 changed files
with
1,733 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# TensorRT Demo with HuggingFace Models | ||
|
||
To run the demo Jupyter notebooks in this folder, follow the instructions in the [TRT setup guide](../../../README.md) to build and launch the docker container. Then, use your browswer to open the Jupyter lab interface at <host_name>:8888/lab using the password provided in the terminal. | ||
|
||
|
||
Notebook list: | ||
|
||
- [gpt2.ipynb](gpt2.ipynb): Step by step walkthrough for building the GPT-2 TensorRT engine. | ||
- [gpt2-playground.ipynb](gpt2-playground.ipynb): GUI for benchmarking GPT-2 TensorRT engines. | ||
- [t5.ipynb](t5.ipynb): Step by step walkthrough for building the T5 TensorRT engine. | ||
- [t5-playground.ipynb](t5-playground.ipynb): GUI for benchmarking T5 TensorRT engines. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,338 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "64974d33-d028-440c-86fa-1a0633b3d31d", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Copyright 2021 NVIDIA Corporation. All Rights Reserved.\n", | ||
"#\n", | ||
"# Licensed under the Apache License, Version 2.0 (the \"License\");\n", | ||
"# you may not use this file except in compliance with the License.\n", | ||
"# You may obtain a copy of the License at\n", | ||
"#\n", | ||
"# http://www.apache.org/licenses/LICENSE-2.0\n", | ||
"#\n", | ||
"# Unless required by applicable law or agreed to in writing, software\n", | ||
"# distributed under the License is distributed on an \"AS IS\" BASIS,\n", | ||
"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", | ||
"# See the License for the specific language governing permissions and\n", | ||
"# limitations under the License.\n", | ||
"# ==============================================================================" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "c3f0ff46-9958-4d57-9067-a64be34e75da", | ||
"metadata": {}, | ||
"source": [ | ||
"<img src=\"http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png\" style=\"width: 90px; float: right;\">\n", | ||
"\n", | ||
"# GPT-2 Playground\n", | ||
"\n", | ||
"This notebook demonstrates the GPT-2 model for open-end text generation.\n", | ||
"\n", | ||
"The TensorRT HuggingFace GPT-2 model is a plug-in replacement for the original PyTorch HuggingFace GPT-2 model.\n", | ||
"\n", | ||
"\n", | ||
"**Notes**: \n", | ||
" - For \"CPU - PyTorch\" and \"GPU - PyTorch\", a GPT-2 small model from HuggingFace model repository is employed. Inference is carried out with PyTorch in FP32 precision. All models run with batch size 1.\n", | ||
"Average run time across 5 runs is reported.\n", | ||
" - Prior to running this notebook, run [gpt2.ipynb](gpt2.ipynb) to download the GPT-2 model and generate the TensorRT engine." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "3530e767-7050-4329-a4bc-e2221b9eb578", | ||
"metadata": { | ||
"jupyter": { | ||
"source_hidden": true | ||
}, | ||
"tags": [] | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"[2021-10-18 00:51:18,099][OSS][INFO] Reading and loading engine file ./models/gpt2/tensorrt/gpt2.onnx.engine using trt native runner.\n", | ||
"[2021-10-18 00:51:22,581][OSS][DEBUG] Number of profiles detected in engine: 2\n", | ||
"[2021-10-18 00:51:22,585][OSS][DEBUG] Selected profile: [(1, 1), (1, 32), (1, 64)]\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import os\n", | ||
"import sys\n", | ||
"ROOT_DIR = os.path.abspath(\"../\")\n", | ||
"sys.path.append(ROOT_DIR)\n", | ||
"\n", | ||
"import warnings\n", | ||
"warnings.filterwarnings('ignore')\n", | ||
"\n", | ||
"import torch \n", | ||
"\n", | ||
"# huggingface\n", | ||
"from transformers import (\n", | ||
" GPT2LMHeadModel,\n", | ||
" GPT2Tokenizer,\n", | ||
" GPT2Config,\n", | ||
")\n", | ||
"\n", | ||
"from GPT2.trt import GPT2TRTDecoder, GPT2TRTEngine\n", | ||
"from NNDF.networks import NetworkMetadata, Precision\n", | ||
"from collections import namedtuple \n", | ||
"from GPT2.GPT2ModelConfig import GPT2ModelTRTConfig\n", | ||
"\n", | ||
"# download HuggingFace model and tokernizer\n", | ||
"GPT2_VARIANT = 'gpt2' # choices: gpt2 | gpt2-large\n", | ||
"model = GPT2LMHeadModel.from_pretrained(GPT2_VARIANT)\n", | ||
"config = GPT2Config(GPT2_VARIANT)\n", | ||
"tokenizer = GPT2Tokenizer.from_pretrained(GPT2_VARIANT)\n", | ||
"\n", | ||
"# load TensorRT engine\n", | ||
"metadata=NetworkMetadata(GPT2_VARIANT, Precision('fp16'), None)\n", | ||
"from os.path import exists\n", | ||
"if not exists('./models/gpt2/tensorrt/gpt2.onnx.engine'):\n", | ||
" print(\"Error: TensorRT engine not found at ./models/gpt2/tensorrt/gpt2.onnx.engine. Please run gpt2.ipynb to generate the TensorRT engine first!\")\n", | ||
"else:\n", | ||
" gpt2_engine = GPT2TRTEngine('./models/gpt2/tensorrt/gpt2.onnx.engine', metadata)\n", | ||
" gpt2_trt = GPT2TRTDecoder(gpt2_engine, metadata, config)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "766b8c94-ba8e-47c8-8624-57da462a0496", | ||
"metadata": { | ||
"jupyter": { | ||
"source_hidden": true | ||
}, | ||
"tags": [] | ||
}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"application/vnd.jupyter.widget-view+json": { | ||
"model_id": "f4971e06250e4ff78206e05b67a0f1d6", | ||
"version_major": 2, | ||
"version_minor": 0 | ||
}, | ||
"text/plain": [ | ||
"Textarea(value='TensorRT is a high performance deep learning inference platform that delivers low latency and …" | ||
] | ||
}, | ||
"metadata": {}, | ||
"output_type": "display_data" | ||
}, | ||
{ | ||
"data": { | ||
"application/vnd.jupyter.widget-view+json": { | ||
"model_id": "bd35da4869764703b4ae4495af4fdbc2", | ||
"version_major": 2, | ||
"version_minor": 0 | ||
}, | ||
"text/plain": [ | ||
"Textarea(value='...', description='GPT-2:', layout=Layout(width='auto'), placeholder='GPT-2 generated text', r…" | ||
] | ||
}, | ||
"metadata": {}, | ||
"output_type": "display_data" | ||
}, | ||
{ | ||
"data": { | ||
"application/vnd.jupyter.widget-view+json": { | ||
"model_id": "6bd69c0458a8426f904a6672a1d885c0", | ||
"version_major": 2, | ||
"version_minor": 0 | ||
}, | ||
"text/plain": [ | ||
"RadioButtons(description='Device:', options=('CPU - PyTorch', 'GPU - PyTorch', 'GPU - TensorRT'), value='CPU -…" | ||
] | ||
}, | ||
"metadata": {}, | ||
"output_type": "display_data" | ||
}, | ||
{ | ||
"data": { | ||
"application/vnd.jupyter.widget-view+json": { | ||
"model_id": "01dabe5d2c4340d3b163d6b680776755", | ||
"version_major": 2, | ||
"version_minor": 0 | ||
}, | ||
"text/plain": [ | ||
"HBox(children=(Button(description='Generate', style=ButtonStyle()),), layout=Layout(align_items='center', disp…" | ||
] | ||
}, | ||
"metadata": {}, | ||
"output_type": "display_data" | ||
}, | ||
{ | ||
"data": { | ||
"application/vnd.jupyter.widget-view+json": { | ||
"model_id": "865124339ace4e0986b07f516bdac937", | ||
"version_major": 2, | ||
"version_minor": 0 | ||
}, | ||
"text/plain": [ | ||
"IntProgress(value=0, description='Progress:', layout=Layout(height='50px', width='100%'), max=6, style=Progres…" | ||
] | ||
}, | ||
"metadata": {}, | ||
"output_type": "display_data" | ||
}, | ||
{ | ||
"data": { | ||
"application/vnd.jupyter.widget-view+json": { | ||
"model_id": "975e84c56f1f43578abaf8260a38aee5", | ||
"version_major": 2, | ||
"version_minor": 0 | ||
}, | ||
"text/plain": [ | ||
"Output()" | ||
] | ||
}, | ||
"metadata": {}, | ||
"output_type": "display_data" | ||
} | ||
], | ||
"source": [ | ||
"import ipywidgets as widgets\n", | ||
"import numpy as np\n", | ||
"import time\n", | ||
"\n", | ||
"device = widgets.RadioButtons(\n", | ||
" options=['CPU - PyTorch', \n", | ||
" 'GPU - PyTorch', \n", | ||
" 'GPU - TensorRT'],\n", | ||
" description='Device:',\n", | ||
" disabled=False\n", | ||
")\n", | ||
"\n", | ||
"paragraph_text = widgets.Textarea(\n", | ||
" value='TensorRT is a high performance deep learning inference platform that delivers low latency and high throughput for apps '\\\n", | ||
"'such as recommenders, speech and image/video on NVIDIA GPUs. ',\n", | ||
" placeholder='Type something',\n", | ||
" description='Context:',\n", | ||
" disabled=False,\n", | ||
" layout=widgets.Layout(width=\"auto\"),\n", | ||
" rows=5, \n", | ||
")\n", | ||
"\n", | ||
"generated_text = widgets.Textarea(\n", | ||
" value='...',\n", | ||
" placeholder='GPT-2 generated text',\n", | ||
" description='GPT-2:',\n", | ||
" disabled=False,\n", | ||
" layout=widgets.Layout(width=\"auto\"),\n", | ||
" rows=5,\n", | ||
")\n", | ||
"button = widgets.Button(description=\"Generate\")\n", | ||
"\n", | ||
"display(paragraph_text)\n", | ||
"display(generated_text)\n", | ||
"display(device)\n", | ||
"\n", | ||
"from IPython.display import display\n", | ||
"box_layout = widgets.Layout(display='flex',\n", | ||
" flex_flow='column',\n", | ||
" align_items='center',\n", | ||
" width='100%')\n", | ||
"N_RUN = 6\n", | ||
"progress_bar = widgets.IntProgress(\n", | ||
" value=0,\n", | ||
" min=0,\n", | ||
" max=N_RUN,\n", | ||
" description='Progress:',\n", | ||
" bar_style='', # 'success', 'info', 'warning', 'danger' or ''\n", | ||
" style={'bar_color': 'green'},\n", | ||
" orientation='horizontal', \n", | ||
" layout=widgets.Layout(width='100%', height='50px')\n", | ||
")\n", | ||
"\n", | ||
"box = widgets.HBox(children=[button],layout=box_layout)\n", | ||
"output = widgets.Output()\n", | ||
"display(box)\n", | ||
"display(progress_bar)\n", | ||
"display(output)\n", | ||
"\n", | ||
"def generate(b):\n", | ||
" progress_bar.value = 0\n", | ||
" inference_time_arr = []\n", | ||
" with output:\n", | ||
" if device.value == 'GPU - TensorRT':\n", | ||
" inputs = tokenizer(paragraph_text.value, return_tensors=\"pt\")\n", | ||
" for _ in range(N_RUN):\n", | ||
" start_time = time.time()\n", | ||
" sample_output = gpt2_trt.generate(inputs.input_ids.to('cuda'), max_length=GPT2ModelTRTConfig.MAX_SEQUENCE_LENGTH[GPT2_VARIANT])\n", | ||
" inference_time_arr.append(time.time()-start_time)\n", | ||
" progress_bar.value += 1\n", | ||
"\n", | ||
" # de-tokenize model output to raw text\n", | ||
" text = tokenizer.decode(sample_output[0], skip_special_tokens=True)\n", | ||
" generated_text.value = text\n", | ||
" print(\"GPU - TensorRT - Average inference time: %.2f (ms)\"%(1000*np.mean(inference_time_arr[1:]))) \n", | ||
" \n", | ||
" elif device.value == 'CPU - PyTorch':\n", | ||
" inputs = tokenizer(paragraph_text.value, return_tensors=\"pt\")\n", | ||
" for _ in range(N_RUN):\n", | ||
" start_time = time.time()\n", | ||
" sample_output = model.to('cpu').generate(inputs.input_ids.to('cpu'), max_length=GPT2ModelTRTConfig.MAX_SEQUENCE_LENGTH[GPT2_VARIANT])\n", | ||
" inference_time_arr.append(time.time()-start_time)\n", | ||
" progress_bar.value += 1\n", | ||
"\n", | ||
" # de-tokenize model output to raw text\n", | ||
" text = tokenizer.decode(sample_output[0], skip_special_tokens=True)\n", | ||
" generated_text.value = text\n", | ||
" print(\"CPU - PyTorch - Average inference time: %.2f (ms)\"%(1000*np.mean(inference_time_arr[1:])))\n", | ||
" \n", | ||
" elif device.value == 'GPU - PyTorch': \n", | ||
" inputs = tokenizer(paragraph_text.value, return_tensors=\"pt\")\n", | ||
" for _ in range(N_RUN):\n", | ||
" start_time = time.time()\n", | ||
" sample_output = model.to('cuda').generate(inputs.input_ids.to('cuda'), max_length=GPT2ModelTRTConfig.MAX_SEQUENCE_LENGTH[GPT2_VARIANT])\n", | ||
" inference_time_arr.append(time.time()-start_time)\n", | ||
" progress_bar.value += 1\n", | ||
"\n", | ||
" # de-tokenize model output to raw text\n", | ||
" text = tokenizer.decode(sample_output[0], skip_special_tokens=True)\n", | ||
" generated_text.value = text\n", | ||
" print(\"GPU - PyTorch - Average inference time: %.2f (ms)\"%(1000*np.mean(inference_time_arr[1:]))) \n", | ||
" \n", | ||
"button.on_click(generate)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "58f473c0-6682-41af-8040-72f0a9472b0f", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.6.9" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Oops, something went wrong.