From c922539e98298da8636a01f1fde7475b71fe9ef0 Mon Sep 17 00:00:00 2001 From: Vinh Nguyen Date: Fri, 15 Oct 2021 04:41:09 -0700 Subject: [PATCH] Add notebooks for HuggingFace demos: T5 and GPT-2 Signed-off-by: Rajeev Rao --- demo/HuggingFace/notebooks/README.md | 11 + .../notebooks/gpt2-playground.ipynb | 338 ++++++++++ demo/HuggingFace/notebooks/gpt2.ipynb | 479 +++++++++++++ .../HuggingFace/notebooks/t5-playground.ipynb | 278 ++++++++ demo/HuggingFace/notebooks/t5.ipynb | 627 ++++++++++++++++++ 5 files changed, 1733 insertions(+) create mode 100644 demo/HuggingFace/notebooks/README.md create mode 100644 demo/HuggingFace/notebooks/gpt2-playground.ipynb create mode 100644 demo/HuggingFace/notebooks/gpt2.ipynb create mode 100644 demo/HuggingFace/notebooks/t5-playground.ipynb create mode 100644 demo/HuggingFace/notebooks/t5.ipynb diff --git a/demo/HuggingFace/notebooks/README.md b/demo/HuggingFace/notebooks/README.md new file mode 100644 index 00000000..21cc42ee --- /dev/null +++ b/demo/HuggingFace/notebooks/README.md @@ -0,0 +1,11 @@ +# TensorRT Demo with HuggingFace Models + +To run the demo Jupyter notebooks in this folder, follow the instructions in the [TRT setup guide](../../../README.md) to build and launch the docker container. Then, use your browswer to open the Jupyter lab interface at :8888/lab using the password provided in the terminal. + + +Notebook list: + +- [gpt2.ipynb](gpt2.ipynb): Step by step walkthrough for building the GPT-2 TensorRT engine. +- [gpt2-playground.ipynb](gpt2-playground.ipynb): GUI for benchmarking GPT-2 TensorRT engines. +- [t5.ipynb](t5.ipynb): Step by step walkthrough for building the T5 TensorRT engine. +- [t5-playground.ipynb](t5-playground.ipynb): GUI for benchmarking T5 TensorRT engines. \ No newline at end of file diff --git a/demo/HuggingFace/notebooks/gpt2-playground.ipynb b/demo/HuggingFace/notebooks/gpt2-playground.ipynb new file mode 100644 index 00000000..60a7dfec --- /dev/null +++ b/demo/HuggingFace/notebooks/gpt2-playground.ipynb @@ -0,0 +1,338 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "64974d33-d028-440c-86fa-1a0633b3d31d", + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2021 NVIDIA Corporation. All Rights Reserved.\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License.\n", + "# ==============================================================================" + ] + }, + { + "cell_type": "markdown", + "id": "c3f0ff46-9958-4d57-9067-a64be34e75da", + "metadata": {}, + "source": [ + "\n", + "\n", + "# GPT-2 Playground\n", + "\n", + "This notebook demonstrates the GPT-2 model for open-end text generation.\n", + "\n", + "The TensorRT HuggingFace GPT-2 model is a plug-in replacement for the original PyTorch HuggingFace GPT-2 model.\n", + "\n", + "\n", + "**Notes**: \n", + " - For \"CPU - PyTorch\" and \"GPU - PyTorch\", a GPT-2 small model from HuggingFace model repository is employed. Inference is carried out with PyTorch in FP32 precision. All models run with batch size 1.\n", + "Average run time across 5 runs is reported.\n", + " - Prior to running this notebook, run [gpt2.ipynb](gpt2.ipynb) to download the GPT-2 model and generate the TensorRT engine." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3530e767-7050-4329-a4bc-e2221b9eb578", + "metadata": { + "jupyter": { + "source_hidden": true + }, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2021-10-18 00:51:18,099][OSS][INFO] Reading and loading engine file ./models/gpt2/tensorrt/gpt2.onnx.engine using trt native runner.\n", + "[2021-10-18 00:51:22,581][OSS][DEBUG] Number of profiles detected in engine: 2\n", + "[2021-10-18 00:51:22,585][OSS][DEBUG] Selected profile: [(1, 1), (1, 32), (1, 64)]\n" + ] + } + ], + "source": [ + "import os\n", + "import sys\n", + "ROOT_DIR = os.path.abspath(\"../\")\n", + "sys.path.append(ROOT_DIR)\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "import torch \n", + "\n", + "# huggingface\n", + "from transformers import (\n", + " GPT2LMHeadModel,\n", + " GPT2Tokenizer,\n", + " GPT2Config,\n", + ")\n", + "\n", + "from GPT2.trt import GPT2TRTDecoder, GPT2TRTEngine\n", + "from NNDF.networks import NetworkMetadata, Precision\n", + "from collections import namedtuple \n", + "from GPT2.GPT2ModelConfig import GPT2ModelTRTConfig\n", + "\n", + "# download HuggingFace model and tokernizer\n", + "GPT2_VARIANT = 'gpt2' # choices: gpt2 | gpt2-large\n", + "model = GPT2LMHeadModel.from_pretrained(GPT2_VARIANT)\n", + "config = GPT2Config(GPT2_VARIANT)\n", + "tokenizer = GPT2Tokenizer.from_pretrained(GPT2_VARIANT)\n", + "\n", + "# load TensorRT engine\n", + "metadata=NetworkMetadata(GPT2_VARIANT, Precision('fp16'), None)\n", + "from os.path import exists\n", + "if not exists('./models/gpt2/tensorrt/gpt2.onnx.engine'):\n", + " print(\"Error: TensorRT engine not found at ./models/gpt2/tensorrt/gpt2.onnx.engine. Please run gpt2.ipynb to generate the TensorRT engine first!\")\n", + "else:\n", + " gpt2_engine = GPT2TRTEngine('./models/gpt2/tensorrt/gpt2.onnx.engine', metadata)\n", + " gpt2_trt = GPT2TRTDecoder(gpt2_engine, metadata, config)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "766b8c94-ba8e-47c8-8624-57da462a0496", + "metadata": { + "jupyter": { + "source_hidden": true + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f4971e06250e4ff78206e05b67a0f1d6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Textarea(value='TensorRT is a high performance deep learning inference platform that delivers low latency and …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "bd35da4869764703b4ae4495af4fdbc2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Textarea(value='...', description='GPT-2:', layout=Layout(width='auto'), placeholder='GPT-2 generated text', r…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6bd69c0458a8426f904a6672a1d885c0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "RadioButtons(description='Device:', options=('CPU - PyTorch', 'GPU - PyTorch', 'GPU - TensorRT'), value='CPU -…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "01dabe5d2c4340d3b163d6b680776755", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(Button(description='Generate', style=ButtonStyle()),), layout=Layout(align_items='center', disp…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "865124339ace4e0986b07f516bdac937", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "IntProgress(value=0, description='Progress:', layout=Layout(height='50px', width='100%'), max=6, style=Progres…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "975e84c56f1f43578abaf8260a38aee5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import ipywidgets as widgets\n", + "import numpy as np\n", + "import time\n", + "\n", + "device = widgets.RadioButtons(\n", + " options=['CPU - PyTorch', \n", + " 'GPU - PyTorch', \n", + " 'GPU - TensorRT'],\n", + " description='Device:',\n", + " disabled=False\n", + ")\n", + "\n", + "paragraph_text = widgets.Textarea(\n", + " value='TensorRT is a high performance deep learning inference platform that delivers low latency and high throughput for apps '\\\n", + "'such as recommenders, speech and image/video on NVIDIA GPUs. ',\n", + " placeholder='Type something',\n", + " description='Context:',\n", + " disabled=False,\n", + " layout=widgets.Layout(width=\"auto\"),\n", + " rows=5, \n", + ")\n", + "\n", + "generated_text = widgets.Textarea(\n", + " value='...',\n", + " placeholder='GPT-2 generated text',\n", + " description='GPT-2:',\n", + " disabled=False,\n", + " layout=widgets.Layout(width=\"auto\"),\n", + " rows=5,\n", + ")\n", + "button = widgets.Button(description=\"Generate\")\n", + "\n", + "display(paragraph_text)\n", + "display(generated_text)\n", + "display(device)\n", + "\n", + "from IPython.display import display\n", + "box_layout = widgets.Layout(display='flex',\n", + " flex_flow='column',\n", + " align_items='center',\n", + " width='100%')\n", + "N_RUN = 6\n", + "progress_bar = widgets.IntProgress(\n", + " value=0,\n", + " min=0,\n", + " max=N_RUN,\n", + " description='Progress:',\n", + " bar_style='', # 'success', 'info', 'warning', 'danger' or ''\n", + " style={'bar_color': 'green'},\n", + " orientation='horizontal', \n", + " layout=widgets.Layout(width='100%', height='50px')\n", + ")\n", + "\n", + "box = widgets.HBox(children=[button],layout=box_layout)\n", + "output = widgets.Output()\n", + "display(box)\n", + "display(progress_bar)\n", + "display(output)\n", + "\n", + "def generate(b):\n", + " progress_bar.value = 0\n", + " inference_time_arr = []\n", + " with output:\n", + " if device.value == 'GPU - TensorRT':\n", + " inputs = tokenizer(paragraph_text.value, return_tensors=\"pt\")\n", + " for _ in range(N_RUN):\n", + " start_time = time.time()\n", + " sample_output = gpt2_trt.generate(inputs.input_ids.to('cuda'), max_length=GPT2ModelTRTConfig.MAX_SEQUENCE_LENGTH[GPT2_VARIANT])\n", + " inference_time_arr.append(time.time()-start_time)\n", + " progress_bar.value += 1\n", + "\n", + " # de-tokenize model output to raw text\n", + " text = tokenizer.decode(sample_output[0], skip_special_tokens=True)\n", + " generated_text.value = text\n", + " print(\"GPU - TensorRT - Average inference time: %.2f (ms)\"%(1000*np.mean(inference_time_arr[1:]))) \n", + " \n", + " elif device.value == 'CPU - PyTorch':\n", + " inputs = tokenizer(paragraph_text.value, return_tensors=\"pt\")\n", + " for _ in range(N_RUN):\n", + " start_time = time.time()\n", + " sample_output = model.to('cpu').generate(inputs.input_ids.to('cpu'), max_length=GPT2ModelTRTConfig.MAX_SEQUENCE_LENGTH[GPT2_VARIANT])\n", + " inference_time_arr.append(time.time()-start_time)\n", + " progress_bar.value += 1\n", + "\n", + " # de-tokenize model output to raw text\n", + " text = tokenizer.decode(sample_output[0], skip_special_tokens=True)\n", + " generated_text.value = text\n", + " print(\"CPU - PyTorch - Average inference time: %.2f (ms)\"%(1000*np.mean(inference_time_arr[1:])))\n", + " \n", + " elif device.value == 'GPU - PyTorch': \n", + " inputs = tokenizer(paragraph_text.value, return_tensors=\"pt\")\n", + " for _ in range(N_RUN):\n", + " start_time = time.time()\n", + " sample_output = model.to('cuda').generate(inputs.input_ids.to('cuda'), max_length=GPT2ModelTRTConfig.MAX_SEQUENCE_LENGTH[GPT2_VARIANT])\n", + " inference_time_arr.append(time.time()-start_time)\n", + " progress_bar.value += 1\n", + "\n", + " # de-tokenize model output to raw text\n", + " text = tokenizer.decode(sample_output[0], skip_special_tokens=True)\n", + " generated_text.value = text\n", + " print(\"GPU - PyTorch - Average inference time: %.2f (ms)\"%(1000*np.mean(inference_time_arr[1:]))) \n", + " \n", + "button.on_click(generate)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58f473c0-6682-41af-8040-72f0a9472b0f", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/demo/HuggingFace/notebooks/gpt2.ipynb b/demo/HuggingFace/notebooks/gpt2.ipynb new file mode 100644 index 00000000..d38f1cd6 --- /dev/null +++ b/demo/HuggingFace/notebooks/gpt2.ipynb @@ -0,0 +1,479 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "28e6e614-e360-4292-965e-0d255027e9b9", + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2021 NVIDIA Corporation. All Rights Reserved.\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License.\n", + "# ==============================================================================" + ] + }, + { + "cell_type": "markdown", + "id": "84d86fae-c008-44c1-a9e9-43145094a333", + "metadata": {}, + "source": [ + "\n", + "\n", + "# Accelerating HuggingFace GPT-2 Inference with TensorRT\n", + "\n", + "GPT-2 is a transformers model pretrained on a very large corpus of English data in a self-supervised fashion. The model was pretrained on the raw texts to predict the next word in sentences. As no human labeling was required, GPT-2 pretraining can use lots of publicly available data with an automatic process to generate inputs and labels from those data.\n", + "\n", + "This notebook shows 3 easy steps to convert a [HuggingFace PyTorch GPT-2 model](https://huggingface.co/gpt2) to a TensorRT engine for high-performance inference.\n", + "\n", + "1. [Download HuggingFace GPT-2 model ](#1)\n", + "1. [Convert to ONNX format](#2)\n", + "1. [Convert to TensorRT engine](#3)\n", + "\n", + "## Prerequisite\n", + "\n", + "Follow the instruction at https://github.com/NVIDIA/TensorRT to build the TensorRT-OSS docker container required to run this notebook.\n", + "\n", + "Next, we install some extra dependencies and restart the Jupyter kernel." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a29dac08-e043-4310-9eb1-2ab989fa285a", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "!pip3 install -r ../requirements.txt\n", + "\n", + "# install Pytorch with A100 support\n", + "!pip3 install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio===0.9.1 -f https://download.pytorch.org/whl/torch_stable.html\n", + "\n", + "import IPython\n", + "import time\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)\n", + "\n", + "time.sleep(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "235d2f1b-439e-4cd0-8286-1d63a13f2cf3", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "ROOT_DIR = os.path.abspath(\"../\")\n", + "sys.path.append(ROOT_DIR)\n", + "\n", + "import torch \n", + "\n", + "# huggingface\n", + "from transformers import (\n", + " GPT2LMHeadModel,\n", + " GPT2Tokenizer,\n", + " GPT2Config,\n", + ")\n", + "\n", + "# to display detailed TensorRT conversion process\n", + "from NNDF.logger import G_LOGGER\n", + "G_LOGGER.setLevel(level=G_LOGGER.DEBUG)" + ] + }, + { + "cell_type": "markdown", + "id": "af4254e2-11fd-4bc7-ac0b-60b1a9e07c4e", + "metadata": {}, + "source": [ + "\n", + "\n", + "## 1. Download HuggingFace GPT-2 model \n", + "\n", + "First, we download the original HuggingFace PyTorch GPT-2 model from HuggingFace model hubs, together with its associated tokernizer.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fae66d58-f994-4987-8f1d-1fa8ac2ec8b4", + "metadata": {}, + "outputs": [], + "source": [ + "# download model and tokernizer\n", + "# The GPT-2 variants supported by TensorRT 8.2 are: gpt2 (117M), gpt2-large (774M). However, as the conversion process takes long time with\n", + "# gpt2-large, we recommend using the ../run.py script. See ../README.md for more details.\n", + "GPT2_VARIANT = 'gpt2'\n", + "\n", + "model = GPT2LMHeadModel.from_pretrained(GPT2_VARIANT)\n", + "\n", + "config = GPT2Config(GPT2_VARIANT)\n", + "tokenizer = GPT2Tokenizer.from_pretrained(GPT2_VARIANT)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7252ca90-1104-40dc-8e72-f51c07a4cd11", + "metadata": {}, + "outputs": [], + "source": [ + "# save model locally\n", + "pytorch_model_dir = './models/{}/pytorch'.format(GPT2_VARIANT)\n", + "!mkdir -p $pytorch_model_dir\n", + "\n", + "model.save_pretrained(pytorch_model_dir)\n", + "print(\"Pytorch Model saved to {}\".format(pytorch_model_dir))" + ] + }, + { + "cell_type": "markdown", + "id": "a84c5766-97ed-4d04-bab5-7fa18e89dee8", + "metadata": {}, + "source": [ + "### Inference with PyTorch model\n", + "\n", + "#### Single example inference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e5c5fe7-7733-49b5-89c5-c8278ff54fea", + "metadata": {}, + "outputs": [], + "source": [ + "# carry out inference with a single sample\n", + "inputs = tokenizer(\"TensorRT is a high performance deep learning inference platform that delivers low latency and high throughput for appssuch as recommenders, speech and image/video on NVIDIA GPUs.\", return_tensors=\"pt\")\n", + "\n", + "model.eval()\n", + "with torch.no_grad():\n", + " outputs = model(**inputs, labels=inputs[\"input_ids\"])\n", + "\n", + "logits = outputs.logits" + ] + }, + { + "cell_type": "markdown", + "id": "a6c0468b-976a-4a08-98d3-e87578ec067f", + "metadata": {}, + "source": [ + "For benchmarking purposes, we will employ a helper function `gpt2_inference` which executes the inference on a single batch repeatedly and measures end to end execution time. Let's take note of this execution time for later comparison with TensorRT. \n", + " \n", + "`TimingProfile` is a named tuple that specifies the number of experiments and number of times to call the function per iteration (and number of warm-up calls although it is not used here)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ecdf8f00-0562-482b-9bec-b0b7596aec48", + "metadata": {}, + "outputs": [], + "source": [ + "from GPT2.measurements import gpt2_inference\n", + "from NNDF.networks import TimingProfile\n", + "\n", + "# Benchmarking TensorRT performance on single batch\n", + "output, decoder_e2e_median_time = gpt2_inference(\n", + " model.to('cuda:0'), inputs.input_ids.to('cuda:0'), TimingProfile(iterations=10, number=1, warmup=1)\n", + " )\n", + "decoder_e2e_median_time" + ] + }, + { + "cell_type": "markdown", + "id": "4805756f-81f9-43cf-88f6-b205ecd23034", + "metadata": {}, + "source": [ + "#### Open-end text generation\n", + "Next, we will employ the PyTorch model for the open-end text generation task, which GPT-2 is particularly good at. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c3d01fc-9928-486b-9d15-de84d46528e5", + "metadata": {}, + "outputs": [], + "source": [ + "from GPT2.GPT2ModelConfig import GPT2ModelTRTConfig\n", + "\n", + "sample_output = model.to('cuda:0').generate(inputs.input_ids.to('cuda:0'), max_length=128)\n", + "\n", + "# de-tokenize model output to raw text\n", + "tokenizer.decode(sample_output[0], skip_special_tokens=True)" + ] + }, + { + "cell_type": "markdown", + "id": "0b016c2f-7982-44ac-81e5-d3854391a8b6", + "metadata": {}, + "source": [ + "For benchmarking purposes, we will employ a helper function `full_inference_greedy` which executes the inference repeatedly and measures end to end execution time. Let's take note of this execution time for later comparison with TensorRT. \n", + " \n", + "TimingProfile is a named tuple that specifies the number of experiments and number of times to call the function per iteration (and number of warm-up calls although it is not used here)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93aea249-529e-4b5e-9759-e0c8370391a3", + "metadata": {}, + "outputs": [], + "source": [ + "from GPT2.measurements import full_inference_greedy\n", + "\n", + "# get complete decoder inference result and its timing profile\n", + "sample_output, full_e2e_median_runtime = full_inference_greedy(\n", + " model.to('cuda:0'), inputs.input_ids, TimingProfile(iterations=10, number=1, warmup=1),\n", + " max_length=GPT2ModelTRTConfig.MAX_SEQUENCE_LENGTH[GPT2_VARIANT]\n", + ")\n", + "full_e2e_median_runtime" + ] + }, + { + "cell_type": "markdown", + "id": "0d662701-e430-4fdc-ad46-1f296defcf8f", + "metadata": {}, + "source": [ + "\n", + "\n", + "## 2. Convert to ONNX format\n", + "\n", + "Prior to converting the model to a TensorRT engine, we will first convert the PyTorch model to an intermediate universal format: ONNX.\n", + "\n", + "ONNX is an open format for machine learning and deep learning models. It allows you to convert deep learning and machine learning models from different frameworks such as TensorFlow, PyTorch, MATLAB, Caffe, and Keras to a single format.\n", + "\n", + "At a high level, the steps to convert a PyTorch model to TensorRT are as follows:\n", + "- Convert the pretrained image segmentation PyTorch model into ONNX.\n", + "- Import the ONNX model into TensorRT.\n", + "- Apply optimizations and generate an engine.\n", + "- Perform inference on the GPU with the TensorRT engine. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2b2be1a-021c-4f6c-957d-2ff7d1b95976", + "metadata": {}, + "outputs": [], + "source": [ + "from GPT2.export import GPT2TorchFile\n", + "from GPT2.GPT2ModelConfig import GPT2ModelTRTConfig\n", + "from GPT2.GPT2ModelConfig import GPT2Metadata\n", + "from NNDF.networks import NetworkMetadata, Precision" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7144d206-c690-4d4c-b590-3eb25e31d106", + "metadata": {}, + "outputs": [], + "source": [ + "metadata=NetworkMetadata(variant=GPT2_VARIANT, precision=Precision(fp16=True), other=GPT2Metadata(kv_cache=False))\n", + "gpt2 = GPT2TorchFile(model.to('cpu'), metadata)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbaa89e4-e83d-4380-a6f8-932fcfeb64d3", + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p ./models/$GPT2_VARIANT/ONNX\n", + "\n", + "onnx_path = ('./models/{}/ONNX/model.onnx'.format(GPT2_VARIANT))\n", + "gpt2.as_onnx_model(onnx_path, force_overwrite=False)\n", + "\n", + "del model" + ] + }, + { + "cell_type": "markdown", + "id": "7baf007e-5508-485c-a87f-9bfe16260452", + "metadata": {}, + "source": [ + "\n", + "\n", + "## 3. Convert to TensorRT engine\n", + "\n", + "Now we are ready to parse the ONNX model and convert it to an optimized TensorRT engine.\n", + "\n", + "**Note:** As TensorRT carries out many optimizations, this conversion process might take a while." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "037ac958-2627-439c-9db5-27640e3f7967", + "metadata": {}, + "outputs": [], + "source": [ + "from GPT2.export import GPT2ONNXFile" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6bd6e3fc-6797-46b0-a211-ce42d3769105", + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p ./models/$GPT2_VARIANT/tensorrt\n", + "trt_path = './models/{}/tensorrt/{}.onnx.engine'.format(GPT2_VARIANT, GPT2_VARIANT)\n", + "gpt2_engine = GPT2ONNXFile(onnx_path, metadata).as_trt_engine(trt_path)" + ] + }, + { + "cell_type": "markdown", + "id": "74f7f6fc-1e6a-4ddc-8e9b-543d9e8dab4d", + "metadata": {}, + "source": [ + "### Inference with TensorRT engine\n", + "\n", + "Great, if you have reached this stage, it means we now have an optimized TensorRT engine for the GPT-2 model, ready for us to carry out inference. \n", + "\n", + "The GPT-2 model with TensorRT backend can now be employed in place of the original HuggingFace GPT-2 model." + ] + }, + { + "cell_type": "markdown", + "id": "54ae13aa-bf6f-4eb7-a453-389865562ae4", + "metadata": {}, + "source": [ + "#### Single batch inference\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "343b58f1-3d9f-4844-85c9-73058bd36a83", + "metadata": {}, + "outputs": [], + "source": [ + "from GPT2.trt import GPT2TRTDecoder\n", + "\n", + "gpt2_trt = GPT2TRTDecoder(gpt2_engine, metadata, config)\n", + "\n", + "outputs = gpt2_trt(inputs.input_ids)\n", + "logits = outputs.logits" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28fc60ad-73a7-46df-85d7-a292a8abbd80", + "metadata": {}, + "outputs": [], + "source": [ + "# Benchmarking TensorRT performance on single batch\n", + "output, decoder_e2e_median_time = gpt2_inference(\n", + " gpt2_trt, inputs.input_ids, TimingProfile(iterations=10, number=1, warmup=1)\n", + " )\n", + "decoder_e2e_median_time" + ] + }, + { + "cell_type": "markdown", + "id": "22122064-5a17-4990-bd6b-073fca5a3e9b", + "metadata": {}, + "source": [ + "#### Open-end text generation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "848bffb8-a7a4-4fcb-91c9-f4e9f7263e6c", + "metadata": {}, + "outputs": [], + "source": [ + "sample_output = gpt2_trt.generate(inputs.input_ids.to('cuda:0'), max_length=GPT2ModelTRTConfig.MAX_SEQUENCE_LENGTH['gpt2'])\n", + "\n", + "# de-tokenize model output to raw text\n", + "tokenizer.decode(sample_output[0], skip_special_tokens=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4c8bc4c-bf3e-4cb5-afc6-c0bd7d8655cb", + "metadata": {}, + "outputs": [], + "source": [ + "# get complete decoder inference result and its timing profile\n", + "sample_output, full_e2e_median_runtime = full_inference_greedy(\n", + " gpt2_trt, inputs.input_ids, TimingProfile(iterations=10, number=1, warmup=1),\n", + " max_length=GPT2ModelTRTConfig.MAX_SEQUENCE_LENGTH['gpt2']\n", + ")\n", + "full_e2e_median_runtime" + ] + }, + { + "cell_type": "markdown", + "id": "6b68a915-2c32-49e5-b1f6-e93d7618f637", + "metadata": {}, + "source": [ + "You can now compare the output of the original PyTorch model and the TensorRT engine. Notice the speed difference." + ] + }, + { + "cell_type": "markdown", + "id": "cbfc6c04-ca47-4fc6-9a12-ed500722bb4a", + "metadata": {}, + "source": [ + "## Conclusion and where-to next?\n", + "\n", + "This notebook has walked you through the process of converting a HuggingFace PyTorch GPT-2 model to an optimized TensorRT engine for inference in 3 easy steps. The TensorRT inference engine can be conviniently used as a drop-in replacement for the orginial HuggingFace GPT-2 model while providing significant speed up. \n", + "\n", + "Launch [gpt2-playground.ipynb](gpt2-playground.ipynb) for a text generation playground with GPT-2.\n", + "\n", + "If you are interested in further details of the conversion process, check out [GPT2/trt.py](../GPT2/trt.py)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a101274-cde3-4c7e-affc-a7424eda7b0f", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/demo/HuggingFace/notebooks/t5-playground.ipynb b/demo/HuggingFace/notebooks/t5-playground.ipynb new file mode 100644 index 00000000..261b7df7 --- /dev/null +++ b/demo/HuggingFace/notebooks/t5-playground.ipynb @@ -0,0 +1,278 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "64974d33-d028-440c-86fa-1a0633b3d31d", + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2021 NVIDIA Corporation. All Rights Reserved.\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License.\n", + "# ==============================================================================" + ] + }, + { + "cell_type": "markdown", + "id": "c3f0ff46-9958-4d57-9067-a64be34e75da", + "metadata": {}, + "source": [ + "\n", + "\n", + "# T5 Playground\n", + "\n", + "This notebook demonstrates T5 model on the task of translation and text summarization.\n", + "\n", + "The TensorRT HuggingFace T5 model is a plug-in replacement for the original PyTorch HuggingFace T5 model.\n", + "\n", + "\n", + "\n", + "**Notes**: \n", + " - For \"CPU - PyTorch\" and \"GPU - PyTorch\", a T5 small model from HuggingFace model repository is employed. Inference is carried out with PyTorch in FP32 precision. All models run with batch size 1.\n", + "Average run time across 5 runs is reported.\n", + " - Prior to running this notebook, run [t5.ipynb](t5.ipynb) to download the T5 model and generate the TensorRT engine." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3530e767-7050-4329-a4bc-e2221b9eb578", + "metadata": { + "jupyter": { + "source_hidden": true + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "ROOT_DIR = os.path.abspath(\"../\")\n", + "sys.path.append(ROOT_DIR)\n", + "\n", + "import torch \n", + "\n", + "# huggingface\n", + "from transformers import (\n", + " T5ForConditionalGeneration,\n", + " T5Tokenizer,\n", + " T5Config,\n", + ")\n", + "\n", + "# download HuggingFace model and tokernizer\n", + "T5_VARIANT = 't5-small'\n", + "\n", + "t5_model = T5ForConditionalGeneration.from_pretrained(T5_VARIANT)\n", + "tokenizer = T5Tokenizer.from_pretrained(T5_VARIANT)\n", + "config = T5Config(T5_VARIANT)\n", + "\n", + "# load TensorRT engine\n", + "from T5.trt import T5TRTEncoder, T5TRTDecoder, TRTHFRunner\n", + "from T5.T5ModelConfig import T5ModelTRTConfig\n", + "from T5.export import T5DecoderTRTEngine, T5EncoderTRTEngine\n", + "from NNDF.networks import NetworkMetadata, Precision\n", + "\n", + "from transformers.generation_stopping_criteria import (\n", + " MaxLengthCriteria,\n", + " StoppingCriteriaList,\n", + ")\n", + "\n", + "tfm_config = T5Config(\n", + " use_cache=True,\n", + " num_layers=T5ModelTRTConfig.NUMBER_OF_LAYERS[T5_VARIANT],\n", + ")\n", + "metadata=NetworkMetadata(T5_VARIANT, Precision('fp16'), None)\n", + "\n", + "from os.path import exists\n", + "encoder_path = './models/{}/tensorrt/{}-encoder.onnx.engine'.format(T5_VARIANT,T5_VARIANT)\n", + "if not exists(encoder_path):\n", + " print(\"Error: TensorRT engine not found at {}. Please run t5.ipynb to generate the TensorRT engine first!\".format(encoder_path))\n", + "else:\n", + " encoder_engine = T5DecoderTRTEngine('./models/{}/tensorrt/{}-encoder.onnx.engine'.format(T5_VARIANT,T5_VARIANT), metadata)\n", + " decoder_engine = T5DecoderTRTEngine('./models/{}/tensorrt/{}-decoder-with-lm-head.onnx.engine'.format(T5_VARIANT,T5_VARIANT), metadata)\n", + "\n", + "t5_trt_encoder = T5TRTEncoder(encoder_engine, metadata, tfm_config)\n", + "t5_trt_decoder = T5TRTDecoder(decoder_engine, metadata, tfm_config)\n", + "\n", + "decoder_input_ids = torch.full(\n", + " (1, 1), tokenizer.convert_tokens_to_ids(tokenizer.pad_token), dtype=torch.int32\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "766b8c94-ba8e-47c8-8624-57da462a0496", + "metadata": { + "jupyter": { + "source_hidden": true + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import ipywidgets as widgets\n", + "import numpy as np\n", + "import time\n", + "\n", + "device = widgets.RadioButtons(\n", + " options=['CPU - PyTorch', \n", + " 'GPU - PyTorch', \n", + " 'GPU - TensorRT'],\n", + " description='Device:',\n", + " disabled=False\n", + ")\n", + "\n", + "task = widgets.RadioButtons(\n", + " options=['En -> German', \n", + " 'Summarize', \n", + " ],\n", + " description='Task:',\n", + " disabled=False\n", + ")\n", + "\n", + "paragraph_text = widgets.Textarea(\n", + " value='TensorRT is a high performance deep learning inference platform that delivers low latency and high throughput for apps'\\\n", + " 'such as recommenders, speech and image/video on NVIDIA GPUs. It includes parsers to import models, and plugins to support novel ops'\\\n", + " 'and layers before applying optimizations for inference. Today NVIDIA is open-sourcing parsers and plugins in TensorRT so that the deep'\\\n", + " 'learning community can customize and extend these components to take advantage of powerful TensorRT optimizations for your apps.',\n", + " placeholder='Type something',\n", + " description='Context:',\n", + " disabled=False,\n", + " layout=widgets.Layout(width=\"auto\"),\n", + " rows=5, \n", + ")\n", + "\n", + "\n", + "generated_text = widgets.Textarea(\n", + " value='...',\n", + " placeholder='Context',\n", + " description='T5 output:',\n", + " disabled=False,\n", + " layout=widgets.Layout(width=\"auto\"),\n", + " rows=5,\n", + ")\n", + "button = widgets.Button(description=\"Generate\")\n", + "\n", + "display(paragraph_text)\n", + "display(generated_text)\n", + "display(device)\n", + "display(task)\n", + "\n", + "from IPython.display import display\n", + "box_layout = widgets.Layout(display='flex',\n", + " flex_flow='column',\n", + " align_items='center',\n", + " width='100%')\n", + "N_RUN = 6\n", + "progress_bar = widgets.IntProgress(\n", + " value=0,\n", + " min=0,\n", + " max=N_RUN,\n", + " description='Progress:',\n", + " bar_style='', # 'success', 'info', 'warning', 'danger' or ''\n", + " style={'bar_color': 'green'},\n", + " orientation='horizontal', \n", + " layout=widgets.Layout(width='100%', height='50px')\n", + ")\n", + "\n", + "box = widgets.HBox(children=[button],layout=box_layout)\n", + "output = widgets.Output()\n", + "display(box)\n", + "display(progress_bar)\n", + "display(output)\n", + "\n", + "MAX_LENGTH = 256\n", + "\n", + "def generate(b):\n", + " progress_bar.value = 0\n", + " inference_time_arr = []\n", + " prefix = 'translate English to German' if task.value=='En -> German' else 'summarize'\n", + " inputs = tokenizer(\"{}: {}\".format(prefix, paragraph_text.value), return_tensors=\"pt\")\n", + " with output:\n", + " if device.value == 'GPU - TensorRT':\n", + " for _ in range(N_RUN):\n", + " start_time = time.time()\n", + " encoder_last_hidden_state = t5_trt_encoder(input_ids=inputs.input_ids)\n", + " outputs = t5_trt_decoder.greedy_search(\n", + " input_ids=decoder_input_ids,\n", + " encoder_hidden_states=encoder_last_hidden_state,\n", + " stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(MAX_LENGTH)])\n", + " )\n", + " inference_time_arr.append(time.time()-start_time)\n", + " progress_bar.value += 1\n", + "\n", + " # de-tokenize model output to raw text\n", + " text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n", + " generated_text.value = text\n", + " print(\"GPU - TensorRT - Average inference time: %.2f (ms)\"%(1000*np.mean(inference_time_arr[1:]))) \n", + " \n", + " elif device.value == 'CPU - PyTorch':\n", + " for _ in range(N_RUN):\n", + " start_time = time.time()\n", + " outputs = t5_model.to('cpu').generate(inputs.input_ids.to('cpu'), max_length=MAX_LENGTH)\n", + " inference_time_arr.append(time.time()-start_time)\n", + " progress_bar.value += 1\n", + "\n", + " # de-tokenize model output to raw text\n", + " text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n", + " generated_text.value = text\n", + " print(\"CPU - PyTorch - Average inference time: %.2f (ms)\"%(1000*np.mean(inference_time_arr[1:])))\n", + " \n", + " elif device.value == 'GPU - PyTorch': \n", + " for _ in range(N_RUN):\n", + " start_time = time.time()\n", + " outputs = t5_model.to('cuda:0').generate(inputs.input_ids.to('cuda:0'), max_length=MAX_LENGTH)\n", + " inference_time_arr.append(time.time()-start_time)\n", + " progress_bar.value += 1\n", + "\n", + " # de-tokenize model output to raw text\n", + " text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n", + " generated_text.value = text\n", + " print(\"GPU - PyTorch - Average inference time: %.2f (ms)\"%(1000*np.mean(inference_time_arr[1:]))) \n", + " \n", + "button.on_click(generate)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58f473c0-6682-41af-8040-72f0a9472b0f", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/demo/HuggingFace/notebooks/t5.ipynb b/demo/HuggingFace/notebooks/t5.ipynb new file mode 100644 index 00000000..0c6668e7 --- /dev/null +++ b/demo/HuggingFace/notebooks/t5.ipynb @@ -0,0 +1,627 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "28e6e614-e360-4292-965e-0d255027e9b9", + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2021 NVIDIA Corporation. All Rights Reserved.\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License.\n", + "# ==============================================================================" + ] + }, + { + "cell_type": "markdown", + "id": "235fd3dc-5c53-4092-a789-72793b41f8c5", + "metadata": {}, + "source": [ + "\n", + "\n", + "# Accelerating HuggingFace T5 Inference with TensorRT\n", + "\n", + "T5 is an encoder-decoder model that converts all NLP problems into a text-to-text format. More specifically, it does so by encoding different tasks as text directives in the input stream. This enables a single model to be trained supervised on a wide variety of NLP tasks such as translation, classification, Q&A and summarization.\n", + "\n", + "This notebook shows 3 easy steps to convert a [HuggingFace PyTorch T5 model](https://huggingface.co/transformers/model_doc/t5.html) to a TensorRT engine for high-performance inference.\n", + "\n", + "1. [Download HuggingFace T5 model](#1)\n", + "1. [Convert to ONNX format](#2)\n", + "1. [Convert to TensorRT engine](#3)\n", + "\n", + "## Prerequisite\n", + "\n", + "Follow the instruction at https://github.com/NVIDIA/TensorRT to build the TensorRT-OSS docker container required to run this notebook.\n", + "\n", + "Next, we install some extra dependencies, then restart the kernel." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "886a08cc-5f39-434d-a5ae-91059836fc37", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "!pip3 install -r ../requirements.txt\n", + "\n", + "# install Pytorch with A100 support\n", + "!pip3 install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio===0.9.1 -f https://download.pytorch.org/whl/torch_stable.html\n", + "\n", + "import IPython\n", + "import time\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)\n", + "\n", + "time.sleep(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "235d2f1b-439e-4cd0-8286-1d63a13f2cf3", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "ROOT_DIR = os.path.abspath(\"../\")\n", + "sys.path.append(ROOT_DIR)\n", + "\n", + "import torch\n", + "import tensorrt as trt\n", + "\n", + "# huggingface\n", + "from transformers import (\n", + " T5ForConditionalGeneration,\n", + " T5Tokenizer,\n", + " T5Config,\n", + ")\n", + "\n", + "# to display detailed TensorRT conversion process\n", + "from NNDF.logger import G_LOGGER\n", + "G_LOGGER.setLevel(level=G_LOGGER.DEBUG)" + ] + }, + { + "cell_type": "markdown", + "id": "af4254e2-11fd-4bc7-ac0b-60b1a9e07c4e", + "metadata": {}, + "source": [ + "\n", + "\n", + "## 1. Download HuggingFace T5 model\n", + "\n", + "First, we download the original HuggingFace PyTorch T5 model from HuggingFace model hubs, together with its associated tokernizer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fae66d58-f994-4987-8f1d-1fa8ac2ec8b4", + "metadata": {}, + "outputs": [], + "source": [ + "# The T5 variants that are suported by TensorRT 8 are: t5-small (60M), t5-base (220M), t5-large (770M). However, as the conversion process takes long time with\n", + "# the base and large models, we recommend using the ../run.py script. See ../README.md for more details.\n", + "T5_VARIANT = 't5-small'\n", + "\n", + "t5_model = T5ForConditionalGeneration.from_pretrained(T5_VARIANT)\n", + "tokenizer = T5Tokenizer.from_pretrained(T5_VARIANT)\n", + "config = T5Config(T5_VARIANT)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7252ca90-1104-40dc-8e72-f51c07a4cd11", + "metadata": {}, + "outputs": [], + "source": [ + "# save model locally\n", + "pytorch_model_dir = './models/{}/pytorch'.format(T5_VARIANT)\n", + "!mkdir -p $pytorch_model_dir\n", + "\n", + "t5_model.save_pretrained(pytorch_model_dir)\n", + "print(\"Pytorch Model saved to {}\".format(pytorch_model_dir))" + ] + }, + { + "cell_type": "markdown", + "id": "11ea023d-c4d4-43bb-9d77-c76684e0b06f", + "metadata": {}, + "source": [ + "### Inference with PyTorch model\n", + "\n", + "Next, we will carry out inference with the PyTorch model.\n", + "\n", + "#### Single example inference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc45d9bc-b6ef-485e-8832-6628c292e315", + "metadata": {}, + "outputs": [], + "source": [ + "inputs = tokenizer(\"translate English to German: That is good.\", return_tensors=\"pt\")\n", + "\n", + "# inference on a single example\n", + "t5_model.eval()\n", + "with torch.no_grad():\n", + " outputs = t5_model(**inputs, labels=inputs[\"input_ids\"])\n", + "\n", + "logits = outputs.logits" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98f7fd8b-2ee3-4d25-9204-7713eb7e90b3", + "metadata": {}, + "outputs": [], + "source": [ + "# Generate sequence for an input\n", + "outputs = t5_model.to('cuda:0').generate(inputs.input_ids.to('cuda:0'))\n", + "print(tokenizer.decode(outputs[0], skip_special_tokens=True))" + ] + }, + { + "cell_type": "markdown", + "id": "667fcacc-02cb-415d-a9ff-2d2ec44ef225", + "metadata": {}, + "source": [ + "#### Model inference benchmark: encoder and decoder stacks\n", + "\n", + "For benchmarking purposes, we will employ a helper functions `encoder_inference` and `decoder_inference` which execute the inference repeatedly for the T5 encoder and decoder stacks separately, and measure end to end execution time. Let's take note of this execution time for comparison with TensorRT. \n", + " \n", + "`TimingProfile` is a named tuple that specifies the number of experiments and number of times to call the function per iteration (and number of warm-up calls although it is not used here)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "596ea542-d9e5-4367-b643-d60027fa05e6", + "metadata": {}, + "outputs": [], + "source": [ + "from T5.measurements import decoder_inference, encoder_inference, full_inference_greedy\n", + "from T5.export import T5EncoderTorchFile, T5DecoderTorchFile\n", + "from NNDF.networks import TimingProfile\n", + "\n", + "t5_torch_encoder = T5EncoderTorchFile.TorchModule(t5_model.encoder)\n", + "t5_torch_decoder = T5DecoderTorchFile.TorchModule(\n", + " t5_model.decoder, t5_model.lm_head, t5_model.config\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be755fbc-c53e-4f8d-a9c2-4817167cf93a", + "metadata": {}, + "outputs": [], + "source": [ + "input_ids = inputs.input_ids\n", + "\n", + "encoder_last_hidden_state, encoder_e2e_median_time = encoder_inference(\n", + " t5_torch_encoder, input_ids, TimingProfile(iterations=10, number=1, warmup=1)\n", + ")\n", + "encoder_e2e_median_time" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "960f05fc-f572-4832-ad82-8a75823866b1", + "metadata": {}, + "outputs": [], + "source": [ + "_, decoder_e2e_median_time = decoder_inference(\n", + " t5_torch_decoder, input_ids, encoder_last_hidden_state, TimingProfile(iterations=10, number=1, warmup=1)\n", + ")\n", + "decoder_e2e_median_time" + ] + }, + { + "cell_type": "markdown", + "id": "a99d5a06-a8f5-4ce7-a34c-bc42f07ac706", + "metadata": {}, + "source": [ + "#### Full model inference and benchmark\n", + "\n", + "Next, we will try the T5 model for the task of translation from English to German.\n", + "\n", + "For benchmarking purposes, we will employ a helper function `full_inference_greedy` which executes the inference repeatedly and measures end to end execution time. Let's take note of this execution time for comparison with TensorRT. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39d511cf-d963-4629-be54-22e9a258716d", + "metadata": {}, + "outputs": [], + "source": [ + "from T5.T5ModelConfig import T5ModelTRTConfig\n", + "\n", + "decoder_output_greedy, full_e2e_median_runtime = full_inference_greedy(\n", + " t5_torch_encoder,\n", + " t5_torch_decoder,\n", + " input_ids,\n", + " tokenizer,\n", + " TimingProfile(iterations=10, number=1, warmup=1),\n", + " max_length=T5ModelTRTConfig.MAX_SEQUENCE_LENGTH[T5_VARIANT],\n", + ")\n", + "full_e2e_median_runtime" + ] + }, + { + "cell_type": "markdown", + "id": "8cff48fc-b792-4852-b638-6e2c54099cb2", + "metadata": {}, + "source": [ + "Let us decode the model's output back into text." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "839bc6bc-65dc-499d-ac26-81456dbc1748", + "metadata": {}, + "outputs": [], + "source": [ + "# De-tokenize output to raw text\n", + "print(tokenizer.decode(decoder_output_greedy[0], skip_special_tokens=True))" + ] + }, + { + "cell_type": "markdown", + "id": "0d662701-e430-4fdc-ad46-1f296defcf8f", + "metadata": {}, + "source": [ + "\n", + "\n", + "## 2. Convert to ONNX\n", + "\n", + "Prior to converting the model to a TensorRT engine, we will first convert the PyTorch model to an intermediate universal format.\n", + "\n", + "ONNX is an open format for machine learning and deep learning models. It allows you to convert deep learning and machine learning models from different frameworks such as TensorFlow, PyTorch, MATLAB, Caffe, and Keras to a single format.\n", + "\n", + "The steps to convert a PyTorch model to TensorRT are as follows:\n", + "- Convert the pretrained image segmentation PyTorch model into ONNX.\n", + "- Import the ONNX model into TensorRT.\n", + "- Apply optimizations and generate an engine.\n", + "- Perform inference on the GPU. \n", + "\n", + "For the T5 model, we will convert the encoder and decoder seperately." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2b2be1a-021c-4f6c-957d-2ff7d1b95976", + "metadata": {}, + "outputs": [], + "source": [ + "# helpers\n", + "from NNDF.networks import NetworkMetadata, Precision\n", + "from T5.T5ModelConfig import T5Metadata" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c50346f7-6c2c-4e4b-ba70-875688947b75", + "metadata": {}, + "outputs": [], + "source": [ + "onnx_model_path = './models/{}/ONNX'.format(T5_VARIANT)\n", + "!mkdir -p $onnx_model_path\n", + "\n", + "metadata=NetworkMetadata(T5_VARIANT, precision=Precision(fp16=False), other=T5Metadata(kv_cache=False))\n", + "\n", + "encoder_onnx_model_fpath = T5_VARIANT + \"-encoder.onnx\"\n", + "decoder_onnx_model_fpath = T5_VARIANT + \"-decoder-with-lm-head.onnx\"\n", + "\n", + "t5_encoder = T5EncoderTorchFile(t5_model.to('cpu'), metadata)\n", + "t5_decoder = T5DecoderTorchFile(t5_model.to('cpu'), metadata)\n", + "\n", + "onnx_t5_encoder = t5_encoder.as_onnx_model(\n", + " os.path.join(onnx_model_path, encoder_onnx_model_fpath), force_overwrite=False\n", + ")\n", + "onnx_t5_decoder = t5_decoder.as_onnx_model(\n", + " os.path.join(onnx_model_path, decoder_onnx_model_fpath), force_overwrite=False\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0440bdc7-3d1e-4eeb-b57d-0101d35c3a81", + "metadata": {}, + "outputs": [], + "source": [ + "del t5_model, onnx_t5_encoder, onnx_t5_decoder" + ] + }, + { + "cell_type": "markdown", + "id": "7baf007e-5508-485c-a87f-9bfe16260452", + "metadata": {}, + "source": [ + "\n", + "\n", + "## 3. Convert to TensorRT\n", + "\n", + "Now we are ready to parse the ONNX encoder and decoder models and convert them to optimized TensorRT engines.\n", + "\n", + "**Note:** As TensorRT carries out many optimizations, this conversion process might take a while." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "037ac958-2627-439c-9db5-27640e3f7967", + "metadata": {}, + "outputs": [], + "source": [ + "from T5.export import T5DecoderONNXFile, T5EncoderONNXFile" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6bd6e3fc-6797-46b0-a211-ce42d3769105", + "metadata": {}, + "outputs": [], + "source": [ + "tensorrt_model_path = './models/{}/tensorrt'.format(T5_VARIANT)\n", + "!rm -rf $tensorrt_model_path\n", + "!mkdir -p $tensorrt_model_path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cfb64120-9012-40c8-b1e2-4a6366b71294", + "metadata": {}, + "outputs": [], + "source": [ + "t5_trt_encoder_engine = T5EncoderONNXFile(\n", + " os.path.join(onnx_model_path, encoder_onnx_model_fpath), metadata\n", + " ).as_trt_engine(os.path.join(tensorrt_model_path, encoder_onnx_model_fpath) + \".engine\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b402b7cf-9c8a-4db1-9ca1-3a776adb67c4", + "metadata": {}, + "outputs": [], + "source": [ + "t5_trt_decoder_engine = T5DecoderONNXFile(\n", + " os.path.join(onnx_model_path, decoder_onnx_model_fpath), metadata\n", + " ).as_trt_engine(os.path.join(tensorrt_model_path, decoder_onnx_model_fpath) + \".engine\")" + ] + }, + { + "cell_type": "markdown", + "id": "74f7f6fc-1e6a-4ddc-8e9b-543d9e8dab4d", + "metadata": {}, + "source": [ + "### Inference with TensorRT engine\n", + "\n", + "Great, if you have reached this stage, it means we now have an optimized TensorRT engine for the T5 model, ready for us to carry out inference. \n", + "\n", + "#### Single example inference\n", + "The T5 model with TensorRT backend can now be employed in place of the original HuggingFace T5 model.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3954f2f4-c393-463b-a44b-3e5335032b57", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize TensorRT engines\n", + "from T5.trt import T5TRTEncoder, T5TRTDecoder\n", + "\n", + "tfm_config = T5Config(\n", + " use_cache=True,\n", + " num_layers=T5ModelTRTConfig.NUMBER_OF_LAYERS[T5_VARIANT],\n", + ")\n", + " \n", + "t5_trt_encoder = T5TRTEncoder(\n", + " t5_trt_encoder_engine, metadata, tfm_config\n", + " )\n", + "t5_trt_decoder = T5TRTDecoder(\n", + " t5_trt_decoder_engine, metadata, tfm_config\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9544ecb-2671-4b53-a544-08f13424cefe", + "metadata": {}, + "outputs": [], + "source": [ + "# Inference on a single sample\n", + "encoder_last_hidden_state = t5_trt_encoder(input_ids=input_ids)\n", + "outputs = t5_trt_decoder(input_ids, encoder_last_hidden_state)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d71a327-546f-4b5b-bd42-caaffcceafc7", + "metadata": {}, + "outputs": [], + "source": [ + "# Generate sequence for an input\n", + "from transformers.generation_stopping_criteria import (\n", + " MaxLengthCriteria,\n", + " StoppingCriteriaList,\n", + ")\n", + "\n", + "max_length = 64\n", + "\n", + "decoder_input_ids = torch.full(\n", + " (1, 1), tokenizer.convert_tokens_to_ids(tokenizer.pad_token), dtype=torch.int32\n", + ")\n", + "\n", + "encoder_last_hidden_state = t5_trt_encoder(input_ids=input_ids)\n", + "\n", + "outputs = t5_trt_decoder.greedy_search(\n", + " input_ids=decoder_input_ids,\n", + " encoder_hidden_states=encoder_last_hidden_state,\n", + " stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length)])\n", + " )\n", + "print(tokenizer.decode(outputs[0], skip_special_tokens=True))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc79e58f-3e8d-4d71-abeb-8f955654f9fb", + "metadata": {}, + "outputs": [], + "source": [ + "outputs" + ] + }, + { + "cell_type": "markdown", + "id": "ed9d4a98-b034-470e-a9f8-096d4100b8d4", + "metadata": {}, + "source": [ + "#### TRT engine inference benchmark: encoder and decoder stacks\n", + "First, we will bechmark the encoder and decoder stacks as before." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70b37591-4398-40ff-8a39-5f75347192dc", + "metadata": {}, + "outputs": [], + "source": [ + "encoder_last_hidden_state, encoder_e2e_median_time = encoder_inference(\n", + " t5_trt_encoder, input_ids, TimingProfile(iterations=10, number=1, warmup=1),\n", + ")\n", + "encoder_e2e_median_time\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e5459da-a01b-4894-88dc-01b3637ded53", + "metadata": {}, + "outputs": [], + "source": [ + "_, decoder_e2e_median_time = decoder_inference(\n", + " t5_trt_decoder, input_ids, encoder_last_hidden_state, TimingProfile(iterations=10, number=1, warmup=1),\n", + ")\n", + "decoder_e2e_median_time" + ] + }, + { + "cell_type": "markdown", + "id": "62ebfe03-7a60-4dd0-ad32-4e53d6012b07", + "metadata": {}, + "source": [ + "### Full model inference benchmark\n", + "\n", + "Next, we will try the full TensorRT T5 engine for the task of translation. As before, note the time difference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f31cb550-24b9-48cd-a4ec-0bf18ac5e40c", + "metadata": {}, + "outputs": [], + "source": [ + "decoder_output_greedy, full_e2e_median_runtime = full_inference_greedy(\n", + " t5_trt_encoder,\n", + " t5_trt_decoder,\n", + " input_ids,\n", + " tokenizer,\n", + " TimingProfile(iterations=10, number=1, warmup=1),\n", + " max_length=T5ModelTRTConfig.MAX_SEQUENCE_LENGTH[metadata.variant],\n", + " use_cuda=False,\n", + ")\n", + "\n", + "print(tokenizer.decode(decoder_output_greedy[0], skip_special_tokens=True))\n", + "full_e2e_median_runtime\n" + ] + }, + { + "cell_type": "markdown", + "id": "92031643-8ee8-4d50-864b-a08e4d551dc6", + "metadata": {}, + "source": [ + "You can now compare the output of the original PyTorch model and the TensorRT engine. Notice the speed difference." + ] + }, + { + "cell_type": "markdown", + "id": "2a1f5dca-397c-4c8c-9200-61b30cdba824", + "metadata": { + "tags": [] + }, + "source": [ + "## Conclusion and where-to next?\n", + "\n", + "This notebook has walked you through the process of converting a HuggingFace PyTorch T5 model to an optimized TensorRT engine for inference in 3 easy steps. The TensorRT inference engine can be conviniently used as a drop-in replacement for the orginial HuggingFace T5 model while providing significant speed up. \n", + "\n", + "Launch [t5-playground.ipynb](t5-playground.ipynb) for a text translation and summarization playground with T5.\n", + "\n", + "If you are interested in further details of the conversion process, check out [T5/trt.py](../T5/trt.py)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e35ddf10-74a4-4845-a357-a2f07e8a7e08", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}