-
Notifications
You must be signed in to change notification settings - Fork 401
/
Copy pathquickstart_guide.py
292 lines (239 loc) · 12.7 KB
/
quickstart_guide.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
# -*- mode: python -*-
# =============================================================================
# @@-COPYRIGHT-START-@@
#
# Copyright (c) 2024, Qualcomm Innovation Center, Inc. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# SPDX-License-Identifier: BSD-3-Clause
#
# @@-COPYRIGHT-END-@@
# =============================================================================
"""
.. _tutorials-quickstart-guide:
Quickstart Guide
================
In this tutorial, we will go through the end-to-end process of using AIMET and PyTorch to create, calibrate, and export
a simple quantized model. Note that this is intended to show the most basic workflow in AIMET. It is *not* meant to
demonstrate the most state-of-the-art techniques available in AIMET.
Overall flow
------------
1. Define the basic floating-point PyTorch model, training, and eval loops
2. Prepare the trained model for quantization
3. Create quantization simulation (quantsim) model in AIMET to simulate the effects of quantization
4. Calibrate the quantsim model on training data and evaluate the quantized accuracy
5. Fine-tune the quantized model to improve the quantized accuracy
6. Export the quantized model
PyTorch prerequisites
---------------------
To see clearly what happens inside AIMET, let's first start with some simple PyTorch code for defining, training, and
evaluating a model. The code below is adapted from PyTorch's
`basic optimization tutorial <https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html>`_.
Note that AIMET does not have any special requirement on what these training/eval loops look like.
"""
import torch
import torchvision
import torch.nn.functional as F
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# 1) Start with some data loaders to train, evaluate, and calibrate the model
cifar10_train_data = torchvision.datasets.FashionMNIST('/tmp/cifar10', train=True, download=True, transform=torchvision.transforms.ToTensor())
cifar10_test_data = torchvision.datasets.FashionMNIST('/tmp/cifar10', train=True, download=True, transform=torchvision.transforms.ToTensor())
train_loader = torch.utils.data.DataLoader(cifar10_train_data, batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(cifar10_train_data, batch_size=128, shuffle=True)
# 2) Define a simple model to train on this dataset
class Network(torch.nn.Module):
def __init__(self):
super().__init__()
self.conv1 = torch.nn.Conv2d(in_channels=1, out_channels=128, kernel_size=3, padding=1, stride=2)
self.bn_1 = torch.nn.BatchNorm2d(128)
self.conv2 = torch.nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1, stride=2)
self.bn_2 = torch.nn.BatchNorm2d(256)
self.linear = torch.nn.Linear(in_features=7*7*256, out_features=10)
def forward(self, x):
x = self.conv1(x)
x = F.relu(self.bn_1(x))
x = self.conv2(x)
x = F.relu(self.bn_2(x))
x = self.linear(x.view(x.shape[0], -1))
return F.softmax(x, dim=-1)
# 3) Define an evaluation loop for the model
def evaluate(model, data_loader):
model.eval()
correct = total = 0
for x, y in data_loader:
x, y = x.to(device), y.to(device)
output = model(x)
correct += (torch.argmax(output, dim=1) == y).sum()
total += x.shape[0]
accuracy = correct / total * 100.
return accuracy
###############################################################################
# Now, let's instantiate a network and train for a few epochs on our dataset to establish a baseline floating-point model
# Create a model
model = Network()
# Send the model to the desired device (optional)
model.to(device)
# Define some loss function and optimizer
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
# Train for 4 epochs
model.train()
for epoch in range(4):
for batch_idx, (x, y) in enumerate(train_loader):
x, y = x.to(device), y.to(device)
output = model(x)
loss = loss_fn(output, y)
loss.backward()
optimizer.step()
optimizer.zero_grad()
# Evaluate the floating-point model
model.eval()
fp_accuracy = evaluate(model, test_loader)
print(f"Floating point accuracy: {fp_accuracy}")
###############################################################################
# Prepare the floating point model for quantization
# -------------------------------------------------
#
# Before we can (accurately) simulate quantization, there are a couple important steps to take care of:
#
#
# 1) Model preparation
# ^^^^^^^^^^^^^^^^^^^^
# AIMET's quantization simulation tool (:class:`QuantizationSimModel`) expects the floating point model to conform to some
# specific guidelines. For example, :class:`QuantizationSimModel` is only able to quantize math operations performed by
# :class:`torch.nn.Module` objects, whereas :mod:`torch.nn.functional` calls will be (incorrectly) ignored.
#
# If we look back at our previous model definition, we see it calls :func:`F.relu` and :func:`F.softmax` in the forward
# function. Does this mean we need to completely redefine our model to use AIMET? Thankfully, no. AIMET provides the
# :mod:`model_preparer` API to transform our incompatible model into a new fully-compatible model.
from aimet_torch import model_preparer
prepared_model = model_preparer.prepare_model(model)
print(prepared_model)
# Note: This transformation should not change the model's forward function at all
fp_accuracy_prepared = evaluate(prepared_model, test_loader)
assert fp_accuracy_prepared == fp_accuracy
###############################################################################
# Note how the prepared model now contains distinct modules for the :func:`relu` and :func:`softmax` operations.
#
# 2) BatchNorm fold
# ^^^^^^^^^^^^^^^^^
#
# When models are executed in a quantized runtime, batchnorm layers are typically folded into the weight and bias of
# an adjacent convolution layer whenever possible in order to remove unnecessary computations. To accurately simulate
# inference in these runtimes, it is generally a good idea to perform this batchnorm folding on the floating point model
# before applying quantization. AIMET provides the :mod:`batch_norm_fold` tool to do this.
from aimet_torch import batch_norm_fold
sample_input, _ = next(iter(train_loader))
batch_norm_fold.fold_all_batch_norms(prepared_model, input_shapes=sample_input.shape)
print(prepared_model)
###############################################################################
# Note that the model now has :class:`Identity` (passthrough) layers where it previously had :class:`BatchNorm2d` layers. Like the
# :mod:`model_preparer` step, this operation should not impact the model's accuracy.
#
# Quantize the model
# ------------------
#
# Now, we are ready to use AIMET's :class:`QuantizationSimModel` to simulate quantizing the floating point model. This
# involves two steps:
#
# 1) Add quantizers to simulate quantization noise during the model's forward pass
# 2) Calibrate the quantizer encodings (e.g., min/max ranges) on some sample inputs
#
# Calibration is necessary to determine the range of values each activation quantizer is likely to encounter in the
# model's forward pass, and should therefore be able to represent. Theoretically, we could pass the entire training
# dataset through the model for calibration, but in practice we usually only need about 500-1000 representative samples
# to accurately estimate the ranges.
import aimet_torch.v2 as aimet
from aimet_torch.v2 import quantsim
# QuantizationSimModel will convert each nn.Module in prepared_model into a quantized equivalent module and configure the module's quantizers
# In this case, we will quantize all parameters to 4 bits and all activations to 8 bits.
sim = quantsim.QuantizationSimModel(prepared_model,
dummy_input=sample_input.to(device),
default_output_bw=8, # Simulate 8-bit activations
default_param_bw=4) # Simulate 4-bit weights
# Inside the compute_encodings context, quantizers will observe the statistics of the activations passing through them. These statistics will be used
# to compute properly calibrated encodings upon exiting the context.
with aimet.nn.compute_encodings(sim.model):
for idx, (x, _) in enumerate(train_loader):
x = x.to(device)
sim.model(x)
if idx >= 10:
break
# Compare the accuracy before and after quantization:
quantized_accuracy = evaluate(sim.model, test_loader)
print(sim.model)
print(f"Floating point model accuracy: {fp_accuracy} %\n"
f"Quantized model accuracy: {quantized_accuracy} %")
###############################################################################
# Here, we can see that ``sim.model`` is nothing more than the ``prepared_model`` with every layer replaced with a
# quantized version of the layer. The quantization behavior of each module is determined by the configuration of its
# held quantizers.
#
# For example, we can see that ``sim.model.conv2`` has a 4-bit weight quantizer and an 8-bit output quantizer as specified
# during construction. We will discuss more advanced ways to configure these quantizers to optimize performance and
# accuracy in a later tutorial.
#
# Fine-tune the model with quantization aware training
# ----------------------------------------------------
#
# If we're not satisfied with our accuracy after applying quantization, there are some steps we can take to further
# optimize the quantized accuracy. One such step is quantization aware training (QAT), during which the model is trained
# with the fake-quantization ops present.
#
# Let's repeat our floating-point training loop for one more epoch, but this time use the quantized model.
# Define some loss function and optimizer
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(sim.model.parameters(), lr=1e-4)
# Train for one more epoch on the quantsim model
for epoch in range(1):
for batch_idx, (x, y) in enumerate(train_loader):
x, y = x.to(device), y.to(device)
output = sim.model(x)
loss = loss_fn(output, y)
loss.backward()
optimizer.step()
optimizer.zero_grad()
# Compare the accuracy before and after QAT:
post_QAT_accuracy = evaluate(sim.model, test_loader)
print(f"Original quantized model accuracy: {quantized_accuracy} %\n"
f"Post-QAT model accuracy: {post_QAT_accuracy} %")
###############################################################################
# Export the quantsim model
# -------------------------
#
# Now that we are happy with our quantized model's accuracy, we are ready to export the model with its quantization parameters.
export_path = "/tmp/"
model_name = "fashion_mnist_model"
sample_input, _ = next(iter(train_loader))
sim.export(export_path, model_name, dummy_input=sample_input)
###############################################################################
# This export method will save the model with quantization nodes removed, along with an encodings file containing
# quantization parameters for each activation and weight tensor in the model. These artifacts can then be sent to a
# quantized runtime such as Qualcomm\ |reg| Neural Processing SDK.
#
# .. |reg| unicode:: U+000AE .. REGISTERED SIGN