Skip to content

Commit

Permalink
Adding quantized::linear function for pytorch mobile in c10 (#26135)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: pytorch/pytorch#26135

This change adds the support to call QNNPACK using the refactored API for Linear operators (Fully Connected)
It also has certain cmake changes to enable builing and using pytorch_qnnpack inside aten
I have disabled USE_QNNPACK in CMakeLists.txt. Enabling it results in picking kernels from third_party/QNNPACK during runtime since the function names are the same.

Test Plan:
python test/test_quantized.py TestQNNPackOps.test_qlinear_qnnpack

Imported from OSS

Differential Revision: D17434885

fbshipit-source-id: 084698026938f4529f61d12e86dfe82534ec73dd
  • Loading branch information
supriyar authored and facebook-github-bot committed Sep 17, 2019
1 parent 59002bb commit bb1efb3
Show file tree
Hide file tree
Showing 9 changed files with 325 additions and 228 deletions.
112 changes: 100 additions & 12 deletions aten/src/ATen/native/quantized/cpu/qlinear.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include <ATen/core/op_registration/op_registration.h>
#include <ATen/cpp_custom_type_hack.h>
#include <ATen/native/quantized/cpu/fbgemm_utils.h>
#include <ATen/native/quantized/cpu/qnnpack_utils.h>

#include <algorithm>
#include <string>
Expand All @@ -14,7 +15,7 @@ template <bool ReluFused>
class QLinearInt8 final : public torch::OperatorKernel {
public:
#ifdef USE_FBGEMM
at::Tensor operator()(
at::Tensor fbgemm_linear(
at::Tensor input,
at::Tensor packed_weight,
double output_scale,
Expand Down Expand Up @@ -207,19 +208,106 @@ class QLinearInt8 final : public torch::OperatorKernel {
}
return output;
}
#else // USE_FBGEMM
at::Tensor operator()(
at::Tensor /* input */,
at::Tensor /* packed_weight */,
double /* output_scale */,
int64_t /* output_zero_point */) {
// We make a strong guarantee that models using these operators will have
// the same numerics across different machines. Therefore, we do not provide
// a fallback path and rather fail loudly if we cannot run FBGEMM.
#endif
#ifdef USE_PYTORCH_QNNPACK
at::Tensor qnnpack_linear(
at::Tensor input,
at::Tensor packed_weight,
double output_scale,
int64_t output_zero_point) {
TORCH_CHECK(
input.dim() >= 2,
"quantized::linear(): Input tensor rank should be >= 2");
auto input_contig = input.contiguous();

auto& pack_ptr =
cpp_custom_type_hack::cast<PackedLinearWeightsQnnp>(packed_weight);
auto packB = pack_ptr.w.get();
auto kernel_zp = pack_ptr.w_zp;
auto kernel_scale = pack_ptr.w_scale;

size_t rows_input = 1;
size_t cols_input = input_contig.size(input_contig.dim() - 1);
for (size_t i = 0; i < input_contig.dim() - 1; ++i) {
rows_input *= input_contig.size(i);
}

size_t rows_w = packB->getOutputChannels();
size_t cols_w = packB->getInputChannels();

TORCH_CHECK(
false, "This PyTorch installation was not built with FBGEMM operators");
cols_input == cols_w,
"quantized::linear(): input size does not match weight dimension 1 size: \
got ",
cols_input,
" but expected ",
cols_w);

// Allocate output Tensor and a buffer for QNNPACK to use
Tensor output = at::_empty_affine_quantized(
{static_cast<long>(rows_input), static_cast<long>(rows_w)},
input.options(),
output_scale,
output_zero_point);

auto output_min = ReluFused
? activationLimits(output_scale, output_zero_point, Activation::RELU)
.first
: std::numeric_limits<uint8_t>::min();
auto output_max = ReluFused
? activationLimits(output_scale, output_zero_point, Activation::RELU)
.second
: std::numeric_limits<uint8_t>::max();
const pytorch_qnnp_status runStatus = qnnpack::qnnpackLinear(
rows_input /* batch_size */,
cols_input /* input_channels */,
rows_w /* output_channels */,
input_contig.q_zero_point(),
input_contig.q_scale(),
kernel_zp,
kernel_scale,
output_zero_point,
output_scale,
output_min,
output_max,
(uint8_t*)input_contig.data_ptr<c10::quint8>(),
cols_input /* input_stride */,
packB->getPackedWeights(),
(uint8_t*)output.data_ptr<c10::quint8>(),
rows_w /* output_stride */,
nullptr /* threadpool */);

TORCH_INTERNAL_ASSERT(
runStatus == pytorch_qnnp_status_success,
"failed to run QNNPACK Linear operator");

return output;
}
#endif
at::Tensor operator()(
at::Tensor input,
at::Tensor packed_weight,
double output_scale,
int64_t output_zero_point) {
auto& ctx = at::globalContext();

#ifdef USE_FBGEMM
if (ctx.preferredQuantizedEngine() == at::QEngine::FBGEMM) {
return fbgemm_linear(
input, packed_weight, output_scale, output_zero_point);
}
#endif
#ifdef USE_PYTORCH_QNNPACK
if (ctx.preferredQuantizedEngine() == at::QEngine::QNNPACK) {
return qnnpack_linear(
input, packed_weight, output_scale, output_zero_point);
}
#endif
TORCH_INTERNAL_ASSERT(
"Didn't find engine for operation quantized::linear ",
toString(ctx.preferredQuantizedEngine()));
return at::Tensor();
}
#endif // USE_FBGEMM
};

static auto registry =
Expand Down
89 changes: 76 additions & 13 deletions aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
#include <ATen/core/op_registration/op_registration.h>
#include <ATen/cpp_custom_type_hack.h>
#include <ATen/native/quantized/cpu/fbgemm_utils.h>
#include <ATen/native/quantized/cpu/init_qnnpack.h>
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
#include <ATen/quantized/Quantizer.h>

#include <algorithm>
#include <vector>

Expand All @@ -12,6 +13,10 @@ namespace caffe2 {
// Required for cpp_custom_type_hack to work
CAFFE_KNOWN_TYPE(PackedLinearWeight);
#endif // USE_FBGEMM
#ifdef USE_PYTORCH_QNNPACK
// Required for cpp_custom_type_hack to work
CAFFE_KNOWN_TYPE(PackedLinearWeightsQnnp);
#endif // USE_PYTORCH_QNNPACK
} // namespace caffe2

namespace at {
Expand Down Expand Up @@ -44,8 +49,9 @@ class QLinearPackWeightInt8 final : public c10::OperatorKernel {
}
}
}

at::Tensor operator()(at::Tensor weight, c10::optional<Tensor> bias) {
at::Tensor fbgemm_linear_prepack(
at::Tensor weight,
c10::optional<Tensor> bias) {
TORCH_CHECK(
weight.dim() == 2,
"The weight tensor for quantized::linear_prepack (fbgemm) should"
Expand Down Expand Up @@ -117,18 +123,75 @@ class QLinearPackWeightInt8 final : public c10::OperatorKernel {
// point.
return cpp_custom_type_hack::create(std::move(ret_ptr), weight.options());
}
#else // USE_FBGEMM
at::Tensor operator()(
at::Tensor /* weight */,
c10::optional<Tensor> /* bias */
) {
// We make a strong guarantee that models using these operators will have
// the same numerics across different machines. Therefore, we do not provide
// a fallback path and rather fail loudly if we cannot run FBGEMM.
#endif
#ifdef USE_PYTORCH_QNNPACK
at::Tensor qnnpack_linear_prepack(
at::Tensor weight,
c10::optional<Tensor> bias_in) {
TORCH_CHECK(
weight.dim() == 2,
"quantized::linear_prepack (qnnpack): Weight tensor rank should be == 2");
TORCH_CHECK(
false, "This PyTorch installation was not built with FBGEMM operators");
weight.qscheme() == kPerTensorAffine,
"quantized::linear_prepack (qnnpack) only supports Per Tensor Quantization Scheme")

int64_t rows_w = weight.size(0);
int64_t cols_w = weight.size(1);
Tensor bias;
if (bias_in.has_value()) {
bias = bias_in.value();
} else {
bias = at::zeros(rows_w, at::kFloat);
bias = at::quantize_linear(bias, 1.0, 0, kQInt32);
}
TORCH_CHECK(
!bias.defined() || (bias.ndimension() == 1 && bias.size(0) == rows_w),
"quantized::linear_prepack (qnnpack): Given weight of size ",
weight.sizes(),
", expected bias to be 1-dimensional with ",
rows_w,
" elements",
", but got bias of size ",
bias.sizes(),
" instead");

Tensor weight_contig = weight.contiguous();
Tensor bias_contig = bias.contiguous();

initQNNPACK();

auto wt_ptr =
guts::make_unique<PackedLinearWeightsQnnp>(PackedLinearWeightsQnnp{
guts::make_unique<qnnpack::PackBMatrix>(
cols_w /* input_channels */,
rows_w /* output_channels */,
weight.q_zero_point(),
weight.q_scale(),
(uint8_t*)weight_contig.data_ptr<c10::quint8>(),
(int32_t*)bias_contig.data_ptr<c10::qint32>()),
weight.q_scale(),
weight.q_zero_point()});
return cpp_custom_type_hack::create(std::move(wt_ptr), weight.options());
}
#endif
at::Tensor operator()(at::Tensor weight, c10::optional<Tensor> bias) {
auto& ctx = at::globalContext();

#ifdef USE_FBGEMM
if (ctx.preferredQuantizedEngine() == at::QEngine::FBGEMM) {
return fbgemm_linear_prepack(weight, bias);
}
#endif
#ifdef USE_PYTORCH_QNNPACK
if (ctx.preferredQuantizedEngine() == at::QEngine::QNNPACK) {
return qnnpack_linear_prepack(weight, bias);
}
#endif
TORCH_INTERNAL_ASSERT(
"Didn't find engine for operation quantized::linear_prepack ",
toString(ctx.preferredQuantizedEngine()));
return at::Tensor();
}
#endif // USE_FBGEMM
};

static auto registry = c10::RegisterOperators().op(
Expand Down
6 changes: 4 additions & 2 deletions aten/src/ATen/native/quantized/cpu/qnnpack/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -276,8 +276,10 @@ IF(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
SET_PROPERTY(SOURCE ${PYTORCH_QNNPACK_OPERATOR_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -O2 ")
ENDIF()
TARGET_INCLUDE_DIRECTORIES(pytorch_qnnpack PUBLIC include)
TARGET_INCLUDE_DIRECTORIES(pytorch_qnnpack PRIVATE src)
TARGET_INCLUDE_DIRECTORIES(pytorch_qnnpack PUBLIC src)
SET_TARGET_PROPERTIES(pytorch_qnnpack PROPERTIES PUBLIC_HEADER include/pytorch_qnnpack.h)
SET_TARGET_PROPERTIES(pytorch_qnnpack PROPERTIES PUBLIC_HEADER include/conv_utils.h)
SET_TARGET_PROPERTIES(pytorch_qnnpack PROPERTIES PUBLIC_HEADER include/qnnpack_func.h)

# ---[ Configure clog
IF(NOT TARGET clog)
Expand All @@ -289,7 +291,7 @@ IF(NOT TARGET clog)
# We build static version of clog but a dynamic library may indirectly depend on it
SET_PROPERTY(TARGET clog PROPERTY POSITION_INDEPENDENT_CODE ON)
ENDIF()
TARGET_LINK_LIBRARIES(pytorch_qnnpack PRIVATE clog)
TARGET_LINK_LIBRARIES(pytorch_qnnpack PUBLIC clog)

# ---[ Configure cpuinfo
IF(NOT TARGET cpuinfo)
Expand Down
3 changes: 2 additions & 1 deletion aten/src/ATen/native/quantized/cpu/qnnpack/src/fc-prepack.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include <qnnpack/pack.h>
#include <qnnpack_func.h>
#include <cstring>
#include <cstdlib>

namespace qnnpack {
PackBMatrix::PackBMatrix(
Expand All @@ -28,7 +29,7 @@ PackBMatrix::PackBMatrix(
input_channels_ = input_channels;
output_channels_ = output_channels;
packed_weights_ =
malloc( n_stride * (k_stride * sizeof(uint8_t) + sizeof(int32_t)));
malloc(n_stride * (k_stride * sizeof(uint8_t) + sizeof(int32_t)));
if (packed_weights_ == NULL) {
pytorch_qnnp_log_error(
"failed to allocate %zu bytes for packed weights",
Expand Down
Loading

0 comments on commit bb1efb3

Please sign in to comment.