Adding quantized::linear function for pytorch mobile in c10 (#26135)

Summary: Pull Request resolved: pytorch/pytorch#26135 This change adds the support to call QNNPACK using the refactored API for Linear operators (Fully Connected) It also has certain cmake changes to enable builing and using pytorch_qnnpack inside aten I have disabled USE_QNNPACK in CMakeLists.txt. Enabling it results in picking kernels from third_party/QNNPACK during runtime since the function names are the same. Test Plan: python test/test_quantized.py TestQNNPackOps.test_qlinear_qnnpack Imported from OSS Differential Revision: D17434885 fbshipit-source-id: 084698026938f4529f61d12e86dfe82534ec73dd
sungin-h · Sep 17, 2019 · bb1efb3 · bb1efb3
1 parent 59002bb
commit bb1efb3
Show file tree

Hide file tree

Showing 9 changed files with 325 additions and 228 deletions.
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -2,6 +2,7 @@
 #include <ATen/core/op_registration/op_registration.h>
 #include <ATen/cpp_custom_type_hack.h>
 #include <ATen/native/quantized/cpu/fbgemm_utils.h>
+#include <ATen/native/quantized/cpu/qnnpack_utils.h>
 
 #include <algorithm>
 #include <string>
@@ -14,7 +15,7 @@ template <bool ReluFused>
 class QLinearInt8 final : public torch::OperatorKernel {
  public:
 #ifdef USE_FBGEMM
-  at::Tensor operator()(
+  at::Tensor fbgemm_linear(
       at::Tensor input,
       at::Tensor packed_weight,
       double output_scale,
@@ -207,19 +208,106 @@ class QLinearInt8 final : public torch::OperatorKernel {
     }
     return output;
   }
-#else // USE_FBGEMM
-  at::Tensor operator()(
-      at::Tensor /* input */,
-      at::Tensor /* packed_weight */,
-      double /* output_scale */,
-      int64_t /* output_zero_point */) {
-    // We make a strong guarantee that models using these operators will have
-    // the same numerics across different machines. Therefore, we do not provide
-    // a fallback path and rather fail loudly if we cannot run FBGEMM.
+#endif
+#ifdef USE_PYTORCH_QNNPACK
+  at::Tensor qnnpack_linear(
+      at::Tensor input,
+      at::Tensor packed_weight,
+      double output_scale,
+      int64_t output_zero_point) {
+    TORCH_CHECK(
+        input.dim() >= 2,
+        "quantized::linear(): Input tensor rank should be >= 2");
+    auto input_contig = input.contiguous();
+
+    auto& pack_ptr =
+        cpp_custom_type_hack::cast<PackedLinearWeightsQnnp>(packed_weight);
+    auto packB = pack_ptr.w.get();
+    auto kernel_zp = pack_ptr.w_zp;
+    auto kernel_scale = pack_ptr.w_scale;
+
+    size_t rows_input = 1;
+    size_t cols_input = input_contig.size(input_contig.dim() - 1);
+    for (size_t i = 0; i < input_contig.dim() - 1; ++i) {
+      rows_input *= input_contig.size(i);
+    }
+
+    size_t rows_w = packB->getOutputChannels();
+    size_t cols_w = packB->getInputChannels();
+
     TORCH_CHECK(
-        false, "This PyTorch installation was not built with FBGEMM operators");
+        cols_input == cols_w,
+        "quantized::linear(): input size does not match weight dimension 1 size: \
+         got ",
+        cols_input,
+        " but expected ",
+        cols_w);
+
+    // Allocate output Tensor and a buffer for QNNPACK to use
+    Tensor output = at::_empty_affine_quantized(
+        {static_cast<long>(rows_input), static_cast<long>(rows_w)},
+        input.options(),
+        output_scale,
+        output_zero_point);
+
+    auto output_min = ReluFused
+        ? activationLimits(output_scale, output_zero_point, Activation::RELU)
+              .first
+        : std::numeric_limits<uint8_t>::min();
+    auto output_max = ReluFused
+        ? activationLimits(output_scale, output_zero_point, Activation::RELU)
+              .second
+        : std::numeric_limits<uint8_t>::max();
+    const pytorch_qnnp_status runStatus = qnnpack::qnnpackLinear(
+        rows_input /* batch_size */,
+        cols_input /* input_channels */,
+        rows_w /* output_channels */,
+        input_contig.q_zero_point(),
+        input_contig.q_scale(),
+        kernel_zp,
+        kernel_scale,
+        output_zero_point,
+        output_scale,
+        output_min,
+        output_max,
+        (uint8_t*)input_contig.data_ptr<c10::quint8>(),
+        cols_input /* input_stride */,
+        packB->getPackedWeights(),
+        (uint8_t*)output.data_ptr<c10::quint8>(),
+        rows_w /* output_stride */,
+        nullptr /* threadpool */);
+
+    TORCH_INTERNAL_ASSERT(
+        runStatus == pytorch_qnnp_status_success,
+        "failed to run QNNPACK Linear operator");
+
+    return output;
+  }
+#endif
+  at::Tensor operator()(
+      at::Tensor input,
+      at::Tensor packed_weight,
+      double output_scale,
+      int64_t output_zero_point) {
+    auto& ctx = at::globalContext();
+
+#ifdef USE_FBGEMM
+    if (ctx.preferredQuantizedEngine() == at::QEngine::FBGEMM) {
+      return fbgemm_linear(
+          input, packed_weight, output_scale, output_zero_point);
+    }
+#endif
+#ifdef USE_PYTORCH_QNNPACK
+    if (ctx.preferredQuantizedEngine() == at::QEngine::QNNPACK) {
+      return qnnpack_linear(
+          input, packed_weight, output_scale, output_zero_point);
+    }
+#endif
+    TORCH_INTERNAL_ASSERT(
+        "Didn't find engine for operation quantized::linear ",
+        toString(ctx.preferredQuantizedEngine()));
+    return at::Tensor();
   }
-#endif // USE_FBGEMM
 };
 
 static auto registry =

diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
@@ -2,8 +2,9 @@
 #include <ATen/core/op_registration/op_registration.h>
 #include <ATen/cpp_custom_type_hack.h>
 #include <ATen/native/quantized/cpu/fbgemm_utils.h>
+#include <ATen/native/quantized/cpu/init_qnnpack.h>
+#include <ATen/native/quantized/cpu/qnnpack_utils.h>
 #include <ATen/quantized/Quantizer.h>
-
 #include <algorithm>
 #include <vector>
 
@@ -12,6 +13,10 @@ namespace caffe2 {
 // Required for cpp_custom_type_hack to work
 CAFFE_KNOWN_TYPE(PackedLinearWeight);
 #endif // USE_FBGEMM
+#ifdef USE_PYTORCH_QNNPACK
+// Required for cpp_custom_type_hack to work
+CAFFE_KNOWN_TYPE(PackedLinearWeightsQnnp);
+#endif // USE_PYTORCH_QNNPACK
 } // namespace caffe2
 
 namespace at {
@@ -44,8 +49,9 @@ class QLinearPackWeightInt8 final : public c10::OperatorKernel {
       }
     }
   }
-
-  at::Tensor operator()(at::Tensor weight, c10::optional<Tensor> bias) {
+  at::Tensor fbgemm_linear_prepack(
+      at::Tensor weight,
+      c10::optional<Tensor> bias) {
     TORCH_CHECK(
         weight.dim() == 2,
         "The weight tensor for quantized::linear_prepack (fbgemm) should"
@@ -117,18 +123,75 @@ class QLinearPackWeightInt8 final : public c10::OperatorKernel {
     // point.
     return cpp_custom_type_hack::create(std::move(ret_ptr), weight.options());
   }
-#else // USE_FBGEMM
-  at::Tensor operator()(
-      at::Tensor /* weight */,
-      c10::optional<Tensor> /* bias */
-  ) {
-    // We make a strong guarantee that models using these operators will have
-    // the same numerics across different machines. Therefore, we do not provide
-    // a fallback path and rather fail loudly if we cannot run FBGEMM.
+#endif
+#ifdef USE_PYTORCH_QNNPACK
+  at::Tensor qnnpack_linear_prepack(
+      at::Tensor weight,
+      c10::optional<Tensor> bias_in) {
+    TORCH_CHECK(
+        weight.dim() == 2,
+        "quantized::linear_prepack (qnnpack): Weight tensor rank should be == 2");
     TORCH_CHECK(
-        false, "This PyTorch installation was not built with FBGEMM operators");
+        weight.qscheme() == kPerTensorAffine,
+        "quantized::linear_prepack (qnnpack) only supports Per Tensor Quantization Scheme")
+
+    int64_t rows_w = weight.size(0);
+    int64_t cols_w = weight.size(1);
+    Tensor bias;
+    if (bias_in.has_value()) {
+      bias = bias_in.value();
+    } else {
+      bias = at::zeros(rows_w, at::kFloat);
+      bias = at::quantize_linear(bias, 1.0, 0, kQInt32);
+    }
+    TORCH_CHECK(
+        !bias.defined() || (bias.ndimension() == 1 && bias.size(0) == rows_w),
+        "quantized::linear_prepack (qnnpack): Given weight of size ",
+        weight.sizes(),
+        ", expected bias to be 1-dimensional with ",
+        rows_w,
+        " elements",
+        ", but got bias of size ",
+        bias.sizes(),
+        " instead");
+
+    Tensor weight_contig = weight.contiguous();
+    Tensor bias_contig = bias.contiguous();
+
+    initQNNPACK();
+
+    auto wt_ptr =
+        guts::make_unique<PackedLinearWeightsQnnp>(PackedLinearWeightsQnnp{
+            guts::make_unique<qnnpack::PackBMatrix>(
+                cols_w /* input_channels */,
+                rows_w /* output_channels */,
+                weight.q_zero_point(),
+                weight.q_scale(),
+                (uint8_t*)weight_contig.data_ptr<c10::quint8>(),
+                (int32_t*)bias_contig.data_ptr<c10::qint32>()),
+            weight.q_scale(),
+            weight.q_zero_point()});
+    return cpp_custom_type_hack::create(std::move(wt_ptr), weight.options());
+  }
+#endif
+  at::Tensor operator()(at::Tensor weight, c10::optional<Tensor> bias) {
+    auto& ctx = at::globalContext();
+
+#ifdef USE_FBGEMM
+    if (ctx.preferredQuantizedEngine() == at::QEngine::FBGEMM) {
+      return fbgemm_linear_prepack(weight, bias);
+    }
+#endif
+#ifdef USE_PYTORCH_QNNPACK
+    if (ctx.preferredQuantizedEngine() == at::QEngine::QNNPACK) {
+      return qnnpack_linear_prepack(weight, bias);
+    }
+#endif
+    TORCH_INTERNAL_ASSERT(
+        "Didn't find engine for operation quantized::linear_prepack ",
+        toString(ctx.preferredQuantizedEngine()));
+    return at::Tensor();
   }
-#endif // USE_FBGEMM
 };
 
 static auto registry = c10::RegisterOperators().op(

diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/CMakeLists.txt b/aten/src/ATen/native/quantized/cpu/qnnpack/CMakeLists.txt
@@ -276,8 +276,10 @@ IF(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
   SET_PROPERTY(SOURCE ${PYTORCH_QNNPACK_OPERATOR_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -O2 ")
 ENDIF()
 TARGET_INCLUDE_DIRECTORIES(pytorch_qnnpack PUBLIC include)
-TARGET_INCLUDE_DIRECTORIES(pytorch_qnnpack PRIVATE src)
+TARGET_INCLUDE_DIRECTORIES(pytorch_qnnpack PUBLIC src)
 SET_TARGET_PROPERTIES(pytorch_qnnpack PROPERTIES PUBLIC_HEADER include/pytorch_qnnpack.h)
+SET_TARGET_PROPERTIES(pytorch_qnnpack PROPERTIES PUBLIC_HEADER include/conv_utils.h)
+SET_TARGET_PROPERTIES(pytorch_qnnpack PROPERTIES PUBLIC_HEADER include/qnnpack_func.h)
 
 # ---[ Configure clog
 IF(NOT TARGET clog)
@@ -289,7 +291,7 @@ IF(NOT TARGET clog)
   # We build static version of clog but a dynamic library may indirectly depend on it
   SET_PROPERTY(TARGET clog PROPERTY POSITION_INDEPENDENT_CODE ON)
 ENDIF()
-TARGET_LINK_LIBRARIES(pytorch_qnnpack PRIVATE clog)
+TARGET_LINK_LIBRARIES(pytorch_qnnpack PUBLIC clog)
 
 # ---[ Configure cpuinfo
 IF(NOT TARGET cpuinfo)

diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/fc-prepack.cc b/aten/src/ATen/native/quantized/cpu/qnnpack/src/fc-prepack.cc
@@ -2,6 +2,7 @@
 #include <qnnpack/pack.h>
 #include <qnnpack_func.h>
 #include <cstring>
+#include <cstdlib>
 
 namespace qnnpack {
 PackBMatrix::PackBMatrix(
@@ -28,7 +29,7 @@ PackBMatrix::PackBMatrix(
   input_channels_ = input_channels;
   output_channels_ = output_channels;
   packed_weights_ =
-      malloc( n_stride * (k_stride * sizeof(uint8_t) + sizeof(int32_t)));
+      malloc(n_stride * (k_stride * sizeof(uint8_t) + sizeof(int32_t)));
   if (packed_weights_ == NULL) {
     pytorch_qnnp_log_error(
         "failed to allocate %zu bytes for packed weights",