[coreml] Introducing Quantization (pytorch#78108)

Summary: Adding Quantization mode to preprocess, which allows us to run through quantization for coreml models Test Plan: https://fburl.com/anp/r0ntsbq0 Notebook runnining through quantization workflow: created a custom bentos kernel to run it through coreml ```bento_kernel( name = "coreml", deps = [ "fbsource//third-party/pypi/coremltools:coremltools", "//caffe2:coreml_backend", "//caffe2:coreml_backend_cpp", "//caffe2:torch", "//caffe2/torch/fb/mobile/model_exporter:model_exporter", ], ) ``` Initial benchmarks on iPhone 11: FP32 Core ML Model: https://our.intern.facebook.com/intern/aibench/details/203998485252700 Quantized Core ML Model: https://our.intern.facebook.com/intern/aibench/details/927584023592505 High End Quantized Model: https://our.intern.facebook.com/intern/aibench/details/396271714697929 Summarized Results | Backend | Quantization | p50 net latency | Model Size | |---------|--------------|-----------------|------------| | Core ML | No | 1.2200 | 1.2mb | | Core ML | Yes | 1.2135 | 385kb | | CPU | Yes | 3.1720 | 426kb | Reviewed By: SS-JIA Differential Revision: D36559966 Pull Request resolved: pytorch#78108 Approved by: https://github.com/jmdetloff
Xxx14124 · Jun 1, 2022 · 93d5a72 · 93d5a72
1 parent 2d5eac4
commit 93d5a72
Showing 1 changed file with 19 additions and 3 deletions.
diff --git a/torch/backends/_coreml/preprocess.py b/torch/backends/_coreml/preprocess.py
@@ -6,6 +6,7 @@
 import torch
 from coremltools.converters.mil.input_types import TensorType  # type: ignore[import]
 from coremltools.converters.mil.mil import types  # type: ignore[import]
+from coremltools.models.neural_network import quantization_utils  # type: ignore[import]
 
 CT_METADATA_VERSION = "com.github.apple.coremltools.version"
 CT_METADATA_SOURCE = "com.github.apple.coremltools.source"
@@ -33,13 +34,23 @@ class CoreMLComputeUnit:
     CPUAndGPU = "cpuAndGPU"
     ALL = "all"
 
+class CoreMLQuantizationMode:
+    LINEAR = "linear"
+    LINEAR_SYMMETRIC = "linear_symmetric"
+    NONE = "none"
+
+
 
 def TensorSpec(shape, dtype=ScalarType.Float):
     return (shape, dtype)
 
 
-def CompileSpec(inputs, outputs, backend=CoreMLComputeUnit.CPU, allow_low_precision=True):
-    return (inputs, outputs, backend, allow_low_precision)
+def CompileSpec(inputs,
+                outputs,
+                backend=CoreMLComputeUnit.CPU,
+                allow_low_precision=True,
+                quantization_mode=CoreMLQuantizationMode.NONE):
+    return (inputs, outputs, backend, allow_low_precision, quantization_mode)
 
 
 def _check_enumerated_shape(shape):
@@ -60,7 +71,7 @@ def _convert_to_mil_type(shape, dtype, name: str):
 
 def preprocess(script_module: torch._C.ScriptObject, compile_spec: Dict[str, Tuple]):
     spec = compile_spec["forward"]
-    input_specs, output_specs, backend, allow_low_precision = spec
+    input_specs, output_specs, backend, allow_low_precision, quantization_mode = spec
     mil_inputs = []
     inputs = []
     for index, input in enumerate(input_specs):
@@ -71,6 +82,11 @@ def preprocess(script_module: torch._C.ScriptObject, compile_spec: Dict[str, Tup
         mil_inputs.append(ml_type)
     model = torch.jit.RecursiveScriptModule._construct(script_module, lambda x: None)
     mlmodel = ct.convert(model, inputs=mil_inputs)
+
+    if(quantization_mode != CoreMLQuantizationMode.NONE):
+        quant_model_spec = quantization_utils.quantize_weights(mlmodel, nbits=8, quantization_mode=quantization_mode)
+        mlmodel = ct.models.MLModel(quant_model_spec)
+
     spec = mlmodel.get_spec()
     assert len(spec.description.output) == len(output_specs)  # type: ignore[attr-defined]
     outputs = []