Add shape inference for LpPool, RoiPool, and fix MaxPool, AveragePool…

…, and Conv (onnx#928) * Make the coefficient non optional for LinearClassifier * Fix the build issue by updating the changelog and the operators-ml.md files * Add shape inference for LpPool, RoiPool, and fix MaxPool, AveragePool, and Conv * Add shape inference for LpPool, RoiPool, and fix MaxPool, AveragePool, and Conv * fix the bug in the conv shape inference * fix review comments * fix the python style issues
aipul · May 13, 2018 · 0bd3f78 · 0bd3f78
1 parent 490c4c6
commit 0bd3f78
Show file tree

Hide file tree

Showing 2 changed files with 215 additions and 55 deletions.
diff --git a/onnx/defs/nn/defs.cc b/onnx/defs/nn/defs.cc
@@ -24,10 +24,19 @@ static std::string auto_pad_doc =
 
 namespace ONNX_NAMESPACE {
 
-void convPoolTypeAndShapeInference(InferenceContext& ctx, bool use_dilation, bool require_kernel_shape) {
+void convPoolTypeAndShapeInference(
+    InferenceContext& ctx,
+    bool use_dilation,
+    bool require_kernel_shape) {
   propagateElemTypeFromInputToOutput(ctx, 0, 0);
 
-  if (!hasNInputShapes(ctx, 2)) {
+  // we need at least one input to have a shape for this inference.
+  if (!hasNInputShapes(ctx, 1)) {
+    return;
+  }
+
+  // if no kernel shape is required, then we need two inputs.
+  if (!require_kernel_shape && !hasNInputShapes(ctx, 2)) {
     return;
   }
 
@@ -36,34 +45,31 @@ void convPoolTypeAndShapeInference(InferenceContext& ctx, bool use_dilation, boo
     return;
   }
 
-  size_t n_input_dims = (size_t) (ctx.getInputType(0)->tensor_type().shape().dim_size() - 2);
+  auto input_shape = ctx.getInputType(0)->tensor_type().shape();
+  if (input_shape.dim_size() < 2) {
+    return; // The input shape is not properly set.
+  }
+
+  // first dim is the batch axis and the next is the number of channels.
+  size_t n_input_dims = static_cast<size_t>(input_shape.dim_size() - 2);
 
   // Pooling operations don't support dilation, only Conv. For
   // simplicity of the code, we just treat them as having all-1s
   // dilation.
   std::vector<int64_t> dilations;
+  bool nodilations = false;
   if (use_dilation && getRepeatedAttribute(ctx, "dilations", dilations)) {
     if (dilations.size() != n_input_dims) {
       return;
     }
   } else {
+    nodilations = true;
     dilations.assign(n_input_dims, 1);
   }
 
-  std::vector<int64_t> kernel_shape;
-  if (getRepeatedAttribute(ctx, "kernel_shape", kernel_shape)) {
-    if (kernel_shape.size() != static_cast<size_t>(ctx.getInputType(0)->tensor_type().shape().dim_size() - 2)) {
-      return;
-    }
-  } else if (require_kernel_shape) {
-    return;
-  } else {
-    for (int i = 2; i < ctx.getInputType(1)->tensor_type().shape().dim_size(); ++i) {
-      if (!ctx.getInputType(1)->tensor_type().shape().dim(i).has_dim_value()) {
-        return;
-      }
-      kernel_shape.push_back(ctx.getInputType(1)->tensor_type().shape().dim(i).dim_value());
-    }
+  int64_t groups = getAttribute(ctx, "group", 1);
+  if (groups != 1) {
+    return; // we don't handle the group case.
   }
 
   std::vector<int64_t> pads;
@@ -84,32 +90,60 @@ void convPoolTypeAndShapeInference(InferenceContext& ctx, bool use_dilation, boo
     strides.assign(n_input_dims, 1);
   }
 
-  *ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape()->add_dim() =
-    ctx.getInputType(0)->tensor_type().shape().dim(0);
-  *ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape()->add_dim() =
-    ctx.getInputType(1)->tensor_type().shape().dim(0);
+  std::vector<int64_t> kernel_shape;
+  if (getRepeatedAttribute(ctx, "kernel_shape", kernel_shape)) {
+    if (kernel_shape.size() != n_input_dims) {
+      return;
+    }
+  } else if (require_kernel_shape) {
+    return;
+  } else {
+    auto second_input_shape = ctx.getInputType(1)->tensor_type().shape();
+    for (int i = 2; i < second_input_shape.dim_size(); ++i) {
+      if (!second_input_shape.dim(i).has_dim_value()) {
+        return;
+      }
+      kernel_shape.push_back(second_input_shape.dim(i).dim_value());
+    }
+  }
+
+  auto output_shape =
+      ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
+
+  if (require_kernel_shape) {
+    // add the first two dimensions from the input.
+    *output_shape->add_dim() = input_shape.dim(0);
+    *output_shape->add_dim() = input_shape.dim(1);
+  } else {
+    *output_shape->add_dim() = input_shape.dim(0);
+    *output_shape->add_dim() =
+        ctx.getInputType(1)->tensor_type().shape().dim(0);
+  }
 
-  for (int i = 0; i < static_cast<int>(kernel_shape.size()); ++i) {
-    auto newdim = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape()->add_dim();
-    if (!ctx.getInputType(0)->tensor_type().shape().dim(2 + i).has_dim_value()) {
+  int kernel_shape_size = static_cast<int>(kernel_shape.size());
+  for (int i = 0; i < kernel_shape_size; ++i) {
+    auto newdim = output_shape->add_dim();
+    if (!input_shape.dim(2 + i).has_dim_value()) {
       continue;
     }
     // how big is the input, including padding
-    int64_t effective_input_size = ctx.getInputType(0)->tensor_type().shape().dim(2 + i).dim_value();
+    int64_t effective_input_size = input_shape.dim(2 + i).dim_value();
     effective_input_size += pads[i];
-    effective_input_size += pads[i + static_cast<int>(kernel_shape.size())];
+    effective_input_size += pads[i + kernel_shape_size];
 
-    // accounting for dilation, how big is the kernel in this dimension
     int64_t effective_kernel_size = kernel_shape[i];
-    effective_kernel_size = (effective_kernel_size - 1) * dilations[i] + 1;
+    if (!nodilations) {
+      // how big is the kernel in this dimension
+      effective_kernel_size = (effective_kernel_size - 1) * dilations[i] + 1;
+    }
 
-    // how many times we can move the kernel from it's initial position, based on the stride
-    int64_t strided_kernel_positions = (effective_input_size - effective_kernel_size) / strides[i];
+    // how many times we can move the kernel from it's initial position, based
+    // on the stride
+    int64_t strided_kernel_positions =
+        (effective_input_size - effective_kernel_size) / strides[i];
 
     // add in the initial position
-    int64_t total_kernel_positions = 1 + strided_kernel_positions;
-
-    newdim->set_dim_value(total_kernel_positions);
+    newdim->set_dim_value(1 + strided_kernel_positions);
   }
 }
 
@@ -187,9 +221,11 @@ std::function<void(OpSchema&)> PoolOpSchemaGenerator(
         "T",
         {"tensor(float16)", "tensor(float)", "tensor(double)"},
         "Constrain input and output types to float tensors.");
-    schema.TypeAndShapeInferenceFunction([](InferenceContext& ctx) { convPoolTypeAndShapeInference(ctx, false, true); });
+    schema.TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+      convPoolTypeAndShapeInference(ctx, false, true);
+    });
   };
-}
+} // namespace ONNX_NAMESPACE
 
 ONNX_OPERATOR_SCHEMA(AveragePool)
     .FillUsing(PoolOpSchemaGenerator(
@@ -271,13 +307,50 @@ std::function<void(OpSchema&)> LpPoolOpSchemaGenerator(const char* name) {
         "T",
         {"tensor(float16)", "tensor(float)", "tensor(double)"},
         "Constrain input and output types to float tensors.");
+    schema.TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+      convPoolTypeAndShapeInference(ctx, false, true);
+    });
   };
 }
 
 ONNX_OPERATOR_SCHEMA(LpPool).FillUsing(LpPoolOpSchemaGenerator("LpPool"));
 
 } // namespace ONNX_NAMESPACE
 
+// For ROI pool operations.
+void roiPoolTypeShapeInference(InferenceContext& ctx) {
+  propagateElemTypeFromInputToOutput(ctx, 0, 0);
+
+  // rois is the second input.
+  if (!hasNInputShapes(ctx, 2)) {
+    return;
+  }
+
+  auto input_shape = ctx.getInputType(0)->tensor_type().shape();
+  auto rios_shape = ctx.getInputType(1)->tensor_type().shape();
+
+  // first dim is the batch axis and the next is the number of channels.
+  size_t n_input_dims = static_cast<size_t>(input_shape.dim_size() - 2);
+
+  std::vector<int64_t> pooled_shape;
+  if (getRepeatedAttribute(ctx, "pooled_shape", pooled_shape)) {
+    if (pooled_shape.size() != n_input_dims) {
+      return;
+    }
+  } else {
+    return; // cannot produce output shape.
+  }
+
+  // (num_rois, channels, pooled_shape[0], pooled_shape[1])
+  auto output_shape =
+      ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
+
+  *output_shape->add_dim() = rios_shape.dim(0);
+  *output_shape->add_dim() = input_shape.dim(1);
+  output_shape->add_dim()->set_dim_value(pooled_shape[0]);
+  output_shape->add_dim()->set_dim_value(pooled_shape[1]);
+}
+
 namespace ONNX_NAMESPACE {
 std::function<void(OpSchema&)> RoiPoolOpSchemaGenerator(const char* name) {
   return [=](OpSchema& schema) {
@@ -321,6 +394,8 @@ std::function<void(OpSchema&)> RoiPoolOpSchemaGenerator(const char* name) {
         "T",
         {"tensor(float16)", "tensor(float)", "tensor(double)"},
         "Constrain input and output types to float tensors.");
+    schema.TypeAndShapeInferenceFunction(
+        [](InferenceContext& ctx) { roiPoolTypeShapeInference(ctx); });
   };
 }
 
@@ -406,7 +481,9 @@ computes the output.)DOC";
         "number of groups input channels and output channels are divided into, default is 1.",
         AttributeProto::INT,
         static_cast<int64_t>(1));
-    schema.TypeAndShapeInferenceFunction([](InferenceContext& ctx) { convPoolTypeAndShapeInference(ctx, true, false); });
+    schema.TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+      convPoolTypeAndShapeInference(ctx, true, false);
+    });
   };
 }
 
@@ -666,11 +743,7 @@ Output case #2: Y (test mode)
         "The running variance (training) or the estimated "
         "variance (testing) as a 1-dimensional tensor of size C.",
         "T")
-    .Output(
-        0,
-        "Y",
-        "The output tensor of the same shape as X.",
-        "T")
+    .Output(0, "Y", "The output tensor of the same shape as X.", "T")
     .Output(
         1,
         "mean",
@@ -704,11 +777,10 @@ Output case #2: Y (test mode)
         {"tensor(float16)", "tensor(float)", "tensor(double)"},
         "Constrain input and output types to float tensors.")
     .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
-        propagateShapeAndTypeFromFirstInput(ctx);
-        // TODO in training mode, it may be possible to infer some of
-        // the other outputs as well.
-      });
-
+      propagateShapeAndTypeFromFirstInput(ctx);
+      // TODO in training mode, it may be possible to infer some of
+      // the other outputs as well.
+    });
 
 ONNX_OPERATOR_SCHEMA(InstanceNormalization)
     .SinceVersion(6)
@@ -739,18 +811,14 @@ where mean and variance are computed per instance per channel.
         "T")
     .Input(1, "scale", "The input 1-dimensional scale tensor of size C.", "T")
     .Input(2, "B", "The input 1-dimensional bias tensor of size C.", "T")
-    .Output(
-        0,
-        "output",
-        "The output tensor of the same shape as input.",
-        "T")
+    .Output(0, "output", "The output tensor of the same shape as input.", "T")
     .TypeConstraint(
         "T",
         {"tensor(float16)", "tensor(float)", "tensor(double)"},
         "Constrain input and output types to float tensors.")
     .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
-        propagateShapeAndTypeFromFirstInput(ctx);
-      });
+      propagateShapeAndTypeFromFirstInput(ctx);
+    });
 
 ONNX_OPERATOR_SCHEMA(LpNormalization)
     .Input(0, "input", "Input matrix", "T")
@@ -773,8 +841,8 @@ Given a matrix, apply Lp-normalization along the provided axis.
         AttributeProto::INT,
         static_cast<int64_t>(2))
     .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
-        propagateShapeAndTypeFromFirstInput(ctx);
-      });
+      propagateShapeAndTypeFromFirstInput(ctx);
+    });
 
 ONNX_OPERATOR_SCHEMA(Dropout)
     .SinceVersion(6)

diff --git a/onnx/test/shape_inference_test.py b/onnx/test/shape_inference_test.py
@@ -672,6 +672,98 @@ def test_softmax(self):
             [])
         self._assert_inferred(graph, [make_tensor_value_info('z', TensorProto.FLOAT, (4, 5))])
 
+    def test_maxpool(self):
+        graph = self._make_graph(
+            [("X", TensorProto.FLOAT, (5, 3, 4, 4))],
+            [make_node("MaxPool", ["X"], ["Y"], kernel_shape=[2, 2])],
+            [])
+        self._assert_inferred(graph, [make_tensor_value_info("Y", TensorProto.FLOAT, (5, 3, 3, 3))])
+
+    def test_maxpool_3D(self):
+        graph = self._make_graph(
+            [("X", TensorProto.FLOAT, (5, 3, 4, 4, 4))],
+            [make_node("MaxPool", ["X"], ["Y"], kernel_shape=[2, 2, 2])],
+            [])
+        self._assert_inferred(graph, [make_tensor_value_info("Y", TensorProto.FLOAT, (5, 3, 3, 3, 3))])
+
+    def test_maxpool_with_padding(self):
+        graph = self._make_graph(
+            [("X", TensorProto.FLOAT, (5, 3, 4, 4))],
+            [make_node("MaxPool", ["X"], ["Y"], kernel_shape=[2, 2], pads=[1, 1, 2, 2])],
+            [])
+        self._assert_inferred(graph, [make_tensor_value_info("Y", TensorProto.FLOAT, (5, 3, 6, 6))])
+
+    def test_maxpool_with_padding_and_stride(self):
+        graph = self._make_graph(
+            [("X", TensorProto.FLOAT, (5, 3, 4, 4))],
+            [make_node("MaxPool", ["X"], ["Y"], kernel_shape=[2, 2], pads=[1, 1, 2, 2], strides=[2, 2])],
+            [])
+        self._assert_inferred(graph, [make_tensor_value_info("Y", TensorProto.FLOAT, (5, 3, 3, 3))])
+
+    def test_averagepool(self):
+        graph = self._make_graph(
+            [("X", TensorProto.FLOAT, (5, 3, 4, 4))],
+            [make_node("AveragePool", ["X"], ["Y"], kernel_shape=[2, 2])],
+            [])
+        self._assert_inferred(graph, [make_tensor_value_info("Y", TensorProto.FLOAT, (5, 3, 3, 3))])
+
+    def test_averagepool_3D(self):
+        graph = self._make_graph(
+            [("X", TensorProto.FLOAT, (5, 3, 4, 4, 4))],
+            [make_node("AveragePool", ["X"], ["Y"], kernel_shape=[2, 2, 2])],
+            [])
+        self._assert_inferred(graph, [make_tensor_value_info("Y", TensorProto.FLOAT, (5, 3, 3, 3, 3))])
+
+    def test_averagepool_with_padding(self):
+        graph = self._make_graph(
+            [("X", TensorProto.FLOAT, (5, 3, 4, 4))],
+            [make_node("AveragePool", ["X"], ["Y"], kernel_shape=[2, 2], pads=[1, 1, 2, 2])],
+            [])
+        self._assert_inferred(graph, [make_tensor_value_info("Y", TensorProto.FLOAT, (5, 3, 6, 6))])
+
+    def test_averagepool_with_padding_and_stride(self):
+        graph = self._make_graph(
+            [("X", TensorProto.FLOAT, (5, 3, 4, 4))],
+            [make_node("AveragePool", ["X"], ["Y"], kernel_shape=[2, 2], pads=[1, 1, 2, 2], strides=[2, 2])],
+            [])
+        self._assert_inferred(graph, [make_tensor_value_info("Y", TensorProto.FLOAT, (5, 3, 3, 3))])
+
+    def test_lppool(self):
+        graph = self._make_graph(
+            [("X", TensorProto.FLOAT, (5, 3, 4, 4))],
+            [make_node("LpPool", ["X"], ["Y"], kernel_shape=[2, 2])],
+            [])
+        self._assert_inferred(graph, [make_tensor_value_info("Y", TensorProto.FLOAT, (5, 3, 3, 3))])
+
+    def test_lppool_3D(self):
+        graph = self._make_graph(
+            [("X", TensorProto.FLOAT, (5, 3, 4, 4, 4))],
+            [make_node("LpPool", ["X"], ["Y"], kernel_shape=[2, 2, 2])],
+            [])
+        self._assert_inferred(graph, [make_tensor_value_info("Y", TensorProto.FLOAT, (5, 3, 3, 3, 3))])
+
+    def test_lppool_with_padding(self):
+        graph = self._make_graph(
+            [("X", TensorProto.FLOAT, (5, 3, 4, 4))],
+            [make_node("LpPool", ["X"], ["Y"], kernel_shape=[2, 2], pads=[1, 1, 2, 2])],
+            [])
+        self._assert_inferred(graph, [make_tensor_value_info("Y", TensorProto.FLOAT, (5, 3, 6, 6))])
+
+    def test_lppool_with_padding_and_stride(self):
+        graph = self._make_graph(
+            [("X", TensorProto.FLOAT, (5, 3, 4, 4))],
+            [make_node("LpPool", ["X"], ["Y"], kernel_shape=[2, 2], pads=[1, 1, 2, 2], strides=[2, 2])],
+            [])
+        self._assert_inferred(graph, [make_tensor_value_info("Y", TensorProto.FLOAT, (5, 3, 3, 3))])
+
+    def test_roipool(self):
+        graph = self._make_graph(
+            [("X", TensorProto.FLOAT, (5, 3, 4, 4)),
+            ("rois", TensorProto.INT64, (2, 5))],
+            [make_node("MaxRoiPool", ["X", "rois"], ["Y"], pooled_shape=[2, 2])],
+            [])
+        self._assert_inferred(graph, [make_tensor_value_info("Y", TensorProto.FLOAT, (2, 3, 2, 2))])
+
     def test_lp_norm(self):
         graph = self._make_graph(
             [('x', TensorProto.FLOAT, (3, 4, 5, 6, 7))],