[ET-VK] Adding all tensor packing support to split op.

trivedivivek · trivedivivek · commit 84318e2967fc · 2025-03-17T16:18:23.000-07:00
This diff updates Executorch Vulkan backend's `split` operation to support width, height and channel packed tensors. It also updates the op_registry.py file to indicate `split` operation supports all packing and adds new test cases to the cases.py file to test the operation. Differential Revision: [D71345589](https://our.internmc.facebook.com/intern/diff/D71345589/) ghstack-source-id: 272306677 Pull Request resolved: #9345
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
@@ -528,8 +528,6 @@ def register_view_op(features: OpFeatures):
         exir_ops.edge.aten.index_select.default,
         exir_ops.edge.aten.select_copy.int,
         # Tensor combination
-        exir_ops.edge.aten.split_with_sizes_copy.default,
-        exir_ops.edge.aten.split.Tensor,
         exir_ops.edge.aten.repeat.default,
         # Tensor creation
         exir_ops.edge.aten.arange.start_step,
@@ -563,6 +561,8 @@ def register_ported_op(features: OpFeatures):
         exir_ops.edge.aten.permute_copy.default,
         # Tensor combination
         exir_ops.edge.aten.cat.default,
+        exir_ops.edge.aten.split_with_sizes_copy.default,
+        exir_ops.edge.aten.split.Tensor,
     ]
 )
 def register_ported_op_all_packed_dims(features: OpFeatures):
diff --git a/backends/vulkan/runtime/graph/ops/impl/Split.cpp b/backends/vulkan/runtime/graph/ops/impl/Split.cpp
@@ -25,8 +25,6 @@ void add_split_with_sizes_default_node(
     ValueRef out_list_ref) {
   vTensorPtr t_in = graph.get_tensor(in);
 
-  VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim));
-
   ValueListPtr out_list = graph.get_value_list(out_list_ref);
 
   DimIndex dim_index = normalize_to_dim_index(*t_in, dim);
@@ -38,62 +36,60 @@ void add_split_with_sizes_default_node(
     ValueRef out_ref = (*out_list)[split_idx];
 
     vTensorPtr t_out = graph.get_tensor(out_ref);
-    VK_CHECK_COND(check_packed_dim_is(*t_out, WHCN::kChannelsDim));
     VK_CHECK_COND(dim_at(*t_out, dim_index) == split_size);
   }
 
-  if (dim_index == kWidth4D) {
-    utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
-    utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
+  const auto packed_dim = t_in->packed_dim();
+  const auto packed_dim_index = static_cast<DimIndex>(kWidth4D - packed_dim);
 
-    for (ValueRef out_ref : *out_list) {
-      // Doesn't need to use split_size since we have already verified that the
-      // output tensor's size matches with the split_size.
-      vTensorPtr t_out = graph.get_tensor(out_ref);
-      utils::ivec3 range = t_out->logical_limits();
-      add_copy_offset_node(
-          graph, in, range, src_offset, dst_offset, out_ref, false, true);
+  // Index of dimension to be concatenated in (w, h, c * b) coordinate system
+  const auto dim_xyz_index = std::min(2, -dim_index - 1);
 
-      src_offset[0] += range[0];
-    }
-  } else if (dim_index == kHeight4D) {
-    utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
-    utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
+  utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
+  utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
 
-    for (ValueRef out_ref : *out_list) {
-      vTensorPtr t_out = graph.get_tensor(out_ref);
-      utils::ivec3 range = t_out->logical_limits();
-      add_copy_offset_node(
-          graph, in, range, src_offset, dst_offset, out_ref, false, true);
+  const bool is_splitting_channel = (dim_index == kChannel4D);
 
-      src_offset[1] += range[1];
-    }
-  } else if (dim_index == kBatch4D) {
-    utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
-    utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
+  // if splitting channels
+  if (is_splitting_channel) {
+    // set source offset w as channel size of the input tensor
+    src_offset[3] = dim_at(t_in->sizes(), kChannel4D);
+  }
 
-    for (ValueRef out_ref : *out_list) {
-      vTensorPtr t_out = graph.get_tensor(out_ref);
-      utils::ivec3 range = t_out->logical_limits();
+  for (ValueRef out_ref : *out_list) {
+    // Doesn't need to use split_size since we have already verified that the
+    // output tensor's size matches with the split_size.
+    vTensorPtr t_out = graph.get_tensor(out_ref);
+    const auto out_channel_size = dim_at(t_out->sizes(), kChannel4D);
+    utils::ivec3 range = t_out->logical_limits();
+
+    if (dim_index == packed_dim_index) {
+      // if splitting channels, use add_copy_channel_offset_node function as
+      // add_copy_packed_dim_offset_node does not support channel packing
+      if (is_splitting_channel) {
+        add_copy_channel_offset_node(
+            graph, in, out_channel_size, src_offset[2], dst_offset[2], out_ref);
+        src_offset[dim_xyz_index] += out_channel_size;
+      } else {
+        // dst_offset[3] is not used now but will be used in the future when
+        // add_copy_packed_dim_offset_node will support channel packing
+        //
+        // set destination offset w as channel size of the output tensor if
+        // splitting channel
+        dst_offset[3] = is_splitting_channel ? out_channel_size : 0;
+        add_copy_packed_dim_offset_node(
+            graph, in, range, src_offset, dst_offset, out_ref);
+        src_offset[dim_xyz_index] += dim_at(t_out->sizes(), packed_dim_index);
+      }
+    } else {
+      // set destination offset w as channel size of the output tensor if
+      // splitting channels
+      dst_offset[3] = is_splitting_channel ? out_channel_size : 0;
       add_copy_offset_node(
           graph, in, range, src_offset, dst_offset, out_ref, false, true);
-
-      src_offset[2] += range[2];
-    }
-  } else if (dim_index == kChannel4D) {
-    int32_t src_offset = 0;
-    int32_t dst_offset = 0;
-
-    for (ValueRef out_ref : *out_list) {
-      vTensorPtr t_out = graph.get_tensor(out_ref);
-      int32_t range = dim_at<kChannel4D>(t_out->sizes());
-      add_copy_channel_offset_node(
-          graph, in, range, src_offset, dst_offset, out_ref);
-      src_offset += range;
+      src_offset[dim_xyz_index] +=
+          is_splitting_channel ? out_channel_size : range[dim_xyz_index];
     }
-
-  } else {
-    VK_THROW("not ipmlemented");
   }
 }
 
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
@@ -922,86 +922,103 @@ def get_split_with_sizes_inputs():
     Test = namedtuple("VkSliceTest", ["self", "sizes", "dim"])
     test_cases = [
         # Split on Width
+        Test(self=(S1, 7, 10, 11), sizes=[1, 3, 3, 5], dim=3),
         Test(self=(S1, 7, 10, 10), sizes=[1, 2, 3, 4], dim=3),
+        Test(self=(7, 10, 11), sizes=[1, 3, 3, 5], dim=2),
         Test(self=(7, 10, 10), sizes=[1, 2, 3, 4], dim=2),
+        Test(self=(7, 10, 11), sizes=[3, 8], dim=2),
         Test(self=(7, 10, 10), sizes=[1, 9], dim=2),
         Test(self=(10, 10), sizes=[1, 9], dim=1),
         Test(self=(10,), sizes=[1, 9], dim=0),
         # Split on Height
+        Test(self=(S1, 7, 11, 10), sizes=[1, 3, 3, 5], dim=2),
         Test(self=(S1, 7, 10, 10), sizes=[1, 2, 3, 4], dim=2),
+        Test(self=(7, 11, 10), sizes=[1, 3, 3, 5], dim=1),
         Test(self=(7, 10, 10), sizes=[1, 2, 3, 4], dim=1),
+        Test(self=(7, 11, 11), sizes=[3, 8], dim=1),
         Test(self=(7, 10, 10), sizes=[10], dim=1),
         Test(self=(7, 6, 10), sizes=[1, 1, 1, 1, 1, 1], dim=1),
         Test(self=(10, 10), sizes=[1, 2, 3, 4], dim=0),
         # Split on Batch
         Test(self=(10, 7, 10, 10), sizes=[3, 6, 1], dim=0),
         Test(self=(10, 7, 10, 10), sizes=[10], dim=0),
         # Split on Channel
+        Test(self=(7, 13, 4, 8), sizes=[3, 5, 2, 3], dim=1),
         Test(self=(7, 13, 4, 8), sizes=[3, 6, 1, 3], dim=1),
+        Test(self=(7, 13, 4, 8), sizes=[3, 3, 2, 5, 1], dim=1),
         Test(self=(7, 13, 4, 8), sizes=[3, 3, 3, 3, 1], dim=1),
+        Test(self=(13, 4, 8), sizes=[3, 5, 2, 1, 2], dim=0),
         Test(self=(13, 4, 8), sizes=[3, 3, 3, 3, 1], dim=0),
         Test(self=(13, 4, 8), sizes=[2, 9, 2], dim=0),
         Test(self=(13, 4, 8), sizes=[13], dim=0),
     ]
     test_suite = VkTestSuite([tuple(tc) for tc in test_cases])
 
     test_suite.layouts = [
+        "utils::kWidthPacked",
+        "utils::kHeightPacked",
         "utils::kChannelsPacked",
     ]
     test_suite.data_gen = "make_seq_tensor"
     test_suite.dtypes = ["at::kFloat"]
     return test_suite
 
 
-@register_test_suite("aten.split.Tensor")
-def get_split_tensor_inputs():
-    test_suite = VkTestSuite(
-        [
-            # Split on Width
-            ((S1, 7, 10, 12), 12, 3),
-            ((S1, 7, 10, 12), 3, 3),
-            ((S1, 7, 10, 12), 1, 3),
-            ((7, 10, 12), 12, 2),
-            ((7, 10, 12), 3, 2),
-            ((7, 10, 12), 1, 2),
-            ((10, 12), 12, 1),
-            ((10, 12), 3, 1),
-            ((10, 12), 1, 1),
-            ((12,), 12, 0),
-            ((12,), 3, 0),
-            ((12,), 1, 0),
-            # Split on Height
-            ((S1, 7, 12, 8), 12, 2),
-            ((S1, 7, 12, 8), 3, 2),
-            ((S1, 7, 12, 8), 1, 2),
-            ((7, 12, 8), 12, 1),
-            ((7, 12, 8), 3, 1),
-            ((7, 12, 8), 1, 1),
-            ((12, 8), 12, 0),
-            ((12, 8), 3, 0),
-            ((12, 8), 1, 0),
-            # Split  on Batch
-            ((12, 7, 10, 10), 12, 0),
-            ((12, 7, 10, 10), 3, 0),
-            ((12, 7, 10, 10), 1, 0),
-            # Split  on Channel
-            ((7, 15, 10, 10), 15, 1),
-            ((7, 15, 10, 10), 5, 1),
-            ((7, 15, 10, 10), 3, 1),
-            ((7, 15, 10, 10), 1, 1),
-            ((15, 10, 10), 15, 0),
-            ((15, 10, 10), 5, 0),
-            ((15, 10, 10), 3, 0),
-            ((15, 10, 10), 1, 0),
-        ]
-    )
-
-    test_suite.layouts = [
-        "utils::kChannelsPacked",
-    ]
-    test_suite.data_gen = "make_seq_tensor"
-    test_suite.dtypes = ["at::kFloat"]
-    return test_suite
+# @register_test_suite("aten.split.Tensor")
+# def get_split_tensor_inputs():
+#     test_suite = VkTestSuite(
+#         [
+#             # Split on Width
+#             ((M1, 7, 10, 12), 12, 3),
+#             ((S1, 7, 10, 12), 12, 3),
+#             ((M1, 7, 10, 12), 3, 3),
+#             ((S1, 7, 10, 12), 3, 3),
+#             ((M1, 7, 10, 12), 1, 3),
+#             ((S1, 7, 10, 12), 1, 3),
+#             ((7, 10, 12), 12, 2),
+#             ((7, 10, 12), 3, 2),
+#             ((7, 10, 12), 1, 2),
+#             ((2, 3, 4), 1, 2),
+#             ((10, 12), 12, 1),
+#             ((10, 12), 3, 1),
+#             ((10, 12), 1, 1),
+#             ((12,), 12, 0),
+#             ((12,), 3, 0),
+#             ((12,), 1, 0),
+#             # Split on Height
+#             ((S1, 7, 12, 8), 12, 2),
+#             ((S1, 7, 12, 8), 3, 2),
+#             ((S1, 7, 12, 8), 1, 2),
+#             ((7, 12, 8), 12, 1),
+#             ((7, 12, 8), 3, 1),
+#             ((7, 12, 8), 1, 1),
+#             ((12, 8), 12, 0),
+#             ((12, 8), 3, 0),
+#             ((12, 8), 1, 0),
+#             # Split  on Batch
+#             ((12, 7, 10, 10), 12, 0),
+#             ((12, 7, 10, 10), 3, 0),
+#             ((12, 7, 10, 10), 1, 0),
+#             # Split  on Channel
+#             ((7, 15, 10, 10), 15, 1),
+#             ((7, 15, 10, 10), 5, 1),
+#             ((7, 15, 10, 10), 3, 1),
+#             ((7, 15, 10, 10), 1, 1),
+#             ((15, 10, 10), 15, 0),
+#             ((15, 10, 10), 5, 0),
+#             ((15, 10, 10), 3, 0),
+#             ((15, 10, 10), 1, 0),
+#         ]
+#     )
+
+#     test_suite.layouts = [
+#         "utils::kWidthPacked",
+#         "utils::kHeightPacked",
+#         "utils::kChannelsPacked",
+#     ]
+#     test_suite.data_gen = "make_seq_tensor"
+#     test_suite.dtypes = ["at::kFloat"]
+#     return test_suite
 
 
 def get_reduce_inputs(is_softmax: bool = False):