From aaf9924097a7a18187e357eeb1bea760f0a4461e Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Wed, 19 Mar 2025 15:48:08 -0700
Subject: [PATCH 1/3] [ET-VK] Adding boolean parameters to add_copy_offset_node
 to specify index calculation function in copy op's shader.

Pull Request resolved: https://github.com/pytorch/executorch/pull/9343

This diff adds two new boolean flags, `calc_out_pos_using_src_chnl` and `calc_in_pos_using_dst_chnl` to add_copy_offset_node, which can be used to specify an indexing function in the shader.
ghstack-source-id: 272554190
@exported-using-ghexport

Differential Revision: [D71343588](https://our.internmc.facebook.com/intern/diff/D71343588/)
---
 .../runtime/graph/ops/glsl/copy_offset.glsl   | 20 +++++++++++++------
 .../vulkan/runtime/graph/ops/impl/Cat.cpp     |  2 +-
 .../vulkan/runtime/graph/ops/impl/Copy.cpp    | 13 +++++++++---
 backends/vulkan/runtime/graph/ops/impl/Copy.h | 17 +++++++++++++++-
 .../vulkan/runtime/graph/ops/impl/Repeat.cpp  |  9 +++++----
 .../vulkan/runtime/graph/ops/impl/Split.cpp   |  9 ++++++---
 6 files changed, 52 insertions(+), 18 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
index a23822765a3..178814a90c3 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
@@ -35,6 +35,8 @@ const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
 ${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
 const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
 
+${layout_declare_spec_const(C, "int", "batch_index_function", "0")}
+
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
@@ -42,14 +44,20 @@ void main() {
     return;
   }
 
-  const ivec3 in_pos = pos + src_offset.xyz;
+  ivec3 in_pos = pos + src_offset.xyz;
   ivec3 out_pos = pos + dst_offset.xyz;
-
-  // If source channel size is specified compose output z based on channel and batch index
   if (src_offset.w > 0) {
-    const int channel_index = in_pos.z % src_offset.w;
-    const int batch_index = in_pos.z / src_offset.w;
-    out_pos.z = channel_index + dst_offset.z + batch_index * dst_offset.w;
+    if (batch_index_function == 1) {
+      // batch index is calculated using source channel size
+      const int channel_index = pos.z % src_offset.w;
+      const int batch_index = pos.z / src_offset.w;
+      out_pos.z = channel_index + dst_offset.z + batch_index * dst_offset.w;
+    } else if (batch_index_function == 2) {
+      // batch index is calculated using destination channel size
+      const int channel_index = pos.z % dst_offset.w;
+      const int batch_index = pos.z / dst_offset.w;
+      in_pos.z = channel_index + src_offset.z + batch_index * src_offset.w;
+    }
   }
 
   write_texel_lpos(
diff --git a/backends/vulkan/runtime/graph/ops/impl/Cat.cpp b/backends/vulkan/runtime/graph/ops/impl/Cat.cpp
index 5f172454121..25a0ff9a7f5 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Cat.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Cat.cpp
@@ -80,7 +80,7 @@ void add_cat_default_node(
       // concatenating channels
       src_offset[3] = is_concat_channel ? in_channel_size : 0;
       add_copy_offset_node(
-          graph, input_ref, range, src_offset, dst_offset, out);
+          graph, input_ref, range, src_offset, dst_offset, out, true, false);
       dst_offset[dim_xyz_index] +=
           is_concat_channel ? in_channel_size : range[dim_xyz_index];
     }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
index 4b09fbe8619..2ecc7400d3e 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
@@ -25,7 +25,9 @@ void add_copy_offset_node(
     const ivec3& range,
     const ivec4& src_offset,
     const ivec4& dst_offset,
-    const ValueRef out) {
+    const ValueRef out,
+    bool calc_out_pos_using_src_chnl,
+    bool calc_in_pos_using_dst_chnl) {
   vTensorPtr t_in = graph.get_tensor(in);
   vTensorPtr t_out = graph.get_tensor(out);
 
@@ -49,7 +51,11 @@ void add_copy_offset_node(
       // Parameter buffers
       {},
       // Specialization Constants
-      {graph.hashed_layout_of(out), graph.hashed_layout_of(in)},
+      {graph.hashed_layout_of(out),
+       graph.hashed_layout_of(in),
+       (calc_out_pos_using_src_chnl      ? 1
+            : calc_in_pos_using_dst_chnl ? 2
+                                         : 0)},
       nullptr,
       {},
       {
@@ -256,7 +262,8 @@ void add_copy_offset_node(
   ivec4 src_offset = {src[0], src[1], src[2], 0};
   ivec4 dst_offset = {dst[0], dst[1], dst[2], 0};
 
-  add_copy_offset_node(graph, in, range, src_offset, dst_offset, out);
+  add_copy_offset_node(
+      graph, in, range, src_offset, dst_offset, out, false, false);
 }
 
 void copy_offset(ComputeGraph& graph, const std::vector<ValueRef>& args) {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.h b/backends/vulkan/runtime/graph/ops/impl/Copy.h
index d4b4c0dcc03..e9388345afa 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Copy.h
+++ b/backends/vulkan/runtime/graph/ops/impl/Copy.h
@@ -22,13 +22,28 @@ namespace vkcompute {
 // It is possible to have input and output to point to the same image
 // object. But when the source range and destination range overlap, the behavior
 // is undefined.
+//
+// boolean flags calc_out_pos_using_src_chnl and calc_in_pos_using_dst_chnl
+// can be used to specify an indexing function in the shader
+// If calc_out_pos_using_src_chnl is set to true channel and batch index will be
+// calculated based on source channel size and will be used to determine
+// destination texel position.
+//
+// If calc_in_pos_using_dst_chnl is set to truechannel and batch index will be
+// calculated based on destination channel size and will be used to determine
+// source texel position.
+//
+// If both are true calc_out_pos_using_src_chnl is picked. If both are false no
+// index calculation happens.
 void add_copy_offset_node(
     ComputeGraph& graph,
     const ValueRef in,
     const utils::ivec3& range,
     const utils::ivec4& src_offset,
     const utils::ivec4& dst_offset,
-    const ValueRef out);
+    const ValueRef out,
+    bool calc_out_pos_using_src_chnl,
+    bool calc_in_pos_using_dst_chnl);
 
 // add_copy_packed_dim_offset_node behaves similar to add_copy_node, except that
 // its used when copying packed dimension, if tensor is width or height packed.
diff --git a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
index 49daabdcb76..3f4ed4f1090 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
@@ -151,7 +151,8 @@ void add_repeat_node(
     utils::ivec4 src_offset{0, 0, 0, 0};
     utils::ivec4 dst_offset{0, 0, 0, 0};
 
-    add_copy_offset_node(graph, in, running_range, src_offset, dst_offset, out);
+    add_copy_offset_node(
+        graph, in, running_range, src_offset, dst_offset, out, false, false);
 
   } else {
     add_repeat_channel_node(graph, in, channel_repeat, out, running_range);
@@ -166,7 +167,7 @@ void add_repeat_node(
       utils::ivec4 dst_offset{i * dim_at<kWidth4D>(in_sizes), 0, 0, 0};
 
       add_copy_offset_node(
-          graph, out, running_range, src_offset, dst_offset, out);
+          graph, out, running_range, src_offset, dst_offset, out, true, false);
     }
 
     running_range[0] = running_range[0] * width_repeat;
@@ -180,7 +181,7 @@ void add_repeat_node(
       utils::ivec4 dst_offset = {0, i * dim_at<kHeight4D>(in_sizes), 0, 0};
 
       add_copy_offset_node(
-          graph, out, running_range, src_offset, dst_offset, out);
+          graph, out, running_range, src_offset, dst_offset, out, true, false);
     }
 
     running_range[1] = running_range[1] * height_repeat;
@@ -194,7 +195,7 @@ void add_repeat_node(
       utils::ivec4 dst_offset = {0, 0, i * running_range[2], 0};
 
       add_copy_offset_node(
-          graph, out, running_range, src_offset, dst_offset, out);
+          graph, out, running_range, src_offset, dst_offset, out, true, false);
     }
 
     running_range[2] = running_range[2] * batch_repeat;
diff --git a/backends/vulkan/runtime/graph/ops/impl/Split.cpp b/backends/vulkan/runtime/graph/ops/impl/Split.cpp
index ca585f1fb6d..b74317b078e 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Split.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Split.cpp
@@ -51,7 +51,8 @@ void add_split_with_sizes_default_node(
       // output tensor's size matches with the split_size.
       vTensorPtr t_out = graph.get_tensor(out_ref);
       utils::ivec3 range = t_out->logical_limits();
-      add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref);
+      add_copy_offset_node(
+          graph, in, range, src_offset, dst_offset, out_ref, false, true);
 
       src_offset[0] += range[0];
     }
@@ -62,7 +63,8 @@ void add_split_with_sizes_default_node(
     for (ValueRef out_ref : *out_list) {
       vTensorPtr t_out = graph.get_tensor(out_ref);
       utils::ivec3 range = t_out->logical_limits();
-      add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref);
+      add_copy_offset_node(
+          graph, in, range, src_offset, dst_offset, out_ref, false, true);
 
       src_offset[1] += range[1];
     }
@@ -73,7 +75,8 @@ void add_split_with_sizes_default_node(
     for (ValueRef out_ref : *out_list) {
       vTensorPtr t_out = graph.get_tensor(out_ref);
       utils::ivec3 range = t_out->logical_limits();
-      add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref);
+      add_copy_offset_node(
+          graph, in, range, src_offset, dst_offset, out_ref, false, true);
 
       src_offset[2] += range[2];
     }

From 22b480e1e0a2bd47bb1512ef78da8924517e6f48 Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Wed, 19 Mar 2025 15:48:10 -0700
Subject: [PATCH 2/3] [ET-VK] Adding source_offset processing to
 copy_packed_dim_offset function.

Pull Request resolved: https://github.com/pytorch/executorch/pull/9344

This diff change `copy_packed_dim_offset` function and associated shader to handle the source_offset parameter.
This change will help enable all tensor packing for slice op.
ghstack-source-id: 272554186
@exported-using-ghexport

Differential Revision: [D71349217](https://our.internmc.facebook.com/intern/diff/D71349217/)
---
 .../ops/glsl/copy_packed_dim_offset.glsl      | 48 ++++++++++++++++---
 .../vulkan/runtime/graph/ops/impl/Copy.cpp    | 28 +++++++++--
 2 files changed, 64 insertions(+), 12 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl
index 02ea6405b4a..e0f09f0be43 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl
@@ -44,15 +44,49 @@ void main() {
     return;
   }
 
-  // Starting offset to write at within a texel
-  const int out_lane_offset = dst_offset[packed_dim] & 0x3;
-  const bool has_lane_offset = out_lane_offset != 0;
-
   // Position in input tensor
-  const ivec3 in_pos = pos + src_offset.xyz;
+  ivec3 in_pos = pos + src_offset.xyz;
+  in_pos[packed_dim] = pos[packed_dim] + (src_offset[packed_dim] >> 2);
 
   // Read input value mapping to this output texel
-  const VEC4_T in_value = load_texel_lpos(t_in, in_pos, in_axis_map);
+  VEC4_T in_value = load_texel_lpos(t_in, in_pos, in_axis_map);
+
+  // Starting offset to read from a texel
+  const int src_lane_offset = src_offset[packed_dim] & 0x3;
+  const bool has_src_lane_offset = src_lane_offset != 0;
+
+  // If input lane offset is non zero i.e packed texel is composed from multiple sources
+  if (has_src_lane_offset) {
+    // Boundary values will come from next input texel in the packed dim.
+    ivec3 next_in_pos = in_pos;
+    next_in_pos[packed_dim] = in_pos[packed_dim] + 1;
+    VEC4_T next_value = load_texel_lpos(t_in, next_in_pos, in_axis_map);
+
+    // Keep input values from the end of current input pixel based on src_lane_offset
+    // offset 1 means the first lane of current input texel is not a part of the output texel
+    // offset 2 means first 2 lanes are not and so on
+    if (src_lane_offset == 1) {
+      in_value.xyz = in_value.yzw;
+    } else if (src_lane_offset == 2) {
+      in_value.xy = in_value.zw;
+    } else {
+      in_value.x = in_value.w;
+    }
+    // Copy next texel's values towards the end of input texel, based on lane offset
+    // offset 1 means the first lane from next texel is part of the input texel
+    // offset 2 means first 2 lanes from next texel is part of the input texel and so on
+    if (src_lane_offset == 1) {
+      in_value.w = next_value.x;
+    } else if (src_lane_offset == 2) {
+      in_value.zw = next_value.xy;
+    } else {
+      in_value.yzw = next_value.xyz;
+    }
+  }
+
+  // Starting offset to write at within a texel
+  const int out_lane_offset = dst_offset[packed_dim] & 0x3;
+  const bool has_dst_lane_offset = out_lane_offset != 0;
 
   ivec3 out_pos = pos + dst_offset.xyz;
   out_pos[packed_dim] = pos[packed_dim] + (dst_offset[packed_dim] >> 2);
@@ -60,7 +94,7 @@ void main() {
   VEC4_T out_value;
 
   // If lane offset is non zero i.e packed texel is composed from multiple sources
-  if (has_lane_offset) {
+  if (has_dst_lane_offset) {
     // When position in packed dim is > 0
     if (pos[packed_dim] > 0) {
       // Boundary values will come from previous input texel in the packed dim.
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
index 2ecc7400d3e..5756d3a9052 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
@@ -92,19 +92,37 @@ void add_copy_packed_dim_offset_node(
   ivec4 final_range = {
       range[0], range[1], range[2], dim_at(t_in->sizes(), kBatch4D)};
   ivec3 global_wg_size = t_out->logical_limits();
+  // The starting offset in a texel where this tensor will start copying from
+  const auto src_lane_offset = src_offset[packed_dim] & 0x3;
   // The starting offset in a texel where this tensor will start copying to
   const auto dst_lane_offset = dst_offset[packed_dim] & 0x3;
+
+  // The total packed texels this tensor will be copied from
+  // The first texel of tensor data in packed dimension will be copied from
+  // remaining lanes from current source Hence (4 - src_lane_offset) is added
+  // to tensor size in packed dimension
+  const auto src_packed_size = utils::div_up_4(
+      (4 - src_lane_offset) +
+      dim_at(t_out->sizes(), normalize_to_dim_index(*t_out, packed_dim)));
+
   // The total packed texels this tensor will be copied to
-  // The first texel of tensor data in packed dimension will be copied to remain
-  // lanes from previous write Hence (4 - dst_lane_offset) is added to tensor
-  // size in packed dimension
+  // The first texel of tensor data in packed dimension will be copied to
+  // remaining lanes from previous write Hence (4 - dst_lane_offset) is added to
+  // tensor size in packed dimension
   const auto dst_packed_size = utils::div_up_4(
       (4 - dst_lane_offset) +
       dim_at(t_in->sizes(), normalize_to_dim_index(*t_in, packed_dim)));
 
-  // If the starting offset is not 0, and the total packed texels is greater
+  // If the starting src offset is not 0, and the total packed texels is greater
   // than the source texel range
-  if (dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim]) {
+  const bool has_additional_src_work =
+      src_lane_offset != 0 && src_packed_size > final_range[packed_dim];
+  // If the starting dst offset is not 0, and the total packed texels is greater
+  // than the source texel range
+  const bool has_additional_dst_work =
+      dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim];
+
+  if (has_additional_src_work || has_additional_dst_work) {
     global_wg_size[packed_dim]++; // Increase the global work group size in
                                   // packed dimension
     final_range[packed_dim]++; // Increase the range in packed dimension

From 6b58d463d57be9c5f14dcd9004a4d56de77307d2 Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Wed, 19 Mar 2025 15:48:11 -0700
Subject: [PATCH 3/3] [ET-VK] Adding all tensor packing support to split op.

Pull Request resolved: https://github.com/pytorch/executorch/pull/9345

This diff updates Executorch Vulkan backend's `split` operation to support width, height and channel packed tensors.
It also updates the op_registry.py file to indicate `split` operation supports all packing and adds new test cases to the cases.py file to test the operation.
ghstack-source-id: 272554188
@exported-using-ghexport

Differential Revision: [D71345589](https://our.internmc.facebook.com/intern/diff/D71345589/)
---
 backends/vulkan/op_registry.py                |  4 +-
 .../vulkan/runtime/graph/ops/impl/Split.cpp   | 90 +++++++++----------
 backends/vulkan/test/op_tests/cases.py        | 13 +++
 3 files changed, 58 insertions(+), 49 deletions(-)

diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index f2b80c2e544..5aa805dc1b3 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -528,8 +528,6 @@ def register_view_op(features: OpFeatures):
         exir_ops.edge.aten.index_select.default,
         exir_ops.edge.aten.select_copy.int,
         # Tensor combination
-        exir_ops.edge.aten.split_with_sizes_copy.default,
-        exir_ops.edge.aten.split.Tensor,
         exir_ops.edge.aten.repeat.default,
         # Tensor creation
         exir_ops.edge.aten.arange.start_step,
@@ -563,6 +561,8 @@ def register_ported_op(features: OpFeatures):
         exir_ops.edge.aten.permute_copy.default,
         # Tensor combination
         exir_ops.edge.aten.cat.default,
+        exir_ops.edge.aten.split_with_sizes_copy.default,
+        exir_ops.edge.aten.split.Tensor,
     ]
 )
 def register_ported_op_all_packed_dims(features: OpFeatures):
diff --git a/backends/vulkan/runtime/graph/ops/impl/Split.cpp b/backends/vulkan/runtime/graph/ops/impl/Split.cpp
index b74317b078e..8002dadc538 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Split.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Split.cpp
@@ -25,8 +25,6 @@ void add_split_with_sizes_default_node(
     ValueRef out_list_ref) {
   vTensorPtr t_in = graph.get_tensor(in);
 
-  VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim));
-
   ValueListPtr out_list = graph.get_value_list(out_list_ref);
 
   DimIndex dim_index = normalize_to_dim_index(*t_in, dim);
@@ -38,62 +36,60 @@ void add_split_with_sizes_default_node(
     ValueRef out_ref = (*out_list)[split_idx];
 
     vTensorPtr t_out = graph.get_tensor(out_ref);
-    VK_CHECK_COND(check_packed_dim_is(*t_out, WHCN::kChannelsDim));
     VK_CHECK_COND(dim_at(*t_out, dim_index) == split_size);
   }
 
-  if (dim_index == kWidth4D) {
-    utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
-    utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
+  const auto packed_dim = t_in->packed_dim();
+  const auto packed_dim_index = static_cast<DimIndex>(kWidth4D - packed_dim);
 
-    for (ValueRef out_ref : *out_list) {
-      // Doesn't need to use split_size since we have already verified that the
-      // output tensor's size matches with the split_size.
-      vTensorPtr t_out = graph.get_tensor(out_ref);
-      utils::ivec3 range = t_out->logical_limits();
-      add_copy_offset_node(
-          graph, in, range, src_offset, dst_offset, out_ref, false, true);
+  // Index of dimension to be concatenated in (w, h, c * b) coordinate system
+  const auto dim_xyz_index = std::min(2, -dim_index - 1);
 
-      src_offset[0] += range[0];
-    }
-  } else if (dim_index == kHeight4D) {
-    utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
-    utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
+  utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
+  utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
 
-    for (ValueRef out_ref : *out_list) {
-      vTensorPtr t_out = graph.get_tensor(out_ref);
-      utils::ivec3 range = t_out->logical_limits();
-      add_copy_offset_node(
-          graph, in, range, src_offset, dst_offset, out_ref, false, true);
+  const bool is_splitting_channel = (dim_index == kChannel4D);
 
-      src_offset[1] += range[1];
-    }
-  } else if (dim_index == kBatch4D) {
-    utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
-    utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
+  // if splitting channels
+  if (is_splitting_channel) {
+    // set source offset w as channel size of the input tensor
+    src_offset[3] = dim_at(t_in->sizes(), kChannel4D);
+  }
 
-    for (ValueRef out_ref : *out_list) {
-      vTensorPtr t_out = graph.get_tensor(out_ref);
-      utils::ivec3 range = t_out->logical_limits();
+  for (ValueRef out_ref : *out_list) {
+    // Doesn't need to use split_size since we have already verified that the
+    // output tensor's size matches with the split_size.
+    vTensorPtr t_out = graph.get_tensor(out_ref);
+    const auto out_channel_size = dim_at(t_out->sizes(), kChannel4D);
+    utils::ivec3 range = t_out->logical_limits();
+
+    if (dim_index == packed_dim_index) {
+      // if splitting channels, use add_copy_channel_offset_node function as
+      // add_copy_packed_dim_offset_node does not support channel packing
+      if (is_splitting_channel) {
+        add_copy_channel_offset_node(
+            graph, in, out_channel_size, src_offset[2], dst_offset[2], out_ref);
+        src_offset[dim_xyz_index] += out_channel_size;
+      } else {
+        // dst_offset[3] is not used now but will be used in the future when
+        // add_copy_packed_dim_offset_node will support channel packing
+        //
+        // set destination offset w as channel size of the output tensor if
+        // splitting channel
+        dst_offset[3] = is_splitting_channel ? out_channel_size : 0;
+        add_copy_packed_dim_offset_node(
+            graph, in, range, src_offset, dst_offset, out_ref);
+        src_offset[dim_xyz_index] += dim_at(t_out->sizes(), packed_dim_index);
+      }
+    } else {
+      // set destination offset w as channel size of the output tensor if
+      // splitting channels
+      dst_offset[3] = is_splitting_channel ? out_channel_size : 0;
       add_copy_offset_node(
           graph, in, range, src_offset, dst_offset, out_ref, false, true);
-
-      src_offset[2] += range[2];
-    }
-  } else if (dim_index == kChannel4D) {
-    int32_t src_offset = 0;
-    int32_t dst_offset = 0;
-
-    for (ValueRef out_ref : *out_list) {
-      vTensorPtr t_out = graph.get_tensor(out_ref);
-      int32_t range = dim_at<kChannel4D>(t_out->sizes());
-      add_copy_channel_offset_node(
-          graph, in, range, src_offset, dst_offset, out_ref);
-      src_offset += range;
+      src_offset[dim_xyz_index] +=
+          is_splitting_channel ? out_channel_size : range[dim_xyz_index];
     }
-
-  } else {
-    VK_THROW("not ipmlemented");
   }
 }
 
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index e4f7ac15434..41d8edf1f25 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -922,14 +922,20 @@ def get_split_with_sizes_inputs():
     Test = namedtuple("VkSliceTest", ["self", "sizes", "dim"])
     test_cases = [
         # Split on Width
+        Test(self=(S1, 7, 10, 11), sizes=[1, 3, 2, 5], dim=3),
         Test(self=(S1, 7, 10, 10), sizes=[1, 2, 3, 4], dim=3),
+        Test(self=(7, 10, 11), sizes=[1, 3, 2, 5], dim=2),
         Test(self=(7, 10, 10), sizes=[1, 2, 3, 4], dim=2),
+        Test(self=(7, 10, 11), sizes=[3, 8], dim=2),
         Test(self=(7, 10, 10), sizes=[1, 9], dim=2),
         Test(self=(10, 10), sizes=[1, 9], dim=1),
         Test(self=(10,), sizes=[1, 9], dim=0),
         # Split on Height
+        Test(self=(S1, 7, 11, 10), sizes=[1, 3, 2, 5], dim=2),
         Test(self=(S1, 7, 10, 10), sizes=[1, 2, 3, 4], dim=2),
+        Test(self=(7, 11, 10), sizes=[1, 3, 2, 5], dim=1),
         Test(self=(7, 10, 10), sizes=[1, 2, 3, 4], dim=1),
+        Test(self=(7, 11, 11), sizes=[3, 8], dim=1),
         Test(self=(7, 10, 10), sizes=[10], dim=1),
         Test(self=(7, 6, 10), sizes=[1, 1, 1, 1, 1, 1], dim=1),
         Test(self=(10, 10), sizes=[1, 2, 3, 4], dim=0),
@@ -937,8 +943,11 @@ def get_split_with_sizes_inputs():
         Test(self=(10, 7, 10, 10), sizes=[3, 6, 1], dim=0),
         Test(self=(10, 7, 10, 10), sizes=[10], dim=0),
         # Split on Channel
+        Test(self=(7, 13, 4, 8), sizes=[3, 5, 2, 3], dim=1),
         Test(self=(7, 13, 4, 8), sizes=[3, 6, 1, 3], dim=1),
+        Test(self=(7, 13, 4, 8), sizes=[3, 2, 2, 5, 1], dim=1),
         Test(self=(7, 13, 4, 8), sizes=[3, 3, 3, 3, 1], dim=1),
+        Test(self=(13, 4, 8), sizes=[3, 5, 2, 1, 2], dim=0),
         Test(self=(13, 4, 8), sizes=[3, 3, 3, 3, 1], dim=0),
         Test(self=(13, 4, 8), sizes=[2, 9, 2], dim=0),
         Test(self=(13, 4, 8), sizes=[13], dim=0),
@@ -946,6 +955,8 @@ def get_split_with_sizes_inputs():
     test_suite = VkTestSuite([tuple(tc) for tc in test_cases])
 
     test_suite.layouts = [
+        "utils::kWidthPacked",
+        "utils::kHeightPacked",
         "utils::kChannelsPacked",
     ]
     test_suite.data_gen = "make_seq_tensor"
@@ -997,6 +1008,8 @@ def get_split_tensor_inputs():
     )
 
     test_suite.layouts = [
+        "utils::kWidthPacked",
+        "utils::kHeightPacked",
         "utils::kChannelsPacked",
     ]
     test_suite.data_gen = "make_seq_tensor"