From 5a5bca0b2b343ad0dfbcf88320fb73f9b773ead3 Mon Sep 17 00:00:00 2001
From: Jorge Pineda <jorgep31415@meta.com>
Date: Thu, 7 Mar 2024 17:09:34 -0800
Subject: [PATCH] [ET-VK] Remove test dependency on ATen/native/vulkan/impl

This must land before https://github.com/pytorch/executorch/pull/2305.

Otherwise, we will have two definitions of the same function signature in `executorch/backends/vulkan/runtime/graph/ops/OpUtils.cpp` and `ATen/native/vulkan/impl/Common.cpp`, resulting in a linker error, when building `vulkan_compute_api_test_bin`:
```
ld.lld: error: duplicate symbol: at::native::vulkan::adaptive_work_group_size(at::native::vulkan::api::utils::detail::vec<unsigned int, 3u> const&)
>>> defined at Common.cpp:8 (./xplat/caffe2/aten/src/ATen/native/vulkan/impl/Common.cpp:8)
>>>            __objects__/aten/src/ATen/native/vulkan/impl/Common.cpp.o:(at::native::vulkan::adaptive_work_group_size(at::native::vulkan::api::utils::detail::vec<unsigned int, 3u> const&)) in archive buck-out/v2/gen/fbsource/a9b839d0bc77bff6/xplat/caffe2/__torch_vulkan_ops__/libtorch_vulkan_ops.a
>>> defined at OpUtils.cpp:16 (./xplat/executorch/backends/vulkan/runtime/graph/ops/OpUtils.cpp:16)
>>>            __objects__/runtime/graph/ops/OpUtils.cpp.o:(.text._ZN2at6native6vulkan24adaptive_work_group_sizeERKNS1_3api5utils6detail3vecIjLj3EEE+0x0) in archive buck-out/v2/gen/fbsource/a9b839d0bc77bff6/xplat/executorch/backends/vulkan/__vulkan_graph_runtime__/libvulkan_graph_runtime.a
clang-15: error: linker command failed with exit code 1 (use -v to see invocation)
```

These files are part of the `torch_vulkan_ops` and `vulkan_runtime_graph` libraries, respectively.

We resolve the issue by removing the dependency of `vulkan_compute_api_test_bin` to on `torch_vulkan_ops`. Unfortunately, this requires copy-pasting more code to `VulkanBackend.cpp`.

Differential Revision: [D54659269](https://our.internmc.facebook.com/intern/diff/D54659269/)

[ghstack-poisoned]
---
 .../vulkan/runtime/graph/ops/impl/Staging.h   |   3 +
 .../vulkan/test/vulkan_compute_api_test.cpp   | 191 ++++++++++++++----
 2 files changed, 159 insertions(+), 35 deletions(-)
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.h b/backends/vulkan/runtime/graph/ops/impl/Staging.h
index 2a49026e8e3..99bdf667c6b 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Staging.h
+++ b/backends/vulkan/runtime/graph/ops/impl/Staging.h
@@ -35,6 +35,9 @@ struct StagingParams final {
 
 ValueRef prepack_if_tensor_ref(ComputeGraph& graph, const ValueRef v);
 
+// Expose for the Vulkan Compute API tests.
+StagingParams create_staging_params(const vTensor& t);
+
 } // namespace vulkan
 } // namespace native
 } // namespace at
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 5c1fc8f3c50..a646e0c4ed5 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -10,10 +10,7 @@
 
 #include <ATen/native/vulkan/api/api.h>
 
-#include <ATen/native/vulkan/impl/Arithmetic.h>
-#include <ATen/native/vulkan/impl/Common.h>
-#include <ATen/native/vulkan/impl/Packing.h>
-
+#include <executorch/backends/vulkan/runtime/graph/ops/OpUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/StagingUtils.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h>
@@ -21,10 +18,6 @@
 
 using namespace at::native::vulkan;
 
-//
-// Utilities
-//
-
 #define CREATE_FLOAT_TEXTURE(sizes, allocate_memory) \
   vTensor(                                           \
       api::context(),                                \
@@ -43,23 +36,159 @@ using namespace at::native::vulkan;
       api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, \
       allocate_memory);
 
+//
+// Simplified versions of ATen Vulkan legacy functions
+//
+
+void record_nchw_to_buffer_op(
+    api::Context* const context,
+    api::VulkanBuffer& src_buffer,
+    vTensor& v_dst) {
+  uint32_t buf_len = api::utils::safe_downcast<uint32_t>(v_dst.gpu_numel());
+  api::utils::uvec3 global_size = {buf_len, 1u, 1u};
+  api::utils::uvec3 local_size = {32u, 1u, 1u};
+
+  api::UniformParamsBuffer cpu_buffer_metadata(
+      context, v_dst.get_cpu_buffer_metadata());
+  api::PipelineBarrier pipeline_barrier{};
+
+  context->submit_compute_job(
+      VK_KERNEL(buffer_to_buffer),
+      pipeline_barrier,
+      global_size,
+      local_size,
+      VK_NULL_HANDLE,
+      v_dst.buffer(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      v_dst.buffer_metadata(),
+      src_buffer,
+      cpu_buffer_metadata.buffer());
+}
+
+bool record_buffer_to_nchw_op(
+    api::Context* const context,
+    vTensor& v_src,
+    api::VulkanBuffer& dst_buffer) {
+  uint32_t buf_len = api::utils::safe_downcast<uint32_t>(v_src.numel());
+  api::utils::uvec3 global_size = {buf_len, 1u, 1u};
+  api::utils::uvec3 local_size = {4u, 1u, 1u};
+
+  api::UniformParamsBuffer cpu_buffer_metadata(
+      context, v_src.get_cpu_buffer_metadata());
+  api::PipelineBarrier pipeline_barrier{};
+
+  return context->submit_compute_job(
+      VK_KERNEL(buffer_to_buffer),
+      pipeline_barrier,
+      global_size,
+      local_size,
+      VK_NULL_HANDLE,
+      dst_buffer,
+      cpu_buffer_metadata.buffer(),
+      v_src.buffer(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      v_src.buffer_metadata());
+}
+
+void record_nchw_to_image_op(
+    api::Context* const context,
+    api::VulkanBuffer& src_buffer,
+    vTensor& v_dst) {
+  api::utils::uvec3 global_size = v_dst.extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  api::UniformParamsBuffer params(context, create_staging_params(v_dst));
+  api::PipelineBarrier pipeline_barrier{};
+
+  context->submit_compute_job(
+      get_nchw_to_image_shader(v_dst),
+      pipeline_barrier,
+      global_size,
+      local_size,
+      VK_NULL_HANDLE,
+      v_dst.image(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      src_buffer,
+      params.buffer());
+}
+
+bool record_image_to_nchw_op(
+    api::Context* const context,
+    vTensor& v_src,
+    api::VulkanBuffer& dst_buffer) {
+  api::utils::uvec3 global_size = v_src.extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  api::UniformParamsBuffer params(context, create_staging_params(v_src));
+  api::PipelineBarrier pipeline_barrier{};
+
+  return context->submit_compute_job(
+      get_image_to_nchw_shader(v_src),
+      pipeline_barrier,
+      global_size,
+      local_size,
+      VK_NULL_HANDLE,
+      v_src.image(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      dst_buffer,
+      params.buffer());
+}
+
+void record_arithmetic_op(
+    api::Context* const context,
+    const api::ShaderInfo& compute_shader,
+    vTensor& v_in1,
+    vTensor& v_in2,
+    vTensor& v_dst,
+    const float alpha) {
+  api::utils::uvec3 global_size = v_dst.extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  ArithmeticParams block{
+      get_size_as_ivec4(v_dst),
+      get_size_as_ivec4(v_in1),
+      get_size_as_ivec4(v_in2),
+      alpha,
+  };
+  api::UniformParamsBuffer params(context, block);
+  api::PipelineBarrier pipeline_barrier{};
+
+  context->submit_compute_job(
+      compute_shader,
+      pipeline_barrier,
+      global_size,
+      local_size,
+      VK_NULL_HANDLE,
+      v_dst.image(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      v_in1.image(pipeline_barrier, api::PipelineStage::COMPUTE),
+      v_in2.image(pipeline_barrier, api::PipelineStage::COMPUTE),
+      params.buffer());
+}
+
+//
+// Utilities
+//
+
 void fill_vtensor(vTensor& vten, std::vector<float>& data) {
   api::StorageBuffer staging_buffer(api::context(), api::kFloat, data.size());
 
   copy_ptr_to_staging(data.data(), staging_buffer, vten.gpu_nbytes());
 
   if (vten.storage_type() == api::StorageType::BUFFER) {
-    packing::record_nchw_to_buffer_op(
-        api::context(), staging_buffer.buffer(), vten, {}, VK_NULL_HANDLE);
+    record_nchw_to_buffer_op(api::context(), staging_buffer.buffer(), vten);
   } else {
-    api::ShaderInfo compute_shader = packing::get_nchw_to_image_shader(vten);
-    packing::record_nchw_to_image_op(
-        api::context(),
-        compute_shader,
-        staging_buffer.buffer(),
-        vten,
-        {},
-        VK_NULL_HANDLE);
+    record_nchw_to_image_op(api::context(), staging_buffer.buffer(), vten);
   }
 }
 
@@ -75,17 +204,9 @@ void extract_vtensor(vTensor& vten, std::vector<float>& data) {
       api::context(), api::kFloat, vten.gpu_numel());
 
   if (vten.storage_type() == api::StorageType::BUFFER) {
-    packing::record_buffer_to_nchw_op(
-        api::context(), vten, staging_buffer.buffer(), {}, VK_NULL_HANDLE);
+    record_buffer_to_nchw_op(api::context(), vten, staging_buffer.buffer());
   } else {
-    api::ShaderInfo compute_shader = packing::get_image_to_nchw_shader(vten);
-    packing::record_image_to_nchw_op(
-        api::context(),
-        compute_shader,
-        vten,
-        staging_buffer.buffer(),
-        {},
-        VK_NULL_HANDLE);
+    record_image_to_nchw_op(api::context(), vten, staging_buffer.buffer());
   }
 
   api::VulkanFence fence = api::context()->fences().get_fence();
@@ -208,14 +329,14 @@ TEST_F(VulkanComputeAPITest, texture_add_sanity_check) {
   std::fill(data_b.begin(), data_b.end(), 1.5f);
 
   // Add shader kernel
-  api::ShaderInfo kernel = arithmetic::get_shader(arithmetic::OpType::ADD);
+  api::ShaderInfo kernel = VK_KERNEL(add);
 
   // Fill input tensors
   fill_vtensor(a, data_a);
   fill_vtensor(b, data_b);
 
   // a + b -> c
-  arithmetic::record_op(api::context(), kernel, a, b, c, 1.0f);
+  record_arithmetic_op(api::context(), kernel, a, b, c, 1.0f);
 
   // Extract output tensor
   std::vector<float> data_out(c.gpu_numel());
@@ -244,7 +365,7 @@ TEST_F(VulkanComputeAPITest, texture_deferred_allocation_test) {
   std::vector<float> data_b(b.gpu_numel());
   std::fill(data_b.begin(), data_b.end(), 1.5f);
 
-  api::ShaderInfo kernel = arithmetic::get_shader(arithmetic::OpType::ADD);
+  api::ShaderInfo kernel = VK_KERNEL(add);
 
   // Allocate memory at the last possible opportunity
   api::MemoryAllocation a_mem = allocate_memory_for(a);
@@ -260,7 +381,7 @@ TEST_F(VulkanComputeAPITest, texture_deferred_allocation_test) {
   fill_vtensor(a, data_a);
   fill_vtensor(b, data_b);
 
-  arithmetic::record_op(api::context(), kernel, a, b, c, 1.0f);
+  record_arithmetic_op(api::context(), kernel, a, b, c, 1.0f);
 
   std::vector<float> data_c(c.gpu_numel());
   extract_vtensor(c, data_c);
@@ -310,20 +431,20 @@ TEST_F(VulkanComputeAPITest, texture_resource_aliasing_test) {
   std::fill(data_d.begin(), data_d.end(), 1.0f);
 
   // Get shader kernel for add
-  api::ShaderInfo kernel = arithmetic::get_shader(arithmetic::OpType::ADD);
+  api::ShaderInfo kernel = VK_KERNEL(add);
 
   // First, fill a and b with data
   fill_vtensor(a, data_a);
   fill_vtensor(b, data_b);
 
   // a + b -> c
-  arithmetic::record_op(api::context(), kernel, a, b, c, 1.0f);
+  record_arithmetic_op(api::context(), kernel, a, b, c, 1.0f);
 
   // Now d can be filled with data
   fill_vtensor(d, data_d);
 
   // c + d -> e
-  arithmetic::record_op(api::context(), kernel, c, d, e, 1.0f);
+  record_arithmetic_op(api::context(), kernel, c, d, e, 1.0f);
 
   // Extract data from e
   std::vector<float> data_e(e.gpu_numel());