From 5a5bca0b2b343ad0dfbcf88320fb73f9b773ead3 Mon Sep 17 00:00:00 2001 From: Jorge Pineda Date: Thu, 7 Mar 2024 17:09:34 -0800 Subject: [PATCH] [ET-VK] Remove test dependency on ATen/native/vulkan/impl This must land before https://github.com/pytorch/executorch/pull/2305. Otherwise, we will have two definitions of the same function signature in `executorch/backends/vulkan/runtime/graph/ops/OpUtils.cpp` and `ATen/native/vulkan/impl/Common.cpp`, resulting in a linker error, when building `vulkan_compute_api_test_bin`: ``` ld.lld: error: duplicate symbol: at::native::vulkan::adaptive_work_group_size(at::native::vulkan::api::utils::detail::vec const&) >>> defined at Common.cpp:8 (./xplat/caffe2/aten/src/ATen/native/vulkan/impl/Common.cpp:8) >>> __objects__/aten/src/ATen/native/vulkan/impl/Common.cpp.o:(at::native::vulkan::adaptive_work_group_size(at::native::vulkan::api::utils::detail::vec const&)) in archive buck-out/v2/gen/fbsource/a9b839d0bc77bff6/xplat/caffe2/__torch_vulkan_ops__/libtorch_vulkan_ops.a >>> defined at OpUtils.cpp:16 (./xplat/executorch/backends/vulkan/runtime/graph/ops/OpUtils.cpp:16) >>> __objects__/runtime/graph/ops/OpUtils.cpp.o:(.text._ZN2at6native6vulkan24adaptive_work_group_sizeERKNS1_3api5utils6detail3vecIjLj3EEE+0x0) in archive buck-out/v2/gen/fbsource/a9b839d0bc77bff6/xplat/executorch/backends/vulkan/__vulkan_graph_runtime__/libvulkan_graph_runtime.a clang-15: error: linker command failed with exit code 1 (use -v to see invocation) ``` These files are part of the `torch_vulkan_ops` and `vulkan_runtime_graph` libraries, respectively. We resolve the issue by removing the dependency of `vulkan_compute_api_test_bin` to on `torch_vulkan_ops`. Unfortunately, this requires copy-pasting more code to `VulkanBackend.cpp`. Differential Revision: [D54659269](https://our.internmc.facebook.com/intern/diff/D54659269/) [ghstack-poisoned] --- .../vulkan/runtime/graph/ops/impl/Staging.h | 3 + .../vulkan/test/vulkan_compute_api_test.cpp | 191 ++++++++++++++---- 2 files changed, 159 insertions(+), 35 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.h b/backends/vulkan/runtime/graph/ops/impl/Staging.h index 2a49026e8e3..99bdf667c6b 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.h +++ b/backends/vulkan/runtime/graph/ops/impl/Staging.h @@ -35,6 +35,9 @@ struct StagingParams final { ValueRef prepack_if_tensor_ref(ComputeGraph& graph, const ValueRef v); +// Expose for the Vulkan Compute API tests. +StagingParams create_staging_params(const vTensor& t); + } // namespace vulkan } // namespace native } // namespace at diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 5c1fc8f3c50..a646e0c4ed5 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -10,10 +10,7 @@ #include -#include -#include -#include - +#include #include #include @@ -21,10 +18,6 @@ using namespace at::native::vulkan; -// -// Utilities -// - #define CREATE_FLOAT_TEXTURE(sizes, allocate_memory) \ vTensor( \ api::context(), \ @@ -43,23 +36,159 @@ using namespace at::native::vulkan; api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, \ allocate_memory); +// +// Simplified versions of ATen Vulkan legacy functions +// + +void record_nchw_to_buffer_op( + api::Context* const context, + api::VulkanBuffer& src_buffer, + vTensor& v_dst) { + uint32_t buf_len = api::utils::safe_downcast(v_dst.gpu_numel()); + api::utils::uvec3 global_size = {buf_len, 1u, 1u}; + api::utils::uvec3 local_size = {32u, 1u, 1u}; + + api::UniformParamsBuffer cpu_buffer_metadata( + context, v_dst.get_cpu_buffer_metadata()); + api::PipelineBarrier pipeline_barrier{}; + + context->submit_compute_job( + VK_KERNEL(buffer_to_buffer), + pipeline_barrier, + global_size, + local_size, + VK_NULL_HANDLE, + v_dst.buffer( + pipeline_barrier, + api::PipelineStage::COMPUTE, + api::MemoryAccessType::WRITE), + v_dst.buffer_metadata(), + src_buffer, + cpu_buffer_metadata.buffer()); +} + +bool record_buffer_to_nchw_op( + api::Context* const context, + vTensor& v_src, + api::VulkanBuffer& dst_buffer) { + uint32_t buf_len = api::utils::safe_downcast(v_src.numel()); + api::utils::uvec3 global_size = {buf_len, 1u, 1u}; + api::utils::uvec3 local_size = {4u, 1u, 1u}; + + api::UniformParamsBuffer cpu_buffer_metadata( + context, v_src.get_cpu_buffer_metadata()); + api::PipelineBarrier pipeline_barrier{}; + + return context->submit_compute_job( + VK_KERNEL(buffer_to_buffer), + pipeline_barrier, + global_size, + local_size, + VK_NULL_HANDLE, + dst_buffer, + cpu_buffer_metadata.buffer(), + v_src.buffer( + pipeline_barrier, + api::PipelineStage::COMPUTE, + api::MemoryAccessType::WRITE), + v_src.buffer_metadata()); +} + +void record_nchw_to_image_op( + api::Context* const context, + api::VulkanBuffer& src_buffer, + vTensor& v_dst) { + api::utils::uvec3 global_size = v_dst.extents(); + api::utils::uvec3 local_size = adaptive_work_group_size(global_size); + + api::UniformParamsBuffer params(context, create_staging_params(v_dst)); + api::PipelineBarrier pipeline_barrier{}; + + context->submit_compute_job( + get_nchw_to_image_shader(v_dst), + pipeline_barrier, + global_size, + local_size, + VK_NULL_HANDLE, + v_dst.image( + pipeline_barrier, + api::PipelineStage::COMPUTE, + api::MemoryAccessType::WRITE), + src_buffer, + params.buffer()); +} + +bool record_image_to_nchw_op( + api::Context* const context, + vTensor& v_src, + api::VulkanBuffer& dst_buffer) { + api::utils::uvec3 global_size = v_src.extents(); + api::utils::uvec3 local_size = adaptive_work_group_size(global_size); + + api::UniformParamsBuffer params(context, create_staging_params(v_src)); + api::PipelineBarrier pipeline_barrier{}; + + return context->submit_compute_job( + get_image_to_nchw_shader(v_src), + pipeline_barrier, + global_size, + local_size, + VK_NULL_HANDLE, + v_src.image( + pipeline_barrier, + api::PipelineStage::COMPUTE, + api::MemoryAccessType::WRITE), + dst_buffer, + params.buffer()); +} + +void record_arithmetic_op( + api::Context* const context, + const api::ShaderInfo& compute_shader, + vTensor& v_in1, + vTensor& v_in2, + vTensor& v_dst, + const float alpha) { + api::utils::uvec3 global_size = v_dst.extents(); + api::utils::uvec3 local_size = adaptive_work_group_size(global_size); + + ArithmeticParams block{ + get_size_as_ivec4(v_dst), + get_size_as_ivec4(v_in1), + get_size_as_ivec4(v_in2), + alpha, + }; + api::UniformParamsBuffer params(context, block); + api::PipelineBarrier pipeline_barrier{}; + + context->submit_compute_job( + compute_shader, + pipeline_barrier, + global_size, + local_size, + VK_NULL_HANDLE, + v_dst.image( + pipeline_barrier, + api::PipelineStage::COMPUTE, + api::MemoryAccessType::WRITE), + v_in1.image(pipeline_barrier, api::PipelineStage::COMPUTE), + v_in2.image(pipeline_barrier, api::PipelineStage::COMPUTE), + params.buffer()); +} + +// +// Utilities +// + void fill_vtensor(vTensor& vten, std::vector& data) { api::StorageBuffer staging_buffer(api::context(), api::kFloat, data.size()); copy_ptr_to_staging(data.data(), staging_buffer, vten.gpu_nbytes()); if (vten.storage_type() == api::StorageType::BUFFER) { - packing::record_nchw_to_buffer_op( - api::context(), staging_buffer.buffer(), vten, {}, VK_NULL_HANDLE); + record_nchw_to_buffer_op(api::context(), staging_buffer.buffer(), vten); } else { - api::ShaderInfo compute_shader = packing::get_nchw_to_image_shader(vten); - packing::record_nchw_to_image_op( - api::context(), - compute_shader, - staging_buffer.buffer(), - vten, - {}, - VK_NULL_HANDLE); + record_nchw_to_image_op(api::context(), staging_buffer.buffer(), vten); } } @@ -75,17 +204,9 @@ void extract_vtensor(vTensor& vten, std::vector& data) { api::context(), api::kFloat, vten.gpu_numel()); if (vten.storage_type() == api::StorageType::BUFFER) { - packing::record_buffer_to_nchw_op( - api::context(), vten, staging_buffer.buffer(), {}, VK_NULL_HANDLE); + record_buffer_to_nchw_op(api::context(), vten, staging_buffer.buffer()); } else { - api::ShaderInfo compute_shader = packing::get_image_to_nchw_shader(vten); - packing::record_image_to_nchw_op( - api::context(), - compute_shader, - vten, - staging_buffer.buffer(), - {}, - VK_NULL_HANDLE); + record_image_to_nchw_op(api::context(), vten, staging_buffer.buffer()); } api::VulkanFence fence = api::context()->fences().get_fence(); @@ -208,14 +329,14 @@ TEST_F(VulkanComputeAPITest, texture_add_sanity_check) { std::fill(data_b.begin(), data_b.end(), 1.5f); // Add shader kernel - api::ShaderInfo kernel = arithmetic::get_shader(arithmetic::OpType::ADD); + api::ShaderInfo kernel = VK_KERNEL(add); // Fill input tensors fill_vtensor(a, data_a); fill_vtensor(b, data_b); // a + b -> c - arithmetic::record_op(api::context(), kernel, a, b, c, 1.0f); + record_arithmetic_op(api::context(), kernel, a, b, c, 1.0f); // Extract output tensor std::vector data_out(c.gpu_numel()); @@ -244,7 +365,7 @@ TEST_F(VulkanComputeAPITest, texture_deferred_allocation_test) { std::vector data_b(b.gpu_numel()); std::fill(data_b.begin(), data_b.end(), 1.5f); - api::ShaderInfo kernel = arithmetic::get_shader(arithmetic::OpType::ADD); + api::ShaderInfo kernel = VK_KERNEL(add); // Allocate memory at the last possible opportunity api::MemoryAllocation a_mem = allocate_memory_for(a); @@ -260,7 +381,7 @@ TEST_F(VulkanComputeAPITest, texture_deferred_allocation_test) { fill_vtensor(a, data_a); fill_vtensor(b, data_b); - arithmetic::record_op(api::context(), kernel, a, b, c, 1.0f); + record_arithmetic_op(api::context(), kernel, a, b, c, 1.0f); std::vector data_c(c.gpu_numel()); extract_vtensor(c, data_c); @@ -310,20 +431,20 @@ TEST_F(VulkanComputeAPITest, texture_resource_aliasing_test) { std::fill(data_d.begin(), data_d.end(), 1.0f); // Get shader kernel for add - api::ShaderInfo kernel = arithmetic::get_shader(arithmetic::OpType::ADD); + api::ShaderInfo kernel = VK_KERNEL(add); // First, fill a and b with data fill_vtensor(a, data_a); fill_vtensor(b, data_b); // a + b -> c - arithmetic::record_op(api::context(), kernel, a, b, c, 1.0f); + record_arithmetic_op(api::context(), kernel, a, b, c, 1.0f); // Now d can be filled with data fill_vtensor(d, data_d); // c + d -> e - arithmetic::record_op(api::context(), kernel, c, d, e, 1.0f); + record_arithmetic_op(api::context(), kernel, c, d, e, 1.0f); // Extract data from e std::vector data_e(e.gpu_numel());