diff --git a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp index 379cf1000b0dc..7ef38ec1a81bd 100644 --- a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp +++ b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp @@ -1521,7 +1521,7 @@ size_t SYCLLowerESIMDPass::runOnFunction(Function &F, // process ESIMD builtins that go through special handling instead of // the translation procedure // TODO FIXME slm_init should be made top-level __esimd_slm_init - if (Name.startswith("N2cl4sycl3ext5intel12experimental5esimd8slm_init")) { + if (Name.startswith("__esimd_slm_init")) { // tag the kernel with meta-data SLMSize, and remove this builtin translateSLMInit(*CI); ToErase.push_back(CI); diff --git a/llvm/test/SYCLLowerIR/ESIMD/lower_intrins.ll b/llvm/test/SYCLLowerIR/ESIMD/lower_intrins.ll index 736babde2bed7..cd4111594209d 100644 --- a/llvm/test/SYCLLowerIR/ESIMD/lower_intrins.ll +++ b/llvm/test/SYCLLowerIR/ESIMD/lower_intrins.ll @@ -81,7 +81,6 @@ define dso_local spir_kernel void @FUNC_30() { ; CHECK: define dso_local spir_kernel void @FUNC_30() call spir_func void @_ZN2cl4sycl3ext5intel12experimental5esimd8slm_initEj(i32 1023) ret void -; CHECK-NEXT: ret void } define dso_local spir_func <16 x i32> @FUNC_32() { @@ -327,6 +326,5 @@ attributes #0 = { "genx_byte_offset"="192" "genx_volatile" } !genx.kernels = !{!0} !0 = !{void ()* @"FUNC_30", !"FUNC_30", !1, i32 0, i32 0, !1, !2, i32 0, i32 0} -; CHECK: !0 = !{void ()* @FUNC_30, !"FUNC_30", !1, i32 1023, i32 0, !1, !2, i32 0, i32 0} !1 = !{i32 0, i32 0} !2 = !{} diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp index 2ca46131e039d..e0c8bfbd5183d 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp @@ -98,6 +98,11 @@ static inline constexpr saturation_on_tag saturation_on{}; /// Represents a pixel's channel. enum class rgba_channel : uint8_t { R, G, B, A }; +/// Surface index type. Surface is an internal representation of a memory block +/// addressable by GPU in "stateful" memory model, and each surface is +/// identified by its "binding table index" - surface index. +using SurfaceIndex = unsigned int; + namespace detail { template static inline constexpr uint8_t ch = 1 << static_cast(Ch); @@ -105,6 +110,11 @@ static inline constexpr uint8_t chR = ch; static inline constexpr uint8_t chG = ch; static inline constexpr uint8_t chB = ch; static inline constexpr uint8_t chA = ch; + +// Shared Local Memory Binding Table Index (aka surface index). +static inline constexpr SurfaceIndex SLM_BTI = 254; +static inline constexpr SurfaceIndex INVALID_BTI = + static_cast(-1); } // namespace detail /// Represents a pixel's channel mask - all possible combinations of enabled @@ -190,11 +200,6 @@ enum class split_barrier_action : uint8_t { signal = 1, // split barrier signal }; -/// Surface index type. Surface is an internal representation of a memory block -/// addressable by GPU in "stateful" memory model, and each surface is -/// identified by its "binding table index" - surface index. -using SurfaceIndex = unsigned int; - /// @} sycl_esimd_core } // namespace esimd diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp index 6c2fbf1cf1efe..0fe4c57374b6f 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp @@ -21,6 +21,21 @@ #include +#ifndef __SYCL_DEVICE_ONLY__ +// ESIMD_CPU Emulation support using esimd_cpu plugin + +#include +#include +#include +#include + +// Channel Mask Array for scaled-gather/scatter +const std::array<__SEIEE::rgba_channel, 4> ChannelMaskArray{ + __SEIEE::rgba_channel::R, __SEIEE::rgba_channel::G, + __SEIEE::rgba_channel::B, __SEIEE::rgba_channel::A}; + +#endif // ifndef __SYCL_DEVICE_ONLY__ + __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { namespace ext { @@ -37,15 +52,11 @@ class AccessorPrivateProxy { static auto getNativeImageObj(const AccessorTy &Acc) { return Acc.getNativeImageObj(); } -#else - template - static auto getImageRange(const AccessorTy &Acc) { - return Acc.getAccessRange(); - } - static auto getElemSize(const sycl::detail::AccessorBaseHost &Acc) { - return Acc.getElemSize(); +#else // __SYCL_DEVICE_ONLY__ + static void *getPtr(const sycl::detail::AccessorBaseHost &Acc) { + return Acc.getPtr(); } -#endif +#endif // __SYCL_DEVICE_ONLY__ }; template V; + __SEIEED::vector_type_t V = 0; ElemsPerAddr = __SEIEED::ElemsPerAddrDecoding(ElemsPerAddr); if (sizeof(Ty) == 2) ElemsPerAddr = ElemsPerAddr / 2; @@ -200,7 +211,40 @@ __esimd_oword_ld_unaligned(SurfIndAliasTy surf_ind, uint32_t offset) ; #else { - throw cl::sycl::feature_not_supported(); + __SEIEED::vector_type_t retv; + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + + if (surf_ind == __SEIEE::detail::SLM_BTI) { + // O-word/Block load for Shared Local Memory + // __SEIEE::detail::SLM_BTI is special binding table index for SLM + char *SlmBase = I->__cm_emu_get_slm_ptr(); + for (int i = 0; i < N; ++i) { + Ty *SlmAddr = reinterpret_cast(offset + SlmBase); + retv[i] = *SlmAddr; + offset += sizeof(Ty); + } + } else { + // O-word/Block load for regular surface indexed by surf_ind + char *readBase; + uint32_t width; + std::mutex *mutexLock; + + I->sycl_get_cm_buffer_params_index_ptr(surf_ind, &readBase, &width, + &mutexLock); + + std::unique_lock lock(*mutexLock); + + for (int idx = 0; idx < N; idx++) { + if (offset >= width) { + retv[idx] = 0; + } else { + retv[idx] = *((Ty *)(readBase + offset)); + } + offset += (uint32_t)sizeof(Ty); + } + } + return retv; } #endif // __SYCL_DEVICE_ONLY__ @@ -212,7 +256,42 @@ __ESIMD_INTRIN void __esimd_oword_st(SurfIndAliasTy surf_ind, uint32_t offset, ; #else { - throw cl::sycl::feature_not_supported(); + offset <<= 4; + + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + if (surf_ind == __SEIEE::detail::SLM_BTI) { + // O-word/Block store for Shared Local Memory + // __SEIEE::detail::SLM_BTI is special binding table index for SLM + char *SlmBase = I->__cm_emu_get_slm_ptr(); + for (int i = 0; i < N; ++i) { + Ty *SlmAddr = reinterpret_cast(offset + SlmBase); + *SlmAddr = vals[i]; + offset += sizeof(Ty); + } + } else { + // O-word/Block store for regular surface indexed by surf_ind + char *writeBase; + uint32_t width; + std::mutex *mutexLock; + + I->sycl_get_cm_buffer_params_index_ptr(surf_ind, &writeBase, &width, + &mutexLock); + + std::unique_lock lock(*mutexLock); + + for (int idx = 0; idx < N; idx++) { + if (offset < width) { + *((Ty *)(writeBase + offset)) = vals[idx]; + } else { + break; + } + offset += (uint32_t)sizeof(Ty); + } + + // TODO : Optimize + I->cm_fence_ptr(); + } } #endif // __SYCL_DEVICE_ONLY__ @@ -225,44 +304,20 @@ __esimd_svm_gather4_scaled(__SEIEED::vector_type_t addrs, ; #else { - __SEIEED::vector_type_t V; + __SEIEED::vector_type_t V = 0; unsigned int Next = 0; - - if constexpr (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::R)) { - for (int I = 0; I < N; I++, Next++) { - if (pred[I]) { - Ty *Addr = reinterpret_cast(addrs[I]); - V[Next] = *Addr; - } - } - } - - if constexpr (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::G)) { - for (int I = 0; I < N; I++, Next++) { - if (pred[I]) { - Ty *Addr = reinterpret_cast(addrs[I] + sizeof(Ty)); - V[Next] = *Addr; - } - } - } - - if constexpr (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::B)) { - for (int I = 0; I < N; I++, Next++) { - if (pred[I]) { - Ty *Addr = reinterpret_cast(addrs[I] + sizeof(Ty) + sizeof(Ty)); - V[Next] = *Addr; - } - } - } - - if constexpr (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::A)) { - for (int I = 0; I < N; I++, Next++) { - if (pred[I]) { - Ty *Addr = reinterpret_cast(addrs[I] + sizeof(Ty) + sizeof(Ty) + - sizeof(Ty)); - V[Next] = *Addr; + uint64_t Offset = 0; + + for (const auto &channel : ChannelMaskArray) { + if (__SEIEE::is_channel_enabled(Mask, channel)) { + for (int I = 0; I < N; I++, Next++) { + if (pred[I]) { + Ty *Addr = reinterpret_cast(addrs[I] + Offset); + V[Next] = *Addr; + } } } + Offset += (uint64_t)sizeof(Ty); } return V; @@ -281,42 +336,18 @@ __ESIMD_INTRIN void __esimd_svm_scatter4_scaled( { __SEIEED::vector_type_t V; unsigned int Next = 0; - - if constexpr (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::R)) { - for (int I = 0; I < N; I++, Next++) { - if (pred[I]) { - Ty *Addr = reinterpret_cast(addrs[I]); - *Addr = vals[Next]; - } - } - } - - if constexpr (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::G)) { - for (int I = 0; I < N; I++, Next++) { - if (pred[I]) { - Ty *Addr = reinterpret_cast(addrs[I] + sizeof(Ty)); - *Addr = vals[Next]; - } - } - } - - if constexpr (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::B)) { - for (int I = 0; I < N; I++, Next++) { - if (pred[I]) { - Ty *Addr = reinterpret_cast(addrs[I] + sizeof(Ty) + sizeof(Ty)); - *Addr = vals[Next]; - } - } - } - - if constexpr (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::A)) { - for (int I = 0; I < N; I++, Next++) { - if (pred[I]) { - Ty *Addr = reinterpret_cast(addrs[I] + sizeof(Ty) + sizeof(Ty) + - sizeof(Ty)); - *Addr = vals[Next]; + uint64_t Offset = 0; + + for (const auto &channel : ChannelMaskArray) { + if (__SEIEE::is_channel_enabled(Mask, channel)) { + for (int I = 0; I < N; I++, Next++) { + if (pred[I]) { + Ty *Addr = reinterpret_cast(addrs[I] + Offset); + *Addr = vals[Next]; + } } } + Offset += (uint64_t)sizeof(Ty); } } #endif // __SYCL_DEVICE_ONLY__ @@ -396,7 +427,43 @@ __esimd_scatter_scaled(__SEIEED::simd_mask_storage_t pred, static_assert(N == 1 || N == 8 || N == 16 || N == 32); static_assert(TySizeLog2 <= 2); static_assert(std::is_integral::value || TySizeLog2 == 2); - throw cl::sycl::feature_not_supported(); + + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + + if (surf_ind == __SEIEE::detail::SLM_BTI) { + // Scattered-store for Shared Local Memory + // __SEIEE::detail::SLM_BTI is special binding table index for SLM + assert(global_offset == 0); + char *SlmBase = I->__cm_emu_get_slm_ptr(); + for (int i = 0; i < N; ++i) { + if (pred[i]) { + Ty *addr = reinterpret_cast(elem_offsets[i] + SlmBase); + *addr = vals[i]; + } + } + } else { + // Scattered-store for regular surface indexed by surf_ind + char *writeBase; + uint32_t width; + std::mutex *mutexLock; + + I->sycl_get_cm_buffer_params_index_ptr(surf_ind, &writeBase, &width, + &mutexLock); + writeBase += global_offset; + + std::unique_lock lock(*mutexLock); + + for (int idx = 0; idx < N; idx++) { + if (pred[idx]) { + Ty *addr = reinterpret_cast(elem_offsets[idx] + writeBase); + *addr = vals[idx]; + } + } + + // TODO : Optimize + I->cm_fence_ptr(); + } } #endif // __SYCL_DEVICE_ONLY__ @@ -422,7 +489,23 @@ __esimd_svm_atomic1(__SEIEED::vector_type_t addrs, ; #else { - throw cl::sycl::feature_not_supported(); + __SEIEED::vector_type_t retv; + + for (int i = 0; i < N; i++) { + if (pred[i]) { + Ty *p = reinterpret_cast(addrs[i]); + + switch (Op) { + case __SEIEE::atomic_op::add: + retv[i] = atomic_add_fetch(p, src0[i]); + break; + default: + throw cl::sycl::feature_not_supported(); + } + } + } + + return retv; } #endif // __SYCL_DEVICE_ONLY__ @@ -440,13 +523,22 @@ __esimd_svm_atomic2(__SEIEED::vector_type_t addrs, } #endif // __SYCL_DEVICE_ONLY__ +__ESIMD_INTRIN void __esimd_slm_init(size_t size) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ + sycl::detail::getESIMDDeviceInterface()->cm_slm_init_ptr(size); +} +#endif // ifndef __SYCL_DEVICE_ONLY__ + // esimd_barrier, generic group barrier __ESIMD_INTRIN void __esimd_barrier() #ifdef __SYCL_DEVICE_ONLY__ ; #else { - throw cl::sycl::feature_not_supported(); + sycl::detail::getESIMDDeviceInterface()->cm_barrier_ptr(); } #endif // __SYCL_DEVICE_ONLY__ @@ -456,7 +548,7 @@ __ESIMD_INTRIN void __esimd_sbarrier(__SEIEE::split_barrier_action flag) ; #else { - throw cl::sycl::feature_not_supported(); + sycl::detail::getESIMDDeviceInterface()->cm_sbarrier_ptr((uint32_t)flag); } #endif // __SYCL_DEVICE_ONLY__ @@ -466,7 +558,7 @@ __ESIMD_INTRIN void __esimd_fence(uint8_t cntl) ; #else { - throw cl::sycl::feature_not_supported(); + sycl::detail::getESIMDDeviceInterface()->cm_fence_ptr(); } #endif // __SYCL_DEVICE_ONLY__ @@ -481,29 +573,66 @@ __esimd_gather_scaled(__SEIEED::simd_mask_storage_t pred, ; #else { - throw cl::sycl::feature_not_supported(); + __SEIEED::vector_type_t retv = 0; + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + if (surf_ind == __SEIEE::detail::SLM_BTI) { + // Scattered-load for Shared Local Memory + // __SEIEE::detail::SLM_BTI is special binding table index for SLM + assert(global_offset == 0); + char *SlmBase = I->__cm_emu_get_slm_ptr(); + for (int i = 0; i < N; ++i) { + if (pred[i]) { + Ty *addr = reinterpret_cast(addrs[i] + SlmBase); + retv[i] = *addr; + } + } + } else { + // Scattered-load for regular surface indexed by surf_ind + char *readBase; + uint32_t width; + std::mutex *mutexLock; + + I->sycl_get_cm_buffer_params_index_ptr(surf_ind, &readBase, &width, + &mutexLock); + readBase += global_offset; + + std::unique_lock lock(*mutexLock); + + for (int idx = 0; idx < N; idx++) { + if (pred[idx]) { + Ty *addr = reinterpret_cast(addrs[idx] + readBase); + retv[idx] = *addr; + } + } + + // TODO : Optimize + I->cm_fence_ptr(); + } + + return retv; } #endif // __SYCL_DEVICE_ONLY__ -/// Predicated (masked) scaled gather from a surface. -/// -/// Template (compile-time constant) parameters: -/// @tparam Ty - element type -/// @tparam N - the number of elements to read -/// @tparam SurfIndAliasTy - "surface index alias" type - internal type in the -/// accessor used to denote the surface -/// @tparam TySizeLog2 - Log2 of the number of bytes written per element: -/// 0 - 1 byte, 1 - 2 bytes, 2 - 4 bytes -/// @tparam Scale - offset scale; only 0 is supported for now -/// -/// Formal parameters: -/// @param surf_ind - the surface index, taken from the SYCL memory object -/// @param global_offset - offset added to each individual element's offset to -/// compute actual memory access offset for that element -/// @param offsets - per-element offsets -/// @param pred - per-element predicates; elements with zero corresponding -/// predicates are not written -/// @return - elements read ("gathered") from memory +// Predicated (masked) scaled gather from a surface. +// +// Template (compile-time constant) parameters: +// @tparam Ty - element type +// @tparam N - the number of elements to read +// @tparam SurfIndAliasTy - "surface index alias" type - internal type in the +// accessor used to denote the surface +// @tparam TySizeLog2 - Log2 of the number of bytes written per element: +// 0 - 1 byte, 1 - 2 bytes, 2 - 4 bytes +// @tparam Scale - offset scale; only 0 is supported for now +// +// Formal parameters: +// @param surf_ind - the surface index, taken from the SYCL memory object +// @param global_offset - offset added to each individual element's offset to +// compute actual memory access offset for that element +// @param offsets - per-element offsets +// @param pred - per-element predicates; elements with zero corresponding +// predicates are not written +// @return - elements read ("gathered") from memory template @@ -515,7 +644,43 @@ __esimd_gather_masked_scaled2(SurfIndAliasTy surf_ind, uint32_t global_offset, ; #else { - throw cl::sycl::feature_not_supported(); + static_assert(Scale == 0); + + __SEIEED::vector_type_t retv = 0; + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + + if (surf_ind == __SEIEE::detail::SLM_BTI) { + // __SEIEE::detail::SLM_BTI is special binding table index for SLM + assert(global_offset == 0); + char *SlmBase = I->__cm_emu_get_slm_ptr(); + for (int idx = 0; idx < N; ++idx) { + if (pred[idx]) { + Ty *addr = reinterpret_cast(offsets[idx] + SlmBase); + retv[idx] = *addr; + } + } + } else { + char *readBase; + uint32_t width; + std::mutex *mutexLock; + + I->sycl_get_cm_buffer_params_index_ptr(surf_ind, &readBase, &width, + &mutexLock); + + readBase += global_offset; + std::unique_lock lock(*mutexLock); + for (int idx = 0; idx < N; idx++) { + if (pred[idx]) { + Ty *addr = reinterpret_cast(offsets[idx] + readBase); + retv[idx] = *addr; + } + } + + // TODO : Optimize + I->cm_fence_ptr(); + } + return retv; } #endif // __SYCL_DEVICE_ONLY__ @@ -528,7 +693,42 @@ __esimd_oword_ld(SurfIndAliasTy surf_ind, uint32_t addr) ; #else { - throw cl::sycl::feature_not_supported(); + addr <<= 4; + + __SEIEED::vector_type_t retv; + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + + if (surf_ind == __SEIEE::detail::SLM_BTI) { + // O-word/Block load for Shared Local Memory + // __SEIEE::detail::SLM_BTI is special binding table index for SLM + char *SlmBase = I->__cm_emu_get_slm_ptr(); + for (int i = 0; i < N; ++i) { + Ty *SlmAddr = reinterpret_cast(addr + SlmBase); + retv[i] = *SlmAddr; + addr += sizeof(Ty); + } + } else { + // O-word/Block load for regular surface indexed by surf_ind + char *readBase; + uint32_t width; + std::mutex *mutexLock; + + I->sycl_get_cm_buffer_params_index_ptr(surf_ind, &readBase, &width, + &mutexLock); + + std::unique_lock lock(*mutexLock); + + for (int idx = 0; idx < N; idx++) { + if (addr >= width) { + retv[idx] = 0; + } else { + retv[idx] = *((Ty *)(readBase + addr)); + } + addr += (uint32_t)sizeof(Ty); + } + } + return retv; } #endif // __SYCL_DEVICE_ONLY__ @@ -543,7 +743,37 @@ __esimd_gather4_scaled(__SEIEED::simd_mask_storage_t pred, ; #else { - throw cl::sycl::feature_not_supported(); + __SEIEED::vector_type_t retv = 0; + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + char *ReadBase; + unsigned int Next = 0; + + if (surf_ind == __SEIEE::detail::SLM_BTI) { + ReadBase = I->__cm_emu_get_slm_ptr(); + } else { + uint32_t width; + std::mutex *mutexLock; + I->sycl_get_cm_buffer_params_index_ptr(surf_ind, &ReadBase, &width, + &mutexLock); + std::unique_lock lock(*mutexLock); + } + + ReadBase += global_offset; + + for (const auto &channel : ChannelMaskArray) { + if (__SEIEE::is_channel_enabled(Mask, channel)) { + for (int I = 0; I < N; I++, Next++) { + if (pred[I]) { + Ty *Addr = reinterpret_cast(ReadBase + offsets[I]); + retv[Next] = *Addr; + } + } + } + ReadBase += (uint64_t)sizeof(Ty); + } + + return retv; } #endif // __SYCL_DEVICE_ONLY__ @@ -558,7 +788,34 @@ __ESIMD_INTRIN void __esimd_scatter4_scaled( ; #else { - throw cl::sycl::feature_not_supported(); + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + char *WriteBase; + unsigned int Next = 0; + + if (surf_ind == __SEIEE::detail::SLM_BTI) { + WriteBase = I->__cm_emu_get_slm_ptr(); + } else { + uint32_t width; + std::mutex *mutexLock; + I->sycl_get_cm_buffer_params_index_ptr(surf_ind, &WriteBase, &width, + &mutexLock); + std::unique_lock lock(*mutexLock); + } + + WriteBase += global_offset; + + for (const auto &channel : ChannelMaskArray) { + if (__SEIEE::is_channel_enabled(Mask, channel)) { + for (int I = 0; I < N; I++, Next++) { + if (pred[I]) { + Ty *Addr = reinterpret_cast(WriteBase + offsets[I]); + *Addr = vals[Next]; + } + } + } + WriteBase += (uint64_t)sizeof(Ty); + } } #endif // __SYCL_DEVICE_ONLY__ @@ -572,7 +829,29 @@ __esimd_dword_atomic0(__SEIEED::simd_mask_storage_t pred, ; #else { - throw cl::sycl::feature_not_supported(); + __SEIEED::vector_type_t retv; + + if (surf_ind == __SEIEE::detail::SLM_BTI) { + char *WriteBase = + sycl::detail::getESIMDDeviceInterface()->__cm_emu_get_slm_ptr(); + + for (int i = 0; i < N; i++) { + if (pred[i]) { + Ty *p = reinterpret_cast(addrs[i] + WriteBase); + + switch (Op) { + case __SEIEE::atomic_op::inc: + retv[i] = atomic_add_fetch(p, 1); + break; + default: + throw cl::sycl::feature_not_supported(); + } + } + } + } else { + throw cl::sycl::feature_not_supported(); + } + return retv; } #endif // __SYCL_DEVICE_ONLY__ @@ -626,48 +905,106 @@ __esimd_media_ld(TACC handle, unsigned x, unsigned y) ; #else { - // On host the input surface is modeled as sycl image 2d object, - // and the read/write access is done through accessor, - // which is passed in as the handle argument. - auto range = __SEIEED::AccessorPrivateProxy::getImageRange(handle); - unsigned bpp = __SEIEED::AccessorPrivateProxy::getElemSize(handle); - unsigned vpp = bpp / sizeof(Ty); - unsigned int i = x / bpp; - unsigned int j = y; - - assert(x % bpp == 0); - unsigned int xbound = range[0] - 1; - unsigned int ybound = range[1] - 1; - __SEIEED::vector_type_t vals; - for (int row = 0; row < M; row++) { - for (int col = 0; col < N; col += vpp) { - unsigned int xoff = (i > xbound) ? xbound : i; - unsigned int yoff = (j > ybound) ? ybound : j; - auto coords = cl::sycl::cl_int2(xoff, yoff); - cl::sycl::cl_uint4 data = handle.read(coords); - - __SEIEED::vector_type_t res; - for (int idx = 0; idx < 4; idx++) { - res[idx] = data[idx]; + char *readBase; + uint32_t bpp; + uint32_t imgWidth; + uint32_t imgHeight; + std::mutex *mutexLock; + + assert((handle != __SEIEE::detail::SLM_BTI) && + "__esimd_media_ld cannot access SLM"); + + sycl::detail::getESIMDDeviceInterface()->sycl_get_cm_image_params_index_ptr( + handle, &readBase, &imgWidth, &imgHeight, &bpp, &mutexLock); + + std::unique_lock lock(*mutexLock); + + int x_pos_a, y_pos_a, offset, index; + + // TODO : Remove intermediate 'in' matrix + std::vector> in(M, std::vector(N)); + int R = M; + int C = N; + for (int i = 0; i < R; i++) { + for (int j = 0; j < C; j++) { + x_pos_a = x + j * sizeof(Ty); + { y_pos_a = y + i; } + // We should check the boundary condition based on sizeof(Ty), x_pos_a is + // 0-based Note: Use a signed variable; otherwise sizeof(Ty) is unsigned + if ((x_pos_a + sizeof(Ty)) > imgWidth) { + // If we're trying to read outside the boundary, limit the value of + // x_pos_a Assumption -- We don't this situation: + // x_pos_a width's boundary + // | | + // <---type(Ty)---> + // At most x_pos_a+sizeof(Ty) is exactly at the boundary. + x_pos_a = imgWidth; + } + if (y_pos_a > imgHeight - 1) { + y_pos_a = imgHeight - 1; + } + if (y_pos_a < 0) { + y_pos_a = 0; + } + { + if (x_pos_a < 0) { + // Need to align x position to bbp + int offset = x % bpp; + x_pos_a -= offset; + } + while (x_pos_a < 0) { + // If we're trying to read outside the left boundary, increase x_pos_a + x_pos_a += bpp; + } } - constexpr int refN = sizeof(cl::sycl::cl_uint4) / sizeof(Ty); - unsigned int stride = sizeof(cl::sycl::cl_uint4) / bpp; - using refTy = __SEIEED::vector_type_t; - auto ref = reinterpret_cast(res); - - unsigned int offset1 = col + row * N; - unsigned int offset2 = 0; - for (int idx = 0; idx < vpp; idx++) { - vals[offset1] = ref[offset2]; - offset1++; - offset2 += stride; + if (x_pos_a >= imgWidth) { + { + x_pos_a = x_pos_a - bpp; + for (uint byte_count = 0; byte_count < sizeof(Ty); byte_count++) { + if (x_pos_a >= imgWidth) { + x_pos_a = x_pos_a - bpp; + } + offset = y_pos_a * imgWidth + x_pos_a; + + /* + If destination size per element is less then or equal pixel size + of the surface move the pixel value accross the destination + elements. If destination size per element is greater then pixel + size of the surface replicate pixel value in the destination + element. + */ + if (sizeof(Ty) <= bpp) { + for (uint bpp_count = 0; j < C && bpp_count < bpp; + j++, bpp_count += sizeof(Ty)) { + in[i][j] = *((Ty *)(readBase + offset + bpp_count)); + } + j--; + break; + } else { + // ((unsigned char*)in.get_addr(i*C+j))[byte_count] = *((unsigned + // char*)((char*)buff_iter->p + offset)); + unsigned char *pTempBase = + ((unsigned char *)in[i].data()) + j * sizeof(Ty); + pTempBase[byte_count] = *((unsigned char *)(readBase + offset)); + } + + x_pos_a = x_pos_a + 1; + } + x_pos_a = imgWidth; + } + } else { + offset = y_pos_a * imgWidth + x_pos_a; + { in[i][j] = *((Ty *)(readBase + offset)); } } - i++; } - i = x / bpp; - j++; + } + + for (auto i = 0, k = 0; i < M; i++) { + for (auto j = 0; j < N; j++) { + vals[k++] = in[i][j]; + } } return vals; @@ -696,110 +1033,127 @@ __ESIMD_INTRIN void __esimd_media_st(TACC handle, unsigned x, unsigned y, ; #else { - unsigned bpp = __SEIEED::AccessorPrivateProxy::getElemSize(handle); - unsigned vpp = bpp / sizeof(Ty); - auto range = __SEIEED::AccessorPrivateProxy::getImageRange(handle); - unsigned int i = x / bpp; - unsigned int j = y; + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); - assert(x % bpp == 0); + char *writeBase; + uint32_t bpp; + uint32_t imgWidth; + uint32_t imgHeight; + std::mutex *mutexLock; - for (int row = 0; row < M; row++) { - for (int col = 0; col < N; col += vpp) { - constexpr int Sz = sizeof(cl::sycl::cl_uint4) / sizeof(Ty); - __SEIEED::vector_type_t res = 0; + assert((handle != __SEIEE::detail::SLM_BTI) && + "__esimd_media_ld cannot access SLM"); - unsigned int offset1 = col + row * N; - unsigned int offset2 = 0; - unsigned int stride = sizeof(cl::sycl::cl_uint4) / bpp; - for (int idx = 0; idx < vpp; idx++) { - res[offset2] = vals[offset1]; - offset1++; - offset2 += stride; - } + I->sycl_get_cm_image_params_index_ptr(handle, &writeBase, &imgWidth, + &imgHeight, &bpp, &mutexLock); - using refTy = __SEIEED::vector_type_t; - auto ref = reinterpret_cast(res); + int x_pos_a, y_pos_a, offset; + + assert((x % 4) == 0); + assert((N * sizeof(Ty)) % 4 == 0); + + // TODO : Remove intermediate 'out' matrix + std::vector> out(M, std::vector(N)); + + std::unique_lock lock(*mutexLock); + + for (int i = 0, k = 0; i < M; i++) { + for (int j = 0; j < N; j++) { + out[i][j] = vals[k++]; + } + } - cl::sycl::cl_uint4 data; - for (int idx = 0; idx < 4; idx++) { - data[idx] = ref[idx]; + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) { + x_pos_a = x + j * sizeof(Ty); + { y_pos_a = y + i; } + if ((int)x_pos_a < 0) { + continue; + } + if ((int)y_pos_a < 0) { + continue; + } + if ((int)(x_pos_a + sizeof(Ty)) > imgWidth) { + continue; } - if (i < range[0] && j < range[1]) { - auto coords = cl::sycl::cl_int2(i, j); - handle.write(coords, data); + if ((int)y_pos_a > imgHeight - 1) { + continue; } - i++; + offset = y_pos_a * imgWidth + x_pos_a; + *((Ty *)(writeBase + offset)) = out[i][j]; } - i = x / bpp; - j++; } + + // TODO : Optimize + I->cm_fence_ptr(); } #endif // __SYCL_DEVICE_ONLY__ -/// \brief Converts given value to a surface index. -/// The input must always be a result of -/// detail::AccessorPrivateProxy::getNativeImageObj(acc) -/// where acc is a buffer or image accessor. If the result is, say, 'obj', then -/// 'obj' is really a value of the surface index kept in a differently typed -/// accessor field. Front-end compilation time type of 'obj' is either -/// ConcreteASPtrType (detail::DecoratedType::type *), for a buffer -/// or -/// image{1,2,3}d_t OpenCL type for an image -/// But when doing code generation, FE replaces e.g. '__read_only image2d_t' FE -/// type with '%opencl.image2d_ro_t addrspace(1) *' LLVM type. -/// image2d_t can neither be reinterpret_cast'ed from pointer to intptr_t -/// (because it is not a pointer at FE translation time), nor it can be -/// bit_cast'ed to intptr_t (because it is not trivially copyable). This -/// intrinsic takes advantage of the fact that in LLVM IR 'obj' is always a -/// pointer, where we can do ptr to uint32_t conversion. -/// This intrinsic can be called only from the device code, as -/// accessor => memory handle translation for host is different. -/// @param acc the SYCL accessor. -/// getNativeImageObj. -/// Returns the binding table index value. +// \brief Converts given value to a surface index. +// The input must always be a result of +// detail::AccessorPrivateProxy::getNativeImageObj(acc) +// where acc is a buffer or image accessor. If the result is, say, 'obj', then +// 'obj' is really a value of the surface index kept in a differently typed +// accessor field. Front-end compilation time type of 'obj' is either +// ConcreteASPtrType (detail::DecoratedType::type *), for a buffer +// or +// image{1,2,3}d_t OpenCL type for an image +// But when doing code generation, FE replaces e.g. '__read_only image2d_t' FE +// type with '%opencl.image2d_ro_t addrspace(1) *' LLVM type. +// image2d_t can neither be reinterpret_cast'ed from pointer to intptr_t +// (because it is not a pointer at FE translation time), nor it can be +// bit_cast'ed to intptr_t (because it is not trivially copyable). This +// intrinsic takes advantage of the fact that in LLVM IR 'obj' is always a +// pointer, where we can do ptr to uint32_t conversion. +// This intrinsic can be called only from the device code, as +// accessor => memory handle translation for host is different. +// @param acc the SYCL accessor. +// getNativeImageObj. +// Returns the binding table index value. template __ESIMD_INTRIN __SEIEE::SurfaceIndex __esimd_get_surface_index(MemObjTy obj) #ifdef __SYCL_DEVICE_ONLY__ ; -#else +#else // __SYCL_DEVICE_ONLY__ { - throw cl::sycl::feature_not_supported(); + return sycl::detail::getESIMDDeviceInterface()->sycl_get_cm_surface_index_ptr( + __SEIEED::AccessorPrivateProxy::getPtr(obj)); } #endif // __SYCL_DEVICE_ONLY__ -/// \brief Raw sends load. -/// -/// @param modifier the send message flags (Bit-0: isSendc, Bit-1: isEOT). -/// -/// @param execSize the execution size, which must be a compile time constant. -/// -/// @param pred the predicate to specify enabled channels. -/// -/// @param numSrc0 the number of GRFs for source-0, which must be a compile time -/// constant. -/// -/// @param numSrc1 the number of GRFs for source-1, which must be a compile time -/// constant. -/// -/// @param numDst the number of GRFs for destination, which must be a compile -/// time constant. -/// -/// @param sfid the shared function ID, which must be a compile time constant. -/// -/// @param exDesc the extended message descriptor. -/// -/// @param msgDesc the message descriptor. -/// -/// @param msgSrc0 the first source operand of send message. -/// -/// @param msgSrc1 the second source operand of send message. -/// -/// @param msgDst the destination operand of send message. -/// -/// Returns a simd vector of type Ty1 and size N1. -/// +// \brief Raw sends load. +// +// @param modifier the send message flags (Bit-0: isSendc, Bit-1: isEOT). +// +// @param execSize the execution size, which must be a compile time constant. +// +// @param pred the predicate to specify enabled channels. +// +// @param numSrc0 the number of GRFs for source-0, which must be a compile time +// constant. +// +// @param numSrc1 the number of GRFs for source-1, which must be a compile time +// constant. +// +// @param numDst the number of GRFs for destination, which must be a compile +// time constant. +// +// @param sfid the shared function ID, which must be a compile time constant. +// +// @param exDesc the extended message descriptor. +// +// @param msgDesc the message descriptor. +// +// @param msgSrc0 the first source operand of send message. +// +// @param msgSrc1 the second source operand of send message. +// +// @param msgDst the destination operand of send message. +// +// Returns a simd vector of type Ty1 and size N1. +// template __ESIMD_INTRIN __SEIEED::vector_type_t __esimd_raw_sends2( @@ -816,32 +1170,32 @@ __ESIMD_INTRIN __SEIEED::vector_type_t __esimd_raw_sends2( } #endif // __SYCL_DEVICE_ONLY__ -/// \brief Raw send load. -/// -/// @param modifier the send message flags (Bit-0: isSendc, Bit-1: isEOT). -/// -/// @param execSize the execution size, which must be a compile time constant. -/// -/// @param pred the predicate to specify enabled channels. -/// -/// @param numSrc0 the number of GRFs for source-0, which must be a compile time -/// constant. -/// -/// @param numDst the number of GRFs for destination, which must be a compile -/// time constant. -/// -/// @param sfid the shared function ID, which must be a compile time constant. -/// -/// @param exDesc the extended message descriptor. -/// -/// @param msgDesc the message descriptor. -/// -/// @param msgSrc0 the first source operand of send message. -/// -/// @param msgDst the destination operand of send message. -/// -/// Returns a simd vector of type Ty1 and size N1. -/// +// \brief Raw send load. +// +// @param modifier the send message flags (Bit-0: isSendc, Bit-1: isEOT). +// +// @param execSize the execution size, which must be a compile time constant. +// +// @param pred the predicate to specify enabled channels. +// +// @param numSrc0 the number of GRFs for source-0, which must be a compile time +// constant. +// +// @param numDst the number of GRFs for destination, which must be a compile +// time constant. +// +// @param sfid the shared function ID, which must be a compile time constant. +// +// @param exDesc the extended message descriptor. +// +// @param msgDesc the message descriptor. +// +// @param msgSrc0 the first source operand of send message. +// +// @param msgDst the destination operand of send message. +// +// Returns a simd vector of type Ty1 and size N1. +// template __ESIMD_INTRIN __SEIEED::vector_type_t __esimd_raw_send2(uint8_t modifier, uint8_t execSize, @@ -857,30 +1211,30 @@ __esimd_raw_send2(uint8_t modifier, uint8_t execSize, } #endif // __SYCL_DEVICE_ONLY__ -/// \brief Raw sends store. -/// -/// @param modifier the send message flags (Bit-0: isSendc, Bit-1: isEOT). -/// -/// @param execSize the execution size, which must be a compile time constant. -/// -/// @param pred the predicate to specify enabled channels. -/// -/// @param numSrc0 the number of GRFs for source-0, which must be a compile time -/// constant. -/// -/// @param numSrc1 the number of GRFs for source-1, which must be a compile time -/// constant. -/// -/// @param sfid the shared function ID, which must be a compile time constant. -/// -/// @param exDesc the extended message descriptor. -/// -/// @param msgDesc the message descriptor. -/// -/// @param msgSrc0 the first source operand of send message. -/// -/// @param msgSrc1 the second source operand of send message. -/// +// \brief Raw sends store. +// +// @param modifier the send message flags (Bit-0: isSendc, Bit-1: isEOT). +// +// @param execSize the execution size, which must be a compile time constant. +// +// @param pred the predicate to specify enabled channels. +// +// @param numSrc0 the number of GRFs for source-0, which must be a compile time +// constant. +// +// @param numSrc1 the number of GRFs for source-1, which must be a compile time +// constant. +// +// @param sfid the shared function ID, which must be a compile time constant. +// +// @param exDesc the extended message descriptor. +// +// @param msgDesc the message descriptor. +// +// @param msgSrc0 the first source operand of send message. +// +// @param msgSrc1 the second source operand of send message. +// template __ESIMD_INTRIN void __esimd_raw_sends2_noresult( uint8_t modifier, uint8_t execSize, __SEIEED::simd_mask_storage_t pred, @@ -895,25 +1249,25 @@ __ESIMD_INTRIN void __esimd_raw_sends2_noresult( } #endif // __SYCL_DEVICE_ONLY__ -/// \brief Raw send store. -/// -/// @param modifier the send message flags (Bit-0: isSendc, Bit-1: isEOT). -/// -/// @param execSize the execution size, which must be a compile time constant. -/// -/// @param pred the predicate to specify enabled channels. -/// -/// @param numSrc0 the number of GRFs for source-0, which must be a compile time -/// constant. -/// -/// @param sfid the shared function ID, which must be a compile time constant. -/// -/// @param exDesc the extended message descriptor. -/// -/// @param msgDesc the message descriptor. -/// -/// @param msgSrc0 the first source operand of send message. -/// +// \brief Raw send store. +// +// @param modifier the send message flags (Bit-0: isSendc, Bit-1: isEOT). +// +// @param execSize the execution size, which must be a compile time constant. +// +// @param pred the predicate to specify enabled channels. +// +// @param numSrc0 the number of GRFs for source-0, which must be a compile time +// constant. +// +// @param sfid the shared function ID, which must be a compile time constant. +// +// @param exDesc the extended message descriptor. +// +// @param msgDesc the message descriptor. +// +// @param msgSrc0 the first source operand of send message. +// template __ESIMD_INTRIN void __esimd_raw_send2_noresult( uint8_t modifier, uint8_t execSize, __SEIEED::simd_mask_storage_t pred, diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/emu/detail/esimd_emulator_functions_v1.h b/sycl/include/sycl/ext/intel/experimental/esimd/emu/detail/esimd_emulator_functions_v1.h index 3e58dc7dedeb6..2851e9a0ce3b3 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/emu/detail/esimd_emulator_functions_v1.h +++ b/sycl/include/sycl/ext/intel/experimental/esimd/emu/detail/esimd_emulator_functions_v1.h @@ -51,4 +51,10 @@ void (*sycl_get_cm_buffer_params_ptr)(void *, char **, uint32_t *, void (*sycl_get_cm_image_params_ptr)(void *, char **, uint32_t *, uint32_t *, uint32_t *, std::mutex **); +unsigned int (*sycl_get_cm_surface_index_ptr)(void *); +void (*sycl_get_cm_buffer_params_index_ptr)(unsigned int, char **, uint32_t *, + std::mutex **); +void (*sycl_get_cm_image_params_index_ptr)(unsigned int, char **, uint32_t *, + uint32_t *, uint32_t *, + std::mutex **); /// @endcond ESIMD_EMU diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp index 731c2d09df1a3..5f145e7421a60 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp @@ -48,10 +48,6 @@ namespace detail { // accesses and thus reuse validity checks etc. struct LocalAccessorMarker {}; -// Shared Local Memory Binding Table Index (aka surface index). -static inline constexpr SurfaceIndex SLM_BTI = 254; -static inline constexpr SurfaceIndex INVALID_BTI = - static_cast(-1); } // namespace detail /// @endcond ESIMD_DETAIL @@ -66,23 +62,19 @@ static inline constexpr SurfaceIndex INVALID_BTI = /// template __ESIMD_API SurfaceIndex get_surface_index(AccessorTy acc) { -#ifdef __SYCL_DEVICE_ONLY__ if constexpr (std::is_same_v) { return detail::SLM_BTI; } else { +#ifdef __SYCL_DEVICE_ONLY__ const auto mem_obj = detail::AccessorPrivateProxy::getNativeImageObj(acc); return __esimd_get_surface_index(mem_obj); +#else // __SYCL_DEVICE_ONLY__ + return __esimd_get_surface_index(acc); +#endif // __SYCL_DEVICE_ONLY__ } -#else - throw sycl::feature_not_supported(); -#endif } -#ifdef __SYCL_DEVICE_ONLY__ #define __ESIMD_GET_SURF_HANDLE(acc) get_surface_index(acc) -#else -#define __ESIMD_GET_SURF_HANDLE(acc) acc -#endif // __SYCL_DEVICE_ONLY__ // TODO @Pennycook // {quote} @@ -266,21 +258,15 @@ __ESIMD_API simd block_load(AccessorTy acc, uint32_t offset, #if defined(__SYCL_DEVICE_ONLY__) auto surf_ind = __esimd_get_surface_index( detail::AccessorPrivateProxy::getNativeImageObj(acc)); +#else // __SYCL_DEVICE_ONLY__ + auto surf_ind = __esimd_get_surface_index(acc); #endif // __SYCL_DEVICE_ONLY__ if constexpr (Flags::template alignment> >= detail::OperandSize::OWORD) { -#if defined(__SYCL_DEVICE_ONLY__) return __esimd_oword_ld(surf_ind, offset >> 4); -#else - return __esimd_oword_ld(acc, offset >> 4); -#endif // __SYCL_DEVICE_ONLY__ } else { -#if defined(__SYCL_DEVICE_ONLY__) return __esimd_oword_ld_unaligned(surf_ind, offset); -#else - return __esimd_oword_ld_unaligned(acc, offset); -#endif // __SYCL_DEVICE_ONLY__ } } @@ -336,10 +322,10 @@ __ESIMD_API void block_store(AccessorTy acc, uint32_t offset, #if defined(__SYCL_DEVICE_ONLY__) auto surf_ind = __esimd_get_surface_index( detail::AccessorPrivateProxy::getNativeImageObj(acc)); +#else // + auto surf_ind = __esimd_get_surface_index(acc); +#endif __esimd_oword_st(surf_ind, offset >> 4, vals.data()); -#else - __esimd_oword_st(acc, offset >> 4, vals.data()); -#endif // __SYCL_DEVICE_ONLY__ } /// @} sycl_esimd_memory @@ -820,24 +806,7 @@ __ESIMD_API void sbarrier(split_barrier_action flag) { __esimd_sbarrier(flag); } /// @{ /// Declare per-work-group slm size. -/// @param size the requested size of the shared local memory for current work -/// group. Must be compile-time constant. -#ifdef __SYCL_DEVICE_ONLY__ -// TODO slm_init should call __esimd_slm_init (TBD) and declared as __ESIMD_API -// on both host and device. Currently __ESIMD_API on device leads to: -// "... cannot call an undefined function without SYCL_EXTERNAL attribute" -__ESIMD_INTRIN -#else -__ESIMD_API -#endif -void slm_init(uint32_t size) -#ifdef __SYCL_DEVICE_ONLY__ - ; -#else -{ - throw sycl::feature_not_supported(); -} -#endif // __SYCL_DEVICE_ONLY__ +__ESIMD_API void slm_init(uint32_t size) { __esimd_slm_init(size); } /// Gather operation over the Shared Local Memory. /// This API has almost the same interface as the @ref accessor_gather diff --git a/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp b/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp index 363d5e86c14fa..1f0cba964b8bc 100644 --- a/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp +++ b/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -28,7 +29,6 @@ #include #include -// TODO : Rename esimdcpu to esimdemu for next CM_EMU release #include #include @@ -116,6 +116,19 @@ static bool PrintPiTrace = false; // Sycl RT calls piTearDown(). static sycl::detail::ESIMDEmuPluginOpaqueData *PiESimdDeviceAccess; +// Single-entry cache for piPlatformsGet call. +static pi_platform PiPlatformCache; +// TODO/FIXME : Memory leak. Handle with 'piTearDown'. +static sycl::detail::SpinLock *PiPlatformCacheMutex = + new sycl::detail::SpinLock; + +// Mapping between surface index and CM-managed surface +static std::unordered_map *PiESimdSurfaceMap = + new std::unordered_map; +// TODO/FIXME : Memory leak. Handle with 'piTearDown'. +static sycl::detail::SpinLock *PiESimdSurfaceMapLock = + new sycl::detail::SpinLock; + // To be compared with ESIMD_EMULATOR_PLUGIN_OPAQUE_DATA_VERSION in device // interface header file #define ESIMDEmuPluginDataVersion 0 @@ -127,9 +140,6 @@ static sycl::detail::ESIMDEmuPluginOpaqueData *PiESimdDeviceAccess; // For PI_DEVICE_INFO_DRIVER_VERSION info static char ESimdEmuVersionString[32]; -// For PI_DEVICE_INFO_VERSION info -static char CmEmuDeviceVersionString[32]; - using IDBuilder = sycl::detail::Builder; template @@ -283,6 +293,51 @@ void sycl_get_cm_image_params(void *PtrInput, char **BaseAddr, uint32_t *Width, *MtxLock = &(Img->mutexLock); } +// Function to provide image info for kernel compilation without +// dependency on '_pi_mem' definition +unsigned int sycl_get_cm_surface_index(void *PtrInput) { + _pi_mem *Surface = static_cast<_pi_mem *>(PtrInput); + + return (unsigned int)(Surface->SurfaceIndex); +} + +// Function to provide image info for kernel compilation using surface +// index without dependency on '_pi_image' definition +void sycl_get_cm_buffer_params_index(unsigned int IndexInput, char **BaseAddr, + uint32_t *Width, std::mutex **MtxLock) { + const std::lock_guard Lock{*PiESimdSurfaceMapLock}; + auto MemIter = PiESimdSurfaceMap->find(IndexInput); + + assert(MemIter != PiESimdSurfaceMap->end() && "Invalid Surface Index"); + + _pi_buffer *Buf = static_cast<_pi_buffer *>(MemIter->second); + + *BaseAddr = cm_support::get_surface_base_addr(Buf->SurfaceIndex); + *Width = static_cast(Buf->Size); + + *MtxLock = &(Buf->mutexLock); +} + +// Function to provide image info for kernel compilation using surface +// index without dependency on '_pi_image' definition +void sycl_get_cm_image_params_index(unsigned int IndexInput, char **BaseAddr, + uint32_t *Width, uint32_t *Height, + uint32_t *Bpp, std::mutex **MtxLock) { + const std::lock_guard Lock{*PiESimdSurfaceMapLock}; + auto MemIter = PiESimdSurfaceMap->find(IndexInput); + assert(MemIter != PiESimdSurfaceMap->end() && "Invalid Surface Index"); + + _pi_image *Img = static_cast<_pi_image *>(MemIter->second); + + *BaseAddr = cm_support::get_surface_base_addr(Img->SurfaceIndex); + + *Bpp = static_cast(Img->BytesPerPixel); + *Width = static_cast(Img->Width) * (*Bpp); + *Height = static_cast(Img->Height); + + *MtxLock = &(Img->mutexLock); +} + /// Implementation for ESIMD_EMULATOR device interface accessing ESIMD /// intrinsics and LibCM functionalties requred by intrinsics sycl::detail::ESIMDDeviceInterface::ESIMDDeviceInterface() { @@ -300,6 +355,11 @@ sycl::detail::ESIMDDeviceInterface::ESIMDDeviceInterface() { sycl_get_cm_buffer_params_ptr = sycl_get_cm_buffer_params; sycl_get_cm_image_params_ptr = sycl_get_cm_image_params; + + sycl_get_cm_surface_index_ptr = sycl_get_cm_surface_index; + sycl_get_cm_buffer_params_index_ptr = sycl_get_cm_buffer_params_index; + sycl_get_cm_image_params_index_ptr = sycl_get_cm_image_params_index; + /* From 'esimd_emulator_functions_v1.h' : End */ } @@ -373,9 +433,10 @@ extern "C" { pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms, pi_uint32 *NumPlatforms) { - + static bool PiPlatformCachePopulated = false; static const char *PiTrace = std::getenv("SYCL_PI_TRACE"); static const int PiTraceValue = PiTrace ? std::stoi(PiTrace) : 0; + if (PiTraceValue == -1) { // Means print all PI traces PrintPiTrace = true; } @@ -401,9 +462,15 @@ pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms, return PI_INVALID_VALUE; } + const std::lock_guard Lock{*PiPlatformCacheMutex}; + if (!PiPlatformCachePopulated) { + PiPlatformCache = new _pi_platform(); + PiPlatformCache->CmEmuVersion = std::string("0.0.1"); + PiPlatformCachePopulated = true; + } + if (Platforms && NumEntries > 0) { - *Platforms = new _pi_platform(); - Platforms[0]->CmEmuVersion = std::string("0.0.1"); + *Platforms = PiPlatformCache; } return PI_SUCCESS; @@ -456,6 +523,11 @@ pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType, return PI_INVALID_PLATFORM; } + pi_result Res = Platform->populateDeviceCacheIfNeeded(); + if (Res != PI_SUCCESS) { + return Res; + } + // CM has single-root-GPU-device without sub-device support. pi_uint32 DeviceCount = (DeviceType & PI_DEVICE_TYPE_GPU) ? 1 : 0; @@ -477,10 +549,23 @@ pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType, } if (DeviceCount == 0) { - /// No GPU entry to fill 'Device' array + /// No GPU entry to fill 'Devices' array return PI_SUCCESS; } + if (Devices) { + *Devices = Platform->PiDeviceCache.get(); + } + return PI_SUCCESS; +} + +// Check the device cache and load it if necessary. +pi_result _pi_platform::populateDeviceCacheIfNeeded() { + std::lock_guard Lock(PiDeviceCacheMutex); + + if (DeviceCachePopulated) { + return PI_SUCCESS; + } cm_support::CmDevice *CmDevice = nullptr; // TODO FIXME Implement proper version checking and reporting: // - version passed to cm_support::CreateCmDevice @@ -494,6 +579,10 @@ pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType, int Result = cm_support::CreateCmDevice(CmDevice, Version); + if (Result != cm_support::CM_SUCCESS) { + return PI_INVALID_DEVICE; + } + // CM Device version info consists of two decimal numbers - major // and minor. Minor is single-digit. Version info is encoded into a // unsigned integer value = 100 * major + minor. Second from right @@ -509,21 +598,13 @@ pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType, return PI_INVALID_DEVICE; } - sprintf(CmEmuDeviceVersionString, "%d.%d", (int)(Version / 100), - (int)(Version % 10)); - - if (Result != cm_support::CM_SUCCESS) { - return PI_INVALID_DEVICE; - } - - // FIXME / TODO : piDevicesGet always must return same pointer for - // 'Devices[0]' from cached entry. Reference : level-zero - // platform/device implementation with PiDevicesCache and - // PiDevicesCache - if (Devices) { - Devices[0] = new _pi_device(Platform, CmDevice); - } + std::ostringstream StrFormat; + StrFormat << (int)(Version / 100) << "." << (int)(Version % 10); + std::unique_ptr<_pi_device> Device( + new _pi_device(this, CmDevice, StrFormat.str())); + PiDeviceCache = std::move(Device); + DeviceCachePopulated = true; return PI_SUCCESS; } @@ -585,7 +666,7 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, // cl_khr_int64_extended_atomics return ReturnValue(""); case PI_DEVICE_INFO_VERSION: - return ReturnValue(CmEmuDeviceVersionString); + return ReturnValue(Device->VersionStr.c_str()); case PI_DEVICE_INFO_COMPILER_AVAILABLE: return ReturnValue(pi_bool{false}); case PI_DEVICE_INFO_LINKER_AVAILABLE: @@ -957,6 +1038,10 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size, } Status = CmBuf->GetIndex(CmIndex); + const std::lock_guard Lock{*PiESimdSurfaceMapLock}; + assert(PiESimdSurfaceMap->find((unsigned int)CmIndex->get_data()) == + PiESimdSurfaceMap->end() && + "Failure from CM-managed buffer creation"); // Initialize the buffer with user data provided with 'HostPtr' if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0) { @@ -980,6 +1065,8 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size, return PI_ERROR_UNKNOWN; } + (*PiESimdSurfaceMap)[(unsigned int)CmIndex->get_data()] = *RetMem; + return PI_SUCCESS; } @@ -1022,6 +1109,19 @@ pi_result piMemRelease(pi_mem Mem) { return PI_INVALID_MEM_OBJECT; } + // Removing Surface-map entry + const std::lock_guard Lock{*PiESimdSurfaceMapLock}; + auto MapEntryIt = PiESimdSurfaceMap->find(Mem->SurfaceIndex); + if (MapEntryIt != PiESimdSurfaceMap->end()) { + PiESimdSurfaceMap->erase(MapEntryIt); + } else { + if (PrintPiTrace) { + std::cerr << "Failure from CM-managed buffer/image deletion" + << std::endl; + } + return PI_INVALID_MEM_OBJECT; + } + delete Mem; } @@ -1126,6 +1226,15 @@ pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags, Status = CmSurface->GetIndex(CmIndex); + const std::lock_guard Lock{*PiESimdSurfaceMapLock}; + if (PiESimdSurfaceMap->find((unsigned int)CmIndex->get_data()) != + PiESimdSurfaceMap->end()) { + if (PrintPiTrace) { + std::cerr << "Failure from CM-managed image creation" << std::endl; + } + return PI_INVALID_MEM_OBJECT; + } + // Initialize the buffer with user data provided with 'HostPtr' if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0) { if (HostPtr != nullptr) { @@ -1150,6 +1259,8 @@ pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags, return PI_ERROR_UNKNOWN; } + (*PiESimdSurfaceMap)[(unsigned int)CmIndex->get_data()] = *RetImage; + return PI_SUCCESS; } @@ -1733,6 +1844,11 @@ pi_result piTearDown(void *) { delete reinterpret_cast( PiESimdDeviceAccess->data); delete PiESimdDeviceAccess; + + const std::lock_guard Lock{*PiESimdSurfaceMapLock}; + for (auto it = PiESimdSurfaceMap->begin(); it != PiESimdSurfaceMap->end();) { + it = PiESimdSurfaceMap->erase(it); + } return PI_SUCCESS; } diff --git a/sycl/plugins/esimd_emulator/pi_esimd_emulator.hpp b/sycl/plugins/esimd_emulator/pi_esimd_emulator.hpp index 6e301a6b78b56..8de4259a71596 100755 --- a/sycl/plugins/esimd_emulator/pi_esimd_emulator.hpp +++ b/sycl/plugins/esimd_emulator/pi_esimd_emulator.hpp @@ -56,16 +56,27 @@ struct _pi_object { struct _pi_platform { _pi_platform() = default; + // Single-entry Cache pi_devices for reuse + std::unique_ptr<_pi_device> PiDeviceCache; + std::mutex PiDeviceCacheMutex; + bool DeviceCachePopulated = false; + + // Check the device cache and load it if necessary. + pi_result populateDeviceCacheIfNeeded(); + // Keep Version information. std::string CmEmuVersion; }; struct _pi_device : _pi_object { - _pi_device(pi_platform ArgPlt, cm_support::CmDevice *ArgCmDev) - : Platform{ArgPlt}, CmDevicePtr{ArgCmDev} {} + _pi_device(pi_platform ArgPlt, cm_support::CmDevice *ArgCmDev, + std::string ArgVersionStr) + : Platform{ArgPlt}, CmDevicePtr{ArgCmDev}, VersionStr{ArgVersionStr} {} pi_platform Platform; cm_support::CmDevice *CmDevicePtr = nullptr; + + std::string VersionStr; }; struct _pi_context : _pi_object { diff --git a/sycl/source/esimd_emulator_device_interface.cpp b/sycl/source/esimd_emulator_device_interface.cpp index ad9f54b8b5a75..76793cee6fed1 100644 --- a/sycl/source/esimd_emulator_device_interface.cpp +++ b/sycl/source/esimd_emulator_device_interface.cpp @@ -28,8 +28,16 @@ __SYCL_EXPORT ESIMDDeviceInterface *getESIMDDeviceInterface() { // tight loop) void *PIOpaqueData = nullptr; - PIOpaqueData = - getPluginOpaqueData(nullptr); + try { + PIOpaqueData = + getPluginOpaqueData( + nullptr); + } catch (...) { + std::cerr << "ESIMD EMU plugin error or not loaded - try setting " + "SYCL_DEVICE_FILTER=esimd_emulator:gpu environment variable" + << std::endl; + throw cl::sycl::feature_not_supported(); + } ESIMDEmuPluginOpaqueData *OpaqueData = reinterpret_cast(PIOpaqueData);