diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index 8cbd33b279e8f..e89cead64ca30 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -2812,10 +2812,11 @@ bool Util::isSyclHalfType(const QualType &Ty) { bool Util::isSyclSpecConstantType(const QualType &Ty) { const StringRef &Name = "spec_constant"; - std::array Scopes = { + std::array Scopes = { Util::DeclContextDesc{clang::Decl::Kind::Namespace, "cl"}, Util::DeclContextDesc{clang::Decl::Kind::Namespace, "sycl"}, - Util::DeclContextDesc{clang::Decl::Kind::Namespace, "experimental"}, + Util::DeclContextDesc{clang::Decl::Kind::Namespace, "ext"}, + Util::DeclContextDesc{clang::Decl::Kind::Namespace, "oneapi"}, Util::DeclContextDesc{Decl::Kind::ClassTemplateSpecialization, Name}}; return matchQualifiedTypeName(Ty, Scopes); } diff --git a/clang/test/CodeGenSYCL/Inputs/sycl.hpp b/clang/test/CodeGenSYCL/Inputs/sycl.hpp index 3184c58edcbfc..82bd00d6b32a3 100644 --- a/clang/test/CodeGenSYCL/Inputs/sycl.hpp +++ b/clang/test/CodeGenSYCL/Inputs/sycl.hpp @@ -242,7 +242,8 @@ struct get_kernel_name_t { using name = Type; }; -namespace experimental { +namespace ext { +namespace oneapi { template class spec_constant { public: @@ -256,7 +257,8 @@ class spec_constant { return get(); } }; -} // namespace experimental +} // namespace oneapi +} // namespace ext #define ATTR_SYCL_KERNEL __attribute__((sycl_kernel)) template diff --git a/clang/test/CodeGenSYCL/int_header_spec_const.cpp b/clang/test/CodeGenSYCL/int_header_spec_const.cpp index e6743c4ea2e91..31d26dfc67e4b 100644 --- a/clang/test/CodeGenSYCL/int_header_spec_const.cpp +++ b/clang/test/CodeGenSYCL/int_header_spec_const.cpp @@ -20,18 +20,18 @@ class MyDoubleConst; int main() { // Create specialization constants. - cl::sycl::experimental::spec_constant i1(false); - cl::sycl::experimental::spec_constant i8(0); - cl::sycl::experimental::spec_constant ui8(0); - cl::sycl::experimental::spec_constant i16(0); - cl::sycl::experimental::spec_constant ui16(0); - cl::sycl::experimental::spec_constant i32(0); + cl::sycl::ext::oneapi::spec_constant i1(false); + cl::sycl::ext::oneapi::spec_constant i8(0); + cl::sycl::ext::oneapi::spec_constant ui8(0); + cl::sycl::ext::oneapi::spec_constant i16(0); + cl::sycl::ext::oneapi::spec_constant ui16(0); + cl::sycl::ext::oneapi::spec_constant i32(0); // Constant used twice, but there must be single entry in the int header, // otherwise compilation error would be issued. - cl::sycl::experimental::spec_constant i32_1(0); - cl::sycl::experimental::spec_constant ui32(0); - cl::sycl::experimental::spec_constant f32(0); - cl::sycl::experimental::spec_constant f64(0); + cl::sycl::ext::oneapi::spec_constant i32_1(0); + cl::sycl::ext::oneapi::spec_constant ui32(0); + cl::sycl::ext::oneapi::spec_constant f32(0); + cl::sycl::ext::oneapi::spec_constant f64(0); double val; double *ptr = &val; // to avoid "unused" warnings diff --git a/sycl/include/CL/sycl.hpp b/sycl/include/CL/sycl.hpp index 4fd3c55b0952c..c5c20813d014a 100644 --- a/sycl/include/CL/sycl.hpp +++ b/sycl/include/CL/sycl.hpp @@ -18,16 +18,16 @@ #include #include #include +#include +#include +#include +#include +#include +#include #include #include #include #include -#include -#include -#include -#include -#include -#include #include #include #include diff --git a/sycl/include/CL/sycl/accessor.hpp b/sycl/include/CL/sycl/accessor.hpp index 18084a8a2d2b2..49bd575cefc40 100755 --- a/sycl/include/CL/sycl/accessor.hpp +++ b/sycl/include/CL/sycl/accessor.hpp @@ -197,12 +197,14 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { +namespace ext { namespace intel { namespace gpu { // Forward declare a "back-door" access class to support ESIMD. class AccessorPrivateProxy; } // namespace gpu } // namespace intel +} // namespace ext } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) @@ -432,7 +434,7 @@ class image_accessor #endif private: - friend class sycl::intel::gpu::AccessorPrivateProxy; + friend class sycl::ext::intel::gpu::AccessorPrivateProxy; #if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_EXPLICIT_SIMD__) const OCLImageTy getNativeImageObj() const { return MImageObj; } @@ -883,7 +885,7 @@ class accessor : #endif // __SYCL_DEVICE_ONLY__ private: - friend class sycl::intel::gpu::AccessorPrivateProxy; + friend class sycl::ext::intel::gpu::AccessorPrivateProxy; public: using value_type = DataT; diff --git a/sycl/include/CL/sycl/builtins.hpp b/sycl/include/CL/sycl/builtins.hpp index 9671987643f41..fbf01679463bf 100644 --- a/sycl/include/CL/sycl/builtins.hpp +++ b/sycl/include/CL/sycl/builtins.hpp @@ -724,14 +724,16 @@ detail::enable_if_t::value, T> clz(T x) __NOEXC { return __sycl_std::__invoke_clz(x); } -namespace intel { +namespace ext { +namespace oneapi { // geninteger ctz (geninteger x) template sycl::detail::enable_if_t::value, T> ctz(T x) __NOEXC { return __sycl_std::__invoke_ctz(x); } -} // namespace intel +} // namespace oneapi +} // namespace ext // geninteger mad_hi (geninteger a, geninteger b, geninteger c) template diff --git a/sycl/include/CL/sycl/detail/accessor_impl.hpp b/sycl/include/CL/sycl/detail/accessor_impl.hpp index 76676014975c2..c5913a6cf5272 100644 --- a/sycl/include/CL/sycl/detail/accessor_impl.hpp +++ b/sycl/include/CL/sycl/detail/accessor_impl.hpp @@ -17,12 +17,14 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { +namespace ext { namespace intel { namespace gpu { // Forward declare a "back-door" access class to support ESIMD. class AccessorPrivateProxy; } // namespace gpu } // namespace intel +} // namespace ext } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) @@ -170,7 +172,7 @@ class AccessorBaseHost { AccessorImplPtr impl; private: - friend class sycl::intel::gpu::AccessorPrivateProxy; + friend class sycl::ext::intel::gpu::AccessorPrivateProxy; }; class __SYCL_EXPORT LocalAccessorImplHost { diff --git a/sycl/include/CL/sycl/detail/spirv.hpp b/sycl/include/CL/sycl/detail/spirv.hpp index d662e2afc7880..989cf6a3a096a 100644 --- a/sycl/include/CL/sycl/detail/spirv.hpp +++ b/sycl/include/CL/sycl/detail/spirv.hpp @@ -12,14 +12,16 @@ #include #include #include -#include +#include #ifdef __SYCL_DEVICE_ONLY__ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { -namespace intel { +namespace ext { +namespace oneapi { struct sub_group; -} // namespace intel +} // namespace oneapi +} // namespace ext namespace detail { namespace spirv { @@ -29,7 +31,7 @@ template struct group_scope> { static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Workgroup; }; -template <> struct group_scope<::cl::sycl::intel::sub_group> { +template <> struct group_scope<::cl::sycl::ext::oneapi::sub_group> { static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Subgroup; }; @@ -87,23 +89,23 @@ T GroupBroadcast(T x, id local_id) { // Single happens-before means semantics should always apply to all spaces // Although consume is unsupported, forwarding to acquire is valid static inline constexpr __spv::MemorySemanticsMask::Flag -getMemorySemanticsMask(intel::memory_order Order) { +getMemorySemanticsMask(ext::oneapi::memory_order Order) { __spv::MemorySemanticsMask::Flag SpvOrder = __spv::MemorySemanticsMask::None; switch (Order) { - case intel::memory_order::relaxed: + case ext::oneapi::memory_order::relaxed: SpvOrder = __spv::MemorySemanticsMask::None; break; - case intel::memory_order::__consume_unsupported: - case intel::memory_order::acquire: + case ext::oneapi::memory_order::__consume_unsupported: + case ext::oneapi::memory_order::acquire: SpvOrder = __spv::MemorySemanticsMask::Acquire; break; - case intel::memory_order::release: + case ext::oneapi::memory_order::release: SpvOrder = __spv::MemorySemanticsMask::Release; break; - case intel::memory_order::acq_rel: + case ext::oneapi::memory_order::acq_rel: SpvOrder = __spv::MemorySemanticsMask::AcquireRelease; break; - case intel::memory_order::seq_cst: + case ext::oneapi::memory_order::seq_cst: SpvOrder = __spv::MemorySemanticsMask::SequentiallyConsistent; break; } @@ -113,17 +115,18 @@ getMemorySemanticsMask(intel::memory_order Order) { __spv::MemorySemanticsMask::CrossWorkgroupMemory); } -static inline constexpr __spv::Scope::Flag getScope(intel::memory_scope Scope) { +static inline constexpr __spv::Scope::Flag +getScope(ext::oneapi::memory_scope Scope) { switch (Scope) { - case intel::memory_scope::work_item: + case ext::oneapi::memory_scope::work_item: return __spv::Scope::Invocation; - case intel::memory_scope::sub_group: + case ext::oneapi::memory_scope::sub_group: return __spv::Scope::Subgroup; - case intel::memory_scope::work_group: + case ext::oneapi::memory_scope::work_group: return __spv::Scope::Workgroup; - case intel::memory_scope::device: + case ext::oneapi::memory_scope::device: return __spv::Scope::Device; - case intel::memory_scope::system: + case ext::oneapi::memory_scope::system: return __spv::Scope::CrossDevice; } } @@ -131,8 +134,10 @@ static inline constexpr __spv::Scope::Flag getScope(intel::memory_scope Scope) { template inline typename detail::enable_if_t::value, T> AtomicCompareExchange(multi_ptr MPtr, - intel::memory_scope Scope, intel::memory_order Success, - intel::memory_order Failure, T Desired, T Expected) { + ext::oneapi::memory_scope Scope, + ext::oneapi::memory_order Success, + ext::oneapi::memory_order Failure, T Desired, + T Expected) { auto SPIRVSuccess = getMemorySemanticsMask(Success); auto SPIRVFailure = getMemorySemanticsMask(Failure); auto SPIRVScope = getScope(Scope); @@ -144,8 +149,10 @@ AtomicCompareExchange(multi_ptr MPtr, template inline typename detail::enable_if_t::value, T> AtomicCompareExchange(multi_ptr MPtr, - intel::memory_scope Scope, intel::memory_order Success, - intel::memory_order Failure, T Desired, T Expected) { + ext::oneapi::memory_scope Scope, + ext::oneapi::memory_order Success, + ext::oneapi::memory_order Failure, T Desired, + T Expected) { using I = detail::make_unsinged_integer_t; auto SPIRVSuccess = getMemorySemanticsMask(Success); auto SPIRVFailure = getMemorySemanticsMask(Failure); @@ -162,8 +169,8 @@ AtomicCompareExchange(multi_ptr MPtr, template inline typename detail::enable_if_t::value, T> -AtomicLoad(multi_ptr MPtr, intel::memory_scope Scope, - intel::memory_order Order) { +AtomicLoad(multi_ptr MPtr, ext::oneapi::memory_scope Scope, + ext::oneapi::memory_order Order) { auto *Ptr = MPtr.get(); auto SPIRVOrder = getMemorySemanticsMask(Order); auto SPIRVScope = getScope(Scope); @@ -172,8 +179,8 @@ AtomicLoad(multi_ptr MPtr, intel::memory_scope Scope, template inline typename detail::enable_if_t::value, T> -AtomicLoad(multi_ptr MPtr, intel::memory_scope Scope, - intel::memory_order Order) { +AtomicLoad(multi_ptr MPtr, ext::oneapi::memory_scope Scope, + ext::oneapi::memory_order Order) { using I = detail::make_unsinged_integer_t; auto *PtrInt = reinterpret_cast::pointer_t>( @@ -186,8 +193,8 @@ AtomicLoad(multi_ptr MPtr, intel::memory_scope Scope, template inline typename detail::enable_if_t::value> -AtomicStore(multi_ptr MPtr, intel::memory_scope Scope, - intel::memory_order Order, T Value) { +AtomicStore(multi_ptr MPtr, ext::oneapi::memory_scope Scope, + ext::oneapi::memory_order Order, T Value) { auto *Ptr = MPtr.get(); auto SPIRVOrder = getMemorySemanticsMask(Order); auto SPIRVScope = getScope(Scope); @@ -196,8 +203,8 @@ AtomicStore(multi_ptr MPtr, intel::memory_scope Scope, template inline typename detail::enable_if_t::value> -AtomicStore(multi_ptr MPtr, intel::memory_scope Scope, - intel::memory_order Order, T Value) { +AtomicStore(multi_ptr MPtr, ext::oneapi::memory_scope Scope, + ext::oneapi::memory_order Order, T Value) { using I = detail::make_unsinged_integer_t; auto *PtrInt = reinterpret_cast::pointer_t>( @@ -210,8 +217,8 @@ AtomicStore(multi_ptr MPtr, intel::memory_scope Scope, template inline typename detail::enable_if_t::value, T> -AtomicExchange(multi_ptr MPtr, intel::memory_scope Scope, - intel::memory_order Order, T Value) { +AtomicExchange(multi_ptr MPtr, ext::oneapi::memory_scope Scope, + ext::oneapi::memory_order Order, T Value) { auto *Ptr = MPtr.get(); auto SPIRVOrder = getMemorySemanticsMask(Order); auto SPIRVScope = getScope(Scope); @@ -220,8 +227,8 @@ AtomicExchange(multi_ptr MPtr, intel::memory_scope Scope, template inline typename detail::enable_if_t::value, T> -AtomicExchange(multi_ptr MPtr, intel::memory_scope Scope, - intel::memory_order Order, T Value) { +AtomicExchange(multi_ptr MPtr, ext::oneapi::memory_scope Scope, + ext::oneapi::memory_order Order, T Value) { using I = detail::make_unsinged_integer_t; auto *PtrInt = reinterpret_cast::pointer_t>( @@ -236,8 +243,8 @@ AtomicExchange(multi_ptr MPtr, intel::memory_scope Scope, template inline typename detail::enable_if_t::value, T> -AtomicIAdd(multi_ptr MPtr, intel::memory_scope Scope, - intel::memory_order Order, T Value) { +AtomicIAdd(multi_ptr MPtr, ext::oneapi::memory_scope Scope, + ext::oneapi::memory_order Order, T Value) { auto *Ptr = MPtr.get(); auto SPIRVOrder = getMemorySemanticsMask(Order); auto SPIRVScope = getScope(Scope); @@ -246,8 +253,8 @@ AtomicIAdd(multi_ptr MPtr, intel::memory_scope Scope, template inline typename detail::enable_if_t::value, T> -AtomicISub(multi_ptr MPtr, intel::memory_scope Scope, - intel::memory_order Order, T Value) { +AtomicISub(multi_ptr MPtr, ext::oneapi::memory_scope Scope, + ext::oneapi::memory_order Order, T Value) { auto *Ptr = MPtr.get(); auto SPIRVOrder = getMemorySemanticsMask(Order); auto SPIRVScope = getScope(Scope); @@ -256,8 +263,8 @@ AtomicISub(multi_ptr MPtr, intel::memory_scope Scope, template inline typename detail::enable_if_t::value, T> -AtomicAnd(multi_ptr MPtr, intel::memory_scope Scope, - intel::memory_order Order, T Value) { +AtomicAnd(multi_ptr MPtr, ext::oneapi::memory_scope Scope, + ext::oneapi::memory_order Order, T Value) { auto *Ptr = MPtr.get(); auto SPIRVOrder = getMemorySemanticsMask(Order); auto SPIRVScope = getScope(Scope); @@ -266,8 +273,8 @@ AtomicAnd(multi_ptr MPtr, intel::memory_scope Scope, template inline typename detail::enable_if_t::value, T> -AtomicOr(multi_ptr MPtr, intel::memory_scope Scope, - intel::memory_order Order, T Value) { +AtomicOr(multi_ptr MPtr, ext::oneapi::memory_scope Scope, + ext::oneapi::memory_order Order, T Value) { auto *Ptr = MPtr.get(); auto SPIRVOrder = getMemorySemanticsMask(Order); auto SPIRVScope = getScope(Scope); @@ -276,8 +283,8 @@ AtomicOr(multi_ptr MPtr, intel::memory_scope Scope, template inline typename detail::enable_if_t::value, T> -AtomicXor(multi_ptr MPtr, intel::memory_scope Scope, - intel::memory_order Order, T Value) { +AtomicXor(multi_ptr MPtr, ext::oneapi::memory_scope Scope, + ext::oneapi::memory_order Order, T Value) { auto *Ptr = MPtr.get(); auto SPIRVOrder = getMemorySemanticsMask(Order); auto SPIRVScope = getScope(Scope); @@ -286,8 +293,8 @@ AtomicXor(multi_ptr MPtr, intel::memory_scope Scope, template inline typename detail::enable_if_t::value, T> -AtomicMin(multi_ptr MPtr, intel::memory_scope Scope, - intel::memory_order Order, T Value) { +AtomicMin(multi_ptr MPtr, ext::oneapi::memory_scope Scope, + ext::oneapi::memory_order Order, T Value) { auto *Ptr = MPtr.get(); auto SPIRVOrder = getMemorySemanticsMask(Order); auto SPIRVScope = getScope(Scope); @@ -296,8 +303,8 @@ AtomicMin(multi_ptr MPtr, intel::memory_scope Scope, template inline typename detail::enable_if_t::value, T> -AtomicMax(multi_ptr MPtr, intel::memory_scope Scope, - intel::memory_order Order, T Value) { +AtomicMax(multi_ptr MPtr, ext::oneapi::memory_scope Scope, + ext::oneapi::memory_order Order, T Value) { auto *Ptr = MPtr.get(); auto SPIRVOrder = getMemorySemanticsMask(Order); auto SPIRVScope = getScope(Scope); diff --git a/sycl/include/CL/sycl/detail/type_traits.hpp b/sycl/include/CL/sycl/detail/type_traits.hpp index 3f52acc8a2de2..5c90569490924 100644 --- a/sycl/include/CL/sycl/detail/type_traits.hpp +++ b/sycl/include/CL/sycl/detail/type_traits.hpp @@ -18,9 +18,11 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { template class group; -namespace intel { +namespace ext { +namespace oneapi { struct sub_group; -} // namespace intel +} // namespace oneapi +} // namespace ext namespace detail { namespace half_impl { class half; @@ -313,7 +315,7 @@ struct is_group> : std::true_type {}; template struct is_sub_group : std::false_type {}; -template <> struct is_sub_group : std::true_type {}; +template <> struct is_sub_group : std::true_type {}; template struct is_generic_group diff --git a/sycl/include/CL/sycl/intel/esimd.hpp b/sycl/include/CL/sycl/ext/intel/esimd.hpp similarity index 78% rename from sycl/include/CL/sycl/intel/esimd.hpp rename to sycl/include/CL/sycl/ext/intel/esimd.hpp index 7f4b7886d2d2c..3a1cffdcd2a68 100644 --- a/sycl/include/CL/sycl/intel/esimd.hpp +++ b/sycl/include/CL/sycl/ext/intel/esimd.hpp @@ -10,10 +10,10 @@ #pragma once -#include -#include -#include -#include +#include +#include +#include +#include #ifdef __SYCL_DEVICE_ONLY__ #define SYCL_ESIMD_KERNEL __attribute__((sycl_explicit_simd)) diff --git a/sycl/include/CL/sycl/intel/esimd/detail/esimd_host_util.hpp b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_host_util.hpp similarity index 100% rename from sycl/include/CL/sycl/intel/esimd/detail/esimd_host_util.hpp rename to sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_host_util.hpp diff --git a/sycl/include/CL/sycl/intel/esimd/detail/esimd_intrin.hpp b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_intrin.hpp similarity index 82% rename from sycl/include/CL/sycl/intel/esimd/detail/esimd_intrin.hpp rename to sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_intrin.hpp index 23674ac3d3e91..e1c19df0b9076 100644 --- a/sycl/include/CL/sycl/intel/esimd/detail/esimd_intrin.hpp +++ b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_intrin.hpp @@ -11,9 +11,9 @@ #pragma once -#include -#include -#include +#include +#include +#include #include // \brief __esimd_rdregion: region access intrinsic. @@ -60,8 +60,9 @@ // template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_rdregion(sycl::intel::gpu::vector_type_t Input, uint16_t Offset); +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_rdregion(sycl::ext::intel::gpu::vector_type_t Input, + uint16_t Offset); // __esimd_wrregion returns the updated vector with the region updated. // @@ -112,13 +113,15 @@ __esimd_rdregion(sycl::intel::gpu::vector_type_t Input, uint16_t Offset); // template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_wrregion(sycl::intel::gpu::vector_type_t OldVal, - sycl::intel::gpu::vector_type_t NewVal, uint16_t Offset, - sycl::intel::gpu::mask_type_t Mask = 1); +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_wrregion(sycl::ext::intel::gpu::vector_type_t OldVal, + sycl::ext::intel::gpu::vector_type_t NewVal, + uint16_t Offset, + sycl::ext::intel::gpu::mask_type_t Mask = 1); __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { +namespace ext { namespace intel { namespace gpu { // TODO dependencies on the std SYCL concepts like images @@ -209,6 +212,7 @@ readRegion(const vector_type_t &Base, std::pair Region) { } // namespace gpu } // namespace intel +} // namespace ext } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) @@ -218,37 +222,41 @@ readRegion(const vector_type_t &Base, std::pair Region) { // optimization on simd object // template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_vload(const sycl::intel::gpu::vector_type_t *ptr); +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_vload(const sycl::ext::intel::gpu::vector_type_t *ptr); // vstore // // map to the backend vstore intrinsic, used by compiler to control // optimization on simd object template -SYCL_EXTERNAL void __esimd_vstore(sycl::intel::gpu::vector_type_t *ptr, - sycl::intel::gpu::vector_type_t vals); +SYCL_EXTERNAL void +__esimd_vstore(sycl::ext::intel::gpu::vector_type_t *ptr, + sycl::ext::intel::gpu::vector_type_t vals); template -SYCL_EXTERNAL uint16_t __esimd_any(sycl::intel::gpu::vector_type_t src); +SYCL_EXTERNAL uint16_t +__esimd_any(sycl::ext::intel::gpu::vector_type_t src); template -SYCL_EXTERNAL uint16_t __esimd_all(sycl::intel::gpu::vector_type_t src); +SYCL_EXTERNAL uint16_t +__esimd_all(sycl::ext::intel::gpu::vector_type_t src); #ifndef __SYCL_DEVICE_ONLY__ // Implementations of ESIMD intrinsics for the SYCL host device template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_rdregion(sycl::intel::gpu::vector_type_t Input, uint16_t Offset) { +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_rdregion(sycl::ext::intel::gpu::vector_type_t Input, + uint16_t Offset) { uint16_t EltOffset = Offset / sizeof(T); assert(Offset % sizeof(T) == 0); int NumRows = M / Width; assert(M % Width == 0); - sycl::intel::gpu::vector_type_t Result; + sycl::ext::intel::gpu::vector_type_t Result; int Index = 0; for (int i = 0; i < NumRows; ++i) { for (int j = 0; j < Width; ++j) { @@ -260,17 +268,17 @@ __esimd_rdregion(sycl::intel::gpu::vector_type_t Input, uint16_t Offset) { template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_wrregion(sycl::intel::gpu::vector_type_t OldVal, - sycl::intel::gpu::vector_type_t NewVal, uint16_t Offset, - sycl::intel::gpu::mask_type_t Mask) { +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_wrregion(sycl::ext::intel::gpu::vector_type_t OldVal, + sycl::ext::intel::gpu::vector_type_t NewVal, + uint16_t Offset, sycl::ext::intel::gpu::mask_type_t Mask) { uint16_t EltOffset = Offset / sizeof(T); assert(Offset % sizeof(T) == 0); int NumRows = M / Width; assert(M % Width == 0); - sycl::intel::gpu::vector_type_t Result = OldVal; + sycl::ext::intel::gpu::vector_type_t Result = OldVal; int Index = 0; for (int i = 0; i < NumRows; ++i) { for (int j = 0; j < Width; ++j) { diff --git a/sycl/include/CL/sycl/intel/esimd/detail/esimd_math_intrin.hpp b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_math_intrin.hpp similarity index 90% rename from sycl/include/CL/sycl/intel/esimd/detail/esimd_math_intrin.hpp rename to sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_math_intrin.hpp index c3f5a9d141305..ad091c82fc694 100644 --- a/sycl/include/CL/sycl/intel/esimd/detail/esimd_math_intrin.hpp +++ b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_math_intrin.hpp @@ -11,12 +11,12 @@ #pragma once -#include -#include -#include +#include +#include +#include #include -using sycl::intel::gpu::vector_type_t; +using sycl::ext::intel::gpu::vector_type_t; // saturation intrinsics template @@ -210,39 +210,39 @@ SYCL_EXTERNAL vector_type_t __esimd_dp4a(vector_type_t src0, // Reduction functions template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_reduced_fmax(sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t src2); +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_reduced_fmax(sycl::ext::intel::gpu::vector_type_t src1, + sycl::ext::intel::gpu::vector_type_t src2); template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_reduced_umax(sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t src2); +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_reduced_umax(sycl::ext::intel::gpu::vector_type_t src1, + sycl::ext::intel::gpu::vector_type_t src2); template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_reduced_smax(sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t src2); +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_reduced_smax(sycl::ext::intel::gpu::vector_type_t src1, + sycl::ext::intel::gpu::vector_type_t src2); template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_reduced_fmin(sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t src2); +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_reduced_fmin(sycl::ext::intel::gpu::vector_type_t src1, + sycl::ext::intel::gpu::vector_type_t src2); template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_reduced_umin(sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t src2); +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_reduced_umin(sycl::ext::intel::gpu::vector_type_t src1, + sycl::ext::intel::gpu::vector_type_t src2); template -sycl::intel::gpu::vector_type_t SYCL_EXTERNAL -__esimd_reduced_smin(sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t src2); +sycl::ext::intel::gpu::vector_type_t SYCL_EXTERNAL +__esimd_reduced_smin(sycl::ext::intel::gpu::vector_type_t src1, + sycl::ext::intel::gpu::vector_type_t src2); template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_dp4(sycl::intel::gpu::vector_type_t v1, - sycl::intel::gpu::vector_type_t v2); +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_dp4(sycl::ext::intel::gpu::vector_type_t v1, + sycl::ext::intel::gpu::vector_type_t v2); #ifndef __SYCL_DEVICE_ONLY__ @@ -1096,10 +1096,10 @@ SYCL_EXTERNAL vector_type_t __esimd_dp4a(vector_type_t src0, }; template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_reduced_max(sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t src2) { - sycl::intel::gpu::vector_type_t retv; +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_reduced_max(sycl::ext::intel::gpu::vector_type_t src1, + sycl::ext::intel::gpu::vector_type_t src2) { + sycl::ext::intel::gpu::vector_type_t retv; for (int I = 0; I < N; I++) { if (src1[I] >= src2[I]) { retv[I] = src1[I]; @@ -1111,31 +1111,31 @@ __esimd_reduced_max(sycl::intel::gpu::vector_type_t src1, } template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_reduced_fmax(sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t src2) { +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_reduced_fmax(sycl::ext::intel::gpu::vector_type_t src1, + sycl::ext::intel::gpu::vector_type_t src2) { return __esimd_reduced_max(src1, src2); } template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_reduced_umax(sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t src2) { +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_reduced_umax(sycl::ext::intel::gpu::vector_type_t src1, + sycl::ext::intel::gpu::vector_type_t src2) { return __esimd_reduced_max(src1, src2); } template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_reduced_smax(sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t src2) { +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_reduced_smax(sycl::ext::intel::gpu::vector_type_t src1, + sycl::ext::intel::gpu::vector_type_t src2) { return __esimd_reduced_max(src1, src2); } template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_reduced_min(sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t src2) { - sycl::intel::gpu::vector_type_t retv; +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_reduced_min(sycl::ext::intel::gpu::vector_type_t src1, + sycl::ext::intel::gpu::vector_type_t src2) { + sycl::ext::intel::gpu::vector_type_t retv; for (int I = 0; I < N; I++) { if (src1[I] <= src2[I]) { retv[I] = src1[I]; @@ -1147,23 +1147,23 @@ __esimd_reduced_min(sycl::intel::gpu::vector_type_t src1, } template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_reduced_fmin(sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t src2) { +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_reduced_fmin(sycl::ext::intel::gpu::vector_type_t src1, + sycl::ext::intel::gpu::vector_type_t src2) { return __esimd_reduced_min(src1, src2); } template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_reduced_umin(sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t src2) { +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_reduced_umin(sycl::ext::intel::gpu::vector_type_t src1, + sycl::ext::intel::gpu::vector_type_t src2) { return __esimd_reduced_min(src1, src2); } template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_reduced_smin(sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t src2) { +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_reduced_smin(sycl::ext::intel::gpu::vector_type_t src1, + sycl::ext::intel::gpu::vector_type_t src2) { return __esimd_reduced_min(src1, src2); } diff --git a/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_memory_intrin.hpp b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_memory_intrin.hpp new file mode 100644 index 0000000000000..e28dad78b048e --- /dev/null +++ b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_memory_intrin.hpp @@ -0,0 +1,690 @@ +//==------------ esimd_memory_intrin.hpp - DPC++ Explicit SIMD API ---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Declares Explicit SIMD intrinsics used to implement working with +// the SIMD classes objects. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +// flat_read does flat-address gather +template +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t< + Ty, N * sycl::ext::intel::gpu::ElemsPerAddrDecoding(NumBlk)> +__esimd_flat_read(sycl::ext::intel::gpu::vector_type_t addrs, + int ElemsPerAddr = NumBlk, + sycl::ext::intel::gpu::vector_type_t pred = 1); + +// flat_write does flat-address scatter +template +SYCL_EXTERNAL void __esimd_flat_write( + sycl::ext::intel::gpu::vector_type_t addrs, + sycl::ext::intel::gpu::vector_type_t< + Ty, N * sycl::ext::intel::gpu::ElemsPerAddrDecoding(NumBlk)> + vals, + int ElemsPerAddr = NumBlk, + sycl::ext::intel::gpu::vector_type_t pred = 1); + +// flat_block_read reads a block of data from one flat address +template +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_flat_block_read_unaligned(uint64_t addr); + +// flat_block_write writes a block of data using one flat address +template +SYCL_EXTERNAL void +__esimd_flat_block_write(uint64_t addr, + sycl::ext::intel::gpu::vector_type_t vals); + +// Reads a block of data from given surface at given offset. +template +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_block_read(SurfIndAliasTy surf_ind, uint32_t offset); + +// Writes given block of data to a surface with given index at given offset. +template +SYCL_EXTERNAL void +__esimd_block_write(SurfIndAliasTy surf_ind, uint32_t offset, + sycl::ext::intel::gpu::vector_type_t vals); + +// flat_read4 does flat-address gather4 +template +sycl::ext::intel::gpu::vector_type_t SYCL_EXTERNAL +__esimd_flat_read4(sycl::ext::intel::gpu::vector_type_t addrs, + sycl::ext::intel::gpu::vector_type_t pred = 1); + +// flat_write does flat-address scatter +template +SYCL_EXTERNAL void __esimd_flat_write4( + sycl::ext::intel::gpu::vector_type_t addrs, + sycl::ext::intel::gpu::vector_type_t vals, + sycl::ext::intel::gpu::vector_type_t pred = 1); + +// flat_atomic: flat-address atomic +template +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_flat_atomic0(sycl::ext::intel::gpu::vector_type_t addrs, + sycl::ext::intel::gpu::vector_type_t pred); + +template +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_flat_atomic1(sycl::ext::intel::gpu::vector_type_t addrs, + sycl::ext::intel::gpu::vector_type_t src0, + sycl::ext::intel::gpu::vector_type_t pred); + +template +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_flat_atomic2(sycl::ext::intel::gpu::vector_type_t addrs, + sycl::ext::intel::gpu::vector_type_t src0, + sycl::ext::intel::gpu::vector_type_t src1, + sycl::ext::intel::gpu::vector_type_t pred); + +// esimd_barrier, generic group barrier +SYCL_EXTERNAL void __esimd_barrier(); + +// slm_fence sets the SLM read/write order +SYCL_EXTERNAL void __esimd_slm_fence(uint8_t cntl); + +// slm_read does SLM gather +template +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_slm_read(sycl::ext::intel::gpu::vector_type_t addrs, + sycl::ext::intel::gpu::vector_type_t pred = 1); + +// slm_write does SLM scatter +template +SYCL_EXTERNAL void +__esimd_slm_write(sycl::ext::intel::gpu::vector_type_t addrs, + sycl::ext::intel::gpu::vector_type_t vals, + sycl::ext::intel::gpu::vector_type_t pred = 1); + +// slm_block_read reads a block of data from SLM +template +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_slm_block_read(uint32_t addr); + +// slm_block_write writes a block of data to SLM +template +SYCL_EXTERNAL void +__esimd_slm_block_write(uint32_t addr, + sycl::ext::intel::gpu::vector_type_t vals); + +// slm_read4 does SLM gather4 +template +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_slm_read4(sycl::ext::intel::gpu::vector_type_t addrs, + sycl::ext::intel::gpu::vector_type_t pred = 1); + +// slm_write4 does SLM scatter4 +template +SYCL_EXTERNAL void __esimd_slm_write4( + sycl::ext::intel::gpu::vector_type_t addrs, + sycl::ext::intel::gpu::vector_type_t vals, + sycl::ext::intel::gpu::vector_type_t pred = 1); + +// slm_atomic: SLM atomic +template +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_slm_atomic0(sycl::ext::intel::gpu::vector_type_t addrs, + sycl::ext::intel::gpu::vector_type_t pred); + +template +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_slm_atomic1(sycl::ext::intel::gpu::vector_type_t addrs, + sycl::ext::intel::gpu::vector_type_t src0, + sycl::ext::intel::gpu::vector_type_t pred); + +template +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_slm_atomic2(sycl::ext::intel::gpu::vector_type_t addrs, + sycl::ext::intel::gpu::vector_type_t src0, + sycl::ext::intel::gpu::vector_type_t src1, + sycl::ext::intel::gpu::vector_type_t pred); + +// Media block load +// +// @param Ty the element data type. +// +// @param M the hight of the 2D block. +// +// @param N the width of the 2D block. +// +// @param TACC type of the surface handle. +// +// @param modifier top/bottom field surface access control. +// +// @param handle the surface handle. +// +// @param plane planar surface index. +// +// @param width the width of the return block. +// +// @param x X-coordinate of the left upper rectangle corner in BYTES. +// +// @param y Y-coordinate of the left upper rectangle corner in ROWS. +// +// @return the linearized 2D block data read from surface. +// +template +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_media_block_load(unsigned modififer, TACC handle, unsigned plane, + unsigned width, unsigned x, unsigned y); + +// Media block store +// +// @param Ty the element data type. +// +// @param M the hight of the 2D block. +// +// @param N the width of the 2D block. +// +// @param TACC type of the surface handle. +// +// @param modifier top/bottom field surface access control. +// +// @param handle the surface handle. +// +// @param plane planar surface index. +// +// @param width the width of the return block. +// +// @param x X-coordinate of the left upper rectangle corner in BYTES. +// +// @param y Y-coordinate of the left upper rectangle corner in ROWS. +// +// @param vals the linearized 2D block data to be written to surface. +// +template +SYCL_EXTERNAL void +__esimd_media_block_store(unsigned modififer, TACC handle, unsigned plane, + unsigned width, unsigned x, unsigned y, + sycl::ext::intel::gpu::vector_type_t vals); + +#ifndef __SYCL_DEVICE_ONLY__ + +template +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t< + Ty, N * sycl::ext::intel::gpu::ElemsPerAddrDecoding(NumBlk)> +__esimd_flat_read(sycl::ext::intel::gpu::vector_type_t addrs, + int ElemsPerAddr, + sycl::ext::intel::gpu::vector_type_t pred) { + auto NumBlkDecoded = sycl::ext::intel::gpu::ElemsPerAddrDecoding(NumBlk); + sycl::ext::intel::gpu::vector_type_t< + Ty, N * sycl::ext::intel::gpu::ElemsPerAddrDecoding(NumBlk)> + V; + ElemsPerAddr = sycl::ext::intel::gpu::ElemsPerAddrDecoding(ElemsPerAddr); + + for (int I = 0; I < N; I++) { + if (pred[I]) { + Ty *Addr = reinterpret_cast(addrs[I]); + if (sizeof(Ty) == 2) + ElemsPerAddr = ElemsPerAddr / 2; + if (sizeof(Ty) <= 2) { + for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddr; J++) + V[I * NumBlkDecoded + J] = *(Addr + J); + } else { + for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddr; J++) + V[J * N + I] = *(Addr + J); + } + } + } + return V; +} + +template +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_flat_read4(sycl::ext::intel::gpu::vector_type_t addrs, + sycl::ext::intel::gpu::vector_type_t pred) { + sycl::ext::intel::gpu::vector_type_t V; + unsigned int Next = 0; + + if constexpr (HasR(Mask)) { + for (int I = 0; I < N; I++, Next++) { + if (pred[I]) { + Ty *Addr = reinterpret_cast(addrs[I]); + V[Next] = *Addr; + } + } + } + + if constexpr (HasG(Mask)) { + for (int I = 0; I < N; I++, Next++) { + if (pred[I]) { + Ty *Addr = reinterpret_cast(addrs[I] + sizeof(Ty)); + V[Next] = *Addr; + } + } + } + + if constexpr (HasB(Mask)) { + for (int I = 0; I < N; I++, Next++) { + if (pred[I]) { + Ty *Addr = reinterpret_cast(addrs[I] + sizeof(Ty) + sizeof(Ty)); + V[Next] = *Addr; + } + } + } + + if constexpr (HasA(Mask)) { + for (int I = 0; I < N; I++, Next++) { + if (pred[I]) { + Ty *Addr = reinterpret_cast(addrs[I] + sizeof(Ty) + sizeof(Ty) + + sizeof(Ty)); + V[Next] = *Addr; + } + } + } + + return V; +} + +template +SYCL_EXTERNAL void __esimd_flat_write( + sycl::ext::intel::gpu::vector_type_t addrs, + sycl::ext::intel::gpu::vector_type_t< + Ty, N * sycl::ext::intel::gpu::ElemsPerAddrDecoding(NumBlk)> + vals, + int ElemsPerAddr, sycl::ext::intel::gpu::vector_type_t pred) { + auto NumBlkDecoded = sycl::ext::intel::gpu::ElemsPerAddrDecoding(NumBlk); + ElemsPerAddr = sycl::ext::intel::gpu::ElemsPerAddrDecoding(ElemsPerAddr); + + for (int I = 0; I < N; I++) { + if (pred[I]) { + Ty *Addr = reinterpret_cast(addrs[I]); + if (sizeof(Ty) == 2) + ElemsPerAddr = ElemsPerAddr / 2; + if (sizeof(Ty) <= 2) { + for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddr; J++) + *(Addr + J) = vals[I * NumBlkDecoded + J]; + } else { + for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddr; J++) + *(Addr + J) = vals[J * N + I]; + } + } + } +} + +template +SYCL_EXTERNAL void __esimd_flat_write4( + sycl::ext::intel::gpu::vector_type_t addrs, + sycl::ext::intel::gpu::vector_type_t vals, + sycl::ext::intel::gpu::vector_type_t pred) { + sycl::ext::intel::gpu::vector_type_t V; + unsigned int Next = 0; + + if constexpr (HasR(Mask)) { + for (int I = 0; I < N; I++, Next++) { + if (pred[I]) { + Ty *Addr = reinterpret_cast(addrs[I]); + *Addr = vals[Next]; + } + } + } + + if constexpr (HasG(Mask)) { + for (int I = 0; I < N; I++, Next++) { + if (pred[I]) { + Ty *Addr = reinterpret_cast(addrs[I] + sizeof(Ty)); + *Addr = vals[Next]; + } + } + } + + if constexpr (HasB(Mask)) { + for (int I = 0; I < N; I++, Next++) { + if (pred[I]) { + Ty *Addr = reinterpret_cast(addrs[I] + sizeof(Ty) + sizeof(Ty)); + *Addr = vals[Next]; + } + } + } + + if constexpr (HasA(Mask)) { + for (int I = 0; I < N; I++, Next++) { + if (pred[I]) { + Ty *Addr = reinterpret_cast(addrs[I] + sizeof(Ty) + sizeof(Ty) + + sizeof(Ty)); + *Addr = vals[Next]; + } + } + } +} + +template +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_flat_block_read_unaligned(uint64_t addr) { + sycl::ext::intel::gpu::vector_type_t V; + + for (int I = 0; I < N; I++) { + Ty *Addr = reinterpret_cast(addr + I * sizeof(Ty)); + V[I] = *Addr; + } + return V; +} + +template +SYCL_EXTERNAL void +__esimd_flat_block_write(uint64_t addr, + sycl::ext::intel::gpu::vector_type_t vals) { + for (int I = 0; I < N; I++) { + Ty *Addr = reinterpret_cast(addr + I * sizeof(Ty)); + *Addr = vals[I]; + } +} + +template +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_media_block_load(unsigned modififer, TACC handle, unsigned plane, + unsigned width, unsigned x, unsigned y) { + // On host the input surface is modeled as sycl image 2d object, + // and the read/write access is done through accessor, + // which is passed in as the handle argument. + auto range = + sycl::ext::intel::gpu::AccessorPrivateProxy::getImageRange(handle); + unsigned bpp = + sycl::ext::intel::gpu::AccessorPrivateProxy::getElemSize(handle); + unsigned vpp = bpp / sizeof(Ty); + unsigned int i = x / bpp; + unsigned int j = y; + + assert(x % bpp == 0); + unsigned int xbound = range[0] - 1; + unsigned int ybound = range[1] - 1; + + sycl::ext::intel::gpu::vector_type_t vals; + for (int row = 0; row < M; row++) { + for (int col = 0; col < N; col += vpp) { + unsigned int xoff = (i > xbound) ? xbound : i; + unsigned int yoff = (j > ybound) ? ybound : j; + auto coords = cl::sycl::cl_int2(xoff, yoff); + cl::sycl::cl_uint4 data = handle.read(coords); + + sycl::ext::intel::gpu::vector_type_t res; + for (int idx = 0; idx < 4; idx++) { + res[idx] = data[idx]; + } + + constexpr int refN = sizeof(cl::sycl::cl_uint4) / sizeof(Ty); + unsigned int stride = sizeof(cl::sycl::cl_uint4) / bpp; + using refTy = sycl::ext::intel::gpu::vector_type_t; + auto ref = reinterpret_cast(res); + + unsigned int offset1 = col + row * N; + unsigned int offset2 = 0; + for (int idx = 0; idx < vpp; idx++) { + vals[offset1] = ref[offset2]; + offset1++; + offset2 += stride; + } + i++; + } + i = x / bpp; + j++; + } + + return vals; +} + +template +SYCL_EXTERNAL void __esimd_media_block_store( + unsigned modififer, TACC handle, unsigned plane, unsigned width, unsigned x, + unsigned y, sycl::ext::intel::gpu::vector_type_t vals) { + unsigned bpp = + sycl::ext::intel::gpu::AccessorPrivateProxy::getElemSize(handle); + unsigned vpp = bpp / sizeof(Ty); + auto range = + sycl::ext::intel::gpu::AccessorPrivateProxy::getImageRange(handle); + unsigned int i = x / bpp; + unsigned int j = y; + + assert(x % bpp == 0); + + for (int row = 0; row < M; row++) { + for (int col = 0; col < N; col += vpp) { + constexpr int Sz = sizeof(cl::sycl::cl_uint4) / sizeof(Ty); + sycl::ext::intel::gpu::vector_type_t res = 0; + + unsigned int offset1 = col + row * N; + unsigned int offset2 = 0; + unsigned int stride = sizeof(cl::sycl::cl_uint4) / bpp; + for (int idx = 0; idx < vpp; idx++) { + res[offset2] = vals[offset1]; + offset1++; + offset2 += stride; + } + + using refTy = sycl::ext::intel::gpu::vector_type_t; + auto ref = reinterpret_cast(res); + + cl::sycl::cl_uint4 data; + for (int idx = 0; idx < 4; idx++) { + data[idx] = ref[idx]; + } + + if (i < range[0] && j < range[1]) { + auto coords = cl::sycl::cl_int2(i, j); + handle.write(coords, data); + } + i++; + } + i = x / bpp; + j++; + } +} + +template +SYCL_EXTERNAL uint16_t +__esimd_any(sycl::ext::intel::gpu::vector_type_t src) { + for (unsigned int i = 0; i != N; i++) { + if (src[i] != 0) + return 1; + } + return 0; +} + +template +SYCL_EXTERNAL uint16_t +__esimd_all(sycl::ext::intel::gpu::vector_type_t src) { + for (unsigned int i = 0; i != N; i++) { + if (src[i] == 0) + return 0; + } + return 1; +} + +template +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_dp4(sycl::ext::intel::gpu::vector_type_t v1, + sycl::ext::intel::gpu::vector_type_t v2) { + sycl::ext::intel::gpu::vector_type_t retv; + for (auto i = 0; i != N; i += 4) { + Ty dp = (v1[i] * v2[i]) + (v1[i + 1] * v2[i + 1]) + + (v1[i + 2] * v2[i + 2]) + (v1[i + 3] * v2[i + 3]); + retv[i] = dp; + retv[i + 1] = dp; + retv[i + 2] = dp; + retv[i + 3] = dp; + } + return retv; +} + +/// TODO +SYCL_EXTERNAL void __esimd_barrier() {} + +SYCL_EXTERNAL void __esimd_slm_fence(uint8_t cntl) {} + +template +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_slm_read(sycl::ext::intel::gpu::vector_type_t addrs, + sycl::ext::intel::gpu::vector_type_t pred) { + sycl::ext::intel::gpu::vector_type_t retv; + return retv; +} + +// slm_write does SLM scatter +template +SYCL_EXTERNAL void +__esimd_slm_write(sycl::ext::intel::gpu::vector_type_t addrs, + sycl::ext::intel::gpu::vector_type_t vals, + sycl::ext::intel::gpu::vector_type_t pred) {} + +// slm_block_read reads a block of data from SLM +template +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_slm_block_read(uint32_t addr) { + sycl::ext::intel::gpu::vector_type_t retv; + return retv; +} + +// slm_block_write writes a block of data to SLM +template +SYCL_EXTERNAL void +__esimd_slm_block_write(uint32_t addr, + sycl::ext::intel::gpu::vector_type_t vals) {} + +// slm_read4 does SLM gather4 +template +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_slm_read4(sycl::ext::intel::gpu::vector_type_t addrs, + sycl::ext::intel::gpu::vector_type_t pred) { + sycl::ext::intel::gpu::vector_type_t retv; + return retv; +} + +// slm_write4 does SLM scatter4 +template +SYCL_EXTERNAL void __esimd_slm_write4( + sycl::ext::intel::gpu::vector_type_t addrs, + sycl::ext::intel::gpu::vector_type_t vals, + sycl::ext::intel::gpu::vector_type_t pred) {} + +// slm_atomic: SLM atomic +template +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_slm_atomic0(sycl::ext::intel::gpu::vector_type_t addrs, + sycl::ext::intel::gpu::vector_type_t pred) { + sycl::ext::intel::gpu::vector_type_t retv; + return retv; +} + +template +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_slm_atomic1(sycl::ext::intel::gpu::vector_type_t addrs, + sycl::ext::intel::gpu::vector_type_t src0, + sycl::ext::intel::gpu::vector_type_t pred) { + sycl::ext::intel::gpu::vector_type_t retv; + return retv; +} + +template +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_slm_atomic2(sycl::ext::intel::gpu::vector_type_t addrs, + sycl::ext::intel::gpu::vector_type_t src0, + sycl::ext::intel::gpu::vector_type_t src1, + sycl::ext::intel::gpu::vector_type_t pred) { + sycl::ext::intel::gpu::vector_type_t retv; + return retv; +} + +template +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_flat_atomic0(sycl::ext::intel::gpu::vector_type_t addrs, + sycl::ext::intel::gpu::vector_type_t pred) { + sycl::ext::intel::gpu::vector_type_t retv; + return retv; +} + +template +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_flat_atomic1(sycl::ext::intel::gpu::vector_type_t addrs, + sycl::ext::intel::gpu::vector_type_t src0, + sycl::ext::intel::gpu::vector_type_t pred) { + sycl::ext::intel::gpu::vector_type_t retv; + return retv; +} + +template +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_flat_atomic2(sycl::ext::intel::gpu::vector_type_t addrs, + sycl::ext::intel::gpu::vector_type_t src0, + sycl::ext::intel::gpu::vector_type_t src1, + sycl::ext::intel::gpu::vector_type_t pred) { + sycl::ext::intel::gpu::vector_type_t retv; + return retv; +} + +template +SYCL_EXTERNAL sycl::ext::intel::gpu::vector_type_t +__esimd_block_read(SurfIndAliasTy surf_ind, uint32_t offset) { + throw cl::sycl::feature_not_supported(); + return sycl::ext::intel::gpu::vector_type_t(); +} + +template +SYCL_EXTERNAL void +__esimd_block_write(SurfIndAliasTy surf_ind, uint32_t offset, + sycl::ext::intel::gpu::vector_type_t vals) { + + throw cl::sycl::feature_not_supported(); +} + +#endif // __SYCL_DEVICE_ONLY__ diff --git a/sycl/include/CL/sycl/intel/esimd/detail/esimd_region.hpp b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_region.hpp similarity index 99% rename from sycl/include/CL/sycl/intel/esimd/detail/esimd_region.hpp rename to sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_region.hpp index c1576415a882b..39910609e7942 100644 --- a/sycl/include/CL/sycl/intel/esimd/detail/esimd_region.hpp +++ b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_region.hpp @@ -17,6 +17,7 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { +namespace ext { namespace intel { namespace gpu { @@ -115,5 +116,6 @@ template T getBaseRegion(std::pair Reg) { } // namespace gpu } // namespace intel +} // namespace ext } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/include/CL/sycl/intel/esimd/detail/esimd_types.hpp b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_types.hpp similarity index 98% rename from sycl/include/CL/sycl/intel/esimd/detail/esimd_types.hpp rename to sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_types.hpp index 7ff12e9113dda..eb6b1fd87914c 100644 --- a/sycl/include/CL/sycl/intel/esimd/detail/esimd_types.hpp +++ b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_types.hpp @@ -12,14 +12,15 @@ #include #include // to define C++14,17 extensions +#include +#include #include -#include -#include #include #include __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { +namespace ext { namespace intel { namespace gpu { @@ -257,5 +258,6 @@ inline std::istream &operator>>(std::istream &I, half &rhs) { } // namespace gpu } // namespace intel +} // namespace ext } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/include/CL/sycl/intel/esimd/detail/esimd_util.hpp b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_util.hpp old mode 100755 new mode 100644 similarity index 87% rename from sycl/include/CL/sycl/intel/esimd/detail/esimd_util.hpp rename to sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_util.hpp index 4bd68905e069b..2a39fa57e46d8 --- a/sycl/include/CL/sycl/intel/esimd/detail/esimd_util.hpp +++ b/sycl/include/CL/sycl/ext/intel/esimd/detail/esimd_util.hpp @@ -69,6 +69,7 @@ static ESIMD_INLINE constexpr bool isPowerOf2(unsigned int n, __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { +namespace ext { namespace intel { namespace gpu { @@ -84,11 +85,11 @@ template struct is_esimd_vector { static const bool value = false; }; template -struct is_esimd_vector> { +struct is_esimd_vector> { static const bool value = true; }; template -struct is_esimd_vector> { +struct is_esimd_vector> { static const bool value = true; }; @@ -106,12 +107,12 @@ struct is_dword_type }; template -struct is_dword_type> { +struct is_dword_type> { static const bool value = is_dword_type::value; }; template -struct is_dword_type> { +struct is_dword_type> { static const bool value = is_dword_type::value; }; @@ -124,11 +125,12 @@ struct is_word_type typename std::remove_const::type>::value> {}; template -struct is_word_type> { +struct is_word_type> { static const bool value = is_word_type::value; }; -template struct is_word_type> { +template +struct is_word_type> { static const bool value = is_word_type::value; }; @@ -141,11 +143,12 @@ struct is_byte_type typename std::remove_const::type>::value> {}; template -struct is_byte_type> { +struct is_byte_type> { static const bool value = is_byte_type::value; }; -template struct is_byte_type> { +template +struct is_byte_type> { static const bool value = is_byte_type::value; }; @@ -179,33 +182,33 @@ struct is_qword_type typename std::remove_const::type>::value> {}; template -struct is_qword_type> { +struct is_qword_type> { static const bool value = is_qword_type::value; }; template -struct is_qword_type> { +struct is_qword_type> { static const bool value = is_qword_type::value; }; // Extends to ESIMD vector types. template -struct is_fp_or_dword_type> { +struct is_fp_or_dword_type> { static const bool value = is_fp_or_dword_type::value; }; template -struct is_fp_or_dword_type> { +struct is_fp_or_dword_type> { static const bool value = is_fp_or_dword_type::value; }; /// Convert types into vector types template struct simd_type { - using type = sycl::intel::gpu::simd; + using type = sycl::ext::intel::gpu::simd; }; template -struct simd_type> { - using type = sycl::intel::gpu::simd; +struct simd_type> { + using type = sycl::ext::intel::gpu::simd; }; template struct simd_type { @@ -239,5 +242,6 @@ template <> struct word_type { using type = ushort; }; } // namespace details } // namespace gpu } // namespace intel +} // namespace ext } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/include/CL/sycl/intel/esimd/esimd.hpp b/sycl/include/CL/sycl/ext/intel/esimd/esimd.hpp similarity index 98% rename from sycl/include/CL/sycl/intel/esimd/esimd.hpp rename to sycl/include/CL/sycl/ext/intel/esimd/esimd.hpp index 757055dfa00fe..6b2bd52ff4ca5 100644 --- a/sycl/include/CL/sycl/intel/esimd/esimd.hpp +++ b/sycl/include/CL/sycl/ext/intel/esimd/esimd.hpp @@ -10,11 +10,12 @@ #pragma once -#include -#include +#include +#include __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { +namespace ext { namespace intel { namespace gpu { @@ -445,13 +446,14 @@ ESIMD_INLINE simd convert(simd val) { } // namespace gpu } // namespace intel +} // namespace ext } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) #ifndef __SYCL_DEVICE_ONLY__ template std::ostream &operator<<(std::ostream &OS, - const sycl::intel::gpu::simd &V) { + const sycl::ext::intel::gpu::simd &V) { OS << "{"; for (int I = 0; I < N; I++) { OS << V[I]; diff --git a/sycl/include/CL/sycl/intel/esimd/esimd_enum.hpp b/sycl/include/CL/sycl/ext/intel/esimd/esimd_enum.hpp similarity index 98% rename from sycl/include/CL/sycl/intel/esimd/esimd_enum.hpp rename to sycl/include/CL/sycl/ext/intel/esimd/esimd_enum.hpp index 4b901ea079119..626d6002af35a 100644 --- a/sycl/include/CL/sycl/intel/esimd/esimd_enum.hpp +++ b/sycl/include/CL/sycl/ext/intel/esimd/esimd_enum.hpp @@ -15,6 +15,7 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { +namespace ext { namespace intel { namespace gpu { @@ -107,5 +108,6 @@ enum class CacheHint : uint8_t { } // namespace gpu } // namespace intel +} // namespace ext } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/include/CL/sycl/intel/esimd/esimd_math.hpp b/sycl/include/CL/sycl/ext/intel/esimd/esimd_math.hpp similarity index 99% rename from sycl/include/CL/sycl/intel/esimd/esimd_math.hpp rename to sycl/include/CL/sycl/ext/intel/esimd/esimd_math.hpp index cd5b962dd60a2..18781748e782e 100644 --- a/sycl/include/CL/sycl/intel/esimd/esimd_math.hpp +++ b/sycl/include/CL/sycl/ext/intel/esimd/esimd_math.hpp @@ -10,15 +10,16 @@ #pragma once -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include #include __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { +namespace ext { namespace intel { namespace gpu { @@ -1946,5 +1947,6 @@ simd esimd_dp4(simd v1, simd v2) { } // namespace gpu } // namespace intel +} // namespace ext } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/include/CL/sycl/intel/esimd/esimd_memory.hpp b/sycl/include/CL/sycl/ext/intel/esimd/esimd_memory.hpp similarity index 98% rename from sycl/include/CL/sycl/intel/esimd/esimd_memory.hpp rename to sycl/include/CL/sycl/ext/intel/esimd/esimd_memory.hpp index 77035cb16c9c3..0903b4d9030b9 100644 --- a/sycl/include/CL/sycl/intel/esimd/esimd_memory.hpp +++ b/sycl/include/CL/sycl/ext/intel/esimd/esimd_memory.hpp @@ -10,16 +10,17 @@ #pragma once +#include +#include +#include +#include +#include #include -#include -#include -#include -#include -#include #include __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { +namespace ext { namespace intel { namespace gpu { @@ -645,5 +646,6 @@ SYCL_EXTERNAL void slm_init(uint32_t size) {} #endif } // namespace gpu } // namespace intel +} // namespace ext } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/include/CL/sycl/intel/esimd/esimd_view.hpp b/sycl/include/CL/sycl/ext/intel/esimd/esimd_view.hpp similarity index 99% rename from sycl/include/CL/sycl/intel/esimd/esimd_view.hpp rename to sycl/include/CL/sycl/ext/intel/esimd/esimd_view.hpp index 57338a0c51e86..7455b13ef509f 100644 --- a/sycl/include/CL/sycl/intel/esimd/esimd_view.hpp +++ b/sycl/include/CL/sycl/ext/intel/esimd/esimd_view.hpp @@ -10,10 +10,11 @@ #pragma once -#include +#include __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { +namespace ext { namespace intel { namespace gpu { @@ -381,5 +382,6 @@ template class simd_view { } // namespace gpu } // namespace intel +} // namespace ext } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/include/CL/sycl/intel/fpga_device_selector.hpp b/sycl/include/CL/sycl/ext/intel/fpga_device_selector.hpp similarity index 97% rename from sycl/include/CL/sycl/intel/fpga_device_selector.hpp rename to sycl/include/CL/sycl/ext/intel/fpga_device_selector.hpp index d5f9cab31180c..83d9e7683bdf2 100644 --- a/sycl/include/CL/sycl/intel/fpga_device_selector.hpp +++ b/sycl/include/CL/sycl/ext/intel/fpga_device_selector.hpp @@ -12,6 +12,7 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { +namespace ext { namespace intel { class platform_selector : public device_selector { @@ -48,5 +49,6 @@ class fpga_emulator_selector : public platform_selector { }; } // namespace intel +} // namespace ext } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/include/CL/sycl/intel/fpga_extensions.hpp b/sycl/include/CL/sycl/ext/intel/fpga_extensions.hpp similarity index 68% rename from sycl/include/CL/sycl/intel/fpga_extensions.hpp rename to sycl/include/CL/sycl/ext/intel/fpga_extensions.hpp index 7140421fe5189..9b019db1c1d41 100644 --- a/sycl/include/CL/sycl/intel/fpga_extensions.hpp +++ b/sycl/include/CL/sycl/ext/intel/fpga_extensions.hpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #pragma once -#include -#include -#include -#include +#include +#include +#include +#include diff --git a/sycl/include/CL/sycl/intel/fpga_lsu.hpp b/sycl/include/CL/sycl/ext/intel/fpga_lsu.hpp similarity index 99% rename from sycl/include/CL/sycl/intel/fpga_lsu.hpp rename to sycl/include/CL/sycl/ext/intel/fpga_lsu.hpp index 5f8d37f802e76..2b8324970658a 100644 --- a/sycl/include/CL/sycl/intel/fpga_lsu.hpp +++ b/sycl/include/CL/sycl/ext/intel/fpga_lsu.hpp @@ -13,6 +13,7 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { +namespace ext { namespace intel { constexpr uint8_t BURST_COALESCE = 0x1; constexpr uint8_t CACHE = 0x2; @@ -109,5 +110,6 @@ template class lsu final { } }; } // namespace intel +} // namespace ext } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/include/CL/sycl/intel/fpga_reg.hpp b/sycl/include/CL/sycl/ext/intel/fpga_reg.hpp similarity index 92% rename from sycl/include/CL/sycl/intel/fpga_reg.hpp rename to sycl/include/CL/sycl/ext/intel/fpga_reg.hpp index 0078dd66c383c..a8c1b3605dce6 100644 --- a/sycl/include/CL/sycl/intel/fpga_reg.hpp +++ b/sycl/include/CL/sycl/ext/intel/fpga_reg.hpp @@ -12,6 +12,7 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { +namespace ext { namespace intel { template T fpga_reg(const T &t) { @@ -23,6 +24,7 @@ template T fpga_reg(const T &t) { } } // namespace intel +} // namespace ext } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) @@ -30,6 +32,6 @@ template T fpga_reg(const T &t) { // Currently clang does not support nested namespace for attributes namespace intelfpga { template T fpga_reg(const T &t) { - return cl::sycl::intel::fpga_reg(t); + return cl::sycl::ext::intel::fpga_reg(t); } } diff --git a/sycl/include/CL/sycl/intel/fpga_utils.hpp b/sycl/include/CL/sycl/ext/intel/fpga_utils.hpp similarity index 96% rename from sycl/include/CL/sycl/intel/fpga_utils.hpp rename to sycl/include/CL/sycl/ext/intel/fpga_utils.hpp index be9bf1a6fc5af..60324b4149c12 100644 --- a/sycl/include/CL/sycl/intel/fpga_utils.hpp +++ b/sycl/include/CL/sycl/ext/intel/fpga_utils.hpp @@ -13,6 +13,7 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { +namespace ext { namespace intel { template