From 2938c8c51274d3692fde49051df49ba78d3650ce Mon Sep 17 00:00:00 2001 From: Alexey Sachkov Date: Thu, 5 Jun 2025 21:08:01 +0200 Subject: [PATCH 1/6] [SYCL][NFCI] Drop `__spirv_ops.hpp` from `core.hpp` Our device compiler have been capable of automagically declaring necessary SPIR-V built-ins on the fly for a while now, meaning that we don't need them to be forward-declared in headers. This PR drops includes of `__spirv_ops.hpp` so that they don't appear anymore in `core.hpp` (and some other headers). The header is not removed entirely, however, because not every built-in is known to the compiler, i.e. some of them still have to be forward-declared in the header. Most likely there are other places which can be made free of uses of the header and the header itself can probably be cleaned up agressively, but I will leave it for separate future PRs. --- sycl/include/sycl/__spirv/spirv_ops.hpp | 46 ------ sycl/include/sycl/access/access.hpp | 118 --------------- .../sycl/detail/address_space_cast.hpp | 137 ++++++++++++++++++ sycl/include/sycl/device_event.hpp | 9 +- .../ext/intel/esimd/detail/memory_intrin.hpp | 4 + .../ext/oneapi/experimental/address_cast.hpp | 2 +- .../oneapi/experimental/group_load_store.hpp | 1 + .../sycl/ext/oneapi/experimental/prefetch.hpp | 4 +- sycl/include/sycl/group.hpp | 67 ++++++--- sycl/include/sycl/kernel_handler.hpp | 4 - sycl/include/sycl/multi_ptr.hpp | 6 +- sycl/include/sycl/nd_item.hpp | 88 +++++++---- sycl/include/sycl/sub_group.hpp | 3 +- .../test/extensions/address_cast_negative.cpp | 4 +- sycl/test/include_deps/sycl_accessor.hpp.cpp | 6 +- sycl/test/include_deps/sycl_buffer.hpp.cpp | 7 +- .../include_deps/sycl_detail_core.hpp.cpp | 6 +- 17 files changed, 271 insertions(+), 241 deletions(-) create mode 100644 sycl/include/sycl/detail/address_space_cast.hpp diff --git a/sycl/include/sycl/__spirv/spirv_ops.hpp b/sycl/include/sycl/__spirv/spirv_ops.hpp index 88c503e02efe..0a4fbf400c74 100644 --- a/sycl/include/sycl/__spirv/spirv_ops.hpp +++ b/sycl/include/sycl/__spirv/spirv_ops.hpp @@ -233,9 +233,6 @@ template extern __DPCPP_SYCL_EXTERNAL RetT __spirv_ConvertHandleToSampledImageINTEL(HandleT); -#define __SYCL_OpGroupAsyncCopyGlobalToLocal __spirv_GroupAsyncCopy -#define __SYCL_OpGroupAsyncCopyLocalToGlobal __spirv_GroupAsyncCopy - // Atomic SPIR-V builtins #define __SPIRV_ATOMIC_LOAD(AS, Type) \ extern __DPCPP_SYCL_EXTERNAL Type __spirv_AtomicLoad(AS Type *P, int S, \ @@ -792,10 +789,6 @@ extern __DPCPP_SYCL_EXTERNAL __ocl_WPipeTy __spirv_CreatePipeFromPipeStorage_write( const ConstantPipeStorage *Storage) noexcept; -extern __DPCPP_SYCL_EXTERNAL void -__spirv_ocl_prefetch(const __attribute__((opencl_global)) char *Ptr, - size_t NumBytes) noexcept; - extern __DPCPP_SYCL_EXTERNAL float __spirv_ConvertBF16ToFINTEL(uint16_t) noexcept; extern __DPCPP_SYCL_EXTERNAL uint16_t @@ -967,43 +960,4 @@ extern __DPCPP_SYCL_EXTERNAL RetT __spirv_TaskSequenceGetINTEL( extern __DPCPP_SYCL_EXTERNAL void __spirv_TaskSequenceReleaseINTEL( __spv::__spirv_TaskSequenceINTEL *TaskSequence) noexcept; -#else // if !__SYCL_DEVICE_ONLY__ - -template -__SYCL_CONVERGENT__ extern __ocl_event_t -__SYCL_OpGroupAsyncCopyGlobalToLocal(int32_t, dataT *Dest, const dataT *Src, - size_t NumElements, size_t Stride, - __ocl_event_t) noexcept { - for (size_t i = 0; i < NumElements; i++) { - Dest[i] = Src[i * Stride]; - } - // A real instance of the class is not needed, return dummy pointer. - return nullptr; -} - -template -__SYCL_CONVERGENT__ extern __ocl_event_t -__SYCL_OpGroupAsyncCopyLocalToGlobal(int32_t, dataT *Dest, const dataT *Src, - size_t NumElements, size_t Stride, - __ocl_event_t) noexcept { - for (size_t i = 0; i < NumElements; i++) { - Dest[i * Stride] = Src[i]; - } - // A real instance of the class is not needed, return dummy pointer. - return nullptr; -} - -extern __SYCL_EXPORT void __spirv_ocl_prefetch(const char *Ptr, - size_t NumBytes) noexcept; - -__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT void -__spirv_ControlBarrier(__spv::Scope Execution, __spv::Scope Memory, - uint32_t Semantics) noexcept; - -__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT void -__spirv_MemoryBarrier(__spv::Scope Memory, uint32_t Semantics) noexcept; - -__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT void -__spirv_GroupWaitEvents(__spv::Scope Execution, uint32_t NumEvents, - __ocl_event_t *WaitEvents) noexcept; #endif // !__SYCL_DEVICE_ONLY__ diff --git a/sycl/include/sycl/access/access.hpp b/sycl/include/sycl/access/access.hpp index 9c3f4e8c7fd6..1ef5de5931f4 100644 --- a/sycl/include/sycl/access/access.hpp +++ b/sycl/include/sycl/access/access.hpp @@ -11,7 +11,6 @@ #include // for __SYCL2020_DEPRECATED #ifdef __SYCL_DEVICE_ONLY__ -#include #include #endif @@ -319,123 +318,6 @@ template struct remove_decoration { template using remove_decoration_t = typename remove_decoration::type; -namespace detail { -#ifdef __SYCL_DEVICE_ONLY__ -inline constexpr bool -address_space_cast_is_possible(access::address_space Src, - access::address_space Dst) { - // constant_space is unique and is not interchangeable with any other. - auto constant_space = access::address_space::constant_space; - if (Src == constant_space || Dst == constant_space) - return Src == Dst; - - auto generic_space = access::address_space::generic_space; - if (Src == Dst || Src == generic_space || Dst == generic_space) - return true; - - // global_host/global_device could be casted to/from global - auto global_space = access::address_space::global_space; - auto global_device = access::address_space::ext_intel_global_device_space; - auto global_host = access::address_space::ext_intel_global_host_space; - - if (Src == global_space || Dst == global_space) { - auto Other = Src == global_space ? Dst : Src; - if (Other == global_device || Other == global_host) - return true; - } - - // No more compatible combinations. - return false; -} - -template -auto static_address_cast(ElementType *Ptr) { - constexpr auto SrcAS = deduce_AS::value; - static_assert(address_space_cast_is_possible(SrcAS, Space)); - - using dst_type = typename DecoratedType< - std::remove_pointer_t>, Space>::type *; - - // Note: reinterpret_cast isn't enough for some of the casts between different - // address spaces, use C-style cast instead. - return (dst_type)Ptr; -} - -// Previous implementation (`castAS`, used in `multi_ptr` ctors among other -// places), used C-style cast instead of a proper dynamic check for some -// backends/spaces. `SupressNotImplementedAssert = true` parameter is emulating -// that previous behavior until the proper support is added for compatibility -// reasons. -template -auto dynamic_address_cast(ElementType *Ptr) { - constexpr auto generic_space = access::address_space::generic_space; - constexpr auto global_space = access::address_space::global_space; - constexpr auto local_space = access::address_space::local_space; - constexpr auto private_space = access::address_space::private_space; - constexpr auto global_device = - access::address_space::ext_intel_global_device_space; - constexpr auto global_host = - access::address_space::ext_intel_global_host_space; - - constexpr auto SrcAS = deduce_AS::value; - using dst_type = typename DecoratedType< - std::remove_pointer_t>, Space>::type *; - using RemoveCvT = std::remove_cv_t; - - if constexpr (!address_space_cast_is_possible(SrcAS, Space)) { - return (dst_type) nullptr; - } else if constexpr (Space == generic_space) { - return (dst_type)Ptr; - } else if constexpr (Space == global_space && - (SrcAS == global_device || SrcAS == global_host)) { - return (dst_type)Ptr; - } else if constexpr (SrcAS == global_space && - (Space == global_device || Space == global_host)) { -#if defined(__ENABLE_USM_ADDR_SPACE__) - static_assert(SupressNotImplementedAssert || Space != Space, - "Not supported yet!"); - return detail::static_address_cast(Ptr); -#else - // If __ENABLE_USM_ADDR_SPACE__ isn't defined then both - // global_device/global_host are just aliases for global_space. - static_assert(std::is_same_v); - return (dst_type)Ptr; -#endif - } else if constexpr (Space == global_space) { - return (dst_type)__spirv_GenericCastToPtrExplicit_ToGlobal( - const_cast(Ptr), __spv::StorageClass::CrossWorkgroup); - } else if constexpr (Space == local_space) { - return (dst_type)__spirv_GenericCastToPtrExplicit_ToLocal( - const_cast(Ptr), __spv::StorageClass::Workgroup); - } else if constexpr (Space == private_space) { - return (dst_type)__spirv_GenericCastToPtrExplicit_ToPrivate( - const_cast(Ptr), __spv::StorageClass::Function); -#if !defined(__ENABLE_USM_ADDR_SPACE__) - } else if constexpr (SrcAS == generic_space && - (Space == global_device || Space == global_host)) { - return (dst_type)__spirv_GenericCastToPtrExplicit_ToGlobal( - const_cast(Ptr), __spv::StorageClass::CrossWorkgroup); -#endif - } else { - static_assert(SupressNotImplementedAssert || Space != Space, - "Not supported yet!"); - return detail::static_address_cast(Ptr); - } -} -#else // __SYCL_DEVICE_ONLY__ -template -auto static_address_cast(ElementType *Ptr) { - return Ptr; -} -template -auto dynamic_address_cast(ElementType *Ptr) { - return Ptr; -} -#endif // __SYCL_DEVICE_ONLY__ -} // namespace detail - #undef __OPENCL_GLOBAL_AS__ #undef __OPENCL_GLOBAL_DEVICE_AS__ #undef __OPENCL_GLOBAL_HOST_AS__ diff --git a/sycl/include/sycl/detail/address_space_cast.hpp b/sycl/include/sycl/detail/address_space_cast.hpp new file mode 100644 index 000000000000..10d8ee4c1fc1 --- /dev/null +++ b/sycl/include/sycl/detail/address_space_cast.hpp @@ -0,0 +1,137 @@ +//==------- address_space_cast.hpp --- Implementation of AS casts ----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +#include + +namespace sycl { +inline namespace _V1 { + +namespace detail { +#ifdef __SYCL_DEVICE_ONLY__ +inline constexpr bool +address_space_cast_is_possible(access::address_space Src, + access::address_space Dst) { + // constant_space is unique and is not interchangeable with any other. + auto constant_space = access::address_space::constant_space; + if (Src == constant_space || Dst == constant_space) + return Src == Dst; + + auto generic_space = access::address_space::generic_space; + if (Src == Dst || Src == generic_space || Dst == generic_space) + return true; + + // global_host/global_device could be casted to/from global + auto global_space = access::address_space::global_space; + auto global_device = access::address_space::ext_intel_global_device_space; + auto global_host = access::address_space::ext_intel_global_host_space; + + if (Src == global_space || Dst == global_space) { + auto Other = Src == global_space ? Dst : Src; + if (Other == global_device || Other == global_host) + return true; + } + + // No more compatible combinations. + return false; +} + +template +auto static_address_cast(ElementType *Ptr) { + constexpr auto SrcAS = deduce_AS::value; + static_assert(address_space_cast_is_possible(SrcAS, Space)); + + using dst_type = typename DecoratedType< + std::remove_pointer_t>, Space>::type *; + + // Note: reinterpret_cast isn't enough for some of the casts between different + // address spaces, use C-style cast instead. + return (dst_type)Ptr; +} + +// Previous implementation (`castAS`, used in `multi_ptr` ctors among other +// places), used C-style cast instead of a proper dynamic check for some +// backends/spaces. `SupressNotImplementedAssert = true` parameter is emulating +// that previous behavior until the proper support is added for compatibility +// reasons. +template +auto dynamic_address_cast(ElementType *Ptr) { + constexpr auto generic_space = access::address_space::generic_space; + constexpr auto global_space = access::address_space::global_space; + constexpr auto local_space = access::address_space::local_space; + constexpr auto private_space = access::address_space::private_space; + constexpr auto global_device = + access::address_space::ext_intel_global_device_space; + constexpr auto global_host = + access::address_space::ext_intel_global_host_space; + + constexpr auto SrcAS = deduce_AS::value; + using dst_type = typename DecoratedType< + std::remove_pointer_t>, Space>::type *; + using RemoveCvT = std::remove_cv_t; + + if constexpr (!address_space_cast_is_possible(SrcAS, Space)) { + return (dst_type) nullptr; + } else if constexpr (Space == generic_space) { + return (dst_type)Ptr; + } else if constexpr (Space == global_space && + (SrcAS == global_device || SrcAS == global_host)) { + return (dst_type)Ptr; + } else if constexpr (SrcAS == global_space && + (Space == global_device || Space == global_host)) { +#if defined(__ENABLE_USM_ADDR_SPACE__) + static_assert(SupressNotImplementedAssert || Space != Space, + "Not supported yet!"); + return detail::static_address_cast(Ptr); +#else + // If __ENABLE_USM_ADDR_SPACE__ isn't defined then both + // global_device/global_host are just aliases for global_space. + static_assert(std::is_same_v); + return (dst_type)Ptr; +#endif + } else if constexpr (Space == global_space) { + return (dst_type)__spirv_GenericCastToPtrExplicit_ToGlobal( + const_cast(Ptr), __spv::StorageClass::CrossWorkgroup); + } else if constexpr (Space == local_space) { + return (dst_type)__spirv_GenericCastToPtrExplicit_ToLocal( + const_cast(Ptr), __spv::StorageClass::Workgroup); + } else if constexpr (Space == private_space) { + return (dst_type)__spirv_GenericCastToPtrExplicit_ToPrivate( + const_cast(Ptr), __spv::StorageClass::Function); +#if !defined(__ENABLE_USM_ADDR_SPACE__) + } else if constexpr (SrcAS == generic_space && + (Space == global_device || Space == global_host)) { + return (dst_type)__spirv_GenericCastToPtrExplicit_ToGlobal( + const_cast(Ptr), __spv::StorageClass::CrossWorkgroup); +#endif + } else { + static_assert(SupressNotImplementedAssert || Space != Space, + "Not supported yet!"); + return detail::static_address_cast(Ptr); + } +} +#else // __SYCL_DEVICE_ONLY__ +template +auto static_address_cast(ElementType *Ptr) { + return Ptr; +} +template +auto dynamic_address_cast(ElementType *Ptr) { + return Ptr; +} +#endif // __SYCL_DEVICE_ONLY__ +} // namespace detail + +} // namespace _V1 +} // namespace sycl diff --git a/sycl/include/sycl/device_event.hpp b/sycl/include/sycl/device_event.hpp index e68662d731fa..ea54172f328d 100644 --- a/sycl/include/sycl/device_event.hpp +++ b/sycl/include/sycl/device_event.hpp @@ -8,7 +8,6 @@ #pragma once -#include #include namespace sycl { @@ -21,7 +20,7 @@ inline namespace _V1 { /// \ingroup sycl_api class device_event { private: - __ocl_event_t m_Event; + [[maybe_unused]] __ocl_event_t m_Event; public: device_event(const device_event &rhs) = default; @@ -31,7 +30,11 @@ class device_event { device_event(__ocl_event_t Event) : m_Event(Event) {} - void wait() { __spirv_GroupWaitEvents(__spv::Scope::Workgroup, 1, &m_Event); } + void wait() { +#ifdef __SYCL_DEVICE_ONLY__ + __spirv_GroupWaitEvents(__spv::Scope::Workgroup, 1, &m_Event); +#endif + } }; } // namespace _V1 diff --git a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp index 1bc5ae0b0b61..d39b0b9660f4 100644 --- a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp +++ b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp @@ -19,6 +19,10 @@ #include #include +#ifdef __SYCL_DEVICE_ONLY__ +#include +#endif + #include namespace sycl { diff --git a/sycl/include/sycl/ext/oneapi/experimental/address_cast.hpp b/sycl/include/sycl/ext/oneapi/experimental/address_cast.hpp index 4708cd2224e1..484f3a5feb9a 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/address_cast.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/address_cast.hpp @@ -8,7 +8,7 @@ #pragma once -#include +#include #include namespace sycl { diff --git a/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp b/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp index 48542c8b6603..5ee3e1d1d059 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp @@ -10,6 +10,7 @@ #pragma once +#include #include #include #include diff --git a/sycl/include/sycl/ext/oneapi/experimental/prefetch.hpp b/sycl/include/sycl/ext/oneapi/experimental/prefetch.hpp index 29c25d6a0860..a10ac625acfc 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/prefetch.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/prefetch.hpp @@ -73,10 +73,10 @@ template void prefetch_impl(T *ptr, size_t bytes, Properties properties) { #ifdef __SYCL_DEVICE_ONLY__ auto *ptrGlobalAS = - reinterpret_cast<__attribute__((opencl_global)) const char *>( + reinterpret_cast<__attribute__((opencl_global)) const unsigned char *>( detail::static_address_cast( const_cast(ptr))); - const __attribute__((opencl_global)) char *ptrAnnotated = nullptr; + const __attribute__((opencl_global)) unsigned char *ptrAnnotated = nullptr; if constexpr (!properties.template has_property()) { ptrAnnotated = __builtin_intel_sycl_ptr_annotation( ptrGlobalAS, "sycl-prefetch-hint", static_cast(cache_level::L1)); diff --git a/sycl/include/sycl/group.hpp b/sycl/include/sycl/group.hpp index 50b5fe6e586a..e989782176e7 100644 --- a/sycl/include/sycl/group.hpp +++ b/sycl/include/sycl/group.hpp @@ -8,7 +8,6 @@ #pragma once -#include // for __spirv_MemoryBarrier #include // for Scope, __ocl_event_t #include // for decorated, mode, addr... #include // for NDLoop, __SYCL_ASSERT @@ -286,12 +285,15 @@ template class __SYCL_TYPE(group) group { /// space, global address space or both based on the value of \p accessSpace. template void mem_fence( + [[maybe_unused]] typename std::enable_if_t accessSpace = access::fence_space::global_and_local) const { - uint32_t flags = detail::getSPIRVMemorySemanticsMask(accessSpace); +#ifdef __SYCL_DEVICE_ONLY__ + uint32_t flags = + detail::getSPIRVMemorySemanticsMask(accessSpace); // TODO: currently, there is no good way in SPIR-V to set the memory // barrier only for load operations or only for store operations. // The full read-and-write barrier is used and the template parameter @@ -301,6 +303,7 @@ template class __SYCL_TYPE(group) group { // we can fix this later, for example, by using OpenCL 1.2 functions // read_mem_fence() and write_mem_fence(). __spirv_MemoryBarrier(__spv::Scope::Workgroup, flags); +#endif } /// Asynchronously copies a number of elements specified by \p numElements @@ -310,15 +313,23 @@ template class __SYCL_TYPE(group) group { /// Permitted types for dataT are all scalar and vector types, except boolean. template __SYCL2020_DEPRECATED("Use decorated multi_ptr arguments instead") - std::enable_if_t::value, - device_event> async_work_group_copy(local_ptr dest, - global_ptr src, - size_t numElements, - size_t srcStride) const { - __ocl_event_t E = __SYCL_OpGroupAsyncCopyGlobalToLocal( + std::enable_if_t< + !detail::is_bool::value, + device_event> async_work_group_copy([[maybe_unused]] local_ptr + dest, + [[maybe_unused]] global_ptr + src, + [[maybe_unused]] size_t numElements, + [[maybe_unused]] size_t srcStride) + const { +#ifdef __SYCL_DEVICE_ONLY__ + __ocl_event_t E = __spirv_GroupAsyncCopy( __spv::Scope::Workgroup, detail::convertToOpenCLType(dest), detail::convertToOpenCLType(src), numElements, srcStride, 0); return device_event(E); +#else + return nullptr; +#endif } /// Asynchronously copies a number of elements specified by \p numElements @@ -329,15 +340,19 @@ template class __SYCL_TYPE(group) group { template __SYCL2020_DEPRECATED("Use decorated multi_ptr arguments instead") std::enable_if_t::value, - device_event> async_work_group_copy(global_ptr dest, - local_ptr src, - size_t numElements, - size_t destStride) + device_event> async_work_group_copy([[maybe_unused]] global_ptr dest, + [[maybe_unused]] local_ptr src, + [[maybe_unused]] size_t numElements, + [[maybe_unused]] size_t destStride) const { - __ocl_event_t E = __SYCL_OpGroupAsyncCopyLocalToGlobal( +#ifdef __SYCL_DEVICE_ONLY__ + __ocl_event_t E = __spirv_GroupAsyncCopy( __spv::Scope::Workgroup, detail::convertToOpenCLType(dest), detail::convertToOpenCLType(src), numElements, destStride, 0); return device_event(E); +#else + return nullptr; +#endif } /// Asynchronously copies a number of elements specified by \p numElements @@ -350,13 +365,18 @@ template class __SYCL_TYPE(group) group { std::enable_if_t::value && std::is_same_v, DestDataT>, device_event> - async_work_group_copy(decorated_local_ptr dest, - decorated_global_ptr src, size_t numElements, - size_t srcStride) const { - __ocl_event_t E = __SYCL_OpGroupAsyncCopyGlobalToLocal( + async_work_group_copy([[maybe_unused]] decorated_local_ptr dest, + [[maybe_unused]] decorated_global_ptr src, + [[maybe_unused]] size_t numElements, + [[maybe_unused]] size_t srcStride) const { +#ifdef __SYCL_DEVICE_ONLY__ + __ocl_event_t E = __spirv_GroupAsyncCopy( __spv::Scope::Workgroup, detail::convertToOpenCLType(dest), detail::convertToOpenCLType(src), numElements, srcStride, 0); return device_event(E); +#else + return nullptr; +#endif } /// Asynchronously copies a number of elements specified by \p numElements @@ -369,13 +389,18 @@ template class __SYCL_TYPE(group) group { std::enable_if_t::value && std::is_same_v, DestDataT>, device_event> - async_work_group_copy(decorated_global_ptr dest, - decorated_local_ptr src, size_t numElements, - size_t destStride) const { - __ocl_event_t E = __SYCL_OpGroupAsyncCopyLocalToGlobal( + async_work_group_copy([[maybe_unused]] decorated_global_ptr dest, + [[maybe_unused]] decorated_local_ptr src, + [[maybe_unused]] size_t numElements, + [[maybe_unused]] size_t destStride) const { +#ifdef __SYCL_DEVICE_ONLY__ + __ocl_event_t E = __spirv_GroupAsyncCopy( __spv::Scope::Workgroup, detail::convertToOpenCLType(dest), detail::convertToOpenCLType(src), numElements, destStride, 0); return device_event(E); +#else + return nullptr; +#endif } /// Specialization for scalar bool type. diff --git a/sycl/include/sycl/kernel_handler.hpp b/sycl/include/sycl/kernel_handler.hpp index 2e2ab51e9336..6ae2a7313296 100644 --- a/sycl/include/sycl/kernel_handler.hpp +++ b/sycl/include/sycl/kernel_handler.hpp @@ -12,10 +12,6 @@ #include // for __SYCL_ALWAYS_INLINE #include // for feature_not_supported -#ifdef __SYCL_DEVICE_ONLY__ -#include -#endif - #include // for remove_reference_t #ifdef __SYCL_DEVICE_ONLY__ diff --git a/sycl/include/sycl/multi_ptr.hpp b/sycl/include/sycl/multi_ptr.hpp index b1946fb68afc..90261452f483 100644 --- a/sycl/include/sycl/multi_ptr.hpp +++ b/sycl/include/sycl/multi_ptr.hpp @@ -8,9 +8,9 @@ #pragma once -#include // for __spirv_ocl_prefetch -#include // for address_space, decorated -#include // for half +#include // for address_space, decorated +#include // for half +#include #include // for __SYCL2020_DEPRECATED #include // for const_if_const_AS #include // for BIsRepresentationT diff --git a/sycl/include/sycl/nd_item.hpp b/sycl/include/sycl/nd_item.hpp index f1a5bfc63c91..76ec89f2bba7 100644 --- a/sycl/include/sycl/nd_item.hpp +++ b/sycl/include/sycl/nd_item.hpp @@ -8,7 +8,6 @@ #pragma once -#include // for __spirv_ControlBarrier #include // for Scope #include // for initLocalInvocationId #include // for mode, fence_space @@ -198,11 +197,13 @@ template class nd_item { get_offset()); } - void barrier(access::fence_space accessSpace = + void barrier([[maybe_unused]] access::fence_space accessSpace = access::fence_space::global_and_local) const { +#ifdef __SYCL_DEVICE_ONLY__ uint32_t flags = _V1::detail::getSPIRVMemorySemanticsMask(accessSpace); __spirv_ControlBarrier(__spv::Scope::Workgroup, __spv::Scope::Workgroup, flags); +#endif } /// Executes a work-group mem-fence with memory ordering on the local address @@ -210,11 +211,13 @@ template class nd_item { template __SYCL2020_DEPRECATED("use sycl::atomic_fence() free function instead") void mem_fence( + [[maybe_unused]] typename std::enable_if_t accessSpace = access::fence_space::global_and_local) const { +#if __SYCL_DEVICE_ONLY__ uint32_t flags = detail::getSPIRVMemorySemanticsMask(accessSpace); // TODO: currently, there is no good way in SPIR-V to set the memory // barrier only for load operations or only for store operations. @@ -225,6 +228,7 @@ template class nd_item { // we can fix this later, for example, by using OpenCL 1.2 functions // read_mem_fence() and write_mem_fence(). __spirv_MemoryBarrier(__spv::Scope::Workgroup, flags); +#endif } /// Asynchronously copies a number of elements specified by \p numElements @@ -234,18 +238,26 @@ template class nd_item { /// Permitted types for dataT are all scalar and vector types, except boolean. template __SYCL2020_DEPRECATED("Use decorated multi_ptr arguments instead") - std::enable_if_t::value, - device_event> async_work_group_copy(local_ptr dest, - global_ptr src, - size_t numElements, - size_t srcStride) const { + std::enable_if_t< + !detail::is_bool::value, + device_event> async_work_group_copy([[maybe_unused]] local_ptr + dest, + [[maybe_unused]] global_ptr + src, + [[maybe_unused]] size_t numElements, + [[maybe_unused]] size_t srcStride) + const { +#ifdef __SYCL_DEVICE_ONLY__ using DestT = detail::ConvertToOpenCLType_t; using SrcT = detail::ConvertToOpenCLType_t; - __ocl_event_t E = __SYCL_OpGroupAsyncCopyGlobalToLocal( - __spv::Scope::Workgroup, DestT(dest.get()), SrcT(src.get()), - numElements, srcStride, 0); + __ocl_event_t E = + __spirv_GroupAsyncCopy(__spv::Scope::Workgroup, DestT(dest.get()), + SrcT(src.get()), numElements, srcStride, 0); return device_event(E); +#else + return nullptr; +#endif } /// Asynchronously copies a number of elements specified by \p numElements @@ -255,19 +267,25 @@ template class nd_item { /// Permitted types for dataT are all scalar and vector types, except boolean. template __SYCL2020_DEPRECATED("Use decorated multi_ptr arguments instead") - std::enable_if_t::value, - device_event> async_work_group_copy(global_ptr dest, - local_ptr src, - size_t numElements, - size_t destStride) + std::enable_if_t< + !detail::is_bool::value, + device_event> async_work_group_copy([[maybe_unused]] global_ptr + dest, + [[maybe_unused]] local_ptr src, + [[maybe_unused]] size_t numElements, + [[maybe_unused]] size_t destStride) const { +#ifdef __SYCL_DEVICE_ONLY__ using DestT = detail::ConvertToOpenCLType_t; using SrcT = detail::ConvertToOpenCLType_t; - __ocl_event_t E = __SYCL_OpGroupAsyncCopyLocalToGlobal( - __spv::Scope::Workgroup, DestT(dest.get()), SrcT(src.get()), - numElements, destStride, 0); + __ocl_event_t E = + __spirv_GroupAsyncCopy(__spv::Scope::Workgroup, DestT(dest.get()), + SrcT(src.get()), numElements, destStride, 0); return device_event(E); +#else + return nullptr; +#endif } /// Asynchronously copies a number of elements specified by \p numElements @@ -280,16 +298,21 @@ template class nd_item { std::enable_if_t::value && std::is_same_v, DestDataT>, device_event> - async_work_group_copy(decorated_local_ptr dest, - decorated_global_ptr src, size_t numElements, - size_t srcStride) const { + async_work_group_copy([[maybe_unused]] decorated_local_ptr dest, + [[maybe_unused]] decorated_global_ptr src, + [[maybe_unused]] size_t numElements, + [[maybe_unused]] size_t srcStride) const { +#ifdef __SYCL_DEVICE_ONLY__ using DestT = detail::ConvertToOpenCLType_t; using SrcT = detail::ConvertToOpenCLType_t; - __ocl_event_t E = __SYCL_OpGroupAsyncCopyGlobalToLocal( - __spv::Scope::Workgroup, DestT(dest.get()), SrcT(src.get()), - numElements, srcStride, 0); + __ocl_event_t E = + __spirv_GroupAsyncCopy(__spv::Scope::Workgroup, DestT(dest.get()), + SrcT(src.get()), numElements, srcStride, 0); return device_event(E); +#else + return nullptr; +#endif } /// Asynchronously copies a number of elements specified by \p numElements @@ -302,16 +325,21 @@ template class nd_item { std::enable_if_t::value && std::is_same_v, DestDataT>, device_event> - async_work_group_copy(decorated_global_ptr dest, - decorated_local_ptr src, size_t numElements, - size_t destStride) const { + async_work_group_copy([[maybe_unused]] decorated_global_ptr dest, + [[maybe_unused]] decorated_local_ptr src, + [[maybe_unused]] size_t numElements, + [[maybe_unused]] size_t destStride) const { +#ifdef __SYCL_DEVICE_ONLY__ using DestT = detail::ConvertToOpenCLType_t; using SrcT = detail::ConvertToOpenCLType_t; - __ocl_event_t E = __SYCL_OpGroupAsyncCopyLocalToGlobal( - __spv::Scope::Workgroup, DestT(dest.get()), SrcT(src.get()), - numElements, destStride, 0); + __ocl_event_t E = + __spirv_GroupAsyncCopy(__spv::Scope::Workgroup, DestT(dest.get()), + SrcT(src.get()), numElements, destStride, 0); return device_event(E); +#else + return nullptr; +#endif } /// Specialization for scalar bool type. diff --git a/sycl/include/sycl/sub_group.hpp b/sycl/include/sycl/sub_group.hpp index a44f42c44f30..d7a29d70149e 100644 --- a/sycl/include/sycl/sub_group.hpp +++ b/sycl/include/sycl/sub_group.hpp @@ -8,7 +8,8 @@ #pragma once -#include // for address_space, decorated +#include // for address_space, decorated +#include #include // for __SYCL_DEPRECATED #include // for select_cl_scalar_inte... #include // for is_scalar_arithmetic diff --git a/sycl/test/extensions/address_cast_negative.cpp b/sycl/test/extensions/address_cast_negative.cpp index f325d283eed9..01aafaf85530 100644 --- a/sycl/test/extensions/address_cast_negative.cpp +++ b/sycl/test/extensions/address_cast_negative.cpp @@ -5,10 +5,10 @@ using namespace sycl::ext::oneapi::experimental; SYCL_EXTERNAL void test(int *p) { - // expected-error-re@sycl/access/access.hpp:* {{{{.*}}Not supported yet!}} + // expected-error-re@sycl/detail/address_space_cast.hpp:* {{{{.*}}Not supported yet!}} std::ignore = dynamic_address_cast< sycl::access::address_space::ext_intel_global_device_space>(p); - // expected-error-re@sycl/access/access.hpp:* {{{{.*}}Not supported yet!}} + // expected-error-re@sycl/detail/address_space_cast.hpp:* {{{{.*}}Not supported yet!}} std::ignore = dynamic_address_cast< sycl::access::address_space::ext_intel_global_host_space>(p); } diff --git a/sycl/test/include_deps/sycl_accessor.hpp.cpp b/sycl/test/include_deps/sycl_accessor.hpp.cpp index af86805dc146..9df04c4d8d9b 100644 --- a/sycl/test/include_deps/sycl_accessor.hpp.cpp +++ b/sycl/test/include_deps/sycl_accessor.hpp.cpp @@ -7,16 +7,16 @@ // CHECK-NEXT: accessor.hpp // CHECK-NEXT: access/access.hpp // CHECK-NEXT: detail/defines_elementary.hpp -// CHECK-NEXT: __spirv/spirv_ops.hpp +// CHECK-NEXT: atomic.hpp // CHECK-NEXT: __spirv/spirv_types.hpp // CHECK-NEXT: detail/defines.hpp -// CHECK-NEXT: detail/export.hpp -// CHECK-NEXT: atomic.hpp // CHECK-NEXT: detail/helpers.hpp +// CHECK-NEXT: detail/export.hpp // CHECK-NEXT: memory_enums.hpp // CHECK-NEXT: __spirv/spirv_vars.hpp // CHECK-NEXT: multi_ptr.hpp // CHECK-NEXT: aliases.hpp +// CHECK-NEXT: detail/address_space_cast.hpp // CHECK-NEXT: detail/type_traits.hpp // CHECK-NEXT: detail/type_traits/vec_marray_traits.hpp // CHECK-NEXT: half_type.hpp diff --git a/sycl/test/include_deps/sycl_buffer.hpp.cpp b/sycl/test/include_deps/sycl_buffer.hpp.cpp index 0c38ee6bde31..0a821e1fe79c 100644 --- a/sycl/test/include_deps/sycl_buffer.hpp.cpp +++ b/sycl/test/include_deps/sycl_buffer.hpp.cpp @@ -7,18 +7,17 @@ // CHECK-NEXT: buffer.hpp // CHECK-NEXT: access/access.hpp // CHECK-NEXT: detail/defines_elementary.hpp -// CHECK-NEXT: __spirv/spirv_ops.hpp -// CHECK-NEXT: __spirv/spirv_types.hpp -// CHECK-NEXT: detail/defines.hpp -// CHECK-NEXT: detail/export.hpp // CHECK-NEXT: backend_types.hpp // CHECK-NEXT: detail/array.hpp // CHECK-NEXT: exception.hpp +// CHECK-NEXT: detail/export.hpp // CHECK-NEXT: detail/string.hpp // CHECK-NEXT: detail/common.hpp // CHECK-NEXT: stl_wrappers/cassert // CHECK-NEXT: stl_wrappers/assert.h // CHECK-NEXT: __spirv/spirv_vars.hpp +// CHECK-NEXT: __spirv/spirv_types.hpp +// CHECK-NEXT: detail/defines.hpp // CHECK-NEXT: detail/helpers.hpp // CHECK-NEXT: memory_enums.hpp // CHECK-NEXT: detail/iostream_proxy.hpp diff --git a/sycl/test/include_deps/sycl_detail_core.hpp.cpp b/sycl/test/include_deps/sycl_detail_core.hpp.cpp index 1240a1e51405..6b5b40470350 100644 --- a/sycl/test/include_deps/sycl_detail_core.hpp.cpp +++ b/sycl/test/include_deps/sycl_detail_core.hpp.cpp @@ -8,16 +8,16 @@ // CHECK-NEXT: accessor.hpp // CHECK-NEXT: access/access.hpp // CHECK-NEXT: detail/defines_elementary.hpp -// CHECK-NEXT: __spirv/spirv_ops.hpp +// CHECK-NEXT: atomic.hpp // CHECK-NEXT: __spirv/spirv_types.hpp // CHECK-NEXT: detail/defines.hpp -// CHECK-NEXT: detail/export.hpp -// CHECK-NEXT: atomic.hpp // CHECK-NEXT: detail/helpers.hpp +// CHECK-NEXT: detail/export.hpp // CHECK-NEXT: memory_enums.hpp // CHECK-NEXT: __spirv/spirv_vars.hpp // CHECK-NEXT: multi_ptr.hpp // CHECK-NEXT: aliases.hpp +// CHECK-NEXT: detail/address_space_cast.hpp // CHECK-NEXT: detail/type_traits.hpp // CHECK-NEXT: detail/type_traits/vec_marray_traits.hpp // CHECK-NEXT: half_type.hpp From ffe05b7e7d29324474ae45d4dc47cdf9a31851a8 Mon Sep 17 00:00:00 2001 From: Alexey Sachkov Date: Fri, 6 Jun 2025 10:45:57 +0200 Subject: [PATCH 2/6] Fix tests --- sycl/include/sycl/__spirv/spirv_ops.hpp | 5 +++++ sycl/include/sycl/detail/spirv.hpp | 7 +++++++ sycl/include/sycl/ext/oneapi/experimental/builtins.hpp | 1 + sycl/include/sycl/sub_group.hpp | 1 + 4 files changed, 14 insertions(+) diff --git a/sycl/include/sycl/__spirv/spirv_ops.hpp b/sycl/include/sycl/__spirv/spirv_ops.hpp index 0a4fbf400c74..7672a76e193b 100644 --- a/sycl/include/sycl/__spirv/spirv_ops.hpp +++ b/sycl/include/sycl/__spirv/spirv_ops.hpp @@ -234,6 +234,11 @@ extern __DPCPP_SYCL_EXTERNAL RetT __spirv_ConvertHandleToSampledImageINTEL(HandleT); // Atomic SPIR-V builtins +// TODO: drop these forward-declarations. +// As of right know, compiler does not forward-declare long long overloads for +// these and as such we can't drop anything from here. But ideally, we should +// rely on the compiler to generate those - that would allow to drop +// spirv_ops.hpp include from more files. #define __SPIRV_ATOMIC_LOAD(AS, Type) \ extern __DPCPP_SYCL_EXTERNAL Type __spirv_AtomicLoad(AS Type *P, int S, \ int O) noexcept; diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 541668ed374d..cbdc1e7e0ffc 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -10,6 +10,13 @@ #ifdef __SYCL_DEVICE_ONLY__ +// Some __spirv_* inrinsics are automatically forward-declared by the compiler, +// but not all of them. For example: +// __spirv_AtomicStore(unsigned long long*, ...) +// Therefore, we need the following include to get forward-declarations of those +// versions. +#include + #include // for IdToMaskPosition #if defined(__NVPTX__) diff --git a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp index e6fdba4cdad4..52d21c1a2883 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp @@ -8,6 +8,7 @@ #pragma once +#include #include // for half #include // for to_vec2 #include // for __SYCL_ALWAYS_INLINE diff --git a/sycl/include/sycl/sub_group.hpp b/sycl/include/sycl/sub_group.hpp index d7a29d70149e..005b7746539f 100644 --- a/sycl/include/sycl/sub_group.hpp +++ b/sycl/include/sycl/sub_group.hpp @@ -8,6 +8,7 @@ #pragma once +#include #include // for address_space, decorated #include #include // for __SYCL_DEPRECATED From 27dacd00372588efcfc826d84255ad7a2c87a926 Mon Sep 17 00:00:00 2001 From: Alexey Sachkov Date: Wed, 11 Jun 2025 16:26:49 +0200 Subject: [PATCH 3/6] Fix remaining test --- sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp b/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp index e6544010ad17..4a12191f7440 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp @@ -8,6 +8,7 @@ #pragma once +#include #include // for sycl::bit_cast #include // for ceil, cos, exp, exp10, exp2 #include // sycl::detail::memcpy From 52202878f8afb982265dc3158b436950b9377862 Mon Sep 17 00:00:00 2001 From: Alexey Sachkov Date: Wed, 11 Jun 2025 18:02:04 +0200 Subject: [PATCH 4/6] Clang-format + fix build with old gcc --- sycl/include/sycl/device_event.hpp | 3 ++- sycl/include/sycl/group.hpp | 15 ++++++++------- sycl/include/sycl/multi_ptr.hpp | 4 ++-- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/sycl/include/sycl/device_event.hpp b/sycl/include/sycl/device_event.hpp index ea54172f328d..cf14d265bf71 100644 --- a/sycl/include/sycl/device_event.hpp +++ b/sycl/include/sycl/device_event.hpp @@ -20,7 +20,7 @@ inline namespace _V1 { /// \ingroup sycl_api class device_event { private: - [[maybe_unused]] __ocl_event_t m_Event; + __ocl_event_t m_Event; public: device_event(const device_event &rhs) = default; @@ -31,6 +31,7 @@ class device_event { device_event(__ocl_event_t Event) : m_Event(Event) {} void wait() { + (void)m_Event; #ifdef __SYCL_DEVICE_ONLY__ __spirv_GroupWaitEvents(__spv::Scope::Workgroup, 1, &m_Event); #endif diff --git a/sycl/include/sycl/group.hpp b/sycl/include/sycl/group.hpp index e989782176e7..9402d2fb83d0 100644 --- a/sycl/include/sycl/group.hpp +++ b/sycl/include/sycl/group.hpp @@ -292,8 +292,7 @@ template class __SYCL_TYPE(group) group { access::fence_space> accessSpace = access::fence_space::global_and_local) const { #ifdef __SYCL_DEVICE_ONLY__ - uint32_t flags = - detail::getSPIRVMemorySemanticsMask(accessSpace); + uint32_t flags = detail::getSPIRVMemorySemanticsMask(accessSpace); // TODO: currently, there is no good way in SPIR-V to set the memory // barrier only for load operations or only for store operations. // The full read-and-write barrier is used and the template parameter @@ -339,11 +338,13 @@ template class __SYCL_TYPE(group) group { /// Permitted types for dataT are all scalar and vector types, except boolean. template __SYCL2020_DEPRECATED("Use decorated multi_ptr arguments instead") - std::enable_if_t::value, - device_event> async_work_group_copy([[maybe_unused]] global_ptr dest, - [[maybe_unused]] local_ptr src, - [[maybe_unused]] size_t numElements, - [[maybe_unused]] size_t destStride) + std::enable_if_t< + !detail::is_bool::value, + device_event> async_work_group_copy([[maybe_unused]] global_ptr + dest, + [[maybe_unused]] local_ptr src, + [[maybe_unused]] size_t numElements, + [[maybe_unused]] size_t destStride) const { #ifdef __SYCL_DEVICE_ONLY__ __ocl_event_t E = __spirv_GroupAsyncCopy( diff --git a/sycl/include/sycl/multi_ptr.hpp b/sycl/include/sycl/multi_ptr.hpp index 90261452f483..824a1470b2c9 100644 --- a/sycl/include/sycl/multi_ptr.hpp +++ b/sycl/include/sycl/multi_ptr.hpp @@ -8,8 +8,8 @@ #pragma once -#include // for address_space, decorated -#include // for half +#include // for address_space, decorated +#include // for half #include #include // for __SYCL2020_DEPRECATED #include // for const_if_const_AS From 11e4b38dfaa47b2aa3d5d136f55c9abfdb71a704 Mon Sep 17 00:00:00 2001 From: Alexey Sachkov Date: Thu, 12 Jun 2025 18:26:37 +0200 Subject: [PATCH 5/6] Clang-format --- sycl/include/sycl/detail/address_space_cast.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/sycl/detail/address_space_cast.hpp b/sycl/include/sycl/detail/address_space_cast.hpp index 10d8ee4c1fc1..3d878631900f 100644 --- a/sycl/include/sycl/detail/address_space_cast.hpp +++ b/sycl/include/sycl/detail/address_space_cast.hpp @@ -8,8 +8,8 @@ #pragma once -#include #include +#include #include From 642852ae0f6aca85a0f9bb6e564d8c25c734472c Mon Sep 17 00:00:00 2001 From: Alexey Sachkov Date: Mon, 16 Jun 2025 11:15:32 +0200 Subject: [PATCH 6/6] Update sycl/include/sycl/__spirv/spirv_ops.hpp Co-authored-by: Udit Kumar Agarwal --- sycl/include/sycl/__spirv/spirv_ops.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/sycl/__spirv/spirv_ops.hpp b/sycl/include/sycl/__spirv/spirv_ops.hpp index 7672a76e193b..b001bc914b19 100644 --- a/sycl/include/sycl/__spirv/spirv_ops.hpp +++ b/sycl/include/sycl/__spirv/spirv_ops.hpp @@ -235,7 +235,7 @@ extern __DPCPP_SYCL_EXTERNAL // Atomic SPIR-V builtins // TODO: drop these forward-declarations. -// As of right know, compiler does not forward-declare long long overloads for +// As of now, compiler does not forward-declare long long overloads for // these and as such we can't drop anything from here. But ideally, we should // rely on the compiler to generate those - that would allow to drop // spirv_ops.hpp include from more files.