From da707813513b3fa2e3b4da843c5cbfa474bc8deb Mon Sep 17 00:00:00 2001 From: Petr Vesely Date: Tue, 4 Apr 2023 09:57:35 +0100 Subject: [PATCH 01/45] [SYCL][CUDA] Export loader interface for CUDA UR adapter [UR] add ur_adapter_cuda target [UR] add license --- sycl/plugins/cuda/CMakeLists.txt | 11 + sycl/plugins/unified_runtime/CMakeLists.txt | 15 + .../ur/adapters/cuda/ur_interface_loader.cpp | 257 ++++++++++++++++++ 3 files changed, 283 insertions(+) create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp diff --git a/sycl/plugins/cuda/CMakeLists.txt b/sycl/plugins/cuda/CMakeLists.txt index d3e742267af34..7df7f549c9981 100644 --- a/sycl/plugins/cuda/CMakeLists.txt +++ b/sycl/plugins/cuda/CMakeLists.txt @@ -48,6 +48,15 @@ endif() add_sycl_plugin(cuda SOURCES + # Some code is shared with the UR adapter + "../unified_runtime/pi2ur.hpp" + "../unified_runtime/pi2ur.cpp" + "../unified_runtime/ur/ur.hpp" + "../unified_runtime/ur/ur.cpp" + "../unified_runtime/ur/usm_allocator.cpp" + "../unified_runtime/ur/usm_allocator.hpp" + "../unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp" + # --- "${sycl_inc_dir}/sycl/detail/pi.h" "${sycl_inc_dir}/sycl/detail/pi.hpp" "pi_cuda.hpp" @@ -57,9 +66,11 @@ add_sycl_plugin(cuda INCLUDE_DIRS ${sycl_inc_dir} ${XPTI_INCLUDE} + ${CMAKE_CURRENT_SOURCE_DIR}/../unified_runtime LIBRARIES cudadrv ${XPTI_LIBS} + UnifiedRuntime-Headers HEADER "${CMAKE_CURRENT_SOURCE_DIR}/include/features.hpp" ) diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index e829d012e55b4..6ed2b57fcd4ce 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -123,6 +123,21 @@ set_target_properties("ur_adapter_level_zero" PROPERTIES SOVERSION "0" ) +# Build CUDA adapter +add_sycl_library("ur_adapter_cuda" SHARED + SOURCES + "ur/ur.hpp" + "ur/ur.cpp" + "ur/usm_allocator.cpp" + "ur/usm_allocator.hpp" + "ur/adapters/cuda/ur_interface_loader.cpp" + # --- + INCLUDE_DIRS + ${sycl_inc_dir} + LIBRARIES + UnifiedRuntime-Headers + Threads::Threads +) if (TARGET UnifiedRuntimeLoader) set_target_properties(hello_world PROPERTIES EXCLUDE_FROM_ALL 1 EXCLUDE_FROM_DEFAULT_BUILD 1) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp new file mode 100644 index 0000000000000..9446515bd435e --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp @@ -0,0 +1,257 @@ +//===--------- ur_interface_loader.cpp - Unified Runtime ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include + +namespace { + +// TODO - this is a duplicate of what is in the L0 plugin +// We should move this to somewhere common +ur_result_t validateProcInputs(ur_api_version_t version, void *pDdiTable) { + if (nullptr == pDdiTable) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + // Pre 1.0 we enforce loader and adapter must have same version. + // Post 1.0 only major version match should be required. + if (version != UR_API_VERSION_CURRENT) { + return UR_RESULT_ERROR_UNSUPPORTED_VERSION; + } + return UR_RESULT_SUCCESS; +} +} // namespace + +#if defined(__cplusplus) +extern "C" { +#endif + +UR_DLLEXPORT ur_result_t UR_APICALL urGetPlatformProcAddrTable( + ur_api_version_t version, ur_platform_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnGet = nullptr; + pDdiTable->pfnGetApiVersion = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetContextProcAddrTable( + ur_api_version_t version, ur_context_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreate = nullptr; + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnRelease = nullptr; + pDdiTable->pfnRetain = nullptr; + pDdiTable->pfnSetExtendedDeleter = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetEventProcAddrTable( + ur_api_version_t version, ur_event_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnGetProfilingInfo = nullptr; + pDdiTable->pfnRelease = nullptr; + pDdiTable->pfnRetain = nullptr; + pDdiTable->pfnSetCallback = nullptr; + pDdiTable->pfnWait = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramProcAddrTable( + ur_api_version_t version, ur_program_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnBuild = nullptr; + pDdiTable->pfnCompile = nullptr; + pDdiTable->pfnCreateWithBinary = nullptr; + pDdiTable->pfnCreateWithIL = nullptr; + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnGetBuildInfo = nullptr; + pDdiTable->pfnGetFunctionPointer = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnLink = nullptr; + pDdiTable->pfnRelease = nullptr; + pDdiTable->pfnRetain = nullptr; + pDdiTable->pfnSetSpecializationConstants = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( + ur_api_version_t version, ur_kernel_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreate = nullptr; + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnGetGroupInfo = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnGetSubGroupInfo = nullptr; + pDdiTable->pfnRelease = nullptr; + pDdiTable->pfnRetain = nullptr; + pDdiTable->pfnSetArgLocal = nullptr; + pDdiTable->pfnSetArgMemObj = nullptr; + pDdiTable->pfnSetArgPointer = nullptr; + pDdiTable->pfnSetArgSampler = nullptr; + pDdiTable->pfnSetArgValue = nullptr; + pDdiTable->pfnSetExecInfo = nullptr; + pDdiTable->pfnSetSpecializationConstants = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetSamplerProcAddrTable( + ur_api_version_t version, ur_sampler_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreate = nullptr; + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnRelease = nullptr; + pDdiTable->pfnRetain = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL +urGetMemProcAddrTable(ur_api_version_t version, ur_mem_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnBufferCreate = nullptr; + pDdiTable->pfnBufferPartition = nullptr; + pDdiTable->pfnBufferCreateWithNativeHandle = nullptr; + pDdiTable->pfnImageCreateWithNativeHandle = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnImageCreate = nullptr; + pDdiTable->pfnImageGetInfo = nullptr; + pDdiTable->pfnRelease = nullptr; + pDdiTable->pfnRetain = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( + ur_api_version_t version, ur_enqueue_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnDeviceGlobalVariableRead = nullptr; + pDdiTable->pfnDeviceGlobalVariableWrite = nullptr; + pDdiTable->pfnEventsWait = nullptr; + pDdiTable->pfnEventsWaitWithBarrier = nullptr; + pDdiTable->pfnKernelLaunch = nullptr; + pDdiTable->pfnMemBufferCopy = nullptr; + pDdiTable->pfnMemBufferCopyRect = nullptr; + pDdiTable->pfnMemBufferFill = nullptr; + pDdiTable->pfnMemBufferMap = nullptr; + pDdiTable->pfnMemBufferRead = nullptr; + pDdiTable->pfnMemBufferReadRect = nullptr; + pDdiTable->pfnMemBufferWrite = nullptr; + pDdiTable->pfnMemBufferWriteRect = nullptr; + pDdiTable->pfnMemImageCopy = nullptr; + pDdiTable->pfnMemImageRead = nullptr; + pDdiTable->pfnMemImageWrite = nullptr; + pDdiTable->pfnMemUnmap = nullptr; + pDdiTable->pfnUSMFill2D = nullptr; + pDdiTable->pfnUSMFill = nullptr; + pDdiTable->pfnUSMAdvise = nullptr; + pDdiTable->pfnUSMMemcpy2D = nullptr; + pDdiTable->pfnUSMMemcpy = nullptr; + pDdiTable->pfnUSMPrefetch = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetGlobalProcAddrTable( + ur_api_version_t version, ur_global_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnGetLastResult = nullptr; + pDdiTable->pfnInit = nullptr; + pDdiTable->pfnTearDown = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( + ur_api_version_t version, ur_queue_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreate = nullptr; + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnFinish = nullptr; + pDdiTable->pfnFlush = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnRelease = nullptr; + pDdiTable->pfnRetain = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL +urGetUSMProcAddrTable(ur_api_version_t version, ur_usm_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnDeviceAlloc = nullptr; + pDdiTable->pfnFree = nullptr; + pDdiTable->pfnGetMemAllocInfo = nullptr; + pDdiTable->pfnHostAlloc = nullptr; + pDdiTable->pfnPoolCreate = nullptr; + pDdiTable->pfnPoolDestroy = nullptr; + pDdiTable->pfnPoolDestroy = nullptr; + pDdiTable->pfnSharedAlloc = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable( + ur_api_version_t version, ur_device_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnGet = nullptr; + pDdiTable->pfnGetGlobalTimestamps = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnPartition = nullptr; + pDdiTable->pfnRelease = nullptr; + pDdiTable->pfnRetain = nullptr; + pDdiTable->pfnSelectBinary = nullptr; + return UR_RESULT_SUCCESS; +} + +#if defined(__cplusplus) +} // extern "C" +#endif From 46dca606721ee9fdd22321e17a90d77bdfa497c5 Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Thu, 6 Apr 2023 12:36:32 +0100 Subject: [PATCH 02/45] [SYCL][PI][UR][CUDA] Port CUDA platform, device, context to Unified Runtime --- sycl/plugins/cuda/CMakeLists.txt | 8 + sycl/plugins/cuda/pi_cuda.cpp | 1491 +---------------- sycl/plugins/cuda/pi_cuda.hpp | 135 +- sycl/plugins/unified_runtime/CMakeLists.txt | 11 +- sycl/plugins/unified_runtime/pi2ur.hpp | 4 + .../ur/adapters/cuda/common.cpp | 87 + .../ur/adapters/cuda/common.hpp | 51 + .../ur/adapters/cuda/context.cpp | 151 ++ .../ur/adapters/cuda/context.hpp | 108 ++ .../ur/adapters/cuda/device.cpp | 1119 +++++++++++++ .../ur/adapters/cuda/device.hpp | 59 + .../ur/adapters/cuda/platform.cpp | 174 ++ .../ur/adapters/cuda/platform.hpp | 15 + .../ur/adapters/cuda/ur_interface_loader.cpp | 37 +- sycl/plugins/unified_runtime/ur/ur.hpp | 15 +- sycl/unittests/pi/cuda/CMakeLists.txt | 2 + 16 files changed, 1876 insertions(+), 1591 deletions(-) create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/platform.hpp diff --git a/sycl/plugins/cuda/CMakeLists.txt b/sycl/plugins/cuda/CMakeLists.txt index 7df7f549c9981..e4fa949eca8e9 100644 --- a/sycl/plugins/cuda/CMakeLists.txt +++ b/sycl/plugins/cuda/CMakeLists.txt @@ -55,6 +55,14 @@ add_sycl_plugin(cuda "../unified_runtime/ur/ur.cpp" "../unified_runtime/ur/usm_allocator.cpp" "../unified_runtime/ur/usm_allocator.hpp" + "../unified_runtime/ur/adapters/cuda/common.cpp" + "../unified_runtime/ur/adapters/cuda/common.hpp" + "../unified_runtime/ur/adapters/cuda/context.cpp" + "../unified_runtime/ur/adapters/cuda/context.hpp" + "../unified_runtime/ur/adapters/cuda/device.cpp" + "../unified_runtime/ur/adapters/cuda/device.hpp" + "../unified_runtime/ur/adapters/cuda/platform.cpp" + "../unified_runtime/ur/adapters/cuda/platform.hpp" "../unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp" # --- "${sycl_inc_dir}/sycl/detail/pi.h" diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index dd68c196e94c1..b1183b662b137 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -33,16 +33,6 @@ void enableCUDATracing(); void disableCUDATracing(); namespace { -std::string getCudaVersionString() { - int driver_version = 0; - cuDriverGetVersion(&driver_version); - // The version is returned as (1000 major + 10 minor). - std::stringstream stream; - stream << "CUDA " << driver_version / 1000 << "." - << driver_version % 1000 / 10; - return stream.str(); -} - pi_result map_error(CUresult result) { switch (result) { case CUDA_SUCCESS: @@ -185,55 +175,13 @@ pi_result check_error(CUresult result, const char *function, int line, /// \cond NODOXY #define PI_CHECK_ERROR(result) check_error(result, __func__, __LINE__, __FILE__) -/// ScopedContext is used across all PI CUDA plugin implementation to ensure -/// that the proper CUDA context is active for the given PI context. -// -/// This class will only replace the context if necessary, and will leave the -/// new context active on the current thread. If there was an active context -/// already it will simply be replaced. -// -/// Previously active contexts are not restored for two reasons: -/// * Performance: context switches are expensive so leaving the context active -/// means subsequent SYCL calls with the same context will be cheaper. -/// * Multi-threading cleanup: contexts are set active per thread and deleting a -/// context will only deactivate it for the current thread. This means other -/// threads may end up with deleted active contexts. In particular this can -/// happen with host_tasks as they run in a thread pool. When the context -/// associated with these tasks is deleted it will remain active in the -/// threads of the thread pool. So it would be invalid for any other task -/// running on these threads to try to restore the deleted context. With the -/// current implementation this is not an issue because the active deleted -/// context will just be replaced. -// -/// This approach does mean that CUDA interop tasks should NOT expect their -/// contexts to be restored by SYCL. -class ScopedContext { -public: - ScopedContext(pi_context ctxt) { - if (!ctxt) { - throw PI_ERROR_INVALID_CONTEXT; - } - - set_context(ctxt->get()); +ScopedContext::ScopedContext(pi_context ctxt) { + if (!ctxt) { + throw PI_ERROR_INVALID_CONTEXT; } - ScopedContext(CUcontext ctxt) { set_context(ctxt); } - - ~ScopedContext() {} - -private: - void set_context(CUcontext desired) { - CUcontext original = nullptr; - - PI_CHECK_ERROR(cuCtxGetCurrent(&original)); - - // Make sure the desired context is active on the current thread, setting - // it if necessary - if (original != desired) { - PI_CHECK_ERROR(cuCtxSetCurrent(desired)); - } - } -}; + set_context(ctxt->get()); +} /// \cond NODOXY template @@ -648,7 +596,7 @@ _pi_event::_pi_event(pi_command_type type, pi_context context, pi_queue queue, if (queue_ != nullptr) { cuda_piQueueRetain(queue_); } - cuda_piContextRetain(context_); + pi2ur::piContextRetain(context_); } _pi_event::_pi_event(pi_context context, CUevent eventNative) @@ -657,14 +605,14 @@ _pi_event::_pi_event(pi_context context, CUevent eventNative) streamToken_{std::numeric_limits::max()}, evEnd_{eventNative}, evStart_{nullptr}, evQueued_{nullptr}, queue_{nullptr}, context_{context} { - cuda_piContextRetain(context_); + pi2ur::piContextRetain(context_); } _pi_event::~_pi_event() { if (queue_ != nullptr) { cuda_piQueueRelease(queue_); } - cuda_piContextRelease(context_); + pi2ur::piContextRelease(context_); } pi_result _pi_event::start() { @@ -702,14 +650,6 @@ bool _pi_event::is_completed() const noexcept { return true; } -pi_uint64 _pi_device::get_elapsed_time(CUevent ev) const { - float miliSeconds = 0.0f; - - PI_CHECK_ERROR(cuEventElapsedTime(&miliSeconds, evBase_, ev)); - - return static_cast(miliSeconds * 1.0e6); -} - pi_uint64 _pi_event::get_queued_time() const { assert(is_started()); return queue_->get_device()->get_elapsed_time(evQueued_); @@ -797,10 +737,10 @@ pi_result enqueueEventWait(pi_queue queue, pi_event event) { _pi_program::_pi_program(pi_context ctxt) : module_{nullptr}, binary_{}, binarySizeInBytes_{0}, refCount_{1}, context_{ctxt}, kernelReqdWorkGroupSizeMD_{} { - cuda_piContextRetain(context_); + pi2ur::piContextRetain(context_); } -_pi_program::~_pi_program() { cuda_piContextRelease(context_); } +_pi_program::~_pi_program() { pi2ur::piContextRelease(context_); } std::pair splitMetadataName(const std::string &metadataName) { @@ -917,201 +857,6 @@ std::string getKernelNames(pi_program) { //-- PI API implementation extern "C" { - -pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret); - -/// Obtains the CUDA platform. -/// There is only one CUDA platform, and contains all devices on the system. -/// Triggers the CUDA Driver initialization (cuInit) the first time, so this -/// must be the first PI API called. -/// -/// However because multiple devices in a context is not currently supported, -/// place each device in a separate platform. -/// -pi_result cuda_piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms, - pi_uint32 *num_platforms) { - - try { - static std::once_flag initFlag; - static pi_uint32 numPlatforms = 1; - static std::vector<_pi_platform> platformIds; - - if (num_entries == 0 && platforms != nullptr) { - return PI_ERROR_INVALID_VALUE; - } - if (platforms == nullptr && num_platforms == nullptr) { - return PI_ERROR_INVALID_VALUE; - } - - pi_result err = PI_SUCCESS; - - std::call_once( - initFlag, - [](pi_result &err) { - if (cuInit(0) != CUDA_SUCCESS) { - numPlatforms = 0; - return; - } - int numDevices = 0; - err = PI_CHECK_ERROR(cuDeviceGetCount(&numDevices)); - if (numDevices == 0) { - numPlatforms = 0; - return; - } - try { - // make one platform per device - numPlatforms = numDevices; - platformIds.resize(numDevices); - - for (int i = 0; i < numDevices; ++i) { - CUdevice device; - err = PI_CHECK_ERROR(cuDeviceGet(&device, i)); - CUcontext context; - err = PI_CHECK_ERROR(cuDevicePrimaryCtxRetain(&context, device)); - - ScopedContext active(context); - CUevent evBase; - err = PI_CHECK_ERROR(cuEventCreate(&evBase, CU_EVENT_DEFAULT)); - - // Use default stream to record base event counter - err = PI_CHECK_ERROR(cuEventRecord(evBase, 0)); - - platformIds[i].devices_.emplace_back( - new _pi_device{device, context, evBase, &platformIds[i]}); - - { - const auto &dev = platformIds[i].devices_.back().get(); - size_t maxWorkGroupSize = 0u; - size_t maxThreadsPerBlock[3] = {}; - pi_result retError = cuda_piDeviceGetInfo( - dev, PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES, - sizeof(maxThreadsPerBlock), maxThreadsPerBlock, nullptr); - assert(retError == PI_SUCCESS); - (void)retError; - - retError = cuda_piDeviceGetInfo( - dev, PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE, - sizeof(maxWorkGroupSize), &maxWorkGroupSize, nullptr); - assert(retError == PI_SUCCESS); - - dev->save_max_work_item_sizes(sizeof(maxThreadsPerBlock), - maxThreadsPerBlock); - dev->save_max_work_group_size(maxWorkGroupSize); - } - } - } catch (const std::bad_alloc &) { - // Signal out-of-memory situation - for (int i = 0; i < numDevices; ++i) { - platformIds[i].devices_.clear(); - } - platformIds.clear(); - err = PI_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - // Clear and rethrow to allow retry - for (int i = 0; i < numDevices; ++i) { - platformIds[i].devices_.clear(); - } - platformIds.clear(); - throw; - } - }, - err); - - if (num_platforms != nullptr) { - *num_platforms = numPlatforms; - } - - if (platforms != nullptr) { - for (unsigned i = 0; i < std::min(num_entries, numPlatforms); ++i) { - platforms[i] = &platformIds[i]; - } - } - - return err; - } catch (pi_result err) { - return err; - } catch (...) { - return PI_ERROR_OUT_OF_RESOURCES; - } -} - -pi_result cuda_piPlatformGetInfo([[maybe_unused]] pi_platform platform, - pi_platform_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) { - assert(platform != nullptr); - - switch (param_name) { - case PI_PLATFORM_INFO_NAME: - return getInfo(param_value_size, param_value, param_value_size_ret, - "NVIDIA CUDA BACKEND"); - case PI_PLATFORM_INFO_VENDOR: - return getInfo(param_value_size, param_value, param_value_size_ret, - "NVIDIA Corporation"); - case PI_PLATFORM_INFO_PROFILE: - return getInfo(param_value_size, param_value, param_value_size_ret, - "FULL PROFILE"); - case PI_PLATFORM_INFO_VERSION: { - auto version = getCudaVersionString(); - return getInfo(param_value_size, param_value, param_value_size_ret, - version.c_str()); - } - case PI_PLATFORM_INFO_EXTENSIONS: { - return getInfo(param_value_size, param_value, param_value_size_ret, ""); - } - case PI_EXT_PLATFORM_INFO_BACKEND: { - return getInfo(param_value_size, param_value, - param_value_size_ret, - PI_EXT_PLATFORM_BACKEND_CUDA); - } - default: - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); - } - sycl::detail::pi::die("Platform info request not implemented"); - return {}; -} - -/// \param devices List of devices available on the system -/// \param num_devices Number of elements in the list of devices -/// Requesting a non-GPU device triggers an error, all PI CUDA devices -/// are GPUs. -/// -pi_result cuda_piDevicesGet(pi_platform platform, pi_device_type device_type, - pi_uint32 num_entries, pi_device *devices, - pi_uint32 *num_devices) { - - pi_result err = PI_SUCCESS; - const bool askingForDefault = device_type == PI_DEVICE_TYPE_DEFAULT; - const bool askingForGPU = device_type & PI_DEVICE_TYPE_GPU; - const bool returnDevices = askingForDefault || askingForGPU; - - size_t numDevices = returnDevices ? platform->devices_.size() : 0; - - try { - if (num_devices) { - *num_devices = numDevices; - } - - if (returnDevices && devices) { - for (size_t i = 0; i < std::min(size_t(num_entries), numDevices); ++i) { - devices[i] = platform->devices_[i].get(); - } - } - - return err; - } catch (pi_result err) { - return err; - } catch (...) { - return PI_ERROR_OUT_OF_RESOURCES; - } -} - -/// \return PI_SUCCESS if the function is executed successfully -/// CUDA devices are always root devices so retain always returns success. -pi_result cuda_piDeviceRetain(pi_device) { return PI_SUCCESS; } - pi_result cuda_piContextGetInfo(pi_context context, pi_context_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret) { @@ -1150,27 +895,6 @@ pi_result cuda_piContextGetInfo(pi_context context, pi_context_info param_name, return PI_ERROR_OUT_OF_RESOURCES; } -pi_result cuda_piContextRetain(pi_context context) { - assert(context != nullptr); - assert(context->get_reference_count() > 0); - - context->increment_reference_count(); - return PI_SUCCESS; -} - -pi_result cuda_piextContextSetExtendedDeleter( - pi_context context, pi_context_extended_deleter function, void *user_data) { - context->set_extended_deleter(function, user_data); - return PI_SUCCESS; -} - -/// Not applicable to CUDA, devices cannot be partitioned. -pi_result cuda_piDevicePartition(pi_device, - const pi_device_partition_property *, - pi_uint32, pi_device *, pi_uint32 *) { - return {}; -} - /// \return If available, the first binary that is PTX /// pi_result cuda_piextDeviceSelectBinary(pi_device device, @@ -1224,1155 +948,6 @@ pi_result cuda_piextGetDeviceFunctionPointer([[maybe_unused]] pi_device device, return retError; } -/// \return PI_SUCCESS always since CUDA devices are always root devices. -/// -pi_result cuda_piDeviceRelease(pi_device) { return PI_SUCCESS; } - -pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) { - - static constexpr pi_uint32 max_work_item_dimensions = 3u; - - assert(device != nullptr); - - ScopedContext active(device->get_context()); - - switch (param_name) { - case PI_DEVICE_INFO_TYPE: { - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_DEVICE_TYPE_GPU); - } - case PI_DEVICE_INFO_VENDOR_ID: { - return getInfo(param_value_size, param_value, param_value_size_ret, 4318u); - } - case PI_DEVICE_INFO_MAX_COMPUTE_UNITS: { - int compute_units = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&compute_units, - CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion(compute_units >= 0); - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint32(compute_units)); - } - case PI_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: { - return getInfo(param_value_size, param_value, param_value_size_ret, - max_work_item_dimensions); - } - case PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES: { - size_t return_sizes[max_work_item_dimensions]; - - int max_x = 0, max_y = 0, max_z = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&max_x, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion(max_x >= 0); - - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&max_y, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion(max_y >= 0); - - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&max_z, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion(max_z >= 0); - - return_sizes[0] = size_t(max_x); - return_sizes[1] = size_t(max_y); - return_sizes[2] = size_t(max_z); - return getInfoArray(max_work_item_dimensions, param_value_size, param_value, - param_value_size_ret, return_sizes); - } - - case PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_3D: { - size_t return_sizes[max_work_item_dimensions]; - int max_x = 0, max_y = 0, max_z = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&max_x, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion(max_x >= 0); - - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&max_y, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion(max_y >= 0); - - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&max_z, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion(max_z >= 0); - - return_sizes[0] = size_t(max_x); - return_sizes[1] = size_t(max_y); - return_sizes[2] = size_t(max_z); - return getInfoArray(max_work_item_dimensions, param_value_size, param_value, - param_value_size_ret, return_sizes); - } - - case PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE: { - int max_work_group_size = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&max_work_group_size, - CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - device->get()) == CUDA_SUCCESS); - - sycl::detail::pi::assertion(max_work_group_size >= 0); - - return getInfo(param_value_size, param_value, param_value_size_ret, - size_t(max_work_group_size)); - } - case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR: { - return getInfo(param_value_size, param_value, param_value_size_ret, 1u); - } - case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT: { - return getInfo(param_value_size, param_value, param_value_size_ret, 1u); - } - case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT: { - return getInfo(param_value_size, param_value, param_value_size_ret, 1u); - } - case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG: { - return getInfo(param_value_size, param_value, param_value_size_ret, 1u); - } - case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT: { - return getInfo(param_value_size, param_value, param_value_size_ret, 1u); - } - case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE: { - return getInfo(param_value_size, param_value, param_value_size_ret, 1u); - } - case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF: { - return getInfo(param_value_size, param_value, param_value_size_ret, 0u); - } - case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR: { - return getInfo(param_value_size, param_value, param_value_size_ret, 1u); - } - case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT: { - return getInfo(param_value_size, param_value, param_value_size_ret, 1u); - } - case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT: { - return getInfo(param_value_size, param_value, param_value_size_ret, 1u); - } - case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG: { - return getInfo(param_value_size, param_value, param_value_size_ret, 1u); - } - case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT: { - return getInfo(param_value_size, param_value, param_value_size_ret, 1u); - } - case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE: { - return getInfo(param_value_size, param_value, param_value_size_ret, 1u); - } - case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF: { - return getInfo(param_value_size, param_value, param_value_size_ret, 0u); - } - case PI_DEVICE_INFO_MAX_NUM_SUB_GROUPS: { - // Number of sub-groups = max block size / warp size + possible remainder - int max_threads = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&max_threads, - CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - device->get()) == CUDA_SUCCESS); - int warpSize = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, - device->get()) == CUDA_SUCCESS); - int maxWarps = (max_threads + warpSize - 1) / warpSize; - return getInfo(param_value_size, param_value, param_value_size_ret, - static_cast(maxWarps)); - } - case PI_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: { - // Volta provides independent thread scheduling - // TODO: Revisit for previous generation GPUs - int major = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - device->get()) == CUDA_SUCCESS); - bool ifp = (major >= 7); - return getInfo(param_value_size, param_value, param_value_size_ret, ifp); - } - - case PI_DEVICE_INFO_ATOMIC_64: { - int major = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - device->get()) == CUDA_SUCCESS); - - bool atomic64 = (major >= 6) ? true : false; - return getInfo(param_value_size, param_value, param_value_size_ret, - atomic64); - } - case PI_EXT_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: { - pi_memory_order_capabilities capabilities = - PI_MEMORY_ORDER_RELAXED | PI_MEMORY_ORDER_ACQUIRE | - PI_MEMORY_ORDER_RELEASE | PI_MEMORY_ORDER_ACQ_REL; - return getInfo(param_value_size, param_value, param_value_size_ret, - capabilities); - } - case PI_EXT_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: { - int major = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - device->get()) == CUDA_SUCCESS); - pi_memory_order_capabilities capabilities = - (major >= 7) ? PI_MEMORY_SCOPE_WORK_ITEM | PI_MEMORY_SCOPE_SUB_GROUP | - PI_MEMORY_SCOPE_WORK_GROUP | PI_MEMORY_SCOPE_DEVICE | - PI_MEMORY_SCOPE_SYSTEM - : PI_MEMORY_SCOPE_WORK_ITEM | PI_MEMORY_SCOPE_SUB_GROUP | - PI_MEMORY_SCOPE_WORK_GROUP | PI_MEMORY_SCOPE_DEVICE; - return getInfo(param_value_size, param_value, param_value_size_ret, - capabilities); - } - case PI_EXT_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: { - // SYCL2020 4.6.4.2 minimum mandated capabilities for - // atomic_fence_order_capabilities. - pi_memory_order_capabilities capabilities = - PI_MEMORY_ORDER_RELAXED | PI_MEMORY_ORDER_ACQUIRE | - PI_MEMORY_ORDER_RELEASE | PI_MEMORY_ORDER_ACQ_REL; - return getInfo(param_value_size, param_value, param_value_size_ret, - capabilities); - } - case PI_EXT_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: { - // SYCL2020 4.6.4.2 minimum mandated capabilities for - // atomic_fence/memory_scope_capabilities. - // Because scopes are hierarchical, wider scopes support all narrower - // scopes. At a minimum, each device must support WORK_ITEM, SUB_GROUP and - // WORK_GROUP. (https://github.com/KhronosGroup/SYCL-Docs/pull/382) - pi_memory_scope_capabilities capabilities = PI_MEMORY_SCOPE_WORK_ITEM | - PI_MEMORY_SCOPE_SUB_GROUP | - PI_MEMORY_SCOPE_WORK_GROUP; - return getInfo(param_value_size, param_value, param_value_size_ret, - capabilities); - } - case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS: { - int major = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - device->get()) == CUDA_SUCCESS); - - bool bfloat16 = (major >= 8) ? true : false; - return getInfo(param_value_size, param_value, param_value_size_ret, - bfloat16); - } - case PI_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: { - // NVIDIA devices only support one sub-group size (the warp size) - int warpSize = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, - device->get()) == CUDA_SUCCESS); - size_t sizes[1] = {static_cast(warpSize)}; - return getInfoArray(1, param_value_size, param_value, - param_value_size_ret, sizes); - } - case PI_DEVICE_INFO_MAX_CLOCK_FREQUENCY: { - int clock_freq = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&clock_freq, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion(clock_freq >= 0); - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint32(clock_freq) / 1000u); - } - case PI_DEVICE_INFO_ADDRESS_BITS: { - auto bits = pi_uint32{std::numeric_limits::digits}; - return getInfo(param_value_size, param_value, param_value_size_ret, bits); - } - case PI_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: { - // Max size of memory object allocation in bytes. - // The minimum value is max(min(1024 × 1024 × - // 1024, 1/4th of CL_DEVICE_GLOBAL_MEM_SIZE), - // 32 × 1024 × 1024) for devices that are not of type - // CL_DEVICE_TYPE_CUSTOM. - - size_t global = 0; - sycl::detail::pi::assertion(cuDeviceTotalMem(&global, device->get()) == - CUDA_SUCCESS); - - auto quarter_global = static_cast(global / 4u); - - auto max_alloc = std::max(std::min(1024u * 1024u * 1024u, quarter_global), - 32u * 1024u * 1024u); - - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint64{max_alloc}); - } - case PI_DEVICE_INFO_IMAGE_SUPPORT: { - pi_bool enabled = PI_FALSE; - - if (std::getenv("SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT") != nullptr) { - enabled = PI_TRUE; - } else { - sycl::detail::pi::cuPrint( - "Images are not fully supported by the CUDA BE, their support is " - "disabled by default. Their partial support can be activated by " - "setting SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT environment variable at " - "runtime."); - } - - return getInfo(param_value_size, param_value, param_value_size_ret, - enabled); - } - case PI_DEVICE_INFO_MAX_READ_IMAGE_ARGS: { - // This call doesn't match to CUDA as it doesn't have images, but instead - // surfaces and textures. No clear call in the CUDA API to determine this, - // but some searching found as of SM 2.x 128 are supported. - return getInfo(param_value_size, param_value, param_value_size_ret, 128u); - } - case PI_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS: { - // This call doesn't match to CUDA as it doesn't have images, but instead - // surfaces and textures. No clear call in the CUDA API to determine this, - // but some searching found as of SM 2.x 128 are supported. - return getInfo(param_value_size, param_value, param_value_size_ret, 128u); - } - case PI_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: { - // Take the smaller of maximum surface and maximum texture height. - int tex_height = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&tex_height, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion(tex_height >= 0); - int surf_height = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&surf_height, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion(surf_height >= 0); - - int min = std::min(tex_height, surf_height); - - return getInfo(param_value_size, param_value, param_value_size_ret, min); - } - case PI_DEVICE_INFO_IMAGE2D_MAX_WIDTH: { - // Take the smaller of maximum surface and maximum texture width. - int tex_width = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&tex_width, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion(tex_width >= 0); - int surf_width = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&surf_width, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion(surf_width >= 0); - - int min = std::min(tex_width, surf_width); - - return getInfo(param_value_size, param_value, param_value_size_ret, min); - } - case PI_DEVICE_INFO_IMAGE3D_MAX_HEIGHT: { - // Take the smaller of maximum surface and maximum texture height. - int tex_height = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&tex_height, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion(tex_height >= 0); - int surf_height = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&surf_height, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion(surf_height >= 0); - - int min = std::min(tex_height, surf_height); - - return getInfo(param_value_size, param_value, param_value_size_ret, min); - } - case PI_DEVICE_INFO_IMAGE3D_MAX_WIDTH: { - // Take the smaller of maximum surface and maximum texture width. - int tex_width = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&tex_width, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion(tex_width >= 0); - int surf_width = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&surf_width, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion(surf_width >= 0); - - int min = std::min(tex_width, surf_width); - - return getInfo(param_value_size, param_value, param_value_size_ret, min); - } - case PI_DEVICE_INFO_IMAGE3D_MAX_DEPTH: { - // Take the smaller of maximum surface and maximum texture depth. - int tex_depth = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&tex_depth, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion(tex_depth >= 0); - int surf_depth = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&surf_depth, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion(surf_depth >= 0); - - int min = std::min(tex_depth, surf_depth); - - return getInfo(param_value_size, param_value, param_value_size_ret, min); - } - case PI_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE: { - // Take the smaller of maximum surface and maximum texture width. - int tex_width = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&tex_width, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion(tex_width >= 0); - int surf_width = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&surf_width, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion(surf_width >= 0); - - int min = std::min(tex_width, surf_width); - - return getInfo(param_value_size, param_value, param_value_size_ret, min); - } - case PI_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE: { - return getInfo(param_value_size, param_value, param_value_size_ret, - size_t(0)); - } - case PI_DEVICE_INFO_MAX_SAMPLERS: { - // This call is kind of meaningless for cuda, as samplers don't exist. - // Closest thing is textures, which is 128. - return getInfo(param_value_size, param_value, param_value_size_ret, 128u); - } - case PI_DEVICE_INFO_MAX_PARAMETER_SIZE: { - // https://docs.nvidia.com/cuda/cuda-c-programming-guide/#function-parameters - // __global__ function parameters are passed to the device via constant - // memory and are limited to 4 KB. - return getInfo(param_value_size, param_value, param_value_size_ret, - size_t{4000u}); - } - case PI_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: { - int mem_base_addr_align = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&mem_base_addr_align, - CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, - device->get()) == CUDA_SUCCESS); - // Multiply by 8 as clGetDeviceInfo returns this value in bits - mem_base_addr_align *= 8; - return getInfo(param_value_size, param_value, param_value_size_ret, - mem_base_addr_align); - } - case PI_DEVICE_INFO_HALF_FP_CONFIG: { - // TODO: is this config consistent across all NVIDIA GPUs? - return getInfo(param_value_size, param_value, param_value_size_ret, 0u); - } - case PI_DEVICE_INFO_SINGLE_FP_CONFIG: { - // TODO: is this config consistent across all NVIDIA GPUs? - auto config = PI_FP_DENORM | PI_FP_INF_NAN | PI_FP_ROUND_TO_NEAREST | - PI_FP_ROUND_TO_ZERO | PI_FP_ROUND_TO_INF | PI_FP_FMA | - PI_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT; - return getInfo(param_value_size, param_value, param_value_size_ret, config); - } - case PI_DEVICE_INFO_DOUBLE_FP_CONFIG: { - // TODO: is this config consistent across all NVIDIA GPUs? - auto config = PI_FP_DENORM | PI_FP_INF_NAN | PI_FP_ROUND_TO_NEAREST | - PI_FP_ROUND_TO_ZERO | PI_FP_ROUND_TO_INF | PI_FP_FMA; - return getInfo(param_value_size, param_value, param_value_size_ret, config); - } - case PI_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE: { - // TODO: is this config consistent across all NVIDIA GPUs? - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_DEVICE_MEM_CACHE_TYPE_READ_WRITE_CACHE); - } - case PI_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE: { - // The value is documented for all existing GPUs in the CUDA programming - // guidelines, section "H.3.2. Global Memory". - return getInfo(param_value_size, param_value, param_value_size_ret, 128u); - } - case PI_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE: { - int cache_size = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&cache_size, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion(cache_size >= 0); - // The L2 cache is global to the GPU. - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint64(cache_size)); - } - case PI_DEVICE_INFO_GLOBAL_MEM_SIZE: { - size_t bytes = 0; - // Runtime API has easy access to this value, driver API info is scarse. - sycl::detail::pi::assertion(cuDeviceTotalMem(&bytes, device->get()) == - CUDA_SUCCESS); - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint64{bytes}); - } - case PI_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE: { - int constant_memory = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&constant_memory, - CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion(constant_memory >= 0); - - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint64(constant_memory)); - } - case PI_DEVICE_INFO_MAX_CONSTANT_ARGS: { - // TODO: is there a way to retrieve this from CUDA driver API? - // Hard coded to value returned by clinfo for OpenCL 1.2 CUDA | GeForce GTX - // 1060 3GB - return getInfo(param_value_size, param_value, param_value_size_ret, 9u); - } - case PI_DEVICE_INFO_LOCAL_MEM_TYPE: { - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_DEVICE_LOCAL_MEM_TYPE_LOCAL); - } - case PI_DEVICE_INFO_LOCAL_MEM_SIZE: { - // OpenCL's "local memory" maps most closely to CUDA's "shared memory". - // CUDA has its own definition of "local memory", which maps to OpenCL's - // "private memory". - int local_mem_size = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&local_mem_size, - CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion(local_mem_size >= 0); - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint64(local_mem_size)); - } - case PI_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: { - int ecc_enabled = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&ecc_enabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, - device->get()) == CUDA_SUCCESS); - - sycl::detail::pi::assertion((ecc_enabled == 0) | (ecc_enabled == 1)); - auto result = static_cast(ecc_enabled); - return getInfo(param_value_size, param_value, param_value_size_ret, result); - } - case PI_DEVICE_INFO_HOST_UNIFIED_MEMORY: { - int is_integrated = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&is_integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, - device->get()) == CUDA_SUCCESS); - - sycl::detail::pi::assertion((is_integrated == 0) | (is_integrated == 1)); - auto result = static_cast(is_integrated); - return getInfo(param_value_size, param_value, param_value_size_ret, result); - } - case PI_DEVICE_INFO_PROFILING_TIMER_RESOLUTION: { - // Hard coded to value returned by clinfo for OpenCL 1.2 CUDA | GeForce GTX - // 1060 3GB - return getInfo(param_value_size, param_value, param_value_size_ret, - size_t{1000u}); - } - case PI_DEVICE_INFO_ENDIAN_LITTLE: { - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_TRUE); - } - case PI_DEVICE_INFO_AVAILABLE: { - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_TRUE); - } - case PI_DEVICE_INFO_BUILD_ON_SUBDEVICE: { - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_TRUE); - } - case PI_DEVICE_INFO_COMPILER_AVAILABLE: { - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_TRUE); - } - case PI_DEVICE_INFO_LINKER_AVAILABLE: { - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_TRUE); - } - case PI_DEVICE_INFO_EXECUTION_CAPABILITIES: { - auto capability = PI_DEVICE_EXEC_CAPABILITIES_KERNEL; - return getInfo(param_value_size, param_value, param_value_size_ret, - capability); - } - case PI_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: { - // The mandated minimum capability: - auto capability = PI_QUEUE_FLAG_PROFILING_ENABLE | - PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE; - return getInfo(param_value_size, param_value, param_value_size_ret, - capability); - } - case PI_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: { - // The mandated minimum capability: - auto capability = PI_QUEUE_FLAG_PROFILING_ENABLE; - return getInfo(param_value_size, param_value, param_value_size_ret, - capability); - } - case PI_DEVICE_INFO_BUILT_IN_KERNELS: { - // An empty string is returned if no built-in kernels are supported by the - // device. - return getInfo(param_value_size, param_value, param_value_size_ret, ""); - } - case PI_DEVICE_INFO_PLATFORM: { - return getInfo(param_value_size, param_value, param_value_size_ret, - device->get_platform()); - } - case PI_DEVICE_INFO_NAME: { - static constexpr size_t MAX_DEVICE_NAME_LENGTH = 256u; - char name[MAX_DEVICE_NAME_LENGTH]; - sycl::detail::pi::assertion(cuDeviceGetName(name, MAX_DEVICE_NAME_LENGTH, - device->get()) == CUDA_SUCCESS); - return getInfoArray(strlen(name) + 1, param_value_size, param_value, - param_value_size_ret, name); - } - case PI_DEVICE_INFO_VENDOR: { - return getInfo(param_value_size, param_value, param_value_size_ret, - "NVIDIA Corporation"); - } - case PI_DEVICE_INFO_DRIVER_VERSION: { - auto version = getCudaVersionString(); - return getInfo(param_value_size, param_value, param_value_size_ret, - version.c_str()); - } - case PI_DEVICE_INFO_PROFILE: { - return getInfo(param_value_size, param_value, param_value_size_ret, "CUDA"); - } - case PI_DEVICE_INFO_REFERENCE_COUNT: { - return getInfo(param_value_size, param_value, param_value_size_ret, - device->get_reference_count()); - } - case PI_DEVICE_INFO_VERSION: { - std::stringstream s; - int major; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - device->get()) == CUDA_SUCCESS); - s << major; - - int minor; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&minor, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, - device->get()) == CUDA_SUCCESS); - s << "." << minor; - return getInfo(param_value_size, param_value, param_value_size_ret, - s.str().c_str()); - } - case PI_DEVICE_INFO_OPENCL_C_VERSION: { - return getInfo(param_value_size, param_value, param_value_size_ret, ""); - } - case PI_DEVICE_INFO_EXTENSIONS: { - - std::string SupportedExtensions = "cl_khr_fp64 cl_khr_subgroups "; - SupportedExtensions += PI_DEVICE_INFO_EXTENSION_DEVICELIB_ASSERT; - SupportedExtensions += " "; - - int major = 0; - int minor = 0; - - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&minor, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, - device->get()) == CUDA_SUCCESS); - - if ((major >= 6) || ((major == 5) && (minor >= 3))) { - SupportedExtensions += "cl_khr_fp16 "; - } - - return getInfo(param_value_size, param_value, param_value_size_ret, - SupportedExtensions.c_str()); - } - case PI_DEVICE_INFO_PRINTF_BUFFER_SIZE: { - // The minimum value for the FULL profile is 1 MB. - return getInfo(param_value_size, param_value, param_value_size_ret, - size_t{1024u}); - } - case PI_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC: { - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_TRUE); - } - case PI_DEVICE_INFO_PARENT_DEVICE: { - return getInfo(param_value_size, param_value, param_value_size_ret, - nullptr); - } - case PI_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES: { - return getInfo(param_value_size, param_value, param_value_size_ret, 0u); - } - case PI_DEVICE_INFO_PARTITION_PROPERTIES: { - return getInfo(param_value_size, param_value, param_value_size_ret, - static_cast(0u)); - } - case PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN: { - return getInfo(param_value_size, param_value, param_value_size_ret, 0u); - } - case PI_DEVICE_INFO_PARTITION_TYPE: { - return getInfo(param_value_size, param_value, param_value_size_ret, - static_cast(0u)); - } - - // Intel USM extensions - - case PI_DEVICE_INFO_USM_HOST_SUPPORT: { - // from cl_intel_unified_shared_memory: "The host memory access capabilities - // apply to any host allocation." - // - // query if/how the device can access page-locked host memory, possibly - // through PCIe, using the same pointer as the host - pi_bitfield value = {}; - if (getAttribute(device, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)) { - // the device shares a unified address space with the host - if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= - 6) { - // compute capability 6.x introduces operations that are atomic with - // respect to other CPUs and GPUs in the system - value = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS | - PI_USM_CONCURRENT_ACCESS | PI_USM_CONCURRENT_ATOMIC_ACCESS; - } else { - // on GPU architectures with compute capability lower than 6.x, atomic - // operations from the GPU to CPU memory will not be atomic with respect - // to CPU initiated atomic operations - value = PI_USM_ACCESS | PI_USM_CONCURRENT_ACCESS; - } - } - return getInfo(param_value_size, param_value, param_value_size_ret, value); - } - case PI_DEVICE_INFO_USM_DEVICE_SUPPORT: { - // from cl_intel_unified_shared_memory: - // "The device memory access capabilities apply to any device allocation - // associated with this device." - // - // query how the device can access memory allocated on the device itself (?) - pi_bitfield value = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS | - PI_USM_CONCURRENT_ACCESS | - PI_USM_CONCURRENT_ATOMIC_ACCESS; - return getInfo(param_value_size, param_value, param_value_size_ret, value); - } - case PI_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT: { - // from cl_intel_unified_shared_memory: - // "The single device shared memory access capabilities apply to any shared - // allocation associated with this device." - // - // query if/how the device can access managed memory associated to it - pi_bitfield value = {}; - if (getAttribute(device, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) { - // the device can allocate managed memory on this system - value = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS; - } - if (getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { - // the device can coherently access managed memory concurrently with the - // CPU - value |= PI_USM_CONCURRENT_ACCESS; - if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= - 6) { - // compute capability 6.x introduces operations that are atomic with - // respect to other CPUs and GPUs in the system - value |= PI_USM_CONCURRENT_ATOMIC_ACCESS; - } - } - return getInfo(param_value_size, param_value, param_value_size_ret, value); - } - case PI_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT: { - // from cl_intel_unified_shared_memory: - // "The cross-device shared memory access capabilities apply to any shared - // allocation associated with this device, or to any shared memory - // allocation on another device that also supports the same cross-device - // shared memory access capability." - // - // query if/how the device can access managed memory associated to other - // devices - pi_bitfield value = {}; - if (getAttribute(device, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) { - // the device can allocate managed memory on this system - value |= PI_USM_ACCESS; - } - if (getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { - // all devices with the CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS - // attribute can coherently access managed memory concurrently with the - // CPU - value |= PI_USM_CONCURRENT_ACCESS; - } - if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= - 6) { - // compute capability 6.x introduces operations that are atomic with - // respect to other CPUs and GPUs in the system - if (value & PI_USM_ACCESS) - value |= PI_USM_ATOMIC_ACCESS; - if (value & PI_USM_CONCURRENT_ACCESS) - value |= PI_USM_CONCURRENT_ATOMIC_ACCESS; - } - return getInfo(param_value_size, param_value, param_value_size_ret, value); - } - case PI_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: { - // from cl_intel_unified_shared_memory: - // "The shared system memory access capabilities apply to any allocations - // made by a system allocator, such as malloc or new." - // - // query if/how the device can access pageable host memory allocated by the - // system allocator - pi_bitfield value = {}; - if (getAttribute(device, CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS)) { - // the device suppports coherently accessing pageable memory without - // calling cuMemHostRegister/cudaHostRegister on it - if (getAttribute(device, - CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED)) { - // the link between the device and the host supports native atomic - // operations - value = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS | - PI_USM_CONCURRENT_ACCESS | PI_USM_CONCURRENT_ATOMIC_ACCESS; - } else { - // the link between the device and the host does not support native - // atomic operations - value = PI_USM_ACCESS | PI_USM_CONCURRENT_ACCESS; - } - } - return getInfo(param_value_size, param_value, param_value_size_ret, value); - } - case PI_EXT_ONEAPI_DEVICE_INFO_CUDA_ASYNC_BARRIER: { - int value = - getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 8; - return getInfo(param_value_size, param_value, param_value_size_ret, value); - } - case PI_DEVICE_INFO_BACKEND_VERSION: { - int major = - getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); - int minor = - getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); - std::string result = std::to_string(major) + "." + std::to_string(minor); - return getInfo(param_value_size, param_value, param_value_size_ret, - result.c_str()); - } - - case PI_EXT_INTEL_DEVICE_INFO_FREE_MEMORY: { - size_t FreeMemory = 0; - size_t TotalMemory = 0; - sycl::detail::pi::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) == - CUDA_SUCCESS, - "failed cuMemGetInfo() API."); - return getInfo(param_value_size, param_value, param_value_size_ret, - FreeMemory); - } - case PI_EXT_INTEL_DEVICE_INFO_MEMORY_CLOCK_RATE: { - int value = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion(value >= 0); - // Convert kilohertz to megahertz when returning. - return getInfo(param_value_size, param_value, param_value_size_ret, - value / 1000); - } - case PI_EXT_INTEL_DEVICE_INFO_MEMORY_BUS_WIDTH: { - int value = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&value, - CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion(value >= 0); - return getInfo(param_value_size, param_value, param_value_size_ret, value); - } - case PI_EXT_INTEL_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES: { - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_int32{1}); - } - - case PI_DEVICE_INFO_DEVICE_ID: { - int value = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion(value >= 0); - return getInfo(param_value_size, param_value, param_value_size_ret, value); - } - - case PI_DEVICE_INFO_UUID: { - CUuuid uuid; -#if (CUDA_VERSION >= 11040) - sycl::detail::pi::assertion(cuDeviceGetUuid_v2(&uuid, device->get()) == - CUDA_SUCCESS); -#else - sycl::detail::pi::assertion(cuDeviceGetUuid(&uuid, device->get()) == - CUDA_SUCCESS); -#endif - std::array name; - std::copy(uuid.bytes, uuid.bytes + 16, name.begin()); - return getInfoArray(16, param_value_size, param_value, param_value_size_ret, - name.data()); - } - - case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH: { - int major = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - device->get()) == CUDA_SUCCESS); - - int minor = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&minor, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, - device->get()) == CUDA_SUCCESS); - - // Some specific devices seem to need special handling. See reference - // https://github.com/jeffhammond/HPCInfo/blob/master/cuda/gpu-detect.cu - bool is_xavier_agx = major == 7 && minor == 2; - bool is_orin_agx = major == 8 && minor == 7; - - int memory_clock_khz = 0; - if (is_xavier_agx) { - memory_clock_khz = 2133000; - } else if (is_orin_agx) { - memory_clock_khz = 3200000; - } else { - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&memory_clock_khz, - CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, - device->get()) == CUDA_SUCCESS); - } - - int memory_bus_width = 0; - if (is_orin_agx) { - memory_bus_width = 256; - } else { - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&memory_bus_width, - CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, - device->get()) == CUDA_SUCCESS); - } - - uint64_t memory_bandwidth = - uint64_t(memory_clock_khz) * memory_bus_width * 250; - - return getInfo(param_value_size, param_value, param_value_size_ret, - memory_bandwidth); - } - case PI_EXT_INTEL_DEVICE_INFO_MEM_CHANNEL_SUPPORT: { - // The mem-channel buffer property is not supported on CUDA devices. - return getInfo(param_value_size, param_value, param_value_size_ret, - false); - } - case PI_DEVICE_INFO_IMAGE_SRGB: { - // The sRGB images are not supported on CUDA. - return getInfo(param_value_size, param_value, param_value_size_ret, - false); - } - - case PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP: { - // Maximum number of 32-bit registers available to a thread block. - // Note: This number is shared by all thread blocks simultaneously resident - // on a multiprocessor. - int max_registers{-1}; - PI_CHECK_ERROR(cuDeviceGetAttribute( - &max_registers, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, - device->get())); - - sycl::detail::pi::assertion(max_registers >= 0); - - return getInfo(param_value_size, param_value, param_value_size_ret, - static_cast(max_registers)); - } - - case PI_DEVICE_INFO_PCI_ADDRESS: { - constexpr size_t AddressBufferSize = 13; - char AddressBuffer[AddressBufferSize]; - sycl::detail::pi::assertion( - cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, device->get()) == - CUDA_SUCCESS); - // CUDA API (8.x - 12.1) guarantees 12 bytes + \0 are written - sycl::detail::pi::assertion(strnlen(AddressBuffer, AddressBufferSize) == - 12); - return getInfoArray(strnlen(AddressBuffer, AddressBufferSize - 1) + 1, - param_value_size, param_value, param_value_size_ret, - AddressBuffer); - } - // TODO: Investigate if this information is available on CUDA. - case PI_DEVICE_INFO_GPU_EU_COUNT: - case PI_DEVICE_INFO_GPU_EU_SIMD_WIDTH: - case PI_DEVICE_INFO_GPU_SLICES: - case PI_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE: - case PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE: - case PI_DEVICE_INFO_GPU_HW_THREADS_PER_EU: - return PI_ERROR_INVALID_VALUE; - - default: - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); - } - sycl::detail::pi::die("Device info request not implemented"); - return {}; -} - -/// Gets the native CUDA handle of a PI device object -/// -/// \param[in] device The PI device to get the native CUDA object of. -/// \param[out] nativeHandle Set to the native handle of the PI device object. -/// -/// \return PI_SUCCESS -pi_result cuda_piextDeviceGetNativeHandle(pi_device device, - pi_native_handle *nativeHandle) { - *nativeHandle = static_cast(device->get()); - return PI_SUCCESS; -} - -/// Created a PI device object from a CUDA device handle. -/// NOTE: The created PI object does not take ownership of the native handle. -/// -/// \param[in] nativeHandle The native handle to create PI device object from. -/// \param[in] platform is the PI platform of the device. -/// \param[out] device Set to the PI device object created from native handle. -/// -/// \return TBD -pi_result cuda_piextDeviceCreateWithNativeHandle(pi_native_handle nativeHandle, - pi_platform platform, - pi_device *piDevice) { - assert(piDevice != nullptr); - - CUdevice cu_device = static_cast(nativeHandle); - - auto is_device = [=](std::unique_ptr<_pi_device> &dev) { - return dev->get() == cu_device; - }; - - // If a platform is provided just check if the device is in it - if (platform) { - auto search_res = std::find_if(begin(platform->devices_), - end(platform->devices_), is_device); - if (search_res != end(platform->devices_)) { - *piDevice = (*search_res).get(); - return PI_SUCCESS; - } - } - - // Get list of platforms - pi_uint32 num_platforms; - pi_result result = cuda_piPlatformsGet(0, nullptr, &num_platforms); - if (result != PI_SUCCESS) - return result; - - pi_platform *plat = - static_cast(malloc(num_platforms * sizeof(pi_platform))); - result = cuda_piPlatformsGet(num_platforms, plat, nullptr); - if (result != PI_SUCCESS) - return result; - - // Iterate through platforms to find device that matches nativeHandle - for (pi_uint32 j = 0; j < num_platforms; ++j) { - auto search_res = std::find_if(begin(plat[j]->devices_), - end(plat[j]->devices_), is_device); - if (search_res != end(plat[j]->devices_)) { - *piDevice = (*search_res).get(); - return PI_SUCCESS; - } - } - - // If the provided nativeHandle cannot be matched to an - // existing device return error - return PI_ERROR_INVALID_OPERATION; -} - -/* Context APIs */ - -/// Create a PI CUDA context. -/// -/// By default creates a scoped context and keeps the last active CUDA context -/// on top of the CUDA context stack. -/// With the __SYCL_PI_CONTEXT_PROPERTIES_CUDA_PRIMARY key/id and a value of -/// PI_TRUE creates a primary CUDA context and activates it on the CUDA context -/// stack. -/// -/// \param[in] properties 0 terminated array of key/id-value combinations. Can -/// be nullptr. Only accepts property key/id -/// __SYCL_PI_CONTEXT_PROPERTIES_CUDA_PRIMARY with a pi_bool value. -/// \param[in] num_devices Number of devices to create the context for. -/// \param[in] devices Devices to create the context for. -/// \param[in] pfn_notify Callback, currently unused. -/// \param[in] user_data User data for callback. -/// \param[out] retcontext Set to created context on success. -/// -/// \return PI_SUCCESS on success, otherwise an error return code. -pi_result cuda_piContextCreate( - [[maybe_unused]] const pi_context_properties *properties, - [[maybe_unused]] pi_uint32 num_devices, const pi_device *devices, - [[maybe_unused]] void (*pfn_notify)(const char *errinfo, - const void *private_info, size_t cb, - void *user_data), - [[maybe_unused]] void *user_data, pi_context *retcontext) { - - assert(devices != nullptr); - // TODO: How to implement context callback? - assert(pfn_notify == nullptr); - assert(user_data == nullptr); - assert(num_devices == 1); - // Need input context - assert(retcontext != nullptr); - pi_result errcode_ret = PI_SUCCESS; - - std::unique_ptr<_pi_context> piContextPtr{nullptr}; - try { - piContextPtr = std::unique_ptr<_pi_context>(new _pi_context{*devices}); - *retcontext = piContextPtr.release(); - } catch (pi_result err) { - errcode_ret = err; - } catch (...) { - errcode_ret = PI_ERROR_OUT_OF_RESOURCES; - } - return errcode_ret; -} - -pi_result cuda_piContextRelease(pi_context ctxt) { - assert(ctxt != nullptr); - - if (ctxt->decrement_reference_count() > 0) { - return PI_SUCCESS; - } - ctxt->invoke_extended_deleters(); - - std::unique_ptr<_pi_context> context{ctxt}; - - return PI_SUCCESS; -} - -/// Gets the native CUDA handle of a PI context object -/// -/// \param[in] context The PI context to get the native CUDA object of. -/// \param[out] nativeHandle Set to the native handle of the PI context object. -/// -/// \return PI_SUCCESS -pi_result cuda_piextContextGetNativeHandle(pi_context context, - pi_native_handle *nativeHandle) { - *nativeHandle = reinterpret_cast(context->get()); - return PI_SUCCESS; -} - -/// Created a PI context object from a CUDA context handle. -/// NOTE: The created PI object does not take ownership of the native handle. -/// -/// \param[in] nativeHandle The native handle to create PI context object from. -/// \param[out] context Set to the PI context object created from native handle. -/// -/// \return TBD -pi_result cuda_piextContextCreateWithNativeHandle(pi_native_handle nativeHandle, - pi_uint32 num_devices, - const pi_device *devices, - bool ownNativeHandle, - pi_context *piContext) { - (void)nativeHandle; - (void)num_devices; - (void)devices; - (void)ownNativeHandle; - (void)piContext; - assert(piContext != nullptr); - assert(ownNativeHandle == false); - - return PI_ERROR_INVALID_OPERATION; -} - /// Creates a PI Memory object using a CUDA memory allocation. /// Can trigger a manual copy depending on the mode. /// \TODO Implement USE_HOST_PTR using cuHostRegister @@ -2874,7 +1449,7 @@ pi_result cuda_piextQueueCreateWithNativeHandle( *queue = new _pi_queue{std::move(computeCuStreams), std::move(transferCuStreams), context, - context->get_device(), + reinterpret_cast(context->get_device()), properties, flags, /*backend_owns*/ false}; @@ -5389,7 +3964,8 @@ pi_result cuda_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event) { - pi_device device = queue->get_context()->get_device(); + pi_device device = + reinterpret_cast(queue->get_context()->get_device()); // Certain cuda devices and Windows do not have support for some Unified // Memory features. cuMemPrefetchAsync requires concurrent memory access @@ -5459,7 +4035,8 @@ pi_result cuda_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr, advice == PI_MEM_ADVICE_CUDA_SET_ACCESSED_BY || advice == PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY || advice == PI_MEM_ADVICE_RESET) { - pi_device device = queue->get_context()->get_device(); + pi_device device = + reinterpret_cast(queue->get_context()->get_device()); if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { setErrorMessage("Mem advise ignored as device does not support " "concurrent managed access", @@ -5727,10 +4304,12 @@ pi_result cuda_piextUSMGetMemAllocInfo(pi_context context, const void *ptr, // the same index std::vector platforms; platforms.resize(device_idx + 1); - result = cuda_piPlatformsGet(device_idx + 1, platforms.data(), nullptr); + result = pi2ur::piPlatformsGet(device_idx + 1, platforms.data(), nullptr); // get the device from the platform - pi_device device = platforms[device_idx]->devices_[0].get(); + // TODO(ur): Remove cast when this entry point is moved to UR + pi_device device = + reinterpret_cast(platforms[device_idx]->devices_[0].get()); return getInfo(param_value_size, param_value, param_value_size_ret, device); } @@ -5915,28 +4494,28 @@ pi_result piPluginInit(pi_plugin *PluginInit) { (PluginInit->PiFunctionTable).pi_api = (decltype(&::pi_api))(&cuda_api); // Platform - _PI_CL(piPlatformsGet, cuda_piPlatformsGet) - _PI_CL(piPlatformGetInfo, cuda_piPlatformGetInfo) + _PI_CL(piPlatformsGet, pi2ur::piPlatformsGet) + _PI_CL(piPlatformGetInfo, pi2ur::piPlatformGetInfo) // Device - _PI_CL(piDevicesGet, cuda_piDevicesGet) - _PI_CL(piDeviceGetInfo, cuda_piDeviceGetInfo) - _PI_CL(piDevicePartition, cuda_piDevicePartition) - _PI_CL(piDeviceRetain, cuda_piDeviceRetain) - _PI_CL(piDeviceRelease, cuda_piDeviceRelease) + _PI_CL(piDevicesGet, pi2ur::piDevicesGet) + _PI_CL(piDeviceGetInfo, pi2ur::piDeviceGetInfo) + _PI_CL(piDevicePartition, pi2ur::piDevicePartition) + _PI_CL(piDeviceRetain, pi2ur::piDeviceRetain) + _PI_CL(piDeviceRelease, pi2ur::piDeviceRelease) _PI_CL(piextDeviceSelectBinary, cuda_piextDeviceSelectBinary) _PI_CL(piextGetDeviceFunctionPointer, cuda_piextGetDeviceFunctionPointer) - _PI_CL(piextDeviceGetNativeHandle, cuda_piextDeviceGetNativeHandle) + _PI_CL(piextDeviceGetNativeHandle, pi2ur::piextDeviceGetNativeHandle) _PI_CL(piextDeviceCreateWithNativeHandle, - cuda_piextDeviceCreateWithNativeHandle) + pi2ur::piextDeviceCreateWithNativeHandle) // Context - _PI_CL(piextContextSetExtendedDeleter, cuda_piextContextSetExtendedDeleter) - _PI_CL(piContextCreate, cuda_piContextCreate) - _PI_CL(piContextGetInfo, cuda_piContextGetInfo) - _PI_CL(piContextRetain, cuda_piContextRetain) - _PI_CL(piContextRelease, cuda_piContextRelease) - _PI_CL(piextContextGetNativeHandle, cuda_piextContextGetNativeHandle) + _PI_CL(piextContextSetExtendedDeleter, pi2ur::piextContextSetExtendedDeleter) + _PI_CL(piContextCreate, pi2ur::piContextCreate) + _PI_CL(piContextGetInfo, pi2ur::piContextGetInfo) + _PI_CL(piContextRetain, pi2ur::piContextRetain) + _PI_CL(piContextRelease, pi2ur::piContextRelease) + _PI_CL(piextContextGetNativeHandle, pi2ur::piextContextGetNativeHandle) _PI_CL(piextContextCreateWithNativeHandle, - cuda_piextContextCreateWithNativeHandle) + pi2ur::piextContextCreateWithNativeHandle) // Queue _PI_CL(piQueueCreate, cuda_piQueueCreate) _PI_CL(piextQueueCreate, cuda_piextQueueCreate) diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index a4864cf673392..f6a95ff8d0ab5 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -42,13 +42,16 @@ #include #include +#include +#include +#include + +// Share code between the PI Plugin and UR Adapter +#include + extern "C" { /// \cond IGNORE_BLOCK_IN_DOXYGEN -pi_result cuda_piContextRetain(pi_context); -pi_result cuda_piContextRelease(pi_context); -pi_result cuda_piDeviceRelease(pi_device); -pi_result cuda_piDeviceRetain(pi_device); pi_result cuda_piProgramRetain(pi_program); pi_result cuda_piProgramRelease(pi_program); pi_result cuda_piQueueRelease(pi_queue); @@ -71,8 +74,8 @@ using _pi_stream_guard = std::unique_lock; /// available devices since initialization is done /// when devices are used. /// -struct _pi_platform { - std::vector> devices_; +struct _pi_platform : ur_platform_handle_t_ { + using ur_platform_handle_t_::ur_platform_handle_t_; }; /// PI device mapping to a CUdevice. @@ -80,53 +83,8 @@ struct _pi_platform { /// and implements the reference counting semantics since /// CUDA objects are not refcounted. /// -struct _pi_device { -private: - using native_type = CUdevice; - - native_type cuDevice_; - CUcontext cuContext_; - CUevent evBase_; // CUDA event used as base counter - std::atomic_uint32_t refCount_; - pi_platform platform_; - - static constexpr pi_uint32 max_work_item_dimensions = 3u; - size_t max_work_item_sizes[max_work_item_dimensions]; - int max_work_group_size; - -public: - _pi_device(native_type cuDevice, CUcontext cuContext, CUevent evBase, - pi_platform platform) - : cuDevice_(cuDevice), cuContext_(cuContext), - evBase_(evBase), refCount_{1}, platform_(platform) {} - - ~_pi_device() { cuDevicePrimaryCtxRelease(cuDevice_); } - - native_type get() const noexcept { return cuDevice_; }; - - CUcontext get_context() const noexcept { return cuContext_; }; - - pi_uint32 get_reference_count() const noexcept { return refCount_; } - - pi_platform get_platform() const noexcept { return platform_; }; - - pi_uint64 get_elapsed_time(CUevent) const; - - void save_max_work_item_sizes(size_t size, - size_t *save_max_work_item_sizes) noexcept { - memcpy(max_work_item_sizes, save_max_work_item_sizes, size); - }; - - void save_max_work_group_size(int value) noexcept { - max_work_group_size = value; - }; - - void get_max_work_item_sizes(size_t ret_size, - size_t *ret_max_work_item_sizes) const noexcept { - memcpy(ret_max_work_item_sizes, max_work_item_sizes, ret_size); - }; - - int get_max_work_group_size() const noexcept { return max_work_group_size; }; +struct _pi_device : ur_device_handle_t_ { + using ur_device_handle_t_::ur_device_handle_t_; }; /// PI context mapping to a CUDA context object. @@ -167,54 +125,8 @@ struct _pi_device { /// called upon destruction of the PI Context. /// See proposal for details. /// -struct _pi_context { - - struct deleter_data { - pi_context_extended_deleter function; - void *user_data; - - void operator()() { function(user_data); } - }; - - using native_type = CUcontext; - - native_type cuContext_; - _pi_device *deviceId_; - std::atomic_uint32_t refCount_; - - _pi_context(_pi_device *devId) - : cuContext_{devId->get_context()}, deviceId_{devId}, refCount_{1} { - cuda_piDeviceRetain(deviceId_); - }; - - ~_pi_context() { cuda_piDeviceRelease(deviceId_); } - - void invoke_extended_deleters() { - std::lock_guard guard(mutex_); - for (auto &deleter : extended_deleters_) { - deleter(); - } - } - - void set_extended_deleter(pi_context_extended_deleter function, - void *user_data) { - std::lock_guard guard(mutex_); - extended_deleters_.emplace_back(deleter_data{function, user_data}); - } - - pi_device get_device() const noexcept { return deviceId_; } - - native_type get() const noexcept { return cuContext_; } - - pi_uint32 increment_reference_count() noexcept { return ++refCount_; } - - pi_uint32 decrement_reference_count() noexcept { return --refCount_; } - - pi_uint32 get_reference_count() const noexcept { return refCount_; } - -private: - std::mutex mutex_; - std::vector extended_deleters_; +struct _pi_context : ur_context_handle_t_ { + using ur_context_handle_t_::ur_context_handle_t_; }; /// PI Mem mapping to CUDA memory allocations, both data and texture/surface. @@ -345,7 +257,7 @@ struct _pi_mem { if (is_sub_buffer()) { cuda_piMemRetain(mem_.buffer_mem_.parent_); } else { - cuda_piContextRetain(context_); + pi2ur::piContextRetain(context_); } }; @@ -359,7 +271,7 @@ struct _pi_mem { mem_.surface_mem_.array_ = array; mem_.surface_mem_.surfObj_ = surf; mem_.surface_mem_.imageType_ = image_type; - cuda_piContextRetain(context_); + pi2ur::piContextRetain(context_); } ~_pi_mem() { @@ -369,7 +281,7 @@ struct _pi_mem { return; } } - cuda_piContextRelease(context_); + pi2ur::piContextRelease(context_); } // TODO: Move as many shared funcs up as possible @@ -444,13 +356,13 @@ struct _pi_queue { num_compute_streams_{0}, num_transfer_streams_{0}, last_sync_compute_streams_{0}, last_sync_transfer_streams_{0}, flags_(flags), has_ownership_{backend_owns} { - cuda_piContextRetain(context_); - cuda_piDeviceRetain(device_); + pi2ur::piContextRetain(context_); + pi2ur::piDeviceRetain(device_); } ~_pi_queue() { - cuda_piContextRelease(context_); - cuda_piDeviceRelease(device_); + pi2ur::piContextRelease(context_); + pi2ur::piDeviceRelease(device_); } void compute_stream_wait_for_barrier_if_needed(CUstream stream, @@ -917,10 +829,11 @@ struct _pi_kernel { : function_{func}, functionWithOffsetParam_{funcWithOffsetParam}, name_{name}, context_{ctxt}, program_{program}, refCount_{1} { cuda_piProgramRetain(program_); - cuda_piContextRetain(context_); + pi2ur::piContextRetain(context_); /// Note: this code assumes that there is only one device per context pi_result retError = cuda_piKernelGetGroupInfo( - this, ctxt->get_device(), PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, + this, reinterpret_cast(ctxt->get_device()), + PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, sizeof(reqdThreadsPerBlock_), reqdThreadsPerBlock_, nullptr); (void)retError; assert(retError == PI_SUCCESS); @@ -928,7 +841,7 @@ struct _pi_kernel { ~_pi_kernel() { cuda_piProgramRelease(program_); - cuda_piContextRelease(context_); + pi2ur::piContextRelease(context_); } pi_program get_program() const noexcept { return program_; } diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 6ed2b57fcd4ce..bec6aed6131c8 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -116,6 +116,7 @@ add_sycl_library("ur_adapter_level_zero" SHARED LevelZeroLoader-Headers LevelZeroLoader Threads::Threads + cudadrv ) set_target_properties("ur_adapter_level_zero" PROPERTIES @@ -130,13 +131,21 @@ add_sycl_library("ur_adapter_cuda" SHARED "ur/ur.cpp" "ur/usm_allocator.cpp" "ur/usm_allocator.hpp" + "ur/adapters/cuda/common.cpp" + "ur/adapters/cuda/common.hpp" + "ur/adapters/cuda/context.cpp" + "ur/adapters/cuda/context.hpp" + "ur/adapters/cuda/device.cpp" + "ur/adapters/cuda/device.hpp" + "ur/adapters/cuda/platform.cpp" + "ur/adapters/cuda/platform.hpp" "ur/adapters/cuda/ur_interface_loader.cpp" - # --- INCLUDE_DIRS ${sycl_inc_dir} LIBRARIES UnifiedRuntime-Headers Threads::Threads + cudadrv ) if (TARGET UnifiedRuntimeLoader) diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 4ba4104ce6c3a..2408fa452351f 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -1017,6 +1017,10 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, InfoType = UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION; break; } + case PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP: { + InfoType = UR_EXT_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP; + break; + } default: return PI_ERROR_UNKNOWN; }; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp new file mode 100644 index 0000000000000..264d7588f3229 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp @@ -0,0 +1,87 @@ +//===--------- common.cpp - CUDA Adapter -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include "common.hpp" + +#include + +#include + +ur_result_t map_error_ur(CUresult result) { + switch (result) { + case CUDA_SUCCESS: + return UR_RESULT_SUCCESS; + case CUDA_ERROR_NOT_PERMITTED: + return UR_RESULT_ERROR_INVALID_OPERATION; + case CUDA_ERROR_INVALID_CONTEXT: + return UR_RESULT_ERROR_INVALID_CONTEXT; + case CUDA_ERROR_INVALID_DEVICE: + return UR_RESULT_ERROR_INVALID_DEVICE; + case CUDA_ERROR_INVALID_VALUE: + return UR_RESULT_ERROR_INVALID_VALUE; + case CUDA_ERROR_OUT_OF_MEMORY: + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + default: + return UR_RESULT_ERROR_UNKNOWN; + } +} + +ur_result_t check_error_ur(CUresult result, const char *function, int line, + const char *file) { + if (result == CUDA_SUCCESS || result == CUDA_ERROR_DEINITIALIZED) { + return UR_RESULT_SUCCESS; + } + + if (std::getenv("SYCL_PI_SUPPRESS_ERROR_MESSAGE") == nullptr) { + const char *errorString = nullptr; + const char *errorName = nullptr; + cuGetErrorName(result, &errorName); + cuGetErrorString(result, &errorString); + std::stringstream ss; + ss << "\nUR CUDA ERROR:" + << "\n\tValue: " << result + << "\n\tName: " << errorName + << "\n\tDescription: " << errorString + << "\n\tFunction: " << function << "\n\tSource Location: " << file + << ":" << line << "\n" + << std::endl; + std::cerr << ss.str(); + } + + if (std::getenv("PI_CUDA_ABORT") != nullptr) { + std::abort(); + } + + throw map_error_ur(result); +} + +std::string getCudaVersionString() { + int driver_version = 0; + cuDriverGetVersion(&driver_version); + // The version is returned as (1000 major + 10 minor). + std::stringstream stream; + stream << "CUDA " << driver_version / 1000 << "." + << driver_version % 1000 / 10; + return stream.str(); +} + +void sycl::detail::ur::die(const char *Message) { + std::cerr << "ur_die: " << Message << std::endl; + std::terminate(); +} + +void sycl::detail::ur::assertion(bool Condition, const char *Message) { + if (!Condition) + die(Message); +} + +void sycl::detail::ur::cuPrint(const char *Message) { + std::cerr << "ur_print: " << Message << std::endl; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp new file mode 100644 index 0000000000000..16cabc37a2b16 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp @@ -0,0 +1,51 @@ +//===--------- common.hpp - CUDA Adapter -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// +#pragma once + +#include +#include +#include + +ur_result_t map_error_ur(CUresult result); + +/// Converts CUDA error into UR error codes, and outputs error information +/// to stderr. +/// If PI_CUDA_ABORT env variable is defined, it aborts directly instead of +/// throwing the error. This is intended for debugging purposes. +/// \return UR_RESULT_SUCCESS if \param result was CUDA_SUCCESS. +/// \throw ur_result_t exception (integer) if input was not success. +/// +ur_result_t check_error_ur(CUresult result, const char *function, int line, + const char *file); + +#define UR_CHECK_ERROR(result) \ + check_error_ur(result, __func__, __LINE__, __FILE__) + +std::string getCudaVersionString(); + +/// ------ Error handling, matching OpenCL plugin semantics. +namespace sycl { +__SYCL_INLINE_VER_NAMESPACE(_V1) { +namespace detail { +namespace ur { + +// Report error and no return (keeps compiler from printing warnings). +// TODO: Probably change that to throw a catchable exception, +// but for now it is useful to see every failure. +// +[[noreturn]] void die(const char *Message); + +// Reports error messages +void cuPrint(const char *Message); + +void assertion(bool Condition, const char *Message = nullptr); + +} // namespace ur +} // namespace detail +} // __SYCL_INLINE_VER_NAMESPACE(_V1) +} // namespace sycl diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp new file mode 100644 index 0000000000000..a84d4c71c8dd2 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp @@ -0,0 +1,151 @@ +//===--------- context.cpp - CUDA Adapter ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include "context.hpp" + +#include + +/// Create a UR CUDA context. +/// +/// By default creates a scoped context and keeps the last active CUDA context +/// on top of the CUDA context stack. +/// With the __SYCL_PI_CONTEXT_PROPERTIES_CUDA_PRIMARY key/id and a value of +/// PI_TRUE creates a primary CUDA context and activates it on the CUDA context +/// stack. +/// +UR_APIEXPORT ur_result_t UR_APICALL +urContextCreate(uint32_t DeviceCount, const ur_device_handle_t *phDevices, + const ur_context_properties_t *pProperties, + ur_context_handle_t *phContext) { + UR_ASSERT(phDevices, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(phContext, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + assert(DeviceCount == 1); + ur_result_t errcode_ret = UR_RESULT_SUCCESS; + + std::unique_ptr piContextPtr{nullptr}; + try { + piContextPtr = std::unique_ptr( + new ur_context_handle_t_{*phDevices}); + *phContext = piContextPtr.release(); + } catch (ur_result_t err) { + errcode_ret = err; + } catch (...) { + errcode_ret = UR_RESULT_ERROR_OUT_OF_RESOURCES; + } + return errcode_ret; +} + +UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo( + ur_context_handle_t hContext, ur_context_info_t ContextInfoType, + size_t propSize, void *pContextInfo, size_t *pPropSizeRet) { + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + UrReturnHelper ReturnValue(propSize, pContextInfo, pPropSizeRet); + + switch (uint32_t{ContextInfoType}) { + case UR_CONTEXT_INFO_NUM_DEVICES: + return ReturnValue(1); + case UR_CONTEXT_INFO_DEVICES: + return ReturnValue(hContext->get_device()); + case UR_CONTEXT_INFO_REFERENCE_COUNT: + return ReturnValue(hContext->get_reference_count()); + case UR_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: { + uint32_t capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | + UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE | + UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE | + UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL; + return ReturnValue(capabilities); + } + case UR_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: { + int major = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + hContext->get_device()->get()) == CUDA_SUCCESS); + uint32_t capabilities = + (major >= 7) ? UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_SYSTEM + : UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE; + return ReturnValue(capabilities); + } + case UR_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT: + // 2D USM memcpy is supported. + return ReturnValue(true); + case UR_CONTEXT_INFO_USM_FILL2D_SUPPORT: + // 2D USM operations currently not supported. + return ReturnValue(false); + + default: + break; + } + + return UR_RESULT_ERROR_INVALID_ENUMERATION; +} + +UR_APIEXPORT ur_result_t UR_APICALL urContextRelease(ur_context_handle_t ctxt) { + UR_ASSERT(ctxt, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + if (ctxt->decrement_reference_count() > 0) { + return UR_RESULT_SUCCESS; + } + ctxt->invoke_extended_deleters(); + + std::unique_ptr context{ctxt}; + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urContextRetain(ur_context_handle_t ctxt) { + UR_ASSERT(ctxt, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + assert(ctxt->get_reference_count() > 0); + + ctxt->increment_reference_count(); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urContextGetNativeHandle( + ur_context_handle_t hContext, ur_native_handle_t *phNativeContext) { + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(phNativeContext, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + *phNativeContext = reinterpret_cast(hContext->get()); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urContextCreateWithNativeHandle( + ur_native_handle_t hNativeContext, uint32_t numDevices, + const ur_device_handle_t *phDevices, + const ur_context_native_properties_t *pProperties, + ur_context_handle_t *phContext) { + (void)hNativeContext; + (void)phContext; + + // TODO(ur): Needed for the conformance test to pass, but it may be valid + // to have a null CUDA context + UR_ASSERT(hNativeContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + return UR_RESULT_ERROR_INVALID_OPERATION; +} + +UR_APIEXPORT ur_result_t UR_APICALL urContextSetExtendedDeleter( + ur_context_handle_t hContext, ur_context_extended_deleter_t pfnDeleter, + void *pUserData) { + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(pfnDeleter, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + hContext->set_extended_deleter(pfnDeleter, pUserData); + return UR_RESULT_SUCCESS; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp new file mode 100644 index 0000000000000..34575829c318b --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp @@ -0,0 +1,108 @@ +//===--------- context.hpp - CUDA Adapter ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// +#pragma once + +#include +#include + +#include +#include +#include + +// We need this declaration temporarily while UR and PI share ScopedContext +class _pi_context; +using pi_context = _pi_context *; + +#include "common.hpp" +#include "device.hpp" + +typedef void (*ur_context_extended_deleter_t)(void *user_data); + +struct ur_context_handle_t_ { + + struct deleter_data { + ur_context_extended_deleter_t function; + void *user_data; + + void operator()() { function(user_data); } + }; + + using native_type = CUcontext; + + native_type cuContext_; + ur_device_handle_t deviceId_; + std::atomic_uint32_t refCount_; + + ur_context_handle_t_(ur_device_handle_t_ *devId) + : cuContext_{devId->get_context()}, deviceId_{devId}, refCount_{1} { + urDeviceRetain(deviceId_); + }; + + ~ur_context_handle_t_() { urDeviceRelease(deviceId_); } + + void invoke_extended_deleters() { + std::lock_guard guard(mutex_); + for (auto &deleter : extended_deleters_) { + deleter(); + } + } + + void set_extended_deleter(ur_context_extended_deleter_t function, + void *user_data) { + std::lock_guard guard(mutex_); + extended_deleters_.emplace_back(deleter_data{function, user_data}); + } + + ur_device_handle_t get_device() const noexcept { return deviceId_; } + + native_type get() const noexcept { return cuContext_; } + + uint32_t increment_reference_count() noexcept { return ++refCount_; } + + uint32_t decrement_reference_count() noexcept { return --refCount_; } + + uint32_t get_reference_count() const noexcept { return refCount_; } + +private: + std::mutex mutex_; + std::vector extended_deleters_; +}; + +namespace { +class ScopedContext { +public: + // TODO(ur): Needed for compatibility with PI; once the CUDA PI plugin is + // fully moved over we can drop this constructor + ScopedContext(pi_context ctxt); + + ScopedContext(ur_context_handle_t ctxt) { + if (!ctxt) { + throw UR_RESULT_ERROR_INVALID_CONTEXT; + } + + set_context(ctxt->get()); + } + + ScopedContext(CUcontext ctxt) { set_context(ctxt); } + + ~ScopedContext() {} + +private: + void set_context(CUcontext desired) { + CUcontext original = nullptr; + + UR_CHECK_ERROR(cuCtxGetCurrent(&original)); + + // Make sure the desired context is active on the current thread, setting + // it if necessary + if (original != desired) { + UR_CHECK_ERROR(cuCtxSetCurrent(desired)); + } + } +}; +} // namespace diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp new file mode 100644 index 0000000000000..d0b11b23cc74d --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp @@ -0,0 +1,1119 @@ +//===--------- device.cpp - CUDA Adapter -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include +#include + +#include "context.hpp" +#include "device.hpp" +#include "platform.hpp" + +int getAttribute(ur_device_handle_t device, CUdevice_attribute attribute) { + int value; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&value, attribute, device->get()) == CUDA_SUCCESS); + return value; +} + +uint64_t ur_device_handle_t_::get_elapsed_time(CUevent ev) const { + float miliSeconds = 0.0f; + + UR_CHECK_ERROR(cuEventElapsedTime(&miliSeconds, evBase_, ev)); + + return static_cast(miliSeconds * 1.0e6); +} + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, + ur_device_info_t infoType, + size_t propSize, + void *pDeviceInfo, + size_t *pPropSizeRet) { + UR_ASSERT(device, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UrReturnHelper ReturnValue(propSize, pDeviceInfo, pPropSizeRet); + + static constexpr uint32_t max_work_item_dimensions = 3u; + + ScopedContext active(device->get_context()); + + switch ((uint32_t)infoType) { + case UR_DEVICE_INFO_TYPE: { + return ReturnValue(UR_DEVICE_TYPE_GPU); + } + case UR_DEVICE_INFO_VENDOR_ID: { + return ReturnValue(4318u); + } + case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: { + int compute_units = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&compute_units, + CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, + device->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(compute_units >= 0); + return ReturnValue(static_cast(compute_units)); + } + case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: { + return ReturnValue(max_work_item_dimensions); + } + case UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES: { + struct { + size_t sizes[max_work_item_dimensions]; + } return_sizes; + + int max_x = 0, max_y = 0, max_z = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&max_x, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, + device->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(max_x >= 0); + + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&max_y, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, + device->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(max_y >= 0); + + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&max_z, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, + device->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(max_z >= 0); + + return_sizes.sizes[0] = size_t(max_x); + return_sizes.sizes[1] = size_t(max_y); + return_sizes.sizes[2] = size_t(max_z); + return ReturnValue(return_sizes); + } + + case UR_DEVICE_INFO_MAX_WORK_GROUPS_3D: { + struct { + size_t sizes[max_work_item_dimensions]; + } return_sizes; + int max_x = 0, max_y = 0, max_z = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&max_x, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, + device->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(max_x >= 0); + + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&max_y, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, + device->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(max_y >= 0); + + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&max_z, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, + device->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(max_z >= 0); + + return_sizes.sizes[0] = size_t(max_x); + return_sizes.sizes[1] = size_t(max_y); + return_sizes.sizes[2] = size_t(max_z); + return ReturnValue(return_sizes); + } + + case UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE: { + int max_work_group_size = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&max_work_group_size, + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, + device->get()) == CUDA_SUCCESS); + + sycl::detail::ur::assertion(max_work_group_size >= 0); + + return ReturnValue(size_t(max_work_group_size)); + } + case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF: { + return ReturnValue(0u); + } + case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF: { + return ReturnValue(0u); + } + case UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS: { + // Number of sub-groups = max block size / warp size + possible remainder + int max_threads = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&max_threads, + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, + device->get()) == CUDA_SUCCESS); + int warpSize = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, + device->get()) == CUDA_SUCCESS); + int maxWarps = (max_threads + warpSize - 1) / warpSize; + return ReturnValue(maxWarps); + } + case UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: { + // Volta provides independent thread scheduling + // TODO: Revisit for previous generation GPUs + int major = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + device->get()) == CUDA_SUCCESS); + bool ifp = (major >= 7); + return ReturnValue(ifp); + } + + case UR_DEVICE_INFO_ATOMIC_64: { + int major = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + device->get()) == CUDA_SUCCESS); + + bool atomic64 = (major >= 6) ? true : false; + return ReturnValue(atomic64); + } + case UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: { + uint64_t capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | + UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE | + UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE | + UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL; + return ReturnValue(capabilities); + } + case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: { + int major = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + device->get()) == CUDA_SUCCESS); + uint64_t capabilities = + (major >= 7) ? UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_SYSTEM + : UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE; + return ReturnValue(capabilities); + } + + case UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: { + // SYCL2020 4.6.4.2 minimum mandated capabilities for + // atomic_fence_order_capabilities. + ur_memory_order_capability_flags_t capabilities = + UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | + UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE | + UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE | + UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL; + return ReturnValue(capabilities); + } + case UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: { + // SYCL2020 4.6.4.2 minimum mandated capabilities for + // atomic_fence/memory_scope_capabilities. + // Because scopes are hierarchical, wider scopes support all narrower + // scopes. At a minimum, each device must support WORK_ITEM, SUB_GROUP and + // WORK_GROUP. (https://github.com/KhronosGroup/SYCL-Docs/pull/382) + ur_memory_scope_capability_flags_t capabilities = + UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP; + return ReturnValue(capabilities); + } + case UR_DEVICE_INFO_BFLOAT16: { + int major = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + device->get()) == CUDA_SUCCESS); + + bool bfloat16 = (major >= 8) ? true : false; + return ReturnValue(bfloat16); + } + case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: { + // NVIDIA devices only support one sub-group size (the warp size) + int warpSize = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, + device->get()) == CUDA_SUCCESS); + size_t sizes[1] = {static_cast(warpSize)}; + return ReturnValue(sizes, 1); + } + case UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY: { + int clock_freq = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&clock_freq, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, + device->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(clock_freq >= 0); + return ReturnValue(static_cast(clock_freq) / 1000u); + } + case UR_DEVICE_INFO_ADDRESS_BITS: { + auto bits = uint32_t{std::numeric_limits::digits}; + return ReturnValue(bits); + } + case UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: { + // Max size of memory object allocation in bytes. + // The minimum value is max(min(1024 × 1024 × + // 1024, 1/4th of CL_DEVICE_GLOBAL_MEM_SIZE), + // 32 × 1024 × 1024) for devices that are not of type + // CL_DEVICE_TYPE_CUSTOM. + + size_t global = 0; + sycl::detail::ur::assertion(cuDeviceTotalMem(&global, device->get()) == + CUDA_SUCCESS); + + auto quarter_global = static_cast(global / 4u); + + auto max_alloc = std::max(std::min(1024u * 1024u * 1024u, quarter_global), + 32u * 1024u * 1024u); + + return ReturnValue(uint64_t{max_alloc}); + } + case UR_DEVICE_INFO_IMAGE_SUPPORTED: { + bool enabled = false; + + if (std::getenv("SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT") != nullptr) { + enabled = true; + } else { + sycl::detail::ur::cuPrint( + "Images are not fully supported by the CUDA BE, their support is " + "disabled by default. Their partial support can be activated by " + "setting SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT environment variable at " + "runtime."); + } + + return ReturnValue(uint32_t{enabled}); + } + case UR_DEVICE_INFO_MAX_READ_IMAGE_ARGS: { + // This call doesn't match to CUDA as it doesn't have images, but instead + // surfaces and textures. No clear call in the CUDA API to determine this, + // but some searching found as of SM 2.x 128 are supported. + return ReturnValue(128u); + } + case UR_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS: { + // This call doesn't match to CUDA as it doesn't have images, but instead + // surfaces and textures. No clear call in the CUDA API to determine this, + // but some searching found as of SM 2.x 128 are supported. + return ReturnValue(128u); + } + case UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: { + // Take the smaller of maximum surface and maximum texture height. + int tex_height = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&tex_height, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, + device->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(tex_height >= 0); + int surf_height = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&surf_height, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT, + device->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(surf_height >= 0); + + int min = std::min(tex_height, surf_height); + + return ReturnValue(static_cast(min)); + } + case UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH: { + // Take the smaller of maximum surface and maximum texture width. + int tex_width = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&tex_width, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, + device->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(tex_width >= 0); + int surf_width = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&surf_width, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH, + device->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(surf_width >= 0); + + int min = std::min(tex_width, surf_width); + + return ReturnValue(static_cast(min)); + } + case UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT: { + // Take the smaller of maximum surface and maximum texture height. + int tex_height = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&tex_height, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, + device->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(tex_height >= 0); + int surf_height = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&surf_height, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT, + device->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(surf_height >= 0); + + int min = std::min(tex_height, surf_height); + + return ReturnValue(static_cast(min)); + } + case UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH: { + // Take the smaller of maximum surface and maximum texture width. + int tex_width = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&tex_width, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, + device->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(tex_width >= 0); + int surf_width = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&surf_width, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH, + device->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(surf_width >= 0); + + int min = std::min(tex_width, surf_width); + + return ReturnValue(static_cast(min)); + } + case UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH: { + // Take the smaller of maximum surface and maximum texture depth. + int tex_depth = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&tex_depth, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, + device->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(tex_depth >= 0); + int surf_depth = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&surf_depth, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH, + device->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(surf_depth >= 0); + + int min = std::min(tex_depth, surf_depth); + + return ReturnValue(static_cast(min)); + } + case UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE: { + // Take the smaller of maximum surface and maximum texture width. + int tex_width = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&tex_width, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, + device->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(tex_width >= 0); + int surf_width = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&surf_width, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH, + device->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(surf_width >= 0); + + int min = std::min(tex_width, surf_width); + + return ReturnValue(static_cast(min)); + } + case UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE: { + return ReturnValue(0lu); + } + case UR_DEVICE_INFO_MAX_SAMPLERS: { + // This call is kind of meaningless for cuda, as samplers don't exist. + // Closest thing is textures, which is 128. + return ReturnValue(128u); + } + case UR_DEVICE_INFO_MAX_PARAMETER_SIZE: { + // https://docs.nvidia.com/cuda/cuda-c-programming-guide/#function-parameters + // __global__ function parameters are passed to the device via constant + // memory and are limited to 4 KB. + return ReturnValue(4000lu); + } + case UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: { + int mem_base_addr_align = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&mem_base_addr_align, + CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, + device->get()) == CUDA_SUCCESS); + // Multiply by 8 as clGetDeviceInfo returns this value in bits + mem_base_addr_align *= 8; + return ReturnValue(mem_base_addr_align); + } + case UR_DEVICE_INFO_HALF_FP_CONFIG: { + // TODO: is this config consistent across all NVIDIA GPUs? + return ReturnValue(0u); + } + case UR_DEVICE_INFO_SINGLE_FP_CONFIG: { + // TODO: is this config consistent across all NVIDIA GPUs? + uint64_t config = + UR_DEVICE_FP_CAPABILITY_FLAG_DENORM | + UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN | + UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST | + UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO | + UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF | + UR_DEVICE_FP_CAPABILITY_FLAG_FMA | + UR_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT; + return ReturnValue(config); + } + case UR_DEVICE_INFO_DOUBLE_FP_CONFIG: { + // TODO: is this config consistent across all NVIDIA GPUs? + uint64_t config = UR_DEVICE_FP_CAPABILITY_FLAG_DENORM | + UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN | + UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST | + UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO | + UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF | + UR_DEVICE_FP_CAPABILITY_FLAG_FMA; + return ReturnValue(config); + } + case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE: { + // TODO: is this config consistent across all NVIDIA GPUs? + return ReturnValue(UR_DEVICE_MEM_CACHE_TYPE_READ_WRITE_CACHE); + } + case UR_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE: { + // The value is documented for all existing GPUs in the CUDA programming + // guidelines, section "H.3.2. Global Memory". + return ReturnValue(128u); + } + case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE: { + int cache_size = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&cache_size, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, + device->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(cache_size >= 0); + // The L2 cache is global to the GPU. + return ReturnValue(static_cast(cache_size)); + } + case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: { + size_t bytes = 0; + // Runtime API has easy access to this value, driver API info is scarse. + sycl::detail::ur::assertion(cuDeviceTotalMem(&bytes, device->get()) == + CUDA_SUCCESS); + return ReturnValue(uint64_t{bytes}); + } + case UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE: { + int constant_memory = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&constant_memory, + CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, + device->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(constant_memory >= 0); + + return ReturnValue(static_cast(constant_memory)); + } + case UR_DEVICE_INFO_MAX_CONSTANT_ARGS: { + // TODO: is there a way to retrieve this from CUDA driver API? + // Hard coded to value returned by clinfo for OpenCL 1.2 CUDA | GeForce GTX + // 1060 3GB + return ReturnValue(9u); + } + case UR_DEVICE_INFO_LOCAL_MEM_TYPE: { + return ReturnValue(UR_DEVICE_LOCAL_MEM_TYPE_LOCAL); + } + case UR_DEVICE_INFO_LOCAL_MEM_SIZE: { + // OpenCL's "local memory" maps most closely to CUDA's "shared memory". + // CUDA has its own definition of "local memory", which maps to OpenCL's + // "private memory". + int local_mem_size = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&local_mem_size, + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, + device->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(local_mem_size >= 0); + return ReturnValue(static_cast(local_mem_size)); + } + case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: { + int ecc_enabled = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&ecc_enabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, + device->get()) == CUDA_SUCCESS); + + sycl::detail::ur::assertion((ecc_enabled == 0) | (ecc_enabled == 1)); + auto result = static_cast(ecc_enabled); + return ReturnValue(result); + } + case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY: { + int is_integrated = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&is_integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, + device->get()) == CUDA_SUCCESS); + + sycl::detail::ur::assertion((is_integrated == 0) | (is_integrated == 1)); + auto result = static_cast(is_integrated); + return ReturnValue(result); + } + case UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION: { + // Hard coded to value returned by clinfo for OpenCL 1.2 CUDA | GeForce GTX + // 1060 3GB + return ReturnValue(1000lu); + } + case UR_DEVICE_INFO_ENDIAN_LITTLE: { + return ReturnValue(true); + } + case UR_DEVICE_INFO_AVAILABLE: { + return ReturnValue(true); + } + case UR_DEVICE_INFO_BUILD_ON_SUBDEVICE: { + return ReturnValue(true); + } + case UR_DEVICE_INFO_COMPILER_AVAILABLE: { + return ReturnValue(true); + } + case UR_DEVICE_INFO_LINKER_AVAILABLE: { + return ReturnValue(true); + } + case UR_DEVICE_INFO_EXECUTION_CAPABILITIES: { + auto capability = ur_device_exec_capability_flags_t{ + UR_DEVICE_EXEC_CAPABILITY_FLAG_KERNEL}; + return ReturnValue(capability); + } + case UR_DEVICE_INFO_QUEUE_PROPERTIES: + return ReturnValue( + ur_queue_flag_t(UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE | + UR_QUEUE_FLAG_PROFILING_ENABLE)); + case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: { + // The mandated minimum capability: + uint64_t capability = UR_QUEUE_FLAG_PROFILING_ENABLE | + UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE; + return ReturnValue(capability); + } + case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: { + // The mandated minimum capability: + uint64_t capability = UR_QUEUE_FLAG_PROFILING_ENABLE; + return ReturnValue(capability); + } + case UR_DEVICE_INFO_BUILT_IN_KERNELS: { + // An empty string is returned if no built-in kernels are supported by the + // device. + return ReturnValue(""); + } + case UR_DEVICE_INFO_PLATFORM: { + return ReturnValue(device->get_platform()); + } + case UR_DEVICE_INFO_NAME: { + static constexpr size_t MAX_DEVICE_NAME_LENGTH = 256u; + char name[MAX_DEVICE_NAME_LENGTH]; + sycl::detail::ur::assertion(cuDeviceGetName(name, MAX_DEVICE_NAME_LENGTH, + device->get()) == CUDA_SUCCESS); + return ReturnValue(name, strlen(name) + 1); + } + case UR_DEVICE_INFO_VENDOR: { + return ReturnValue("NVIDIA Corporation"); + } + case UR_DEVICE_INFO_DRIVER_VERSION: { + auto version = getCudaVersionString(); + return ReturnValue(version.c_str()); + } + case UR_DEVICE_INFO_PROFILE: { + return ReturnValue("CUDA"); + } + case UR_DEVICE_INFO_REFERENCE_COUNT: { + return ReturnValue(device->get_reference_count()); + } + case UR_DEVICE_INFO_VERSION: { + std::stringstream SS; + int Major; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&Major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + device->get()) == CUDA_SUCCESS); + SS << Major; + int Minor; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&Minor, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, + device->get()) == CUDA_SUCCESS); + SS << "." << Minor; + return ReturnValue(SS.str().c_str()); + } + case UR_EXT_DEVICE_INFO_OPENCL_C_VERSION: { + return ReturnValue(""); + } + case UR_DEVICE_INFO_EXTENSIONS: { + + std::string SupportedExtensions = "cl_khr_fp64 cl_khr_subgroups "; + SupportedExtensions += "pi_ext_intel_devicelib_assert "; + SupportedExtensions += " "; + + int major = 0; + int minor = 0; + + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + device->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&minor, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, + device->get()) == CUDA_SUCCESS); + + if ((major >= 6) || ((major == 5) && (minor >= 3))) { + SupportedExtensions += "cl_khr_fp16 "; + } + + return ReturnValue(SupportedExtensions.c_str()); + } + case UR_DEVICE_INFO_PRINTF_BUFFER_SIZE: { + // The minimum value for the FULL profile is 1 MB. + return ReturnValue(1024lu); + } + case UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC: { + return ReturnValue(true); + } + case UR_DEVICE_INFO_PARENT_DEVICE: { + return ReturnValue(nullptr); + } + case UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES: { + return ReturnValue(0u); + } + case UR_DEVICE_INFO_PARTITION_PROPERTIES: { + return ReturnValue(static_cast(0u)); + } + case UR_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN: { + return ReturnValue(0u); + } + case UR_DEVICE_INFO_PARTITION_TYPE: { + return ReturnValue(static_cast(0u)); + } + + // Intel USM extensions + + case UR_DEVICE_INFO_USM_HOST_SUPPORT: { + // from cl_intel_unified_shared_memory: "The host memory access capabilities + // apply to any host allocation." + // + // query if/how the device can access page-locked host memory, possibly + // through PCIe, using the same pointer as the host + uint64_t value = {}; + if (getAttribute(device, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)) { + // the device shares a unified address space with the host + if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= + 6) { + // compute capability 6.x introduces operations that are atomic with + // respect to other CPUs and GPUs in the system + value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; + } else { + // on GPU architectures with compute capability lower than 6.x, atomic + // operations from the GPU to CPU memory will not be atomic with respect + // to CPU initiated atomic operations + value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; + } + } + return ReturnValue(value); + } + case UR_DEVICE_INFO_USM_DEVICE_SUPPORT: { + // from cl_intel_unified_shared_memory: + // "The device memory access capabilities apply to any device allocation + // associated with this device." + // + // query how the device can access memory allocated on the device itself (?) + uint64_t value = + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; + return ReturnValue(value); + } + case UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT: { + // from cl_intel_unified_shared_memory: + // "The single device shared memory access capabilities apply to any shared + // allocation associated with this device." + // + // query if/how the device can access managed memory associated to it + uint64_t value = {}; + if (getAttribute(device, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) { + // the device can allocate managed memory on this system + value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS; + } + if (getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { + // the device can coherently access managed memory concurrently with the + // CPU + value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; + if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= + 6) { + // compute capability 6.x introduces operations that are atomic with + // respect to other CPUs and GPUs in the system + value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; + } + } + return ReturnValue(value); + } + case UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT: { + // from cl_intel_unified_shared_memory: + // "The cross-device shared memory access capabilities apply to any shared + // allocation associated with this device, or to any shared memory + // allocation on another device that also supports the same cross-device + // shared memory access capability." + // + // query if/how the device can access managed memory associated to other + // devices + uint64_t value = {}; + if (getAttribute(device, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) { + // the device can allocate managed memory on this system + value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS; + } + if (getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { + // all devices with the CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS + // attribute can coherently access managed memory concurrently with the + // CPU + value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; + } + if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= + 6) { + // compute capability 6.x introduces operations that are atomic with + // respect to other CPUs and GPUs in the system + if (value & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS) + value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS; + if (value & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS) + value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; + } + return ReturnValue(value); + } + case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: { + // from cl_intel_unified_shared_memory: + // "The shared system memory access capabilities apply to any allocations + // made by a system allocator, such as malloc or new." + // + // query if/how the device can access pageable host memory allocated by the + // system allocator + uint64_t value = {}; + if (getAttribute(device, CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS)) { + // the device suppports coherently accessing pageable memory without + // calling cuMemHostRegister/cudaHostRegister on it + if (getAttribute(device, + CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED)) { + // the link between the device and the host supports native atomic + // operations + value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; + } else { + // the link between the device and the host does not support native + // atomic operations + value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; + } + } + return ReturnValue(value); + } + case UR_DEVICE_INFO_ASYNC_BARRIER: { + int value = + getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 8; + return ReturnValue(static_cast(value)); + } + case UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION: { + int major = + getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); + int minor = + getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); + std::string result = std::to_string(major) + "." + std::to_string(minor); + return ReturnValue(result.c_str()); + } + + case UR_DEVICE_INFO_GLOBAL_MEM_FREE: { + size_t FreeMemory = 0; + size_t TotalMemory = 0; + sycl::detail::ur::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) == + CUDA_SUCCESS, + "failed cuMemGetInfo() API."); + return ReturnValue(FreeMemory); + } + case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: { + int value = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, + device->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(value >= 0); + // Convert kilohertz to megahertz when returning. + return ReturnValue(value / 1000); + } + case UR_DEVICE_INFO_MEMORY_BUS_WIDTH: { + int value = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&value, + CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, + device->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(value >= 0); + return ReturnValue(value); + } + case UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES: { + return ReturnValue(int32_t{1}); + } + case UR_DEVICE_INFO_DEVICE_ID: { + int value = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, + device->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(value >= 0); + return ReturnValue(value); + } + case UR_DEVICE_INFO_UUID: { + int driver_version = 0; + cuDriverGetVersion(&driver_version); + int major = driver_version / 1000; + int minor = driver_version % 1000 / 10; + CUuuid uuid; + if ((major > 11) || (major == 11 && minor >= 4)) { + sycl::detail::ur::assertion(cuDeviceGetUuid_v2(&uuid, device->get()) == + CUDA_SUCCESS); + } else { + sycl::detail::ur::assertion(cuDeviceGetUuid(&uuid, device->get()) == + CUDA_SUCCESS); + } + std::array name; + std::copy(uuid.bytes, uuid.bytes + 16, name.begin()); + return ReturnValue(name.data(), 16); + } + case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH: { + int major = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + device->get()) == CUDA_SUCCESS); + + int minor = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&minor, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, + device->get()) == CUDA_SUCCESS); + + // Some specific devices seem to need special handling. See reference + // https://github.com/jeffhammond/HPCInfo/blob/master/cuda/gpu-detect.cu + bool is_xavier_agx = major == 7 && minor == 2; + bool is_orin_agx = major == 8 && minor == 7; + + int memory_clock_khz = 0; + if (is_xavier_agx) { + memory_clock_khz = 2133000; + } else if (is_orin_agx) { + memory_clock_khz = 3200000; + } else { + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&memory_clock_khz, + CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, + device->get()) == CUDA_SUCCESS); + } + + int memory_bus_width = 0; + if (is_orin_agx) { + memory_bus_width = 256; + } else { + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&memory_bus_width, + CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, + device->get()) == CUDA_SUCCESS); + } + + uint64_t memory_bandwidth = + uint64_t(memory_clock_khz) * memory_bus_width * 250; + + return ReturnValue(memory_bandwidth); + } + case UR_EXT_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP: { + // Maximum number of 32-bit registers available to a thread block. + // Note: This number is shared by all thread blocks simultaneously resident + // on a multiprocessor. + int max_registers{-1}; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &max_registers, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, + device->get())); + + sycl::detail::ur::assertion(max_registers >= 0); + + return ReturnValue(static_cast(max_registers)); + } + case UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT: + return ReturnValue(false); + case UR_DEVICE_INFO_IMAGE_SRGB: + return ReturnValue(false); + case UR_DEVICE_INFO_PCI_ADDRESS: { + constexpr size_t AddressBufferSize = 13; + char AddressBuffer[AddressBufferSize]; + sycl::detail::ur::assertion( + cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, device->get()) == + CUDA_SUCCESS); + // CUDA API (8.x - 12.1) guarantees 12 bytes + \0 are written + sycl::detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) == 12); + return ReturnValue(AddressBuffer, + strnlen(AddressBuffer, AddressBufferSize - 1) + 1); + } + // TODO: Investigate if this information is available on CUDA. + case UR_DEVICE_INFO_GPU_EU_COUNT: + case UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH: + case UR_DEVICE_INFO_GPU_EU_SLICES: + case UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE: + case UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE: + case UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + + default: + break; + } + return UR_RESULT_ERROR_INVALID_ENUMERATION; +} + +/// \return PI_SUCCESS if the function is executed successfully +/// CUDA devices are always root devices so retain always returns success. +UR_APIEXPORT ur_result_t UR_APICALL urDeviceRetain(ur_device_handle_t device) { + UR_ASSERT(device, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urDevicePartition(ur_device_handle_t, const ur_device_partition_property_t *, + uint32_t, ur_device_handle_t *, uint32_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +/// \return UR_RESULT_SUCCESS always since CUDA devices are always root +/// devices. +ur_result_t urDeviceRelease(ur_device_handle_t device) { + UR_ASSERT(device, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(ur_platform_handle_t hPlatform, + ur_device_type_t DeviceType, + uint32_t NumEntries, + ur_device_handle_t *phDevices, + uint32_t *pNumDevices) { + ur_result_t err = UR_RESULT_SUCCESS; + const bool askingForAll = DeviceType == UR_DEVICE_TYPE_ALL; + const bool askingForDefault = DeviceType == UR_DEVICE_TYPE_DEFAULT; + const bool askingForGPU = DeviceType == UR_DEVICE_TYPE_GPU; + const bool returnDevices = askingForDefault || askingForAll || askingForGPU; + + UR_ASSERT(hPlatform, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + size_t numDevices = returnDevices ? hPlatform->devices_.size() : 0; + + try { + UR_ASSERT(pNumDevices || phDevices, UR_RESULT_ERROR_INVALID_VALUE); + + if (pNumDevices) { + *pNumDevices = numDevices; + } + + if (returnDevices && phDevices) { + for (size_t i = 0; i < std::min(size_t(NumEntries), numDevices); ++i) { + phDevices[i] = hPlatform->devices_[i].get(); + } + } + + return err; + } catch (ur_result_t err) { + return err; + } catch (...) { + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + } +} + +/// Gets the native CUDA handle of a UR device object +/// +/// \param[in] device The UR device to get the native CUDA object of. +/// \param[out] nativeHandle Set to the native handle of the UR device object. +/// +/// \return PI_SUCCESS + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle( + ur_device_handle_t hDevice, ur_native_handle_t *phNativeHandle) { + UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(phNativeHandle, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + *phNativeHandle = reinterpret_cast(hDevice->get()); + return UR_RESULT_SUCCESS; +} + +/// Created a UR device object from a CUDA device handle. +/// NOTE: The created UR object does not take ownership of the native handle. +/// +/// \param[in] nativeHandle The native handle to create UR device object from. +/// \param[in] platform is the UR platform of the device. +/// \param[out] device Set to the UR device object created from native handle. +/// +/// \return TBD + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( + ur_native_handle_t hNativeDevice, ur_platform_handle_t hPlatform, + ur_device_handle_t *phDevice) { + // TODO(ur): This is neede for the UR CTS, but it might be valid to to have a + // null native handle + UR_ASSERT(hNativeDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(phDevice, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + // We can't cast between ur_native_handle_t and CUdevice, so memcpy the bits + // instead + CUdevice cu_device = 0; + memcpy(&cu_device, hNativeDevice, sizeof(CUdevice)); + + auto is_device = [=](std::unique_ptr &dev) { + return dev->get() == cu_device; + }; + + // If a platform is provided just check if the device is in it + if (hPlatform) { + auto search_res = std::find_if(begin(hPlatform->devices_), + end(hPlatform->devices_), is_device); + if (search_res != end(hPlatform->devices_)) { + *phDevice = search_res->get(); + return UR_RESULT_SUCCESS; + } + } + + // Get list of platforms + uint32_t num_platforms = 0; + ur_result_t result = urPlatformGet(0, nullptr, &num_platforms); + if (result != UR_RESULT_SUCCESS) + return result; + + ur_platform_handle_t *plat = static_cast( + malloc(num_platforms * sizeof(ur_platform_handle_t))); + result = urPlatformGet(num_platforms, plat, nullptr); + if (result != UR_RESULT_SUCCESS) + return result; + + // Iterate through platforms to find device that matches nativeHandle + for (uint32_t j = 0; j < num_platforms; ++j) { + auto search_res = std::find_if(begin(plat[j]->devices_), + end(plat[j]->devices_), is_device); + if (search_res != end(plat[j]->devices_)) { + *phDevice = static_cast((*search_res).get()); + return UR_RESULT_SUCCESS; + } + } + + // If the provided nativeHandle cannot be matched to an + // existing device return error + return UR_RESULT_ERROR_INVALID_OPERATION; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp new file mode 100644 index 0000000000000..c2195c958cfd7 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp @@ -0,0 +1,59 @@ +//===--------- device.hpp - CUDA Adapter -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// +#pragma once + +#include + +struct ur_device_handle_t_ { +private: + using native_type = CUdevice; + + native_type cuDevice_; + CUcontext cuContext_; + CUevent evBase_; // CUDA event used as base counter + std::atomic_uint32_t refCount_; + ur_platform_handle_t platform_; + + static constexpr uint32_t max_work_item_dimensions = 3u; + size_t max_work_item_sizes[max_work_item_dimensions]; + int max_work_group_size; + +public: + ur_device_handle_t_(native_type cuDevice, CUcontext cuContext, CUevent evBase, + ur_platform_handle_t platform) + : cuDevice_(cuDevice), cuContext_(cuContext), evBase_(evBase), + refCount_{1}, platform_(platform) {} + + ur_device_handle_t_() { cuDevicePrimaryCtxRelease(cuDevice_); } + + native_type get() const noexcept { return cuDevice_; }; + + CUcontext get_context() const noexcept { return cuContext_; }; + + uint32_t get_reference_count() const noexcept { return refCount_; } + + ur_platform_handle_t get_platform() const noexcept { return platform_; }; + + uint64_t get_elapsed_time(CUevent) const; + + void save_max_work_item_sizes(size_t size, + size_t *save_max_work_item_sizes) noexcept { + memcpy(max_work_item_sizes, save_max_work_item_sizes, size); + }; + + void save_max_work_group_size(int value) noexcept { + max_work_group_size = value; + }; + + void get_max_work_item_sizes(size_t ret_size, + size_t *ret_max_work_item_sizes) const noexcept { + memcpy(ret_max_work_item_sizes, max_work_item_sizes, ret_size); + }; + + int get_max_work_group_size() const noexcept { return max_work_group_size; }; +}; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp new file mode 100644 index 0000000000000..dd8503f1f8907 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp @@ -0,0 +1,174 @@ +//===--------- platform.cpp - CUDA Adapter ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include "platform.hpp" +#include "common.hpp" +#include "context.hpp" +#include "device.hpp" + +#include +#include +#include + +ur_result_t urPlatformGetInfo(ur_platform_handle_t hPlatform, + ur_platform_info_t PlatformInfoType, size_t Size, + void *pPlatformInfo, size_t *pSizeRet) { + + UR_ASSERT(hPlatform, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UrReturnHelper ReturnValue(Size, pPlatformInfo, pSizeRet); + + switch (PlatformInfoType) { + case UR_PLATFORM_INFO_NAME: + return ReturnValue("NVIDIA CUDA BACKEND"); + case UR_PLATFORM_INFO_VENDOR_NAME: + return ReturnValue("NVIDIA Corporation"); + case UR_PLATFORM_INFO_PROFILE: + return ReturnValue("FULL PROFILE"); + case UR_PLATFORM_INFO_VERSION: { + auto version = getCudaVersionString(); + return ReturnValue(version.c_str()); + } + case UR_PLATFORM_INFO_EXTENSIONS: { + return ReturnValue(""); + } + case UR_PLATFORM_INFO_BACKEND: { + return ReturnValue(UR_PLATFORM_BACKEND_CUDA); + } + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + + return UR_RESULT_SUCCESS; +} + +/// Obtains the CUDA platform. +/// There is only one CUDA platform, and contains all devices on the system. +/// Triggers the CUDA Driver initialization (cuInit) the first time, so this +/// must be the first PI API called. +/// +/// However because multiple devices in a context is not currently supported, +/// place each device in a separate platform. +/// +ur_result_t urPlatformGet(uint32_t NumEntries, + ur_platform_handle_t *phPlatforms, + uint32_t *pNumPlatforms) { + + try { + static std::once_flag initFlag; + static uint32_t numPlatforms = 1; + static std::vector platformIds; + + UR_ASSERT(phPlatforms || pNumPlatforms, UR_RESULT_ERROR_INVALID_VALUE); + UR_ASSERT(!phPlatforms || NumEntries > 0, UR_RESULT_ERROR_INVALID_SIZE); + + ur_result_t err = UR_RESULT_SUCCESS; + + std::call_once( + initFlag, + [](ur_result_t &err) { + if (cuInit(0) != CUDA_SUCCESS) { + numPlatforms = 0; + return; + } + int numDevices = 0; + err = UR_CHECK_ERROR(cuDeviceGetCount(&numDevices)); + if (numDevices == 0) { + numPlatforms = 0; + return; + } + try { + // make one platform per device + numPlatforms = numDevices; + platformIds.resize(numDevices); + + for (int i = 0; i < numDevices; ++i) { + CUdevice device; + err = UR_CHECK_ERROR(cuDeviceGet(&device, i)); + CUcontext context; + err = UR_CHECK_ERROR(cuDevicePrimaryCtxRetain(&context, device)); + + ScopedContext active(context); + CUevent evBase; + err = UR_CHECK_ERROR(cuEventCreate(&evBase, CU_EVENT_DEFAULT)); + + // Use default stream to record base event counter + err = UR_CHECK_ERROR(cuEventRecord(evBase, 0)); + + platformIds[i].devices_.emplace_back(new ur_device_handle_t_{ + device, context, evBase, &platformIds[i]}); + { + const auto &dev = platformIds[i].devices_.back().get(); + size_t maxWorkGroupSize = 0u; + size_t maxThreadsPerBlock[3] = {}; + ur_result_t retError = urDeviceGetInfo( + dev, UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES, + sizeof(maxThreadsPerBlock), maxThreadsPerBlock, nullptr); + if (retError != UR_RESULT_SUCCESS) { + throw retError; + } + + retError = urDeviceGetInfo( + dev, UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE, + sizeof(maxWorkGroupSize), &maxWorkGroupSize, nullptr); + if (retError != UR_RESULT_SUCCESS) { + throw retError; + } + + dev->save_max_work_item_sizes(sizeof(maxThreadsPerBlock), + maxThreadsPerBlock); + dev->save_max_work_group_size(maxWorkGroupSize); + } + } + } catch (const std::bad_alloc &) { + // Signal out-of-memory situation + for (int i = 0; i < numDevices; ++i) { + platformIds[i].devices_.clear(); + } + platformIds.clear(); + err = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + // Clear and rethrow to allow retry + for (int i = 0; i < numDevices; ++i) { + platformIds[i].devices_.clear(); + } + platformIds.clear(); + throw; + } + }, + err); + + if (pNumPlatforms != nullptr) { + *pNumPlatforms = numPlatforms; + } + + if (phPlatforms != nullptr) { + for (unsigned i = 0; i < std::min(NumEntries, numPlatforms); ++i) { + phPlatforms[i] = &platformIds[i]; + } + } + + return err; + } catch (ur_result_t err) { + return err; + } catch (...) { + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + } +} + +ur_result_t urPlatformGetApiVersion(ur_platform_handle_t hDriver, + ur_api_version_t *pVersion) { + UR_ASSERT(hDriver, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(pVersion, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + *pVersion = UR_API_VERSION_CURRENT; + return UR_RESULT_SUCCESS; +} + +ur_result_t urInit(ur_device_init_flags_t) { return UR_RESULT_SUCCESS; } + +ur_result_t urTearDown(void *) { return UR_RESULT_SUCCESS; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.hpp new file mode 100644 index 0000000000000..5b2e79f49be8d --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.hpp @@ -0,0 +1,15 @@ +//===--------- platform.hpp - CUDA Adapter ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// +#pragma once + +#include +#include + +struct ur_platform_handle_t_ { + std::vector> devices_; +}; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp index 9446515bd435e..015dadcbaa074 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp @@ -6,6 +6,7 @@ // //===-----------------------------------------------------------------===// +#include #include namespace { @@ -36,9 +37,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetPlatformProcAddrTable( return result; } pDdiTable->pfnCreateWithNativeHandle = nullptr; - pDdiTable->pfnGet = nullptr; - pDdiTable->pfnGetApiVersion = nullptr; - pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGet = urPlatformGet; + pDdiTable->pfnGetApiVersion = urPlatformGetApiVersion; + pDdiTable->pfnGetInfo = urPlatformGetInfo; pDdiTable->pfnGetNativeHandle = nullptr; return UR_RESULT_SUCCESS; } @@ -49,13 +50,13 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetContextProcAddrTable( if (UR_RESULT_SUCCESS != result) { return result; } - pDdiTable->pfnCreate = nullptr; - pDdiTable->pfnCreateWithNativeHandle = nullptr; - pDdiTable->pfnGetInfo = nullptr; - pDdiTable->pfnGetNativeHandle = nullptr; - pDdiTable->pfnRelease = nullptr; - pDdiTable->pfnRetain = nullptr; - pDdiTable->pfnSetExtendedDeleter = nullptr; + pDdiTable->pfnCreate = urContextCreate; + pDdiTable->pfnCreateWithNativeHandle = urContextCreateWithNativeHandle; + pDdiTable->pfnGetInfo = urContextGetInfo; + pDdiTable->pfnGetNativeHandle = urContextGetNativeHandle; + pDdiTable->pfnRelease = urContextRelease; + pDdiTable->pfnRetain = urContextRetain; + pDdiTable->pfnSetExtendedDeleter = urContextSetExtendedDeleter; return UR_RESULT_SUCCESS; } @@ -195,8 +196,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetGlobalProcAddrTable( return result; } pDdiTable->pfnGetLastResult = nullptr; - pDdiTable->pfnInit = nullptr; - pDdiTable->pfnTearDown = nullptr; + pDdiTable->pfnInit = urInit; + pDdiTable->pfnTearDown = urTearDown; return UR_RESULT_SUCCESS; } @@ -240,14 +241,14 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable( if (UR_RESULT_SUCCESS != result) { return result; } - pDdiTable->pfnCreateWithNativeHandle = nullptr; - pDdiTable->pfnGet = nullptr; + pDdiTable->pfnCreateWithNativeHandle = urDeviceCreateWithNativeHandle; + pDdiTable->pfnGet = urDeviceGet; pDdiTable->pfnGetGlobalTimestamps = nullptr; - pDdiTable->pfnGetInfo = nullptr; - pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnGetInfo = urDeviceGetInfo; + pDdiTable->pfnGetNativeHandle = urDeviceGetNativeHandle; pDdiTable->pfnPartition = nullptr; - pDdiTable->pfnRelease = nullptr; - pDdiTable->pfnRetain = nullptr; + pDdiTable->pfnRelease = urDeviceRelease; + pDdiTable->pfnRetain = urDeviceRetain; pDdiTable->pfnSelectBinary = nullptr; return UR_RESULT_SUCCESS; } diff --git a/sycl/plugins/unified_runtime/ur/ur.hpp b/sycl/plugins/unified_runtime/ur/ur.hpp index d0d1fb8f46912..c2f3a3782f9a0 100644 --- a/sycl/plugins/unified_runtime/ur/ur.hpp +++ b/sycl/plugins/unified_runtime/ur/ur.hpp @@ -39,6 +39,9 @@ template <> uint32_t inline ur_cast(uint64_t Value) { const ur_device_info_t UR_EXT_DEVICE_INFO_OPENCL_C_VERSION = (ur_device_info_t)0x103D; +const ur_device_info_t UR_EXT_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP = + (ur_device_info_t)((uint32_t)UR_DEVICE_INFO_FORCE_UINT32 - 1); + const ur_command_t UR_EXT_COMMAND_TYPE_USER = (ur_command_t)((uint32_t)UR_COMMAND_FORCE_UINT32 - 1); @@ -197,6 +200,7 @@ extern bool PiPlatformCachePopulated; // The getInfo*/ReturnHelper facilities provide shortcut way of // writing return bytes for the various getInfo APIs. +namespace ur { template ur_result_t getInfoImpl(size_t param_value_size, void *param_value, size_t *param_value_size_ret, T value, @@ -260,6 +264,7 @@ getInfo(size_t param_value_size, void *param_value, return getInfoArray(strlen(value) + 1, param_value_size, param_value, param_value_size_ret, value); } +} // namespace ur class UrReturnHelper { public: @@ -276,20 +281,20 @@ class UrReturnHelper { // Scalar return value template ur_result_t operator()(const T &t) { - return getInfo(param_value_size, param_value, param_value_size_ret, t); + return ur::getInfo(param_value_size, param_value, param_value_size_ret, t); } // Array return value template ur_result_t operator()(const T *t, size_t s) { - return getInfoArray(s, param_value_size, param_value, param_value_size_ret, - t); + return ur::getInfoArray(s, param_value_size, param_value, + param_value_size_ret, t); } // Array return value where element type is differrent from T template ur_result_t operator()(const T *t, size_t s) { - return getInfoArray(s, param_value_size, param_value, - param_value_size_ret, t); + return ur::getInfoArray(s, param_value_size, param_value, + param_value_size_ret, t); } protected: diff --git a/sycl/unittests/pi/cuda/CMakeLists.txt b/sycl/unittests/pi/cuda/CMakeLists.txt index 94ac39f07e474..7808340cc4302 100644 --- a/sycl/unittests/pi/cuda/CMakeLists.txt +++ b/sycl/unittests/pi/cuda/CMakeLists.txt @@ -22,9 +22,11 @@ target_include_directories(PiCudaTests "${sycl_inc_dir}/sycl/detail/" "${sycl_inc_dir}" "${sycl_plugin_dir}/cuda/" + "${sycl_plugin_dir}/unified_runtime/" ) target_link_libraries(PiCudaTests PRIVATE cudadrv + UnifiedRuntime-Headers ) From 8073f6c46b8c93b84bbfbae37d3847dab72d1fae Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Mon, 10 Apr 2023 13:53:15 +0100 Subject: [PATCH 03/45] Port program and kernel entry points --- sycl/plugins/cuda/CMakeLists.txt | 4 + sycl/plugins/cuda/pi_cuda.cpp | 840 +----------------- sycl/plugins/cuda/pi_cuda.hpp | 217 +---- sycl/plugins/unified_runtime/CMakeLists.txt | 4 + .../ur/adapters/cuda/kernel.cpp | 281 ++++++ .../ur/adapters/cuda/kernel.hpp | 183 ++++ .../ur/adapters/cuda/program.cpp | 439 +++++++++ .../ur/adapters/cuda/program.hpp | 55 ++ 8 files changed, 997 insertions(+), 1026 deletions(-) create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/program.hpp diff --git a/sycl/plugins/cuda/CMakeLists.txt b/sycl/plugins/cuda/CMakeLists.txt index e4fa949eca8e9..76d730967a7c0 100644 --- a/sycl/plugins/cuda/CMakeLists.txt +++ b/sycl/plugins/cuda/CMakeLists.txt @@ -63,6 +63,10 @@ add_sycl_plugin(cuda "../unified_runtime/ur/adapters/cuda/device.hpp" "../unified_runtime/ur/adapters/cuda/platform.cpp" "../unified_runtime/ur/adapters/cuda/platform.hpp" + "../unified_runtime/ur/adapters/cuda/program.cpp" + "../unified_runtime/ur/adapters/cuda/program.hpp" + "../unified_runtime/ur/adapters/cuda/kernel.cpp" + "../unified_runtime/ur/adapters/cuda/kernel.hpp" "../unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp" # --- "${sycl_inc_dir}/sycl/detail/pi.h" diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index b1183b662b137..c09ccea8ef6a3 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -228,14 +228,6 @@ pi_result getInfoArray(size_t array_length, size_t param_value_size, array_length * sizeof(T), memcpy); } -template <> -pi_result getInfo(size_t param_value_size, void *param_value, - size_t *param_value_size_ret, - const char *value) { - return getInfoArray(strlen(value) + 1, param_value_size, param_value, - param_value_size_ret, value); -} - int getAttribute(pi_device device, CUdevice_attribute attribute) { int value; sycl::detail::pi::assertion( @@ -370,41 +362,6 @@ void getUSMHostOrDevicePtr(PtrT usm_ptr, CUmemorytype *out_mem_type, } } -bool getMaxRegistersJitOptionValue(const std::string &build_options, - unsigned int &value) { - using namespace std::string_view_literals; - const std::size_t optionPos = build_options.find_first_of("maxrregcount"sv); - if (optionPos == std::string::npos) { - return false; - } - - const std::size_t delimPos = build_options.find('=', optionPos + 1u); - if (delimPos == std::string::npos) { - return false; - } - - const std::size_t length = build_options.length(); - const std::size_t startPos = delimPos + 1u; - if (delimPos == std::string::npos || startPos >= length) { - return false; - } - - std::size_t pos = startPos; - while (pos < length && - std::isdigit(static_cast(build_options[pos]))) { - pos++; - } - - const std::string valueString = - build_options.substr(startPos, pos - startPos); - if (valueString.empty()) { - return false; - } - - value = static_cast(std::stoi(valueString)); - return true; -} - // Helper to verify out-of-registers case (exceeded block max registers). // If the kernel requires a number of registers for the entire thread // block exceeds the hardware limitations, then the cuLaunchKernel call @@ -734,127 +691,6 @@ pi_result enqueueEventWait(pi_queue queue, pi_event event) { return PI_SUCCESS; } -_pi_program::_pi_program(pi_context ctxt) - : module_{nullptr}, binary_{}, binarySizeInBytes_{0}, refCount_{1}, - context_{ctxt}, kernelReqdWorkGroupSizeMD_{} { - pi2ur::piContextRetain(context_); -} - -_pi_program::~_pi_program() { pi2ur::piContextRelease(context_); } - -std::pair -splitMetadataName(const std::string &metadataName) { - size_t splitPos = metadataName.rfind('@'); - if (splitPos == std::string::npos) - return std::make_pair(metadataName, std::string{}); - return std::make_pair(metadataName.substr(0, splitPos), - metadataName.substr(splitPos, metadataName.length())); -} - -pi_result _pi_program::set_metadata(const pi_device_binary_property *metadata, - size_t length) { - for (size_t i = 0; i < length; ++i) { - const pi_device_binary_property metadataElement = metadata[i]; - std::string metadataElementName{metadataElement->Name}; - - auto [prefix, tag] = splitMetadataName(metadataElementName); - - if (tag == __SYCL_PI_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE) { - // If metadata is reqd_work_group_size, record it for the corresponding - // kernel name. - size_t MDElemsSize = metadataElement->ValSize - sizeof(std::uint64_t); - - // Expect between 1 and 3 32-bit integer values. - assert(MDElemsSize >= sizeof(std::uint32_t) && - MDElemsSize <= sizeof(std::uint32_t) * 3 && - "Unexpected size for reqd_work_group_size metadata"); - - // Get pointer to data, skipping 64-bit size at the start of the data. - const char *ValuePtr = - reinterpret_cast(metadataElement->ValAddr) + - sizeof(std::uint64_t); - // Read values and pad with 1's for values not present. - std::uint32_t reqdWorkGroupElements[] = {1, 1, 1}; - std::memcpy(reqdWorkGroupElements, ValuePtr, MDElemsSize); - kernelReqdWorkGroupSizeMD_[prefix] = - std::make_tuple(reqdWorkGroupElements[0], reqdWorkGroupElements[1], - reqdWorkGroupElements[2]); - } else if (tag == __SYCL_PI_PROGRAM_METADATA_GLOBAL_ID_MAPPING) { - const char *metadataValPtr = - reinterpret_cast(metadataElement->ValAddr) + - sizeof(std::uint64_t); - const char *metadataValPtrEnd = - metadataValPtr + metadataElement->ValSize - sizeof(std::uint64_t); - globalIDMD_[prefix] = std::string{metadataValPtr, metadataValPtrEnd}; - } - } - return PI_SUCCESS; -} - -pi_result _pi_program::set_binary(const char *source, size_t length) { - assert((binary_ == nullptr && binarySizeInBytes_ == 0) && - "Re-setting program binary data which has already been set"); - binary_ = source; - binarySizeInBytes_ = length; - return PI_SUCCESS; -} - -pi_result _pi_program::build_program(const char *build_options) { - - this->buildOptions_ = build_options; - - constexpr const unsigned int numberOfOptions = 4u; - - std::vector options(numberOfOptions); - std::vector optionVals(numberOfOptions); - - // Pass a buffer for info messages - options[0] = CU_JIT_INFO_LOG_BUFFER; - optionVals[0] = (void *)infoLog_; - // Pass the size of the info buffer - options[1] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; - optionVals[1] = (void *)(long)MAX_LOG_SIZE; - // Pass a buffer for error message - options[2] = CU_JIT_ERROR_LOG_BUFFER; - optionVals[2] = (void *)errorLog_; - // Pass the size of the error buffer - options[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; - optionVals[3] = (void *)(long)MAX_LOG_SIZE; - - if (!buildOptions_.empty()) { - unsigned int maxRegs; - bool valid = getMaxRegistersJitOptionValue(buildOptions_, maxRegs); - if (valid) { - options.push_back(CU_JIT_MAX_REGISTERS); - optionVals.push_back(reinterpret_cast(maxRegs)); - } - } - - auto result = PI_CHECK_ERROR( - cuModuleLoadDataEx(&module_, static_cast(binary_), - options.size(), options.data(), optionVals.data())); - - const auto success = (result == PI_SUCCESS); - - buildStatus_ = - success ? PI_PROGRAM_BUILD_STATUS_SUCCESS : PI_PROGRAM_BUILD_STATUS_ERROR; - - // If no exception, result is correct - return success ? PI_SUCCESS : PI_ERROR_BUILD_PROGRAM_FAILURE; -} - -/// Finds kernel names by searching for entry points in the PTX source, as the -/// CUDA driver API doesn't expose an operation for this. -/// Note: This is currently only being used by the SYCL program class for the -/// has_kernel method, so an alternative would be to move the has_kernel -/// query to PI and use cuModuleGetFunction to check for a kernel. -/// Note: Another alternative is to add kernel names as metadata, like with -/// reqd_work_group_size. -std::string getKernelNames(pi_program) { - sycl::detail::pi::die("getKernelNames not implemented"); - return {}; -} - //-- PI API implementation extern "C" { pi_result cuda_piContextGetInfo(pi_context context, pi_context_info param_name, @@ -1586,63 +1422,6 @@ pi_result cuda_piEventsWait(pi_uint32 num_events, const pi_event *event_list) { } } -pi_result cuda_piKernelCreate(pi_program program, const char *kernel_name, - pi_kernel *kernel) { - assert(kernel != nullptr); - assert(program != nullptr); - - pi_result retErr = PI_SUCCESS; - std::unique_ptr<_pi_kernel> retKernel{nullptr}; - - try { - ScopedContext active(program->get_context()); - - CUfunction cuFunc; - retErr = PI_CHECK_ERROR( - cuModuleGetFunction(&cuFunc, program->get(), kernel_name)); - - std::string kernel_name_woffset = std::string(kernel_name) + "_with_offset"; - CUfunction cuFuncWithOffsetParam; - CUresult offsetRes = cuModuleGetFunction( - &cuFuncWithOffsetParam, program->get(), kernel_name_woffset.c_str()); - - // If there is no kernel with global offset parameter we mark it as missing - if (offsetRes == CUDA_ERROR_NOT_FOUND) { - cuFuncWithOffsetParam = nullptr; - } else { - retErr = PI_CHECK_ERROR(offsetRes); - } - - retKernel = std::unique_ptr<_pi_kernel>( - new _pi_kernel{cuFunc, cuFuncWithOffsetParam, kernel_name, program, - program->get_context()}); - } catch (pi_result err) { - retErr = err; - } catch (...) { - retErr = PI_ERROR_OUT_OF_HOST_MEMORY; - } - - *kernel = retKernel.release(); - return retErr; -} - -pi_result cuda_piKernelSetArg(pi_kernel kernel, pi_uint32 arg_index, - size_t arg_size, const void *arg_value) { - - assert(kernel != nullptr); - pi_result retErr = PI_SUCCESS; - try { - if (arg_value) { - kernel->set_kernel_arg(arg_index, arg_size, arg_value); - } else { - kernel->set_kernel_local_arg(arg_index, arg_size); - } - } catch (pi_result err) { - retErr = err; - } - return retErr; -} - pi_result cuda_piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index, const pi_mem *arg_value) { @@ -1700,119 +1479,6 @@ pi_result cuda_piextKernelSetArgSampler(pi_kernel kernel, pi_uint32 arg_index, return retErr; } -pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device, - pi_kernel_group_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) { - - // Here we want to query about a kernel's cuda blocks! - - if (kernel != nullptr) { - - switch (param_name) { - case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: { - size_t global_work_size[3] = {0, 0, 0}; - - int max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0}; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&max_block_dimX, - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&max_block_dimY, - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&max_block_dimZ, - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, - device->get()) == CUDA_SUCCESS); - - int max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0}; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&max_grid_dimX, - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&max_grid_dimY, - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&max_grid_dimZ, - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, - device->get()) == CUDA_SUCCESS); - - global_work_size[0] = max_block_dimX * max_grid_dimX; - global_work_size[1] = max_block_dimY * max_grid_dimY; - global_work_size[2] = max_block_dimZ * max_grid_dimZ; - return getInfoArray(3, param_value_size, param_value, - param_value_size_ret, global_work_size); - } - case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { - int max_threads = 0; - sycl::detail::pi::assertion( - cuFuncGetAttribute(&max_threads, - CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - kernel->get()) == CUDA_SUCCESS); - return getInfo(param_value_size, param_value, param_value_size_ret, - size_t(max_threads)); - } - case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { - size_t group_size[3] = {0, 0, 0}; - const auto &reqd_wg_size_md_map = - kernel->program_->kernelReqdWorkGroupSizeMD_; - const auto reqd_wg_size_md = reqd_wg_size_md_map.find(kernel->name_); - if (reqd_wg_size_md != reqd_wg_size_md_map.end()) { - const auto reqd_wg_size = reqd_wg_size_md->second; - group_size[0] = std::get<0>(reqd_wg_size); - group_size[1] = std::get<1>(reqd_wg_size); - group_size[2] = std::get<2>(reqd_wg_size); - } - return getInfoArray(3, param_value_size, param_value, - param_value_size_ret, group_size); - } - case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: { - // OpenCL LOCAL == CUDA SHARED - int bytes = 0; - sycl::detail::pi::assertion( - cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, - kernel->get()) == CUDA_SUCCESS); - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint64(bytes)); - } - case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { - // Work groups should be multiples of the warp size - int warpSize = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, - device->get()) == CUDA_SUCCESS); - return getInfo(param_value_size, param_value, param_value_size_ret, - static_cast(warpSize)); - } - case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: { - // OpenCL PRIVATE == CUDA LOCAL - int bytes = 0; - sycl::detail::pi::assertion( - cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, - kernel->get()) == CUDA_SUCCESS); - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint64(bytes)); - } - case PI_KERNEL_GROUP_INFO_NUM_REGS: { - int numRegs = 0; - sycl::detail::pi::assertion( - cuFuncGetAttribute(&numRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, - kernel->get()) == CUDA_SUCCESS); - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint32(numRegs)); - } - default: - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); - } - } - - return PI_ERROR_INVALID_KERNEL; -} - pi_result cuda_piEnqueueKernelLaunch( pi_queue command_queue, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, @@ -1984,13 +1650,6 @@ pi_result cuda_piEnqueueNativeKernel(pi_queue, void (*)(void *), void *, size_t, return {}; } -pi_result cuda_piextKernelCreateWithNativeHandle(pi_native_handle, pi_context, - pi_program, bool, - pi_kernel *) { - sycl::detail::pi::die("Unsupported operation"); - return PI_SUCCESS; -} - /// \TODO Not implemented pi_result cuda_piMemImageCreate(pi_context context, pi_mem_flags flags, const pi_image_format *image_format, @@ -2161,457 +1820,6 @@ pi_result cuda_piMemRetain(pi_mem mem) { return PI_SUCCESS; } -/// Not used as CUDA backend only creates programs from binary. -/// See \ref cuda_piclProgramCreateWithBinary. -/// -pi_result cuda_piclProgramCreateWithSource(pi_context, pi_uint32, const char **, - const size_t *, pi_program *) { - sycl::detail::pi::cuPrint("cuda_piclProgramCreateWithSource not implemented"); - return PI_ERROR_INVALID_OPERATION; -} - -/// Loads the images from a PI program into a CUmodule that can be -/// used later on to extract functions (kernels). -/// See \ref _pi_program for implementation details. -/// -pi_result cuda_piProgramBuild( - pi_program program, [[maybe_unused]] pi_uint32 num_devices, - [[maybe_unused]] const pi_device *device_list, const char *options, - [[maybe_unused]] void (*pfn_notify)(pi_program program, void *user_data), - [[maybe_unused]] void *user_data) { - - assert(program != nullptr); - assert(num_devices == 1 || num_devices == 0); - assert(device_list != nullptr || num_devices == 0); - assert(pfn_notify == nullptr); - assert(user_data == nullptr); - pi_result retError = PI_SUCCESS; - - try { - ScopedContext active(program->get_context()); - - program->build_program(options); - - } catch (pi_result err) { - retError = err; - } - return retError; -} - -/// \TODO Not implemented -pi_result cuda_piProgramCreate(pi_context, const void *, size_t, pi_program *) { - sycl::detail::pi::die("cuda_piProgramCreate not implemented"); - return {}; -} - -/// Loads images from a list of PTX or CUBIN binaries. -/// Note: No calls to CUDA driver API in this function, only store binaries -/// for later. -/// -/// Note: Only supports one device -/// -pi_result cuda_piProgramCreateWithBinary( - pi_context context, [[maybe_unused]] pi_uint32 num_devices, - [[maybe_unused]] const pi_device *device_list, const size_t *lengths, - const unsigned char **binaries, size_t num_metadata_entries, - const pi_device_binary_property *metadata, pi_int32 *binary_status, - pi_program *program) { - // Ignore unused parameter - (void)binary_status; - - assert(context != nullptr); - assert(binaries != nullptr); - assert(program != nullptr); - assert(device_list != nullptr); - assert(num_devices == 1 && "CUDA contexts are for a single device"); - assert((context->get_device()->get() == device_list[0]->get()) && - "Mismatch between devices context and passed context when creating " - "program from binary"); - - pi_result retError = PI_SUCCESS; - - std::unique_ptr<_pi_program> retProgram{new _pi_program{context}}; - - retProgram->set_metadata(metadata, num_metadata_entries); - - const bool has_length = (lengths != nullptr); - size_t length = has_length - ? lengths[0] - : strlen(reinterpret_cast(binaries[0])) + 1; - - assert(length != 0); - - retProgram->set_binary(reinterpret_cast(binaries[0]), length); - - *program = retProgram.release(); - - return retError; -} - -pi_result cuda_piProgramGetInfo(pi_program program, pi_program_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) { - assert(program != nullptr); - - switch (param_name) { - case PI_PROGRAM_INFO_REFERENCE_COUNT: - return getInfo(param_value_size, param_value, param_value_size_ret, - program->get_reference_count()); - case PI_PROGRAM_INFO_CONTEXT: - return getInfo(param_value_size, param_value, param_value_size_ret, - program->context_); - case PI_PROGRAM_INFO_NUM_DEVICES: - return getInfo(param_value_size, param_value, param_value_size_ret, 1u); - case PI_PROGRAM_INFO_DEVICES: - return getInfoArray(1, param_value_size, param_value, param_value_size_ret, - &program->context_->deviceId_); - case PI_PROGRAM_INFO_SOURCE: - return getInfo(param_value_size, param_value, param_value_size_ret, - program->binary_); - case PI_PROGRAM_INFO_BINARY_SIZES: - return getInfoArray(1, param_value_size, param_value, param_value_size_ret, - &program->binarySizeInBytes_); - case PI_PROGRAM_INFO_BINARIES: - return getInfoArray(1, param_value_size, param_value, param_value_size_ret, - &program->binary_); - case PI_PROGRAM_INFO_KERNEL_NAMES: { - return getInfo(param_value_size, param_value, param_value_size_ret, - getKernelNames(program).c_str()); - } - default: - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); - } - sycl::detail::pi::die("Program info request not implemented"); - return {}; -} - -/// Creates a new PI program object that is the outcome of linking all input -/// programs. -/// \TODO Implement linker options, requires mapping of OpenCL to CUDA -/// -pi_result cuda_piProgramLink( - pi_context context, [[maybe_unused]] pi_uint32 num_devices, - [[maybe_unused]] const pi_device *device_list, const char *options, - pi_uint32 num_input_programs, const pi_program *input_programs, - [[maybe_unused]] void (*pfn_notify)(pi_program program, void *user_data), - [[maybe_unused]] void *user_data, pi_program *ret_program) { - - assert(ret_program != nullptr); - assert(num_devices == 1 || num_devices == 0); - assert(device_list != nullptr || num_devices == 0); - assert(pfn_notify == nullptr); - assert(user_data == nullptr); - pi_result retError = PI_SUCCESS; - - try { - ScopedContext active(context); - - CUlinkState state; - std::unique_ptr<_pi_program> retProgram{new _pi_program{context}}; - - retError = PI_CHECK_ERROR(cuLinkCreate(0, nullptr, nullptr, &state)); - try { - for (size_t i = 0; i < num_input_programs; ++i) { - pi_program program = input_programs[i]; - retError = PI_CHECK_ERROR(cuLinkAddData( - state, CU_JIT_INPUT_PTX, const_cast(program->binary_), - program->binarySizeInBytes_, nullptr, 0, nullptr, nullptr)); - } - void *cubin = nullptr; - size_t cubinSize = 0; - retError = PI_CHECK_ERROR(cuLinkComplete(state, &cubin, &cubinSize)); - - retError = - retProgram->set_binary(static_cast(cubin), cubinSize); - - if (retError != PI_SUCCESS) { - return retError; - } - - retError = retProgram->build_program(options); - - if (retError != PI_SUCCESS) { - return retError; - } - } catch (...) { - // Upon error attempt cleanup - PI_CHECK_ERROR(cuLinkDestroy(state)); - throw; - } - - retError = PI_CHECK_ERROR(cuLinkDestroy(state)); - *ret_program = retProgram.release(); - - } catch (pi_result err) { - retError = err; - } - return retError; -} - -/// Creates a new program that is the outcome of the compilation of the headers -/// and the program. -/// \TODO Implement asynchronous compilation -/// -pi_result cuda_piProgramCompile( - pi_program program, [[maybe_unused]] pi_uint32 num_devices, - [[maybe_unused]] const pi_device *device_list, const char *options, - [[maybe_unused]] pi_uint32 num_input_headers, - const pi_program *input_headers, const char **header_include_names, - [[maybe_unused]] void (*pfn_notify)(pi_program program, void *user_data), - [[maybe_unused]] void *user_data) { - // Ignore unused parameters - (void)header_include_names; - (void)input_headers; - - assert(program != nullptr); - assert(num_devices == 1 || num_devices == 0); - assert(device_list != nullptr || num_devices == 0); - assert(pfn_notify == nullptr); - assert(user_data == nullptr); - assert(num_input_headers == 0); - pi_result retError = PI_SUCCESS; - - try { - ScopedContext active(program->get_context()); - - program->build_program(options); - - } catch (pi_result err) { - retError = err; - } - return retError; -} - -pi_result cuda_piProgramGetBuildInfo(pi_program program, pi_device device, - pi_program_build_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) { - // Ignore unused parameter - (void)device; - - assert(program != nullptr); - - switch (param_name) { - case PI_PROGRAM_BUILD_INFO_STATUS: { - return getInfo(param_value_size, param_value, param_value_size_ret, - program->buildStatus_); - } - case PI_PROGRAM_BUILD_INFO_OPTIONS: - return getInfo(param_value_size, param_value, param_value_size_ret, - program->buildOptions_.c_str()); - case PI_PROGRAM_BUILD_INFO_LOG: - return getInfoArray(program->MAX_LOG_SIZE, param_value_size, param_value, - param_value_size_ret, program->infoLog_); - default: - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); - } - sycl::detail::pi::die("Program Build info request not implemented"); - return {}; -} - -pi_result cuda_piProgramRetain(pi_program program) { - assert(program != nullptr); - assert(program->get_reference_count() > 0); - program->increment_reference_count(); - return PI_SUCCESS; -} - -/// Decreases the reference count of a pi_program object. -/// When the reference count reaches 0, it unloads the module from -/// the context. -pi_result cuda_piProgramRelease(pi_program program) { - assert(program != nullptr); - - // double delete or someone is messing with the ref count. - // either way, cannot safely proceed. - assert(program->get_reference_count() != 0 && - "Reference count overflow detected in cuda_piProgramRelease."); - - // decrement ref count. If it is 0, delete the program. - if (program->decrement_reference_count() == 0) { - - std::unique_ptr<_pi_program> program_ptr{program}; - - pi_result result = PI_ERROR_INVALID_PROGRAM; - - try { - ScopedContext active(program->get_context()); - auto cuModule = program->get(); - result = PI_CHECK_ERROR(cuModuleUnload(cuModule)); - } catch (...) { - result = PI_ERROR_OUT_OF_RESOURCES; - } - - return result; - } - - return PI_SUCCESS; -} - -/// Gets the native CUDA handle of a PI program object -/// -/// \param[in] program The PI program to get the native CUDA object of. -/// \param[out] nativeHandle Set to the native handle of the PI program object. -/// -/// \return TBD -pi_result cuda_piextProgramGetNativeHandle(pi_program program, - pi_native_handle *nativeHandle) { - *nativeHandle = reinterpret_cast(program->get()); - return PI_SUCCESS; -} - -/// Created a PI program object from a CUDA program handle. -/// TODO: Implement this. -/// NOTE: The created PI object takes ownership of the native handle. -/// -/// \param[in] nativeHandle The native handle to create PI program object from. -/// \param[in] context The PI context of the program. -/// \param[out] program Set to the PI program object created from native handle. -/// -/// \return TBD -pi_result cuda_piextProgramCreateWithNativeHandle(pi_native_handle, pi_context, - bool, pi_program *) { - sycl::detail::pi::die( - "Creation of PI program from native handle not implemented"); - return {}; -} - -pi_result cuda_piKernelGetInfo(pi_kernel kernel, pi_kernel_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) { - - if (kernel != nullptr) { - - switch (param_name) { - case PI_KERNEL_INFO_FUNCTION_NAME: - return getInfo(param_value_size, param_value, param_value_size_ret, - kernel->get_name()); - case PI_KERNEL_INFO_NUM_ARGS: - return getInfo(param_value_size, param_value, param_value_size_ret, - kernel->get_num_args()); - case PI_KERNEL_INFO_REFERENCE_COUNT: - return getInfo(param_value_size, param_value, param_value_size_ret, - kernel->get_reference_count()); - case PI_KERNEL_INFO_CONTEXT: { - return getInfo(param_value_size, param_value, param_value_size_ret, - kernel->get_context()); - } - case PI_KERNEL_INFO_PROGRAM: { - return getInfo(param_value_size, param_value, param_value_size_ret, - kernel->get_program()); - } - case PI_KERNEL_INFO_ATTRIBUTES: { - return getInfo(param_value_size, param_value, param_value_size_ret, ""); - } - default: { - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); - } - } - } - - return PI_ERROR_INVALID_KERNEL; -} - -pi_result cuda_piKernelGetSubGroupInfo( - pi_kernel kernel, pi_device device, pi_kernel_sub_group_info param_name, - size_t input_value_size, const void *input_value, size_t param_value_size, - void *param_value, size_t *param_value_size_ret) { - // Ignore unused parameters - (void)input_value_size; - (void)input_value; - - if (kernel != nullptr) { - switch (param_name) { - case PI_KERNEL_MAX_SUB_GROUP_SIZE: { - // Sub-group size is equivalent to warp size - int warpSize = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, - device->get()) == CUDA_SUCCESS); - return getInfo(param_value_size, param_value, param_value_size_ret, - static_cast(warpSize)); - } - case PI_KERNEL_MAX_NUM_SUB_GROUPS: { - // Number of sub-groups = max block size / warp size + possible remainder - int max_threads = 0; - sycl::detail::pi::assertion( - cuFuncGetAttribute(&max_threads, - CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - kernel->get()) == CUDA_SUCCESS); - int warpSize = 0; - cuda_piKernelGetSubGroupInfo(kernel, device, PI_KERNEL_MAX_SUB_GROUP_SIZE, - 0, nullptr, sizeof(uint32_t), &warpSize, - nullptr); - int maxWarps = (max_threads + warpSize - 1) / warpSize; - return getInfo(param_value_size, param_value, param_value_size_ret, - static_cast(maxWarps)); - } - case PI_KERNEL_COMPILE_NUM_SUB_GROUPS: { - // Return value of 0 => not specified - // TODO: Revisit if PTX is generated for compile-time work-group sizes - return getInfo(param_value_size, param_value, param_value_size_ret, 0); - } - case PI_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL: { - // Return value of 0 => unspecified or "auto" sub-group size - // Correct for now, since warp size may be read from special register - // TODO: Return warp size once default is primary sub-group size - // TODO: Revisit if we can recover [[sub_group_size]] attribute from PTX - return getInfo(param_value_size, param_value, param_value_size_ret, 0); - } - default: - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); - } - } - return PI_ERROR_INVALID_KERNEL; -} - -pi_result cuda_piKernelRetain(pi_kernel kernel) { - assert(kernel != nullptr); - assert(kernel->get_reference_count() > 0u); - - kernel->increment_reference_count(); - return PI_SUCCESS; -} - -pi_result cuda_piKernelRelease(pi_kernel kernel) { - assert(kernel != nullptr); - - // double delete or someone is messing with the ref count. - // either way, cannot safely proceed. - assert(kernel->get_reference_count() != 0 && - "Reference count overflow detected in cuda_piKernelRelease."); - - // decrement ref count. If it is 0, delete the program. - if (kernel->decrement_reference_count() == 0) { - // no internal cuda resources to clean up. Just delete it. - delete kernel; - return PI_SUCCESS; - } - - return PI_SUCCESS; -} - -// A NOP for the CUDA backend -pi_result cuda_piKernelSetExecInfo(pi_kernel, pi_kernel_exec_info, size_t, - const void *) { - return PI_SUCCESS; -} - -pi_result cuda_piextProgramSetSpecializationConstant(pi_program, pi_uint32, - size_t, const void *) { - // This entry point is only used for native specialization constants (SPIR-V), - // and the CUDA plugin is AOT only so this entry point is not supported. - sycl::detail::pi::die("Native specialization constants are not supported"); - return {}; -} - -pi_result cuda_piextKernelSetArgPointer(pi_kernel kernel, pi_uint32 arg_index, - size_t arg_size, - const void *arg_value) { - kernel->set_kernel_arg(arg_index, arg_size, arg_value); - return PI_SUCCESS; -} - // // Events // @@ -4538,33 +3746,35 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextMemGetNativeHandle, cuda_piextMemGetNativeHandle) _PI_CL(piextMemCreateWithNativeHandle, cuda_piextMemCreateWithNativeHandle) // Program - _PI_CL(piProgramCreate, cuda_piProgramCreate) - _PI_CL(piclProgramCreateWithSource, cuda_piclProgramCreateWithSource) - _PI_CL(piProgramCreateWithBinary, cuda_piProgramCreateWithBinary) - _PI_CL(piProgramGetInfo, cuda_piProgramGetInfo) - _PI_CL(piProgramCompile, cuda_piProgramCompile) - _PI_CL(piProgramBuild, cuda_piProgramBuild) - _PI_CL(piProgramLink, cuda_piProgramLink) - _PI_CL(piProgramGetBuildInfo, cuda_piProgramGetBuildInfo) - _PI_CL(piProgramRetain, cuda_piProgramRetain) - _PI_CL(piProgramRelease, cuda_piProgramRelease) - _PI_CL(piextProgramGetNativeHandle, cuda_piextProgramGetNativeHandle) + _PI_CL(piProgramCreate, pi2ur::piProgramCreate) + _PI_CL(piclProgramCreateWithSource, pi2ur::piclProgramCreateWithSource) + _PI_CL(piProgramCreateWithBinary, pi2ur::piProgramCreateWithBinary) + _PI_CL(piProgramGetInfo, pi2ur::piProgramGetInfo) + _PI_CL(piProgramCompile, pi2ur::piProgramCompile) + _PI_CL(piProgramBuild, pi2ur::piProgramBuild) + _PI_CL(piProgramLink, pi2ur::piProgramLink) + _PI_CL(piProgramGetBuildInfo, pi2ur::piProgramGetBuildInfo) + _PI_CL(piProgramRetain, pi2ur::piProgramRetain) + _PI_CL(piProgramRelease, pi2ur::piProgramRelease) + _PI_CL(piextProgramGetNativeHandle, pi2ur::piextProgramGetNativeHandle) _PI_CL(piextProgramCreateWithNativeHandle, - cuda_piextProgramCreateWithNativeHandle) + pi2ur::piextProgramCreateWithNativeHandle) // Kernel - _PI_CL(piKernelCreate, cuda_piKernelCreate) - _PI_CL(piKernelSetArg, cuda_piKernelSetArg) - _PI_CL(piKernelGetInfo, cuda_piKernelGetInfo) - _PI_CL(piKernelGetGroupInfo, cuda_piKernelGetGroupInfo) - _PI_CL(piKernelGetSubGroupInfo, cuda_piKernelGetSubGroupInfo) - _PI_CL(piKernelRetain, cuda_piKernelRetain) - _PI_CL(piKernelRelease, cuda_piKernelRelease) - _PI_CL(piKernelSetExecInfo, cuda_piKernelSetExecInfo) + _PI_CL(piKernelCreate, pi2ur::piKernelCreate) + _PI_CL(piKernelSetArg, pi2ur::piKernelSetArg) + _PI_CL(piKernelGetInfo, pi2ur::piKernelGetInfo) + _PI_CL(piKernelGetGroupInfo, pi2ur::piKernelGetGroupInfo) + _PI_CL(piKernelGetSubGroupInfo, pi2ur::piKernelGetSubGroupInfo) + _PI_CL(piKernelRetain, pi2ur::piKernelRetain) + _PI_CL(piKernelRelease, pi2ur::piKernelRelease) + _PI_CL(piextKernelGetNativeHandle, pi2ur::piextKernelGetNativeHandle) + _PI_CL(piKernelSetExecInfo, pi2ur::piKernelSetExecInfo) _PI_CL(piextProgramSetSpecializationConstant, - cuda_piextProgramSetSpecializationConstant) - _PI_CL(piextKernelSetArgPointer, cuda_piextKernelSetArgPointer) + pi2ur::piextProgramSetSpecializationConstant) + _PI_CL(piextKernelSetArgPointer, pi2ur::piKernelSetArgPointer) _PI_CL(piextKernelCreateWithNativeHandle, - cuda_piextKernelCreateWithNativeHandle) + pi2ur::piextKernelCreateWithNativeHandle) + // Event _PI_CL(piEventCreate, cuda_piEventCreate) _PI_CL(piEventGetInfo, cuda_piEventGetInfo) diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index f6a95ff8d0ab5..51f6b7f2a34b4 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -44,7 +44,9 @@ #include #include +#include #include +#include // Share code between the PI Plugin and UR Adapter #include @@ -52,18 +54,10 @@ extern "C" { /// \cond IGNORE_BLOCK_IN_DOXYGEN -pi_result cuda_piProgramRetain(pi_program); -pi_result cuda_piProgramRelease(pi_program); pi_result cuda_piQueueRelease(pi_queue); pi_result cuda_piQueueRetain(pi_queue); pi_result cuda_piMemRetain(pi_mem); pi_result cuda_piMemRelease(pi_mem); -pi_result cuda_piKernelRetain(pi_kernel); -pi_result cuda_piKernelRelease(pi_kernel); -pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device, - pi_kernel_group_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret); /// \endcond } @@ -670,44 +664,8 @@ struct _pi_event { /// Implementation of PI Program on CUDA Module object /// -struct _pi_program { - using native_type = CUmodule; - native_type module_; - const char *binary_; - size_t binarySizeInBytes_; - std::atomic_uint32_t refCount_; - _pi_context *context_; - - // Metadata - std::unordered_map> - kernelReqdWorkGroupSizeMD_; - std::unordered_map globalIDMD_; - - constexpr static size_t MAX_LOG_SIZE = 8192u; - - char errorLog_[MAX_LOG_SIZE], infoLog_[MAX_LOG_SIZE]; - std::string buildOptions_; - pi_program_build_status buildStatus_ = PI_PROGRAM_BUILD_STATUS_NONE; - - _pi_program(pi_context ctxt); - ~_pi_program(); - - pi_result set_metadata(const pi_device_binary_property *metadata, - size_t length); - - pi_result set_binary(const char *binary, size_t binarySizeInBytes); - - pi_result build_program(const char *build_options); - - pi_context get_context() const { return context_; }; - - native_type get() const noexcept { return module_; }; - - pi_uint32 increment_reference_count() noexcept { return ++refCount_; } - - pi_uint32 decrement_reference_count() noexcept { return --refCount_; } - - pi_uint32 get_reference_count() const noexcept { return refCount_; } +struct _pi_program : ur_program_handle_t_ { + using ur_program_handle_t_::ur_program_handle_t_; }; /// Implementation of a PI Kernel for CUDA @@ -726,171 +684,8 @@ struct _pi_program { /// CUDA shared model. This object simply calculates the total of /// shared memory, and the initial offsets of each parameter. /// -struct _pi_kernel { - using native_type = CUfunction; - - native_type function_; - native_type functionWithOffsetParam_; - std::string name_; - pi_context context_; - pi_program program_; - std::atomic_uint32_t refCount_; - - static constexpr pi_uint32 REQD_THREADS_PER_BLOCK_DIMENSIONS = 3u; - size_t reqdThreadsPerBlock_[REQD_THREADS_PER_BLOCK_DIMENSIONS]; - - /// Structure that holds the arguments to the kernel. - /// Note earch argument size is known, since it comes - /// from the kernel signature. - /// This is not something can be queried from the CUDA API - /// so there is a hard-coded size (\ref MAX_PARAM_BYTES) - /// and a storage. - /// - struct arguments { - static constexpr size_t MAX_PARAM_BYTES = 4000u; - using args_t = std::array; - using args_size_t = std::vector; - using args_index_t = std::vector; - args_t storage_; - args_size_t paramSizes_; - args_index_t indices_; - args_size_t offsetPerIndex_; - - std::uint32_t implicitOffsetArgs_[3] = {0, 0, 0}; - - arguments() { - // Place the implicit offset index at the end of the indicies collection - indices_.emplace_back(&implicitOffsetArgs_); - } - - /// Adds an argument to the kernel. - /// If the argument existed before, it is replaced. - /// Otherwise, it is added. - /// Gaps are filled with empty arguments. - /// Implicit offset argument is kept at the back of the indices collection. - void add_arg(size_t index, size_t size, const void *arg, - size_t localSize = 0) { - if (index + 2 > indices_.size()) { - // Move implicit offset argument index with the end - indices_.resize(index + 2, indices_.back()); - // Ensure enough space for the new argument - paramSizes_.resize(index + 1); - offsetPerIndex_.resize(index + 1); - } - paramSizes_[index] = size; - // calculate the insertion point on the array - size_t insertPos = std::accumulate(std::begin(paramSizes_), - std::begin(paramSizes_) + index, 0); - // Update the stored value for the argument - std::memcpy(&storage_[insertPos], arg, size); - indices_[index] = &storage_[insertPos]; - offsetPerIndex_[index] = localSize; - } - - void add_local_arg(size_t index, size_t size) { - size_t localOffset = this->get_local_size(); - - // maximum required alignment is the size of the largest vector type - const size_t max_alignment = sizeof(double) * 16; - - // for arguments smaller than the maximum alignment simply align to the - // size of the argument - const size_t alignment = std::min(max_alignment, size); - - // align the argument - size_t alignedLocalOffset = localOffset; - if (localOffset % alignment != 0) { - alignedLocalOffset += alignment - (localOffset % alignment); - } - - add_arg(index, sizeof(size_t), (const void *)&(alignedLocalOffset), - size + (alignedLocalOffset - localOffset)); - } - - void set_implicit_offset(size_t size, std::uint32_t *implicitOffset) { - assert(size == sizeof(std::uint32_t) * 3); - std::memcpy(implicitOffsetArgs_, implicitOffset, size); - } - - void clear_local_size() { - std::fill(std::begin(offsetPerIndex_), std::end(offsetPerIndex_), 0); - } - - const args_index_t &get_indices() const noexcept { return indices_; } - - pi_uint32 get_local_size() const { - return std::accumulate(std::begin(offsetPerIndex_), - std::end(offsetPerIndex_), 0); - } - } args_; - - _pi_kernel(CUfunction func, CUfunction funcWithOffsetParam, const char *name, - pi_program program, pi_context ctxt) - : function_{func}, functionWithOffsetParam_{funcWithOffsetParam}, - name_{name}, context_{ctxt}, program_{program}, refCount_{1} { - cuda_piProgramRetain(program_); - pi2ur::piContextRetain(context_); - /// Note: this code assumes that there is only one device per context - pi_result retError = cuda_piKernelGetGroupInfo( - this, reinterpret_cast(ctxt->get_device()), - PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, - sizeof(reqdThreadsPerBlock_), reqdThreadsPerBlock_, nullptr); - (void)retError; - assert(retError == PI_SUCCESS); - } - - ~_pi_kernel() { - cuda_piProgramRelease(program_); - pi2ur::piContextRelease(context_); - } - - pi_program get_program() const noexcept { return program_; } - - pi_uint32 increment_reference_count() noexcept { return ++refCount_; } - - pi_uint32 decrement_reference_count() noexcept { return --refCount_; } - - pi_uint32 get_reference_count() const noexcept { return refCount_; } - - native_type get() const noexcept { return function_; }; - - native_type get_with_offset_parameter() const noexcept { - return functionWithOffsetParam_; - }; - - bool has_with_offset_parameter() const noexcept { - return functionWithOffsetParam_ != nullptr; - } - - pi_context get_context() const noexcept { return context_; }; - - const char *get_name() const noexcept { return name_.c_str(); } - - /// Returns the number of arguments, excluding the implicit global offset. - /// Note this only returns the current known number of arguments, not the - /// real one required by the kernel, since this cannot be queried from - /// the CUDA Driver API - pi_uint32 get_num_args() const noexcept { return args_.indices_.size() - 1; } - - void set_kernel_arg(int index, size_t size, const void *arg) { - args_.add_arg(index, size, arg); - } - - void set_kernel_local_arg(int index, size_t size) { - args_.add_local_arg(index, size); - } - - void set_implicit_offset_arg(size_t size, std::uint32_t *implicitOffset) { - args_.set_implicit_offset(size, implicitOffset); - } - - const arguments::args_index_t &get_arg_indices() const { - return args_.get_indices(); - } - - pi_uint32 get_local_size() const noexcept { return args_.get_local_size(); } - - void clear_local_size() { args_.clear_local_size(); } +struct _pi_kernel : ur_kernel_handle_t_ { + using ur_kernel_handle_t_::ur_kernel_handle_t_; }; /// Implementation of samplers for CUDA diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index bec6aed6131c8..c5ac46747fd73 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -139,6 +139,10 @@ add_sycl_library("ur_adapter_cuda" SHARED "ur/adapters/cuda/device.hpp" "ur/adapters/cuda/platform.cpp" "ur/adapters/cuda/platform.hpp" + "ur/adapters/cuda/program.cpp" + "ur/adapters/cuda/program.hpp" + "ur/adapters/cuda/kernel.cpp" + "ur/adapters/cuda/kernel.hpp" "ur/adapters/cuda/ur_interface_loader.cpp" INCLUDE_DIRS ${sycl_inc_dir} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp new file mode 100644 index 0000000000000..ea341f47ee167 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp @@ -0,0 +1,281 @@ +//===--------- kernel.cpp - CUDA Adapter ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include "kernel.hpp" + +UR_APIEXPORT ur_result_t UR_APICALL +urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName, + ur_kernel_handle_t *phKernel) { + UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(phKernel, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + ur_result_t retErr = UR_RESULT_SUCCESS; + std::unique_ptr retKernel{nullptr}; + + try { + ScopedContext active(hProgram->get_context()); + + CUfunction cuFunc; + retErr = UR_CHECK_ERROR( + cuModuleGetFunction(&cuFunc, hProgram->get(), pKernelName)); + + std::string kernel_name_woffset = std::string(pKernelName) + "_with_offset"; + CUfunction cuFuncWithOffsetParam; + CUresult offsetRes = cuModuleGetFunction( + &cuFuncWithOffsetParam, hProgram->get(), kernel_name_woffset.c_str()); + + // If there is no kernel with global offset parameter we mark it as missing + if (offsetRes == CUDA_ERROR_NOT_FOUND) { + cuFuncWithOffsetParam = nullptr; + } else { + retErr = UR_CHECK_ERROR(offsetRes); + } + retKernel = std::unique_ptr( + new ur_kernel_handle_t_{cuFunc, cuFuncWithOffsetParam, pKernelName, + hProgram, hProgram->get_context()}); + } catch (ur_result_t err) { + retErr = err; + } catch (...) { + retErr = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + *phKernel = retKernel.release(); + return retErr; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, + ur_kernel_group_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet) { + UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + // Here we want to query about a kernel's cuda blocks! + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + switch (propName) { + case UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { + int max_threads = 0; + sycl::detail::ur::assertion( + cuFuncGetAttribute(&max_threads, + CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, + hKernel->get()) == CUDA_SUCCESS); + return ReturnValue(size_t(max_threads)); + } + case UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { + size_t group_size[3] = {0, 0, 0}; + const auto &reqd_wg_size_md_map = + hKernel->program_->kernelReqdWorkGroupSizeMD_; + const auto reqd_wg_size_md = reqd_wg_size_md_map.find(hKernel->name_); + if (reqd_wg_size_md != reqd_wg_size_md_map.end()) { + const auto reqd_wg_size = reqd_wg_size_md->second; + group_size[0] = std::get<0>(reqd_wg_size); + group_size[1] = std::get<1>(reqd_wg_size); + group_size[2] = std::get<2>(reqd_wg_size); + } + return ReturnValue(group_size, 3); + } + case UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: { + // OpenCL LOCAL == CUDA SHARED + int bytes = 0; + sycl::detail::ur::assertion( + cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, + hKernel->get()) == CUDA_SUCCESS); + return ReturnValue(uint64_t(bytes)); + } + case UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { + // Work groups should be multiples of the warp size + int warpSize = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, + hDevice->get()) == CUDA_SUCCESS); + return ReturnValue(static_cast(warpSize)); + } + case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: { + // OpenCL PRIVATE == CUDA LOCAL + int bytes = 0; + sycl::detail::ur::assertion( + cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, + hKernel->get()) == CUDA_SUCCESS); + return ReturnValue(uint64_t(bytes)); + } + default: + break; + } + + return UR_RESULT_ERROR_INVALID_ENUMERATION; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelRetain(ur_kernel_handle_t hKernel) { + UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hKernel->get_reference_count() > 0u, + UR_RESULT_ERROR_INVALID_KERNEL); + + hKernel->increment_reference_count(); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urKernelRelease(ur_kernel_handle_t hKernel) { + UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + // double delete or someone is messing with the ref count. + // either way, cannot safely proceed. + UR_ASSERT(hKernel->get_reference_count() != 0, + UR_RESULT_ERROR_INVALID_KERNEL); + + // decrement ref count. If it is 0, delete the program. + if (hKernel->decrement_reference_count() == 0) { + // no internal cuda resources to clean up. Just delete it. + delete hKernel; + return UR_RESULT_SUCCESS; + } + + return UR_RESULT_SUCCESS; +} + +// TODO(ur): Not implemented on cuda atm. Also, need to add tests for this +// feature. +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle( + ur_kernel_handle_t hKernel, ur_native_handle_t *phNativeKernel) { + (void)hKernel; + (void)phNativeKernel; + + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urKernelSetArgValue(ur_kernel_handle_t hKernel, uint32_t argIndex, + size_t argSize, const void *pArgValue) { + UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + ur_result_t retErr = UR_RESULT_SUCCESS; + try { + if (pArgValue) { + hKernel->set_kernel_arg(argIndex, argSize, pArgValue); + } else { + hKernel->set_kernel_local_arg(argIndex, argSize); + } + } catch (ur_result_t err) { + retErr = err; + } + return retErr; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel, + ur_kernel_info_t propName, + size_t propSize, + void *pKernelInfo, + size_t *pPropSizeRet) { + UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + UrReturnHelper ReturnValue(propSize, pKernelInfo, pPropSizeRet); + + switch (propName) { + case UR_KERNEL_INFO_FUNCTION_NAME: + return ReturnValue(hKernel->get_name()); + case UR_KERNEL_INFO_NUM_ARGS: + return ReturnValue(hKernel->get_num_args()); + case UR_KERNEL_INFO_REFERENCE_COUNT: + return ReturnValue(hKernel->get_reference_count()); + case UR_KERNEL_INFO_CONTEXT: + return ReturnValue(hKernel->get_context()); + case UR_KERNEL_INFO_PROGRAM: + return ReturnValue(hKernel->get_program()); + case UR_KERNEL_INFO_ATTRIBUTES: + return ReturnValue(""); + case UR_KERNEL_INFO_NUM_REGS: { + int numRegs = 0; + sycl::detail::ur::assertion( + cuFuncGetAttribute(&numRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, + hKernel->get()) == CUDA_SUCCESS); + return ReturnValue(uint32_t{numRegs}); + } + default: + break; + } + + return UR_RESULT_ERROR_INVALID_ENUMERATION; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, + ur_kernel_sub_group_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet) { + UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + switch (propName) { + case UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE: { + // Sub-group size is equivalent to warp size + int warpSize = 0; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, + hDevice->get()) == CUDA_SUCCESS); + return ReturnValue(static_cast(warpSize)); + } + case UR_KERNEL_SUB_GROUP_INFO_MAX_NUM_SUB_GROUPS: { + // Number of sub-groups = max block size / warp size + possible remainder + int max_threads = 0; + sycl::detail::ur::assertion( + cuFuncGetAttribute(&max_threads, + CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, + hKernel->get()) == CUDA_SUCCESS); + int warpSize = 0; + urKernelGetSubGroupInfo(hKernel, hDevice, + UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE, + sizeof(uint32_t), &warpSize, nullptr); + int maxWarps = (max_threads + warpSize - 1) / warpSize; + return ReturnValue(static_cast(maxWarps)); + } + case UR_KERNEL_SUB_GROUP_INFO_COMPILE_NUM_SUB_GROUPS: { + // Return value of 0 => not specified + // TODO: Revisit if PTX is generated for compile-time work-group sizes + return ReturnValue(0); + } + case UR_KERNEL_SUB_GROUP_INFO_SUB_GROUP_SIZE_INTEL: { + // Return value of 0 => unspecified or "auto" sub-group size + // Correct for now, since warp size may be read from special register + // TODO: Return warp size once default is primary sub-group size + // TODO: Revisit if we can recover [[sub_group_size]] attribute from PTX + return ReturnValue(0); + } + default: + break; + } + + return UR_RESULT_ERROR_INVALID_ENUMERATION; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer( + ur_kernel_handle_t hKernel, uint32_t argIndex, const void *pArgValue) { + hKernel->set_kernel_arg(argIndex, sizeof(pArgValue), pArgValue); + return UR_RESULT_SUCCESS; +} + +// A NOP for the CUDA backend +UR_APIEXPORT ur_result_t UR_APICALL +urKernelSetExecInfo(ur_kernel_handle_t hKernel, ur_kernel_exec_info_t propName, + size_t propSize, const void *pPropValue) { + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urProgramSetSpecializationConstants( + ur_program_handle_t, uint32_t, const ur_specialization_constant_info_t *) { + // This entry point is only used for native specialization constants (SPIR-V), + // and the CUDA plugin is AOT only so this entry point is not supported. + sycl::detail::ur::die("Native specialization constants are not supported"); + return {}; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle( + ur_native_handle_t hNativeKernel, ur_context_handle_t hContext, + ur_program_handle_t hProgram, + const ur_kernel_native_properties_t *pProperties, + ur_kernel_handle_t *phKernel) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp new file mode 100644 index 0000000000000..42e624cefba48 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp @@ -0,0 +1,183 @@ +//===--------- kernel.hpp - CUDA Adapter ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// +#pragma once + +#include +#include + +#include +#include +#include + +#include "program.hpp" + +struct ur_kernel_handle_t_ { + using native_type = CUfunction; + + native_type function_; + native_type functionWithOffsetParam_; + std::string name_; + ur_context_handle_t context_; + ur_program_handle_t program_; + std::atomic_uint32_t refCount_; + + static constexpr uint32_t REQD_THREADS_PER_BLOCK_DIMENSIONS = 3u; + size_t reqdThreadsPerBlock_[REQD_THREADS_PER_BLOCK_DIMENSIONS]; + + /// Structure that holds the arguments to the kernel. + /// Note earch argument size is known, since it comes + /// from the kernel signature. + /// This is not something can be queried from the CUDA API + /// so there is a hard-coded size (\ref MAX_PARAM_BYTES) + /// and a storage. + /// + struct arguments { + static constexpr size_t MAX_PARAM_BYTES = 4000u; + using args_t = std::array; + using args_size_t = std::vector; + using args_index_t = std::vector; + args_t storage_; + args_size_t paramSizes_; + args_index_t indices_; + args_size_t offsetPerIndex_; + + std::uint32_t implicitOffsetArgs_[3] = {0, 0, 0}; + + arguments() { + // Place the implicit offset index at the end of the indicies collection + indices_.emplace_back(&implicitOffsetArgs_); + } + + /// Adds an argument to the kernel. + /// If the argument existed before, it is replaced. + /// Otherwise, it is added. + /// Gaps are filled with empty arguments. + /// Implicit offset argument is kept at the back of the indices collection. + void add_arg(size_t index, size_t size, const void *arg, + size_t localSize = 0) { + if (index + 2 > indices_.size()) { + // Move implicit offset argument index with the end + indices_.resize(index + 2, indices_.back()); + // Ensure enough space for the new argument + paramSizes_.resize(index + 1); + offsetPerIndex_.resize(index + 1); + } + paramSizes_[index] = size; + // calculate the insertion point on the array + size_t insertPos = std::accumulate(std::begin(paramSizes_), + std::begin(paramSizes_) + index, 0); + // Update the stored value for the argument + std::memcpy(&storage_[insertPos], arg, size); + indices_[index] = &storage_[insertPos]; + offsetPerIndex_[index] = localSize; + } + + void add_local_arg(size_t index, size_t size) { + size_t localOffset = this->get_local_size(); + + // maximum required alignment is the size of the largest vector type + const size_t max_alignment = sizeof(double) * 16; + + // for arguments smaller than the maximum alignment simply align to the + // size of the argument + const size_t alignment = std::min(max_alignment, size); + + // align the argument + size_t alignedLocalOffset = localOffset; + if (localOffset % alignment != 0) { + alignedLocalOffset += alignment - (localOffset % alignment); + } + + add_arg(index, sizeof(size_t), (const void *)&(alignedLocalOffset), + size + (alignedLocalOffset - localOffset)); + } + + void set_implicit_offset(size_t size, std::uint32_t *implicitOffset) { + assert(size == sizeof(std::uint32_t) * 3); + std::memcpy(implicitOffsetArgs_, implicitOffset, size); + } + + void clear_local_size() { + std::fill(std::begin(offsetPerIndex_), std::end(offsetPerIndex_), 0); + } + + const args_index_t &get_indices() const noexcept { return indices_; } + + uint32_t get_local_size() const { + return std::accumulate(std::begin(offsetPerIndex_), + std::end(offsetPerIndex_), 0); + } + } args_; + + ur_kernel_handle_t_(CUfunction func, CUfunction funcWithOffsetParam, + const char *name, ur_program_handle_t program, + ur_context_handle_t ctxt) + : function_{func}, functionWithOffsetParam_{funcWithOffsetParam}, + name_{name}, context_{ctxt}, program_{program}, refCount_{1} { + urProgramRetain(program_); + urContextRetain(context_); + /// Note: this code assumes that there is only one device per context + ur_result_t retError = urKernelGetGroupInfo( + this, ctxt->get_device(), UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, + sizeof(reqdThreadsPerBlock_), reqdThreadsPerBlock_, nullptr); + assert(retError == UR_RESULT_SUCCESS); + } + + ~ur_kernel_handle_t_() { + urProgramRelease(program_); + urContextRelease(context_); + } + + ur_program_handle_t get_program() const noexcept { return program_; } + + uint32_t increment_reference_count() noexcept { return ++refCount_; } + + uint32_t decrement_reference_count() noexcept { return --refCount_; } + + uint32_t get_reference_count() const noexcept { return refCount_; } + + native_type get() const noexcept { return function_; }; + + native_type get_with_offset_parameter() const noexcept { + return functionWithOffsetParam_; + }; + + bool has_with_offset_parameter() const noexcept { + return functionWithOffsetParam_ != nullptr; + } + + ur_context_handle_t get_context() const noexcept { return context_; }; + + const char *get_name() const noexcept { return name_.c_str(); } + + /// Returns the number of arguments, excluding the implicit global offset. + /// Note this only returns the current known number of arguments, not the + /// real one required by the kernel, since this cannot be queried from + /// the CUDA Driver API + uint32_t get_num_args() const noexcept { return args_.indices_.size() - 1; } + + void set_kernel_arg(int index, size_t size, const void *arg) { + args_.add_arg(index, size, arg); + } + + void set_kernel_local_arg(int index, size_t size) { + args_.add_local_arg(index, size); + } + + void set_implicit_offset_arg(size_t size, std::uint32_t *implicitOffset) { + return args_.set_implicit_offset(size, implicitOffset); + } + + const arguments::args_index_t &get_arg_indices() const { + return args_.get_indices(); + } + + uint32_t get_local_size() const noexcept { return args_.get_local_size(); } + + void clear_local_size() { args_.clear_local_size(); } +}; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp new file mode 100644 index 0000000000000..7a56620180fef --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp @@ -0,0 +1,439 @@ +//===--------- program.cpp - CUDA Adapter ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include "program.hpp" + +bool getMaxRegistersJitOptionValue(const std::string &build_options, + unsigned int &value) { + using namespace std::string_view_literals; + const std::size_t optionPos = build_options.find_first_of("maxrregcount"sv); + if (optionPos == std::string::npos) { + return false; + } + + const std::size_t delimPos = build_options.find('=', optionPos + 1u); + if (delimPos == std::string::npos) { + return false; + } + + const std::size_t length = build_options.length(); + const std::size_t startPos = delimPos + 1u; + if (delimPos == std::string::npos || startPos >= length) { + return false; + } + + std::size_t pos = startPos; + while (pos < length && + std::isdigit(static_cast(build_options[pos]))) { + pos++; + } + + const std::string valueString = + build_options.substr(startPos, pos - startPos); + if (valueString.empty()) { + return false; + } + + value = static_cast(std::stoi(valueString)); + return true; +} + +ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t ctxt) + : module_{nullptr}, binary_{}, binarySizeInBytes_{0}, refCount_{1}, + context_{ctxt}, kernelReqdWorkGroupSizeMD_{} { + urContextRetain(context_); +} + +ur_program_handle_t_::~ur_program_handle_t_() { urContextRelease(context_); } + +std::pair +splitMetadataName(const std::string &metadataName) { + size_t splitPos = metadataName.rfind('@'); + if (splitPos == std::string::npos) + return std::make_pair(metadataName, std::string{}); + return std::make_pair(metadataName.substr(0, splitPos), + metadataName.substr(splitPos, metadataName.length())); +} + +ur_result_t +ur_program_handle_t_::set_metadata(const ur_program_metadata_t *metadata, + size_t length) { + for (size_t i = 0; i < length; ++i) { + const ur_program_metadata_t metadataElement = metadata[i]; + std::string metadataElementName{metadataElement.pName}; + + auto [prefix, tag] = splitMetadataName(metadataElementName); + + if (tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE) { + // If metadata is reqd_work_group_size, record it for the corresponding + // kernel name. + size_t MDElemsSize = metadataElement.size - sizeof(std::uint64_t); + + // Expect between 1 and 3 32-bit integer values. + UR_ASSERT(MDElemsSize >= sizeof(std::uint32_t) && + MDElemsSize <= sizeof(std::uint32_t) * 3, + UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); + + // Get pointer to data, skipping 64-bit size at the start of the data. + const char *ValuePtr = + reinterpret_cast(metadataElement.value.pData) + + sizeof(std::uint64_t); + // Read values and pad with 1's for values not present. + std::uint32_t reqdWorkGroupElements[] = {1, 1, 1}; + std::memcpy(reqdWorkGroupElements, ValuePtr, MDElemsSize); + kernelReqdWorkGroupSizeMD_[prefix] = + std::make_tuple(reqdWorkGroupElements[0], reqdWorkGroupElements[1], + reqdWorkGroupElements[2]); + } else if (tag == __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING) { + const char *metadataValPtr = + reinterpret_cast(metadataElement.value.pData) + + sizeof(std::uint64_t); + const char *metadataValPtrEnd = + metadataValPtr + metadataElement.size - sizeof(std::uint64_t); + globalIDMD_[prefix] = std::string{metadataValPtr, metadataValPtrEnd}; + } + } + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_program_handle_t_::set_binary(const char *source, + size_t length) { + // Do not re-set program binary data which has already been set as that will + // delete the old binary data. + UR_ASSERT(binary_ == nullptr && binarySizeInBytes_ == 0, + UR_RESULT_ERROR_INVALID_OPERATION); + binary_ = source; + binarySizeInBytes_ = length; + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_program_handle_t_::build_program(const char *build_options) { + + this->buildOptions_ = build_options; + + constexpr const unsigned int numberOfOptions = 4u; + + std::vector options(numberOfOptions); + std::vector optionVals(numberOfOptions); + + // Pass a buffer for info messages + options[0] = CU_JIT_INFO_LOG_BUFFER; + optionVals[0] = (void *)infoLog_; + // Pass the size of the info buffer + options[1] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; + optionVals[1] = (void *)(long)MAX_LOG_SIZE; + // Pass a buffer for error message + options[2] = CU_JIT_ERROR_LOG_BUFFER; + optionVals[2] = (void *)errorLog_; + // Pass the size of the error buffer + options[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; + optionVals[3] = (void *)(long)MAX_LOG_SIZE; + + if (!buildOptions_.empty()) { + unsigned int maxRegs; + bool valid = getMaxRegistersJitOptionValue(buildOptions_, maxRegs); + if (valid) { + options.push_back(CU_JIT_MAX_REGISTERS); + optionVals.push_back(reinterpret_cast(maxRegs)); + } + } + + auto result = UR_CHECK_ERROR( + cuModuleLoadDataEx(&module_, static_cast(binary_), + options.size(), options.data(), optionVals.data())); + + const auto success = (result == UR_RESULT_SUCCESS); + + buildStatus_ = + success ? UR_PROGRAM_BUILD_STATUS_SUCCESS : UR_PROGRAM_BUILD_STATUS_ERROR; + + // If no exception, result is correct + return success ? UR_RESULT_SUCCESS : UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE; +} + +/// Finds kernel names by searching for entry points in the PTX source, as the +/// CUDA driver API doesn't expose an operation for this. +/// Note: This is currently only being used by the SYCL program class for the +/// has_kernel method, so an alternative would be to move the has_kernel +/// query to PI and use cuModuleGetFunction to check for a kernel. +/// Note: Another alternative is to add kernel names as metadata, like with +/// reqd_work_group_size. +std::string getKernelNames(ur_program_handle_t) { + sycl::detail::ur::die("getKernelNames not implemented"); + return {}; +} + +/// CUDA will handle the PTX/CUBIN binaries internally through CUmodule object. +/// So, urProgramCreateWithIL and urProgramCreateWithBinary are equivalent in +/// terms of CUDA adapter. See \ref urProgramCreateWithBinary. +/// +UR_APIEXPORT ur_result_t UR_APICALL +urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL, + size_t length, const ur_program_properties_t *pProperties, + ur_program_handle_t *phProgram) { + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + ur_device_handle_t hDevice = hContext->get_device(); + auto pBinary = reinterpret_cast(pIL); + + return urProgramCreateWithBinary(hContext, hDevice, length, pBinary, + pProperties, phProgram); +} + +/// CUDA will handle the PTX/CUBIN binaries internally through a call to +/// cuModuleLoadDataEx. So, urProgramCompile and urProgramBuild are equivalent +/// in terms of CUDA adapter. \TODO Implement asynchronous compilation +/// +UR_APIEXPORT ur_result_t UR_APICALL +urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram, + const char *pOptions) { + return urProgramBuild(hContext, hProgram, pOptions); +} + +/// Loads the images from a UR program into a CUmodule that can be +/// used later on to extract functions (kernels). +/// See \ref ur_program_handle_t for implementation details. +/// +UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext, + ur_program_handle_t hProgram, + const char *pOptions) { + UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + ur_result_t retError = UR_RESULT_SUCCESS; + + try { + ScopedContext active(hProgram->get_context()); + + hProgram->build_program(pOptions); + + } catch (ur_result_t err) { + retError = err; + } + return retError; +} + +/// Creates a new UR program object that is the outcome of linking all input +/// programs. +/// \TODO Implement linker options, requires mapping of OpenCL to CUDA +/// +UR_APIEXPORT ur_result_t UR_APICALL +urProgramLink(ur_context_handle_t hContext, uint32_t count, + const ur_program_handle_t *phPrograms, const char *pOptions, + ur_program_handle_t *phProgram) { + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(count, UR_RESULT_ERROR_PROGRAM_LINK_FAILURE); + UR_ASSERT(phPrograms, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(phProgram, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + ur_result_t retError = UR_RESULT_SUCCESS; + + try { + ScopedContext active(hContext); + + CUlinkState state; + std::unique_ptr retProgram{ + new ur_program_handle_t_{hContext}}; + + retError = UR_CHECK_ERROR(cuLinkCreate(0, nullptr, nullptr, &state)); + try { + for (size_t i = 0; i < count; ++i) { + ur_program_handle_t program = phPrograms[i]; + retError = UR_CHECK_ERROR(cuLinkAddData( + state, CU_JIT_INPUT_PTX, const_cast(program->binary_), + program->binarySizeInBytes_, nullptr, 0, nullptr, nullptr)); + } + void *cubin = nullptr; + size_t cubinSize = 0; + retError = UR_CHECK_ERROR(cuLinkComplete(state, &cubin, &cubinSize)); + + retError = + retProgram->set_binary(static_cast(cubin), cubinSize); + + retError = retProgram->build_program(pOptions); + } catch (...) { + // Upon error attempt cleanup + UR_CHECK_ERROR(cuLinkDestroy(state)); + throw; + } + + retError = UR_CHECK_ERROR(cuLinkDestroy(state)); + *phProgram = retProgram.release(); + + } catch (ur_result_t err) { + retError = err; + } + return retError; +} + +/// Created a UR program object from a CUDA program handle. +/// TODO: Implement this. +/// NOTE: The created UR object takes ownership of the native handle. +/// +/// \param[in] nativeHandle The native handle to create UR program object from. +/// \param[in] context The UR context of the program. +/// \param[out] program Set to the UR program object created from native handle. +/// +/// \return TBD +UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithNativeHandle( + ur_native_handle_t hNativeProgram, ur_context_handle_t hContext, + ur_program_handle_t *phProgram) { + sycl::detail::ur::die( + "Creation of UR program from native handle not implemented"); + return {}; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urProgramGetBuildInfo(ur_program_handle_t hProgram, ur_device_handle_t hDevice, + ur_program_build_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet) { + // Ignore unused parameter + (void)hDevice; + + UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + switch (propName) { + case UR_PROGRAM_BUILD_INFO_STATUS: { + return ReturnValue(hProgram->buildStatus_); + } + case UR_PROGRAM_BUILD_INFO_OPTIONS: + return ReturnValue(hProgram->buildOptions_.c_str()); + case UR_PROGRAM_BUILD_INFO_LOG: + return ReturnValue(hProgram->infoLog_, hProgram->MAX_LOG_SIZE); + default: + break; + } + return UR_RESULT_ERROR_INVALID_ENUMERATION; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName, + size_t propSize, void *pProgramInfo, size_t *pPropSizeRet) { + UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + UrReturnHelper ReturnValue(propSize, pProgramInfo, pPropSizeRet); + + switch (propName) { + case UR_PROGRAM_INFO_REFERENCE_COUNT: + return ReturnValue(hProgram->get_reference_count()); + case UR_PROGRAM_INFO_CONTEXT: + return ReturnValue(hProgram->context_); + case UR_PROGRAM_INFO_NUM_DEVICES: + return ReturnValue(1u); + case UR_PROGRAM_INFO_DEVICES: + return ReturnValue(&hProgram->context_->deviceId_, 1); + case UR_PROGRAM_INFO_SOURCE: + return ReturnValue(hProgram->binary_); + case UR_PROGRAM_INFO_BINARY_SIZES: + return ReturnValue(&hProgram->binarySizeInBytes_, 1); + case UR_PROGRAM_INFO_BINARIES: + return ReturnValue(&hProgram->binary_, 1); + case UR_PROGRAM_INFO_NUM_KERNELS: + return ReturnValue(getKernelNames(hProgram).c_str()); + default: + break; + } + return UR_RESULT_ERROR_INVALID_ENUMERATION; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urProgramRetain(ur_program_handle_t program) { + UR_ASSERT(program, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(program->get_reference_count() > 0, + UR_RESULT_ERROR_INVALID_PROGRAM); + program->increment_reference_count(); + return UR_RESULT_SUCCESS; +} + +/// Decreases the reference count of a ur_program_handle_t object. +/// When the reference count reaches 0, it unloads the module from +/// the context. +UR_APIEXPORT ur_result_t UR_APICALL +urProgramRelease(ur_program_handle_t program) { + UR_ASSERT(program, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + // double delete or someone is messing with the ref count. + // either way, cannot safely proceed. + UR_ASSERT(program->get_reference_count() != 0, + UR_RESULT_ERROR_INVALID_PROGRAM); + + // decrement ref count. If it is 0, delete the program. + if (program->decrement_reference_count() == 0) { + + std::unique_ptr program_ptr{program}; + + ur_result_t result = UR_RESULT_ERROR_INVALID_PROGRAM; + + try { + ScopedContext active(program->get_context()); + auto cuModule = program->get(); + result = UR_CHECK_ERROR(cuModuleUnload(cuModule)); + } catch (...) { + result = UR_RESULT_ERROR_OUT_OF_RESOURCES; + } + + return result; + } + + return UR_RESULT_SUCCESS; +} + +/// Gets the native CUDA handle of a UR program object +/// +/// \param[in] program The PI program to get the native CUDA object of. +/// \param[out] nativeHandle Set to the native handle of the PI program object. +/// +/// \return TBD +UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle( + ur_program_handle_t program, ur_native_handle_t *nativeHandle) { + UR_ASSERT(program, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + *nativeHandle = reinterpret_cast(program->get()); + return UR_RESULT_SUCCESS; +} + +/// Loads images from a list of PTX or CUBIN binaries. +/// Note: No calls to CUDA driver API in this function, only store binaries +/// for later. +/// +/// Note: Only supports one device +/// +UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( + ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size, + const uint8_t *pBinary, const ur_program_properties_t *pProperties, + ur_program_handle_t *phProgram) { + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(phProgram, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(pBinary != nullptr && size != 0, UR_RESULT_ERROR_INVALID_BINARY); + UR_ASSERT(hContext->get_device()->get() == hDevice->get(), + UR_RESULT_ERROR_INVALID_CONTEXT); + + ur_result_t retError = UR_RESULT_SUCCESS; + + std::unique_ptr retProgram{ + new ur_program_handle_t_{hContext}}; + + retError = + retProgram->set_metadata(pProperties->pMetadatas, pProperties->count); + UR_ASSERT(retError == UR_RESULT_SUCCESS, retError); + + auto pBinary_string = reinterpret_cast(pBinary); + if (size == 0) { + size = strlen(pBinary_string) + 1; + } + + UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE); + + retError = retProgram->set_binary(pBinary_string, size); + UR_ASSERT(retError == UR_RESULT_SUCCESS, retError); + + *phProgram = retProgram.release(); + + return retError; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.hpp new file mode 100644 index 0000000000000..35ac6fb215ea0 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.hpp @@ -0,0 +1,55 @@ +//===--------- program.hpp - CUDA Adapter ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// +#pragma once + +#include +#include + +#include +#include + +#include "context.hpp" + +struct ur_program_handle_t_ { + using native_type = CUmodule; + native_type module_; + const char *binary_; + size_t binarySizeInBytes_; + std::atomic_uint32_t refCount_; + ur_context_handle_t context_; + + // Metadata + std::unordered_map> + kernelReqdWorkGroupSizeMD_; + std::unordered_map globalIDMD_; + + constexpr static size_t MAX_LOG_SIZE = 8192u; + + char errorLog_[MAX_LOG_SIZE], infoLog_[MAX_LOG_SIZE]; + std::string buildOptions_; + ur_program_build_status_t buildStatus_ = UR_PROGRAM_BUILD_STATUS_NONE; + + ur_program_handle_t_(ur_context_handle_t ctxt); + ~ur_program_handle_t_(); + + ur_result_t set_metadata(const ur_program_metadata_t *metadata, + size_t length); + + ur_result_t set_binary(const char *binary, size_t binarySizeInBytes); + + ur_result_t build_program(const char *build_options); + ur_context_handle_t get_context() const { return context_; }; + + native_type get() const noexcept { return module_; }; + + uint32_t increment_reference_count() noexcept { return ++refCount_; } + + uint32_t decrement_reference_count() noexcept { return --refCount_; } + + uint32_t get_reference_count() const noexcept { return refCount_; } +}; From 506130ec0bc7f44c034c1d2fbd4e3bfc9617e733 Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Thu, 13 Apr 2023 17:18:54 +0100 Subject: [PATCH 04/45] Add UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE to kernel group info --- .../ur/adapters/cuda/kernel.cpp | 35 ++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp index ea341f47ee167..e34976394c5ff 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp @@ -52,12 +52,45 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, ur_kernel_group_info_t propName, size_t propSize, void *pPropValue, size_t *pPropSizeRet) { - UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); // Here we want to query about a kernel's cuda blocks! UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); switch (propName) { + case UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: { + size_t global_work_size[3] = {0, 0, 0}; + + int max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0}; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&max_block_dimX, + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&max_block_dimY, + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&max_block_dimZ, + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, + hDevice->get()) == CUDA_SUCCESS); + + int max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0}; + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&max_grid_dimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&max_grid_dimY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion( + cuDeviceGetAttribute(&max_grid_dimZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, + hDevice->get()) == CUDA_SUCCESS); + + global_work_size[0] = max_block_dimX * max_grid_dimX; + global_work_size[1] = max_block_dimY * max_grid_dimY; + global_work_size[2] = max_block_dimZ * max_grid_dimZ; + return ReturnValue(global_work_size, 3); + } case UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { int max_threads = 0; sycl::detail::ur::assertion( From 625f1f8958496054588d7bc2b5303dc7ecfcb3f6 Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Fri, 14 Apr 2023 10:53:05 +0100 Subject: [PATCH 05/45] [SYCL][PI][UR][CUDA] Port a few miscellaneous CUDA entry points to UR Namely: * piTearDown * piPluginGetLastError * piGetDeviceAndHostTimer --- sycl/plugins/cuda/CMakeLists.txt | 2 +- sycl/plugins/cuda/pi_cuda.cpp | 75 +++---------------- sycl/plugins/unified_runtime/CMakeLists.txt | 1 + .../ur/adapters/cuda/common.cpp | 25 +++++++ .../ur/adapters/cuda/common.hpp | 8 ++ .../ur/adapters/cuda/device.cpp | 28 +++++++ .../ur/adapters/cuda/platform.cpp | 14 +++- .../ur/adapters}/cuda/tracing.cpp | 0 .../ur/adapters/cuda/ur_interface_loader.cpp | 6 +- 9 files changed, 88 insertions(+), 71 deletions(-) rename sycl/plugins/{ => unified_runtime/ur/adapters}/cuda/tracing.cpp (100%) diff --git a/sycl/plugins/cuda/CMakeLists.txt b/sycl/plugins/cuda/CMakeLists.txt index 76d730967a7c0..cbc5e8f9e9638 100644 --- a/sycl/plugins/cuda/CMakeLists.txt +++ b/sycl/plugins/cuda/CMakeLists.txt @@ -68,12 +68,12 @@ add_sycl_plugin(cuda "../unified_runtime/ur/adapters/cuda/kernel.cpp" "../unified_runtime/ur/adapters/cuda/kernel.hpp" "../unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp" + "../unified_runtime/ur/adapters/cuda/tracing.cpp" # --- "${sycl_inc_dir}/sycl/detail/pi.h" "${sycl_inc_dir}/sycl/detail/pi.hpp" "pi_cuda.hpp" "pi_cuda.cpp" - "tracing.cpp" ${XPTI_PROXY_SRC} INCLUDE_DIRS ${sycl_inc_dir} diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index c09ccea8ef6a3..31c85b3877091 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -54,24 +54,6 @@ pi_result map_error(CUresult result) { } } -// Global variables for PI_ERROR_PLUGIN_SPECIFIC_ERROR -constexpr size_t MaxMessageSize = 256; -thread_local pi_result ErrorMessageCode = PI_SUCCESS; -thread_local char ErrorMessage[MaxMessageSize]; - -// Utility function for setting a message and warning -static void setErrorMessage(const char *message, pi_result error_code) { - assert(strlen(message) <= MaxMessageSize); - strcpy(ErrorMessage, message); - ErrorMessageCode = error_code; -} - -// Returns plugin specific error and warning messages -pi_result cuda_piPluginGetLastError(char **message) { - *message = &ErrorMessage[0]; - return ErrorMessageCode; -} - // Returns plugin specific backend option. // Current support is only for optimization options. // Return empty string for cuda. @@ -713,7 +695,7 @@ pi_result cuda_piContextGetInfo(pi_context context, pi_context_info param_name, // These queries should be dealt with in context_impl.cpp by calling the // queries of each device separately and building the intersection set. setErrorMessage("These queries should have never come here.", - PI_ERROR_INVALID_ARG_VALUE); + UR_RESULT_ERROR_INVALID_ARGUMENT); return PI_ERROR_PLUGIN_SPECIFIC_ERROR; } case PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT: @@ -1448,7 +1430,7 @@ pi_result cuda_piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index, arrayDesc.Format != CU_AD_FORMAT_FLOAT) { setErrorMessage("PI CUDA kernels only support images with channel " "types int32, uint32, float, and half.", - PI_ERROR_PLUGIN_SPECIFIC_ERROR); + UR_RESULT_ERROR_ADAPTER_SPECIFIC); return PI_ERROR_PLUGIN_SPECIFIC_ERROR; } CUsurfObject cuSurf = arg_mem->mem_.surface_mem_.get_surface(); @@ -1618,7 +1600,7 @@ pi_result cuda_piEnqueueKernelLaunch( if (env_val <= 0 || env_val > device_max_local_mem) { setErrorMessage("Invalid value specified for " "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE", - PI_ERROR_PLUGIN_SPECIFIC_ERROR); + UR_RESULT_ERROR_ADAPTER_SPECIFIC); return PI_ERROR_PLUGIN_SPECIFIC_ERROR; } PI_CHECK_ERROR(cuFuncSetAttribute( @@ -3182,7 +3164,7 @@ pi_result cuda_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr, if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { setErrorMessage("Prefetch hint ignored as device does not support " "concurrent managed access", - PI_SUCCESS); + UR_RESULT_SUCCESS); return PI_ERROR_PLUGIN_SPECIFIC_ERROR; } @@ -3191,7 +3173,7 @@ pi_result cuda_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr, &is_managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)ptr)); if (!is_managed) { setErrorMessage("Prefetch hint ignored as prefetch only works with USM", - PI_SUCCESS); + UR_RESULT_SUCCESS); return PI_ERROR_PLUGIN_SPECIFIC_ERROR; } @@ -3248,7 +3230,7 @@ pi_result cuda_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr, if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { setErrorMessage("Mem advise ignored as device does not support " "concurrent managed access", - PI_SUCCESS); + UR_RESULT_SUCCESS); return PI_ERROR_PLUGIN_SPECIFIC_ERROR; } @@ -3263,7 +3245,7 @@ pi_result cuda_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr, if (!is_managed) { setErrorMessage( "Memory advice ignored as memory advices only works with USM", - PI_SUCCESS); + UR_RESULT_SUCCESS); return PI_ERROR_PLUGIN_SPECIFIC_ERROR; } @@ -3641,43 +3623,6 @@ pi_result cuda_piextEnqueueWriteHostPipe( return {}; } -// This API is called by Sycl RT to notify the end of the plugin lifetime. -// Windows: dynamically loaded plugins might have been unloaded already -// when this is called. Sycl RT holds onto the PI plugin so it can be -// called safely. But this is not transitive. If the PI plugin in turn -// dynamically loaded a different DLL, that may have been unloaded. -// TODO: add a global variable lifetime management code here (see -// pi_level_zero.cpp for reference) Currently this is just a NOOP. -pi_result cuda_piTearDown(void *) { - disableCUDATracing(); - return PI_SUCCESS; -} - -pi_result cuda_piGetDeviceAndHostTimer(pi_device Device, uint64_t *DeviceTime, - uint64_t *HostTime) { - _pi_event::native_type event; - ScopedContext active(Device->get_context()); - - if (DeviceTime) { - PI_CHECK_ERROR(cuEventCreate(&event, CU_EVENT_DEFAULT)); - PI_CHECK_ERROR(cuEventRecord(event, 0)); - } - if (HostTime) { - - using namespace std::chrono; - *HostTime = - duration_cast(steady_clock::now().time_since_epoch()) - .count(); - } - - if (DeviceTime) { - PI_CHECK_ERROR(cuEventSynchronize(event)); - *DeviceTime = Device->get_elapsed_time(event); - } - - return PI_SUCCESS; -} - const char SupportedVersion[] = _PI_CUDA_PLUGIN_VERSION_STRING; pi_result piPluginInit(pi_plugin *PluginInit) { @@ -3835,9 +3780,9 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextKernelSetArgMemObj, cuda_piextKernelSetArgMemObj) _PI_CL(piextKernelSetArgSampler, cuda_piextKernelSetArgSampler) - _PI_CL(piPluginGetLastError, cuda_piPluginGetLastError) - _PI_CL(piTearDown, cuda_piTearDown) - _PI_CL(piGetDeviceAndHostTimer, cuda_piGetDeviceAndHostTimer) + _PI_CL(piPluginGetLastError, pi2ur::piPluginGetLastError) + _PI_CL(piTearDown, pi2ur::piTearDown) + _PI_CL(piGetDeviceAndHostTimer, pi2ur::piGetDeviceAndHostTimer) _PI_CL(piPluginGetBackendOption, cuda_piPluginGetBackendOption) #undef _PI_CL diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index c5ac46747fd73..014938c9ba542 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -144,6 +144,7 @@ add_sycl_library("ur_adapter_cuda" SHARED "ur/adapters/cuda/kernel.cpp" "ur/adapters/cuda/kernel.hpp" "ur/adapters/cuda/ur_interface_loader.cpp" + "ur/adapters/cuda/tracing.cpp" INCLUDE_DIRS ${sycl_inc_dir} LIBRARIES diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp index 264d7588f3229..f25aa88b3e292 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp @@ -85,3 +85,28 @@ void sycl::detail::ur::assertion(bool Condition, const char *Message) { void sycl::detail::ur::cuPrint(const char *Message) { std::cerr << "ur_print: " << Message << std::endl; } + + +// Global variables for ZER_EXT_RESULT_ADAPTER_SPECIFIC_ERROR +thread_local ur_result_t ErrorMessageCode = UR_RESULT_SUCCESS; +thread_local char ErrorMessage[MaxMessageSize]; + +// Utility function for setting a message and warning +[[maybe_unused]] void setErrorMessage(const char *message, + ur_result_t error_code) { + assert(strlen(message) <= MaxMessageSize); + strcpy(ErrorMessage, message); + ErrorMessageCode = error_code; +} + +ur_result_t zerPluginGetLastError(char **message) { + *message = &ErrorMessage[0]; + return ErrorMessageCode; +} + +// Returns plugin specific error and warning messages; common implementation +// that can be shared between adapters +ur_result_t urGetLastResult(ur_platform_handle_t, const char **ppMessage) { + *ppMessage = &ErrorMessage[0]; + return ErrorMessageCode; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp index 16cabc37a2b16..3aa23c67bf492 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp @@ -28,6 +28,14 @@ ur_result_t check_error_ur(CUresult result, const char *function, int line, std::string getCudaVersionString(); +constexpr size_t MaxMessageSize = 256; +extern thread_local ur_result_t ErrorMessageCode; +extern thread_local char ErrorMessage[MaxMessageSize]; + +// Utility function for setting a message and warning +[[maybe_unused]] void setErrorMessage(const char *message, + ur_result_t error_code); + /// ------ Error handling, matching OpenCL plugin semantics. namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp index d0b11b23cc74d..ae987ab4a7c6e 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp @@ -1117,3 +1117,31 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( // existing device return error return UR_RESULT_ERROR_INVALID_OPERATION; } + +ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice, + uint64_t *pDeviceTimestamp, + uint64_t *pHostTimestamp) { + UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + CUevent event; + ScopedContext active(hDevice->get_context()); + + if (pDeviceTimestamp) { + UR_CHECK_ERROR(cuEventCreate(&event, CU_EVENT_DEFAULT)); + UR_CHECK_ERROR(cuEventRecord(event, 0)); + } + if (pHostTimestamp) { + + using namespace std::chrono; + *pHostTimestamp = + duration_cast(steady_clock::now().time_since_epoch()) + .count(); + } + + if (pDeviceTimestamp) { + UR_CHECK_ERROR(cuEventSynchronize(event)); + *pDeviceTimestamp = hDevice->get_elapsed_time(event); + } + + return UR_RESULT_SUCCESS; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp index dd8503f1f8907..5a4e43c320af0 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp @@ -15,6 +15,9 @@ #include #include +void enableCUDATracing(); +void disableCUDATracing(); + ur_result_t urPlatformGetInfo(ur_platform_handle_t hPlatform, ur_platform_info_t PlatformInfoType, size_t Size, void *pPlatformInfo, size_t *pSizeRet) { @@ -169,6 +172,13 @@ ur_result_t urPlatformGetApiVersion(ur_platform_handle_t hDriver, return UR_RESULT_SUCCESS; } -ur_result_t urInit(ur_device_init_flags_t) { return UR_RESULT_SUCCESS; } +ur_result_t urInit(ur_device_init_flags_t) { + enableCUDATracing(); + return UR_RESULT_SUCCESS; +} + +ur_result_t urTearDown(void *) { + disableCUDATracing(); + return UR_RESULT_SUCCESS; +} -ur_result_t urTearDown(void *) { return UR_RESULT_SUCCESS; } diff --git a/sycl/plugins/cuda/tracing.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/tracing.cpp similarity index 100% rename from sycl/plugins/cuda/tracing.cpp rename to sycl/plugins/unified_runtime/ur/adapters/cuda/tracing.cpp diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp index 015dadcbaa074..d7f9ad75d38cd 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp @@ -195,7 +195,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetGlobalProcAddrTable( if (UR_RESULT_SUCCESS != result) { return result; } - pDdiTable->pfnGetLastResult = nullptr; + pDdiTable->pfnGetLastResult = urGetLastResult; pDdiTable->pfnInit = urInit; pDdiTable->pfnTearDown = urTearDown; return UR_RESULT_SUCCESS; @@ -243,10 +243,10 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable( } pDdiTable->pfnCreateWithNativeHandle = urDeviceCreateWithNativeHandle; pDdiTable->pfnGet = urDeviceGet; - pDdiTable->pfnGetGlobalTimestamps = nullptr; + pDdiTable->pfnGetGlobalTimestamps = urDeviceGetGlobalTimestamps; pDdiTable->pfnGetInfo = urDeviceGetInfo; pDdiTable->pfnGetNativeHandle = urDeviceGetNativeHandle; - pDdiTable->pfnPartition = nullptr; + pDdiTable->pfnPartition = urDevicePartition; pDdiTable->pfnRelease = urDeviceRelease; pDdiTable->pfnRetain = urDeviceRetain; pDdiTable->pfnSelectBinary = nullptr; From 3ae23298f2869e319739518938c88a45cce96aec Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Fri, 7 Apr 2023 15:53:48 +0100 Subject: [PATCH 06/45] [SYCL][PI][UR][CUDA] Port CUDA queue and event to Unified Runtime --- sycl/plugins/cuda/CMakeLists.txt | 5 + sycl/plugins/cuda/pi_cuda.cpp | 794 ++---------------- sycl/plugins/cuda/pi_cuda.hpp | 419 ++------- sycl/plugins/unified_runtime/CMakeLists.txt | 5 + .../ur/adapters/cuda/enqueue.cpp | 110 +++ .../ur/adapters/cuda/event.cpp | 309 +++++++ .../ur/adapters/cuda/event.hpp | 191 +++++ .../ur/adapters/cuda/queue.cpp | 326 +++++++ .../ur/adapters/cuda/queue.hpp | 253 ++++++ 9 files changed, 1337 insertions(+), 1075 deletions(-) create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp diff --git a/sycl/plugins/cuda/CMakeLists.txt b/sycl/plugins/cuda/CMakeLists.txt index cbc5e8f9e9638..6339f1e3466ea 100644 --- a/sycl/plugins/cuda/CMakeLists.txt +++ b/sycl/plugins/cuda/CMakeLists.txt @@ -61,12 +61,17 @@ add_sycl_plugin(cuda "../unified_runtime/ur/adapters/cuda/context.hpp" "../unified_runtime/ur/adapters/cuda/device.cpp" "../unified_runtime/ur/adapters/cuda/device.hpp" + "../unified_runtime/ur/adapters/cuda/enqueue.cpp" + "../unified_runtime/ur/adapters/cuda/event.cpp" + "../unified_runtime/ur/adapters/cuda/event.hpp" "../unified_runtime/ur/adapters/cuda/platform.cpp" "../unified_runtime/ur/adapters/cuda/platform.hpp" "../unified_runtime/ur/adapters/cuda/program.cpp" "../unified_runtime/ur/adapters/cuda/program.hpp" "../unified_runtime/ur/adapters/cuda/kernel.cpp" "../unified_runtime/ur/adapters/cuda/kernel.hpp" + "../unified_runtime/ur/adapters/cuda/queue.hpp" + "../unified_runtime/ur/adapters/cuda/queue.cpp" "../unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp" "../unified_runtime/ur/adapters/cuda/tracing.cpp" # --- diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 31c85b3877091..c2c08b645b03a 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -73,6 +73,27 @@ pi_result cuda_piPluginGetBackendOption(pi_platform, return PI_ERROR_INVALID_VALUE; } +pi_result map_ur_error(ur_result_t result) { + switch (result) { + case UR_RESULT_SUCCESS: + return PI_SUCCESS; + case UR_RESULT_ERROR_INVALID_OPERATION: + return PI_ERROR_INVALID_OPERATION; + case UR_RESULT_ERROR_INVALID_CONTEXT: + return PI_ERROR_INVALID_CONTEXT; + case UR_RESULT_ERROR_INVALID_DEVICE: + return PI_ERROR_INVALID_DEVICE; + case UR_RESULT_ERROR_INVALID_VALUE: + return PI_ERROR_INVALID_VALUE; + case UR_RESULT_ERROR_OUT_OF_HOST_MEMORY: + return PI_ERROR_OUT_OF_HOST_MEMORY; + case UR_RESULT_ERROR_OUT_OF_RESOURCES: + return PI_ERROR_OUT_OF_RESOURCES; + default: + return PI_ERROR_UNKNOWN; + } +} + // Iterates over the event wait list, returns correct pi_result error codes. // Invokes the callback for the latest event of each queue in the wait list. // The callback must take a single pi_event argument and return a pi_result. @@ -411,257 +432,11 @@ pi_result cuda_piEnqueueEventsWaitWithBarrier(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event); -pi_result cuda_piEventRelease(pi_event event); -pi_result cuda_piEventRetain(pi_event event); } // extern "C" /// \endcond -void _pi_queue::compute_stream_wait_for_barrier_if_needed(CUstream stream, - pi_uint32 stream_i) { - if (barrier_event_ && !compute_applied_barrier_[stream_i]) { - PI_CHECK_ERROR(cuStreamWaitEvent(stream, barrier_event_, 0)); - compute_applied_barrier_[stream_i] = true; - } -} - -void _pi_queue::transfer_stream_wait_for_barrier_if_needed(CUstream stream, - pi_uint32 stream_i) { - if (barrier_event_ && !transfer_applied_barrier_[stream_i]) { - PI_CHECK_ERROR(cuStreamWaitEvent(stream, barrier_event_, 0)); - transfer_applied_barrier_[stream_i] = true; - } -} - -CUstream _pi_queue::get_next_compute_stream(pi_uint32 *stream_token) { - pi_uint32 stream_i; - pi_uint32 token; - while (true) { - if (num_compute_streams_ < compute_streams_.size()) { - // the check above is for performance - so as not to lock mutex every time - std::lock_guard guard(compute_stream_mutex_); - // The second check is done after mutex is locked so other threads can not - // change num_compute_streams_ after that - if (num_compute_streams_ < compute_streams_.size()) { - PI_CHECK_ERROR( - cuStreamCreate(&compute_streams_[num_compute_streams_++], flags_)); - } - } - token = compute_stream_idx_++; - stream_i = token % compute_streams_.size(); - // if a stream has been reused before it was next selected round-robin - // fashion, we want to delay its next use and instead select another one - // that is more likely to have completed all the enqueued work. - if (delay_compute_[stream_i]) { - delay_compute_[stream_i] = false; - } else { - break; - } - } - if (stream_token) { - *stream_token = token; - } - CUstream res = compute_streams_[stream_i]; - compute_stream_wait_for_barrier_if_needed(res, stream_i); - return res; -} - -CUstream _pi_queue::get_next_compute_stream(pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - _pi_stream_guard &guard, - pi_uint32 *stream_token) { - for (pi_uint32 i = 0; i < num_events_in_wait_list; i++) { - pi_uint32 token = event_wait_list[i]->get_compute_stream_token(); - if (event_wait_list[i]->get_queue() == this && can_reuse_stream(token)) { - std::unique_lock compute_sync_guard( - compute_stream_sync_mutex_); - // redo the check after lock to avoid data races on - // last_sync_compute_streams_ - if (can_reuse_stream(token)) { - pi_uint32 stream_i = token % delay_compute_.size(); - delay_compute_[stream_i] = true; - if (stream_token) { - *stream_token = token; - } - guard = _pi_stream_guard{std::move(compute_sync_guard)}; - CUstream res = event_wait_list[i]->get_stream(); - compute_stream_wait_for_barrier_if_needed(res, stream_i); - return res; - } - } - } - guard = {}; - return get_next_compute_stream(stream_token); -} - -CUstream _pi_queue::get_next_transfer_stream() { - if (transfer_streams_.empty()) { // for example in in-order queue - return get_next_compute_stream(); - } - if (num_transfer_streams_ < transfer_streams_.size()) { - // the check above is for performance - so as not to lock mutex every time - std::lock_guard guard(transfer_stream_mutex_); - // The second check is done after mutex is locked so other threads can not - // change num_transfer_streams_ after that - if (num_transfer_streams_ < transfer_streams_.size()) { - PI_CHECK_ERROR( - cuStreamCreate(&transfer_streams_[num_transfer_streams_++], flags_)); - } - } - pi_uint32 stream_i = transfer_stream_idx_++ % transfer_streams_.size(); - CUstream res = transfer_streams_[stream_i]; - transfer_stream_wait_for_barrier_if_needed(res, stream_i); - return res; -} - -_pi_event::_pi_event(pi_command_type type, pi_context context, pi_queue queue, - CUstream stream, pi_uint32 stream_token) - : commandType_{type}, refCount_{1}, has_ownership_{true}, - hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false}, - streamToken_{stream_token}, evEnd_{nullptr}, evStart_{nullptr}, - evQueued_{nullptr}, queue_{queue}, stream_{stream}, context_{context} { - - bool profilingEnabled = queue_->properties_ & PI_QUEUE_FLAG_PROFILING_ENABLE; - - PI_CHECK_ERROR(cuEventCreate( - &evEnd_, profilingEnabled ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING)); - - if (profilingEnabled) { - PI_CHECK_ERROR(cuEventCreate(&evQueued_, CU_EVENT_DEFAULT)); - PI_CHECK_ERROR(cuEventCreate(&evStart_, CU_EVENT_DEFAULT)); - } - - if (queue_ != nullptr) { - cuda_piQueueRetain(queue_); - } - pi2ur::piContextRetain(context_); -} - -_pi_event::_pi_event(pi_context context, CUevent eventNative) - : commandType_{PI_COMMAND_TYPE_USER}, refCount_{1}, has_ownership_{false}, - hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false}, - streamToken_{std::numeric_limits::max()}, evEnd_{eventNative}, - evStart_{nullptr}, evQueued_{nullptr}, queue_{nullptr}, - context_{context} { - pi2ur::piContextRetain(context_); -} - -_pi_event::~_pi_event() { - if (queue_ != nullptr) { - cuda_piQueueRelease(queue_); - } - pi2ur::piContextRelease(context_); -} - -pi_result _pi_event::start() { - assert(!is_started()); - pi_result result = PI_SUCCESS; - - try { - if (queue_->properties_ & PI_QUEUE_FLAG_PROFILING_ENABLE) { - // NOTE: This relies on the default stream to be unused. - result = PI_CHECK_ERROR(cuEventRecord(evQueued_, 0)); - result = PI_CHECK_ERROR(cuEventRecord(evStart_, stream_)); - } - } catch (pi_result error) { - result = error; - } - - isStarted_ = true; - return result; -} - -bool _pi_event::is_completed() const noexcept { - if (!isRecorded_) { - return false; - } - if (!hasBeenWaitedOn_) { - const CUresult ret = cuEventQuery(evEnd_); - if (ret != CUDA_SUCCESS && ret != CUDA_ERROR_NOT_READY) { - PI_CHECK_ERROR(ret); - return false; - } - if (ret == CUDA_ERROR_NOT_READY) { - return false; - } - } - return true; -} - -pi_uint64 _pi_event::get_queued_time() const { - assert(is_started()); - return queue_->get_device()->get_elapsed_time(evQueued_); -} - -pi_uint64 _pi_event::get_start_time() const { - assert(is_started()); - return queue_->get_device()->get_elapsed_time(evStart_); -} - -pi_uint64 _pi_event::get_end_time() const { - assert(is_started() && is_recorded()); - return queue_->get_device()->get_elapsed_time(evEnd_); -} - -pi_result _pi_event::record() { - - if (is_recorded() || !is_started()) { - return PI_ERROR_INVALID_EVENT; - } - - pi_result result = PI_ERROR_INVALID_OPERATION; - - if (!queue_) { - return PI_ERROR_INVALID_QUEUE; - } - - try { - eventId_ = queue_->get_next_event_id(); - if (eventId_ == 0) { - sycl::detail::pi::die( - "Unrecoverable program state reached in event identifier overflow"); - } - result = PI_CHECK_ERROR(cuEventRecord(evEnd_, stream_)); - } catch (pi_result error) { - result = error; - } - - if (result == PI_SUCCESS) { - isRecorded_ = true; - } - - return result; -} - -pi_result _pi_event::wait() { - pi_result retErr; - try { - retErr = PI_CHECK_ERROR(cuEventSynchronize(evEnd_)); - hasBeenWaitedOn_ = true; - } catch (pi_result error) { - retErr = error; - } - - return retErr; -} - -pi_result _pi_event::release() { - if (!backend_has_ownership()) - return PI_SUCCESS; - - assert(queue_ != nullptr); - - PI_CHECK_ERROR(cuEventDestroy(evEnd_)); - - if (queue_->properties_ & PI_QUEUE_FLAG_PROFILING_ENABLE) { - PI_CHECK_ERROR(cuEventDestroy(evQueued_)); - PI_CHECK_ERROR(cuEventDestroy(evStart_)); - } - - return PI_SUCCESS; -} - // makes all future work submitted to queue wait for all work captured in event. pi_result enqueueEventWait(pi_queue queue, pi_event event) { // for native events, the cuStreamWaitEvent call is used. @@ -1028,254 +803,6 @@ pi_result cuda_piextMemImageCreateWithNativeHandle(pi_native_handle, pi_context, return {}; } -/// Creates a `pi_queue` object on the CUDA backend. -/// Valid properties -/// * __SYCL_PI_CUDA_USE_DEFAULT_STREAM -> CU_STREAM_DEFAULT -/// * __SYCL_PI_CUDA_SYNC_WITH_DEFAULT -> CU_STREAM_NON_BLOCKING -/// \return Pi queue object mapping to a CUStream -/// -pi_result cuda_piQueueCreate(pi_context context, pi_device device, - pi_queue_properties properties, pi_queue *queue) { - try { - std::unique_ptr<_pi_queue> queueImpl{nullptr}; - - if (context->get_device() != device) { - *queue = nullptr; - return PI_ERROR_INVALID_DEVICE; - } - - unsigned int flags = 0; - if (properties == __SYCL_PI_CUDA_USE_DEFAULT_STREAM) { - flags = CU_STREAM_DEFAULT; - } else if (properties == __SYCL_PI_CUDA_SYNC_WITH_DEFAULT) { - flags = 0; - } else { - flags = CU_STREAM_NON_BLOCKING; - } - - const bool is_out_of_order = - properties & PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE; - - std::vector computeCuStreams( - is_out_of_order ? _pi_queue::default_num_compute_streams : 1); - std::vector transferCuStreams( - is_out_of_order ? _pi_queue::default_num_transfer_streams : 0); - - queueImpl = std::unique_ptr<_pi_queue>( - new _pi_queue{std::move(computeCuStreams), std::move(transferCuStreams), - context, device, properties, flags}); - - *queue = queueImpl.release(); - - return PI_SUCCESS; - } catch (pi_result err) { - - return err; - - } catch (...) { - - return PI_ERROR_OUT_OF_RESOURCES; - } -} -pi_result cuda_piextQueueCreate(pi_context Context, pi_device Device, - pi_queue_properties *Properties, - pi_queue *Queue) { - assert(Properties); - // Expect flags mask to be passed first. - assert(Properties[0] == PI_QUEUE_FLAGS); - if (Properties[0] != PI_QUEUE_FLAGS) - return PI_ERROR_INVALID_VALUE; - pi_queue_properties Flags = Properties[1]; - // Extra data isn't supported yet. - assert(Properties[2] == 0); - if (Properties[2] != 0) - return PI_ERROR_INVALID_VALUE; - return cuda_piQueueCreate(Context, Device, Flags, Queue); -} - -pi_result cuda_piQueueGetInfo(pi_queue command_queue, pi_queue_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) { - assert(command_queue != nullptr); - - switch (param_name) { - case PI_QUEUE_INFO_CONTEXT: - return getInfo(param_value_size, param_value, param_value_size_ret, - command_queue->context_); - case PI_QUEUE_INFO_DEVICE: - return getInfo(param_value_size, param_value, param_value_size_ret, - command_queue->device_); - case PI_QUEUE_INFO_REFERENCE_COUNT: - return getInfo(param_value_size, param_value, param_value_size_ret, - command_queue->get_reference_count()); - case PI_QUEUE_INFO_PROPERTIES: - return getInfo(param_value_size, param_value, param_value_size_ret, - command_queue->properties_); - case PI_EXT_ONEAPI_QUEUE_INFO_EMPTY: { - try { - bool IsReady = command_queue->all_of([](CUstream s) -> bool { - const CUresult ret = cuStreamQuery(s); - if (ret == CUDA_SUCCESS) - return true; - - if (ret == CUDA_ERROR_NOT_READY) - return false; - - PI_CHECK_ERROR(ret); - return false; - }); - return getInfo(param_value_size, param_value, param_value_size_ret, - IsReady); - } catch (pi_result err) { - return err; - } catch (...) { - return PI_ERROR_OUT_OF_RESOURCES; - } - } - default: - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); - } - sycl::detail::pi::die("Queue info request not implemented"); - return {}; -} - -pi_result cuda_piQueueRetain(pi_queue command_queue) { - assert(command_queue != nullptr); - assert(command_queue->get_reference_count() > 0); - - command_queue->increment_reference_count(); - return PI_SUCCESS; -} - -pi_result cuda_piQueueRelease(pi_queue command_queue) { - assert(command_queue != nullptr); - - if (command_queue->decrement_reference_count() > 0) { - return PI_SUCCESS; - } - - try { - std::unique_ptr<_pi_queue> queueImpl(command_queue); - - if (!command_queue->backend_has_ownership()) - return PI_SUCCESS; - - ScopedContext active(command_queue->get_context()); - - command_queue->for_each_stream([](CUstream s) { - PI_CHECK_ERROR(cuStreamSynchronize(s)); - PI_CHECK_ERROR(cuStreamDestroy(s)); - }); - - return PI_SUCCESS; - } catch (pi_result err) { - return err; - } catch (...) { - return PI_ERROR_OUT_OF_RESOURCES; - } -} - -pi_result cuda_piQueueFinish(pi_queue command_queue) { - pi_result result = PI_SUCCESS; - - try { - - assert(command_queue != - nullptr); // need PI_ERROR_INVALID_EXTERNAL_HANDLE error code - ScopedContext active(command_queue->get_context()); - - command_queue->sync_streams([&result](CUstream s) { - result = PI_CHECK_ERROR(cuStreamSynchronize(s)); - }); - - } catch (pi_result err) { - - result = err; - - } catch (...) { - - result = PI_ERROR_OUT_OF_RESOURCES; - } - - return result; -} - -// There is no CUDA counterpart for queue flushing and we don't run into the -// same problem of having to flush cross-queue dependencies as some of the -// other plugins, so it can be left as no-op. -pi_result cuda_piQueueFlush(pi_queue command_queue) { - (void)command_queue; - return PI_SUCCESS; -} - -/// Gets the native CUDA handle of a PI queue object -/// -/// \param[in] queue The PI queue to get the native CUDA object of. -/// \param[in] NativeHandleDesc Pointer to additional native handle info. -/// \param[out] nativeHandle Set to the native handle of the PI queue object. -/// -/// \return PI_SUCCESS -pi_result cuda_piextQueueGetNativeHandle(pi_queue queue, - pi_native_handle *nativeHandle, - int32_t *NativeHandleDesc) { - *NativeHandleDesc = 0; - ScopedContext active(queue->get_context()); - *nativeHandle = - reinterpret_cast(queue->get_next_compute_stream()); - return PI_SUCCESS; -} - -/// Created a PI queue object from a CUDA queue handle. -/// NOTE: The created PI object does not take ownership of the native handle. -/// -/// \param[in] nativeHandle The native handle to create PI queue object from. -/// \param[in] nativeHandleDesc Info about the native handle. -/// \param[in] context is the PI context of the queue. -/// \param[out] queue Set to the PI queue object created from native handle. -/// \param ownNativeHandle tells if SYCL RT should assume the ownership of -/// the native handle, if it can. -/// -/// \return TBD -pi_result cuda_piextQueueCreateWithNativeHandle( - pi_native_handle nativeHandle, int32_t NativeHandleDesc, pi_context context, - pi_device device, bool ownNativeHandle, pi_queue_properties *Properties, - pi_queue *queue) { - (void)NativeHandleDesc; - (void)device; - (void)ownNativeHandle; - (void)Properties; - assert(ownNativeHandle == false); - - unsigned int flags; - CUstream cuStream = reinterpret_cast(nativeHandle); - - auto retErr = PI_CHECK_ERROR(cuStreamGetFlags(cuStream, &flags)); - - pi_queue_properties properties = 0; - if (flags == CU_STREAM_DEFAULT) - properties = __SYCL_PI_CUDA_USE_DEFAULT_STREAM; - else if (flags == CU_STREAM_NON_BLOCKING) - properties = __SYCL_PI_CUDA_SYNC_WITH_DEFAULT; - else - sycl::detail::pi::die("Unknown cuda stream"); - - std::vector computeCuStreams(1, cuStream); - std::vector transferCuStreams(0); - - // Create queue and set num_compute_streams to 1, as computeCuStreams has - // valid stream - *queue = new _pi_queue{std::move(computeCuStreams), - std::move(transferCuStreams), - context, - reinterpret_cast(context->get_device()), - properties, - flags, - /*backend_owns*/ false}; - (*queue)->num_compute_streams_ = 1; - - return retErr; -} - pi_result cuda_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer, pi_bool blocking_write, size_t offset, size_t size, const void *ptr, @@ -1306,7 +833,7 @@ pi_result cuda_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer, PI_CHECK_ERROR(cuMemcpyHtoDAsync(devPtr + offset, ptr, size, cuStream)); if (event) { - retErr = retImplEv->record(); + retErr = map_ur_error(retImplEv->record()); } if (blocking_write) { @@ -1352,7 +879,7 @@ pi_result cuda_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer, PI_CHECK_ERROR(cuMemcpyDtoHAsync(ptr, devPtr + offset, size, cuStream)); if (event) { - retErr = retImplEv->record(); + retErr = map_ur_error(retImplEv->record()); } if (blocking_read) { @@ -1369,41 +896,6 @@ pi_result cuda_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer, return retErr; } -pi_result cuda_piEventsWait(pi_uint32 num_events, const pi_event *event_list) { - - try { - assert(num_events != 0); - assert(event_list); - if (num_events == 0) { - return PI_ERROR_INVALID_VALUE; - } - - if (!event_list) { - return PI_ERROR_INVALID_EVENT; - } - - auto context = event_list[0]->get_context(); - ScopedContext active(context); - - auto waitFunc = [context](pi_event event) -> pi_result { - if (!event) { - return PI_ERROR_INVALID_EVENT; - } - - if (event->get_context() != context) { - return PI_ERROR_INVALID_CONTEXT; - } - - return event->wait(); - }; - return forLatestEvents(event_list, num_events, waitFunc); - } catch (pi_result err) { - return err; - } catch (...) { - return PI_ERROR_OUT_OF_RESOURCES; - } -} - pi_result cuda_piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index, const pi_mem *arg_value) { @@ -1526,14 +1018,15 @@ pi_result cuda_piEnqueueKernelLaunch( kernelLocalWorkGroupSize += local_work_size[dim]; } - if (hasExceededMaxRegistersPerBlock(command_queue->device_, kernel, - kernelLocalWorkGroupSize)) { + if (hasExceededMaxRegistersPerBlock( + reinterpret_cast(command_queue->device_), kernel, + kernelLocalWorkGroupSize)) { return PI_ERROR_INVALID_WORK_GROUP_SIZE; } } else { - guessLocalWorkSize(command_queue->device_, threadsPerBlock, - global_work_size, maxThreadsPerBlock, kernel, - local_size); + guessLocalWorkSize(reinterpret_cast(command_queue->device_), + threadsPerBlock, global_work_size, + maxThreadsPerBlock, kernel, local_size); } } @@ -1554,7 +1047,9 @@ pi_result cuda_piEnqueueKernelLaunch( pi_uint32 stream_token; _pi_stream_guard guard; CUstream cuStream = command_queue->get_next_compute_stream( - num_events_in_wait_list, event_wait_list, guard, &stream_token); + num_events_in_wait_list, + reinterpret_cast(event_wait_list), guard, + &stream_token); CUfunction cuFunc = kernel->get(); retError = enqueueEventsWait(command_queue, cuStream, @@ -1615,7 +1110,7 @@ pi_result cuda_piEnqueueKernelLaunch( kernel->clear_local_size(); if (event) { - retError = retImplEv->record(); + retError = map_ur_error(retImplEv->record()); *event = retImplEv.release(); } } catch (pi_result err) { @@ -1802,124 +1297,6 @@ pi_result cuda_piMemRetain(pi_mem mem) { return PI_SUCCESS; } -// -// Events -// -pi_result cuda_piEventCreate(pi_context, pi_event *) { - sycl::detail::pi::die("PI Event Create not implemented in CUDA backend"); -} - -pi_result cuda_piEventGetInfo(pi_event event, pi_event_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) { - assert(event != nullptr); - - switch (param_name) { - case PI_EVENT_INFO_COMMAND_QUEUE: - return getInfo(param_value_size, param_value, param_value_size_ret, - event->get_queue()); - case PI_EVENT_INFO_COMMAND_TYPE: - return getInfo(param_value_size, param_value, param_value_size_ret, - event->get_command_type()); - case PI_EVENT_INFO_REFERENCE_COUNT: - return getInfo(param_value_size, param_value, param_value_size_ret, - event->get_reference_count()); - case PI_EVENT_INFO_COMMAND_EXECUTION_STATUS: { - return getInfo(param_value_size, param_value, param_value_size_ret, - static_cast(event->get_execution_status())); - } - case PI_EVENT_INFO_CONTEXT: - return getInfo(param_value_size, param_value, param_value_size_ret, - event->get_context()); - default: - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); - } - - return PI_ERROR_INVALID_EVENT; -} - -/// Obtain profiling information from PI CUDA events -/// \TODO Timings from CUDA are only elapsed time. -pi_result cuda_piEventGetProfilingInfo(pi_event event, - pi_profiling_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) { - - assert(event != nullptr); - - pi_queue queue = event->get_queue(); - if (queue == nullptr || - !(queue->properties_ & PI_QUEUE_FLAG_PROFILING_ENABLE)) { - return PI_ERROR_PROFILING_INFO_NOT_AVAILABLE; - } - - switch (param_name) { - case PI_PROFILING_INFO_COMMAND_QUEUED: - case PI_PROFILING_INFO_COMMAND_SUBMIT: - // Note: No user for this case - return getInfo(param_value_size, param_value, - param_value_size_ret, event->get_queued_time()); - case PI_PROFILING_INFO_COMMAND_START: - return getInfo(param_value_size, param_value, - param_value_size_ret, event->get_start_time()); - case PI_PROFILING_INFO_COMMAND_END: - return getInfo(param_value_size, param_value, - param_value_size_ret, event->get_end_time()); - default: - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); - } - sycl::detail::pi::die("Event Profiling info request not implemented"); - return {}; -} - -pi_result cuda_piEventSetCallback(pi_event, pi_int32, pfn_notify, void *) { - sycl::detail::pi::die("Event Callback not implemented in CUDA backend"); - return PI_SUCCESS; -} - -pi_result cuda_piEventSetStatus(pi_event, pi_int32) { - sycl::detail::pi::die("Event Set Status not implemented in CUDA backend"); - return PI_ERROR_INVALID_VALUE; -} - -pi_result cuda_piEventRetain(pi_event event) { - assert(event != nullptr); - - const auto refCount = event->increment_reference_count(); - - sycl::detail::pi::assertion( - refCount != 0, - "Reference count overflow detected in cuda_piEventRetain."); - - return PI_SUCCESS; -} - -pi_result cuda_piEventRelease(pi_event event) { - assert(event != nullptr); - - // double delete or someone is messing with the ref count. - // either way, cannot safely proceed. - sycl::detail::pi::assertion( - event->get_reference_count() != 0, - "Reference count overflow detected in cuda_piEventRelease."); - - // decrement ref count. If it is 0, delete the event. - if (event->decrement_reference_count() == 0) { - std::unique_ptr<_pi_event> event_ptr{event}; - pi_result result = PI_ERROR_INVALID_EVENT; - try { - ScopedContext active(event->get_context()); - result = event->release(); - } catch (...) { - result = PI_ERROR_OUT_OF_RESOURCES; - } - return result; - } - - return PI_SUCCESS; -} - /// Enqueues a wait on the given CUstream for all events. /// See \ref enqueueEventWait /// TODO: Add support for multiple streams once the Event class is properly @@ -1962,7 +1339,9 @@ pi_result cuda_piEnqueueEventsWaitWithBarrier(pi_queue command_queue, pi_uint32 stream_token; _pi_stream_guard guard; CUstream cuStream = command_queue->get_next_compute_stream( - num_events_in_wait_list, event_wait_list, guard, &stream_token); + num_events_in_wait_list, + reinterpret_cast(event_wait_list), guard, + &stream_token); { std::lock_guard guard(command_queue->barrier_mutex_); if (command_queue->barrier_event_ == nullptr) { @@ -2027,41 +1406,6 @@ pi_result cuda_piEnqueueEventsWaitWithBarrier(pi_queue command_queue, } } -/// Gets the native CUDA handle of a PI event object -/// -/// \param[in] event The PI event to get the native CUDA object of. -/// \param[out] nativeHandle Set to the native handle of the PI event object. -/// -/// \return PI_SUCCESS on success. PI_ERROR_INVALID_EVENT if given a user event. -pi_result cuda_piextEventGetNativeHandle(pi_event event, - pi_native_handle *nativeHandle) { - *nativeHandle = reinterpret_cast(event->get()); - return PI_SUCCESS; -} - -/// Created a PI event object from a CUDA event handle. -/// TODO: Implement this. -/// NOTE: The created PI object takes ownership of the native handle. -/// -/// \param[in] nativeHandle The native handle to create PI event object from. -/// \param[out] event Set to the PI event object created from native handle. -/// -/// \return TBD -pi_result cuda_piextEventCreateWithNativeHandle(pi_native_handle nativeHandle, - pi_context context, - bool ownNativeHandle, - pi_event *event) { - (void)ownNativeHandle; - assert(!ownNativeHandle); - - std::unique_ptr<_pi_event> event_ptr{nullptr}; - - *event = _pi_event::make_with_native(context, - reinterpret_cast(nativeHandle)); - - return PI_SUCCESS; -} - /// Creates a PI sampler object /// /// \param[in] context The context the sampler is created for. @@ -2297,7 +1641,7 @@ pi_result cuda_piEnqueueMemBufferReadRect( host_offset, host_row_pitch, host_slice_pitch); if (event) { - retErr = retImplEv->record(); + retErr = map_ur_error(retImplEv->record()); } if (blocking_read) { @@ -2347,7 +1691,7 @@ pi_result cuda_piEnqueueMemBufferWriteRect( buffer_row_pitch, buffer_slice_pitch); if (event) { - retErr = retImplEv->record(); + retErr = map_ur_error(retImplEv->record()); } if (blocking_write) { @@ -2387,7 +1731,7 @@ pi_result cuda_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, if (event) { retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( PI_COMMAND_TYPE_MEM_BUFFER_COPY, command_queue, stream)); - result = retImplEv->start(); + result = map_ur_error(retImplEv->start()); } auto src = src_buffer->mem_.buffer_mem_.get() + src_offset; @@ -2396,7 +1740,7 @@ pi_result cuda_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, result = PI_CHECK_ERROR(cuMemcpyDtoDAsync(dst, src, size, stream)); if (event) { - result = retImplEv->record(); + result = map_ur_error(retImplEv->record()); *event = retImplEv.release(); } @@ -2489,7 +1833,7 @@ pi_result cuda_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, if (event) { retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( PI_COMMAND_TYPE_MEM_BUFFER_FILL, command_queue, stream)); - result = retImplEv->start(); + result = map_ur_error(retImplEv->start()); } auto dstDevice = buffer->mem_.buffer_mem_.get() + offset; @@ -2541,7 +1885,7 @@ pi_result cuda_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, } if (event) { - result = retImplEv->record(); + result = map_ur_error(retImplEv->record()); *event = retImplEv.release(); } @@ -3088,7 +2432,9 @@ pi_result cuda_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value, pi_uint32 stream_token; _pi_stream_guard guard; CUstream cuStream = queue->get_next_compute_stream( - num_events_in_waitlist, events_waitlist, guard, &stream_token); + num_events_in_waitlist, + reinterpret_cast(events_waitlist), guard, + &stream_token); result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist, events_waitlist); if (event) { @@ -3099,7 +2445,7 @@ pi_result cuda_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value, result = PI_CHECK_ERROR(cuMemsetD8Async( (CUdeviceptr)ptr, (unsigned char)value & 0xFF, count, cuStream)); if (event) { - result = event_ptr->record(); + result = map_ur_error(event_ptr->record()); *event = event_ptr.release(); } } catch (pi_result err) { @@ -3134,7 +2480,7 @@ pi_result cuda_piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking, result = PI_CHECK_ERROR(cuMemcpyAsync( (CUdeviceptr)dst_ptr, (CUdeviceptr)src_ptr, size, cuStream)); if (event) { - result = event_ptr->record(); + result = map_ur_error(event_ptr->record()); } if (blocking) { result = PI_CHECK_ERROR(cuStreamSynchronize(cuStream)); @@ -3198,7 +2544,7 @@ pi_result cuda_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr, result = PI_CHECK_ERROR( cuMemPrefetchAsync((CUdeviceptr)ptr, size, device->get(), cuStream)); if (event) { - result = event_ptr->record(); + result = map_ur_error(event_ptr->record()); *event = event_ptr.release(); } } catch (pi_result err) { @@ -3299,7 +2645,7 @@ pi_result cuda_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr, sycl::detail::pi::die("Unknown advice"); } if (event) { - result = event_ptr->record(); + result = map_ur_error(event_ptr->record()); *event = event_ptr.release(); } } catch (pi_result err) { @@ -3670,16 +3016,20 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextContextCreateWithNativeHandle, pi2ur::piextContextCreateWithNativeHandle) // Queue - _PI_CL(piQueueCreate, cuda_piQueueCreate) - _PI_CL(piextQueueCreate, cuda_piextQueueCreate) - _PI_CL(piQueueGetInfo, cuda_piQueueGetInfo) - _PI_CL(piQueueFinish, cuda_piQueueFinish) - _PI_CL(piQueueFlush, cuda_piQueueFlush) - _PI_CL(piQueueRetain, cuda_piQueueRetain) - _PI_CL(piQueueRelease, cuda_piQueueRelease) - _PI_CL(piextQueueGetNativeHandle, cuda_piextQueueGetNativeHandle) + _PI_CL(piQueueCreate, pi2ur::piQueueCreate) + _PI_CL(piextQueueCreate, pi2ur::piextQueueCreate) + _PI_CL(piextQueueCreate2, pi2ur::piextQueueCreate2) + _PI_CL(piQueueGetInfo, pi2ur::piQueueGetInfo) + _PI_CL(piQueueFinish, pi2ur::piQueueFinish) + _PI_CL(piQueueFlush, pi2ur::piQueueFlush) + _PI_CL(piQueueRetain, pi2ur::piQueueRetain) + _PI_CL(piQueueRelease, pi2ur::piQueueRelease) + _PI_CL(piextQueueGetNativeHandle, pi2ur::piextQueueGetNativeHandle) + _PI_CL(piextQueueGetNativeHandle2, pi2ur::piextQueueGetNativeHandle2) _PI_CL(piextQueueCreateWithNativeHandle, - cuda_piextQueueCreateWithNativeHandle) + pi2ur::piextQueueCreateWithNativeHandle) + _PI_CL(piextQueueCreateWithNativeHandle2, + pi2ur::piextQueueCreateWithNativeHandle2) // Memory _PI_CL(piMemBufferCreate, cuda_piMemBufferCreate) _PI_CL(piMemImageCreate, cuda_piMemImageCreate) @@ -3721,17 +3071,17 @@ pi_result piPluginInit(pi_plugin *PluginInit) { pi2ur::piextKernelCreateWithNativeHandle) // Event - _PI_CL(piEventCreate, cuda_piEventCreate) - _PI_CL(piEventGetInfo, cuda_piEventGetInfo) - _PI_CL(piEventGetProfilingInfo, cuda_piEventGetProfilingInfo) - _PI_CL(piEventsWait, cuda_piEventsWait) - _PI_CL(piEventSetCallback, cuda_piEventSetCallback) - _PI_CL(piEventSetStatus, cuda_piEventSetStatus) - _PI_CL(piEventRetain, cuda_piEventRetain) - _PI_CL(piEventRelease, cuda_piEventRelease) - _PI_CL(piextEventGetNativeHandle, cuda_piextEventGetNativeHandle) + _PI_CL(piEventCreate, pi2ur::piEventCreate) + _PI_CL(piEventGetInfo, pi2ur::piEventGetInfo) + _PI_CL(piEventGetProfilingInfo, pi2ur::piEventGetProfilingInfo) + _PI_CL(piEventsWait, pi2ur::piEventsWait) + _PI_CL(piEventSetCallback, pi2ur::piEventSetCallback) + _PI_CL(piEventSetStatus, pi2ur::piEventSetStatus) + _PI_CL(piEventRetain, pi2ur::piEventRetain) + _PI_CL(piEventRelease, pi2ur::piEventRelease) + _PI_CL(piextEventGetNativeHandle, pi2ur::piextEventGetNativeHandle) _PI_CL(piextEventCreateWithNativeHandle, - cuda_piextEventCreateWithNativeHandle) + pi2ur::piextEventCreateWithNativeHandle) // Sampler _PI_CL(piSamplerCreate, cuda_piSamplerCreate) _PI_CL(piSamplerGetInfo, cuda_piSamplerGetInfo) diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index 51f6b7f2a34b4..1a8c7e64537cd 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -47,6 +47,8 @@ #include #include #include +#include +#include // Share code between the PI Plugin and UR Adapter #include @@ -54,8 +56,6 @@ extern "C" { /// \cond IGNORE_BLOCK_IN_DOXYGEN -pi_result cuda_piQueueRelease(pi_queue); -pi_result cuda_piQueueRetain(pi_queue); pi_result cuda_piMemRetain(pi_mem); pi_result cuda_piMemRelease(pi_mem); /// \endcond @@ -298,368 +298,81 @@ struct _pi_mem { /// PI queue mapping on to CUstream objects. /// -struct _pi_queue { - using native_type = CUstream; - static constexpr int default_num_compute_streams = 128; - static constexpr int default_num_transfer_streams = 64; - - std::vector compute_streams_; - std::vector transfer_streams_; - // delay_compute_ keeps track of which streams have been recently reused and - // their next use should be delayed. If a stream has been recently reused it - // will be skipped the next time it would be selected round-robin style. When - // skipped, its delay flag is cleared. - std::vector delay_compute_; - // keep track of which streams have applied barrier - std::vector compute_applied_barrier_; - std::vector transfer_applied_barrier_; - _pi_context *context_; - _pi_device *device_; - pi_queue_properties properties_; - CUevent barrier_event_ = nullptr; - CUevent barrier_tmp_event_ = nullptr; - std::atomic_uint32_t refCount_; - std::atomic_uint32_t eventCount_; - std::atomic_uint32_t compute_stream_idx_; - std::atomic_uint32_t transfer_stream_idx_; - unsigned int num_compute_streams_; - unsigned int num_transfer_streams_; - unsigned int last_sync_compute_streams_; - unsigned int last_sync_transfer_streams_; - unsigned int flags_; - // When compute_stream_sync_mutex_ and compute_stream_mutex_ both need to be - // locked at the same time, compute_stream_sync_mutex_ should be locked first - // to avoid deadlocks - std::mutex compute_stream_sync_mutex_; - std::mutex compute_stream_mutex_; - std::mutex transfer_stream_mutex_; - std::mutex barrier_mutex_; - bool has_ownership_; - - _pi_queue(std::vector &&compute_streams, - std::vector &&transfer_streams, _pi_context *context, - _pi_device *device, pi_queue_properties properties, - unsigned int flags, bool backend_owns = true) - : compute_streams_{std::move(compute_streams)}, - transfer_streams_{std::move(transfer_streams)}, - delay_compute_(compute_streams_.size(), false), - compute_applied_barrier_(compute_streams_.size()), - transfer_applied_barrier_(transfer_streams_.size()), context_{context}, - device_{device}, properties_{properties}, refCount_{1}, eventCount_{0}, - compute_stream_idx_{0}, transfer_stream_idx_{0}, - num_compute_streams_{0}, num_transfer_streams_{0}, - last_sync_compute_streams_{0}, last_sync_transfer_streams_{0}, - flags_(flags), has_ownership_{backend_owns} { - pi2ur::piContextRetain(context_); - pi2ur::piDeviceRetain(device_); - } - - ~_pi_queue() { - pi2ur::piContextRelease(context_); - pi2ur::piDeviceRelease(device_); - } - - void compute_stream_wait_for_barrier_if_needed(CUstream stream, - pi_uint32 stream_i); - void transfer_stream_wait_for_barrier_if_needed(CUstream stream, - pi_uint32 stream_i); - - // get_next_compute/transfer_stream() functions return streams from - // appropriate pools in round-robin fashion - native_type get_next_compute_stream(pi_uint32 *stream_token = nullptr); - // this overload tries select a stream that was used by one of dependancies. - // If that is not possible returns a new stream. If a stream is reused it - // returns a lock that needs to remain locked as long as the stream is in use - native_type get_next_compute_stream(pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - _pi_stream_guard &guard, - pi_uint32 *stream_token = nullptr); - native_type get_next_transfer_stream(); - native_type get() { return get_next_compute_stream(); }; - - bool has_been_synchronized(pi_uint32 stream_token) { - // stream token not associated with one of the compute streams - if (stream_token == std::numeric_limits::max()) { - return false; - } - return last_sync_compute_streams_ > stream_token; - } - - bool can_reuse_stream(pi_uint32 stream_token) { - // stream token not associated with one of the compute streams - if (stream_token == std::numeric_limits::max()) { - return false; - } - // If the command represented by the stream token was not the last command - // enqueued to the stream we can not reuse the stream - we need to allow for - // commands enqueued after it and the one we are about to enqueue to run - // concurrently - bool is_last_command = - (compute_stream_idx_ - stream_token) <= compute_streams_.size(); - // If there was a barrier enqueued to the queue after the command - // represented by the stream token we should not reuse the stream, as we can - // not take that stream into account for the bookkeeping for the next - // barrier - such a stream would not be synchronized with. Performance-wise - // it does not matter that we do not reuse the stream, as the work - // represented by the stream token is guaranteed to be complete by the - // barrier before any work we are about to enqueue to the stream will start, - // so the event does not need to be synchronized with. - return is_last_command && !has_been_synchronized(stream_token); - } - - template bool all_of(T &&f) { - { - std::lock_guard compute_guard(compute_stream_mutex_); - unsigned int end = - std::min(static_cast(compute_streams_.size()), - num_compute_streams_); - if (!std::all_of(compute_streams_.begin(), compute_streams_.begin() + end, - f)) - return false; - } - { - std::lock_guard transfer_guard(transfer_stream_mutex_); - unsigned int end = - std::min(static_cast(transfer_streams_.size()), - num_transfer_streams_); - if (!std::all_of(transfer_streams_.begin(), - transfer_streams_.begin() + end, f)) - return false; - } - return true; - } - - template void for_each_stream(T &&f) { - { - std::lock_guard compute_guard(compute_stream_mutex_); - unsigned int end = - std::min(static_cast(compute_streams_.size()), - num_compute_streams_); - for (unsigned int i = 0; i < end; i++) { - f(compute_streams_[i]); - } - } - { - std::lock_guard transfer_guard(transfer_stream_mutex_); - unsigned int end = - std::min(static_cast(transfer_streams_.size()), - num_transfer_streams_); - for (unsigned int i = 0; i < end; i++) { - f(transfer_streams_[i]); - } - } - } - - template void sync_streams(T &&f) { - auto sync_compute = [&f, &streams = compute_streams_, - &delay = delay_compute_](unsigned int start, - unsigned int stop) { - for (unsigned int i = start; i < stop; i++) { - f(streams[i]); - delay[i] = false; - } - }; - auto sync_transfer = [&f, &streams = transfer_streams_](unsigned int start, - unsigned int stop) { - for (unsigned int i = start; i < stop; i++) { - f(streams[i]); - } - }; - { - unsigned int size = static_cast(compute_streams_.size()); - std::lock_guard compute_sync_guard( - compute_stream_sync_mutex_); - std::lock_guard compute_guard(compute_stream_mutex_); - unsigned int start = last_sync_compute_streams_; - unsigned int end = num_compute_streams_ < size - ? num_compute_streams_ - : compute_stream_idx_.load(); - if (end - start >= size) { - sync_compute(0, size); - } else { - start %= size; - end %= size; - if (start <= end) { - sync_compute(start, end); - } else { - sync_compute(start, size); - sync_compute(0, end); - } - } - if (ResetUsed) { - last_sync_compute_streams_ = end; - } - } - { - unsigned int size = static_cast(transfer_streams_.size()); - if (size > 0) { - std::lock_guard transfer_guard(transfer_stream_mutex_); - unsigned int start = last_sync_transfer_streams_; - unsigned int end = num_transfer_streams_ < size - ? num_transfer_streams_ - : transfer_stream_idx_.load(); - if (end - start >= size) { - sync_transfer(0, size); - } else { - start %= size; - end %= size; - if (start <= end) { - sync_transfer(start, end); - } else { - sync_transfer(start, size); - sync_transfer(0, end); - } - } - if (ResetUsed) { - last_sync_transfer_streams_ = end; - } - } - } - } - - _pi_context *get_context() const { return context_; }; - - _pi_device *get_device() const { return device_; }; - - pi_uint32 increment_reference_count() noexcept { return ++refCount_; } - - pi_uint32 decrement_reference_count() noexcept { return --refCount_; } - - pi_uint32 get_reference_count() const noexcept { return refCount_; } - - pi_uint32 get_next_event_id() noexcept { return ++eventCount_; } - - bool backend_has_ownership() const noexcept { return has_ownership_; } +struct _pi_queue : ur_queue_handle_t_ { + using ur_queue_handle_t_::ur_queue_handle_t_; }; typedef void (*pfn_notify)(pi_event event, pi_int32 eventCommandStatus, void *userData); -/// PI Event mapping to CUevent -/// -struct _pi_event { -public: - using native_type = CUevent; - - pi_result record(); - - pi_result wait(); - - pi_result start(); - - native_type get() const noexcept { return evEnd_; }; - - pi_queue get_queue() const noexcept { return queue_; } - - CUstream get_stream() const noexcept { return stream_; } - - pi_uint32 get_compute_stream_token() const noexcept { return streamToken_; } - - pi_command_type get_command_type() const noexcept { return commandType_; } - - pi_uint32 get_reference_count() const noexcept { return refCount_; } - - bool is_recorded() const noexcept { return isRecorded_; } - - bool is_started() const noexcept { return isStarted_; } - - bool is_completed() const noexcept; - - pi_int32 get_execution_status() const noexcept { - - if (!is_recorded()) { - return PI_EVENT_SUBMITTED; - } - - if (!is_completed()) { - return PI_EVENT_RUNNING; - } - return PI_EVENT_COMPLETE; - } - - pi_context get_context() const noexcept { return context_; }; - - pi_uint32 increment_reference_count() { return ++refCount_; } - - pi_uint32 decrement_reference_count() { return --refCount_; } - pi_uint32 get_event_id() const noexcept { return eventId_; } +struct _pi_event : ur_event_handle_t_ { + using ur_event_handle_t_::ur_event_handle_t_; - bool backend_has_ownership() const noexcept { return has_ownership_; } - - // Returns the counter time when the associated command(s) were enqueued - // - pi_uint64 get_queued_time() const; - - // Returns the counter time when the associated command(s) started execution - // - pi_uint64 get_start_time() const; - - // Returns the counter time when the associated command(s) completed - // - pi_uint64 get_end_time() const; - - // construct a native CUDA. This maps closely to the underlying CUDA event. + // Helpers for queue command implementations until they also get ported to UR static pi_event make_native(pi_command_type type, pi_queue queue, CUstream stream, - pi_uint32 stream_token = std::numeric_limits::max()) { - return new _pi_event(type, queue->get_context(), queue, stream, - stream_token); - } + uint32_t stream_token = std::numeric_limits::max()) { + auto urQueue = reinterpret_cast(queue); + static std::unordered_map<_pi_command_type, ur_command_t> cmdMap = { + {PI_COMMAND_TYPE_NDRANGE_KERNEL, UR_COMMAND_KERNEL_LAUNCH}, + {PI_COMMAND_TYPE_MEM_BUFFER_READ, UR_COMMAND_MEM_BUFFER_READ}, + {PI_COMMAND_TYPE_MEM_BUFFER_WRITE, UR_COMMAND_MEM_BUFFER_WRITE}, + {PI_COMMAND_TYPE_MEM_BUFFER_COPY, UR_COMMAND_MEM_BUFFER_COPY}, + {PI_COMMAND_TYPE_MEM_BUFFER_MAP, UR_COMMAND_MEM_BUFFER_MAP}, + {PI_COMMAND_TYPE_MEM_BUFFER_UNMAP, UR_COMMAND_MEM_UNMAP}, + {PI_COMMAND_TYPE_MEM_BUFFER_READ_RECT, UR_COMMAND_MEM_BUFFER_READ_RECT}, + {PI_COMMAND_TYPE_MEM_BUFFER_WRITE_RECT, + UR_COMMAND_MEM_BUFFER_WRITE_RECT}, + {PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, UR_COMMAND_MEM_BUFFER_COPY_RECT}, + {PI_COMMAND_TYPE_MEM_BUFFER_FILL, UR_COMMAND_MEM_BUFFER_FILL}, + {PI_COMMAND_TYPE_IMAGE_READ, UR_COMMAND_MEM_IMAGE_READ}, + {PI_COMMAND_TYPE_IMAGE_WRITE, UR_COMMAND_MEM_IMAGE_WRITE}, + {PI_COMMAND_TYPE_IMAGE_COPY, UR_COMMAND_MEM_IMAGE_COPY}, + {PI_COMMAND_TYPE_BARRIER, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER}, + {PI_COMMAND_TYPE_DEVICE_GLOBAL_VARIABLE_READ, + UR_COMMAND_DEVICE_GLOBAL_VARIABLE_READ}, + {PI_COMMAND_TYPE_DEVICE_GLOBAL_VARIABLE_WRITE, + UR_COMMAND_DEVICE_GLOBAL_VARIABLE_WRITE}, + }; - static pi_event make_with_native(pi_context context, CUevent eventNative) { - return new _pi_event(context, eventNative); + // TODO(ur): There is no exact mapping for the following commands. Just + // default to KERNEL_LAUNCH for now. + // PI_COMMAND_TYPE_USER + // PI_COMMAND_TYPE_MEM_BUFFER_FILL, + // PI_COMMAND_TYPE_IMAGE_READ, + // PI_COMMAND_TYPE_IMAGE_WRITE, + // PI_COMMAND_TYPE_IMAGE_COPY, + // PI_COMMAND_TYPE_NATIVE_KERNEL, + // PI_COMMAND_TYPE_COPY_BUFFER_TO_IMAGE, + // PI_COMMAND_TYPE_COPY_IMAGE_TO_BUFFER, + // PI_COMMAND_TYPE_MAP_IMAGE, + // PI_COMMAND_TYPE_MARKER, + // PI_COMMAND_TYPE_ACQUIRE_GL_OBJECTS, + // PI_COMMAND_TYPE_RELEASE_GL_OBJECTS, + // PI_COMMAND_TYPE_BARRIER, + // PI_COMMAND_TYPE_MIGRATE_MEM_OBJECTS, + // PI_COMMAND_TYPE_FILL_IMAGE + // PI_COMMAND_TYPE_SVM_FREE + // PI_COMMAND_TYPE_SVM_MEMCPY + // PI_COMMAND_TYPE_SVM_MEMFILL + // PI_COMMAND_TYPE_SVM_MAP + // PI_COMMAND_TYPE_SVM_UNMAP + + ur_command_t urCmd = UR_COMMAND_KERNEL_LAUNCH; + auto cmdIt = cmdMap.find(type); + if (cmdIt != cmdMap.end()) { + urCmd = cmdIt->second; + } + return reinterpret_cast( + ur_event_handle_t_::make_native(urCmd, urQueue, stream, stream_token)); } - pi_result release(); - - ~_pi_event(); - -private: - // This constructor is private to force programmers to use the make_native / - // make_user static members in order to create a pi_event for CUDA. - _pi_event(pi_command_type type, pi_context context, pi_queue queue, - CUstream stream, pi_uint32 stream_token); - - // This constructor is private to force programmers to use the - // make_with_native for event introp - _pi_event(pi_context context, CUevent eventNative); - - pi_command_type commandType_; // The type of command associated with event. - - std::atomic_uint32_t refCount_; // Event reference count. - - bool has_ownership_; // Signifies if event owns the native type. - - bool hasBeenWaitedOn_; // Signifies whether the event has been waited - // on through a call to wait(), which implies - // that it has completed. - - bool isRecorded_; // Signifies wether a native CUDA event has been recorded - // yet. - bool isStarted_; // Signifies wether the operation associated with the - // PI event has started or not - // - - pi_uint32 streamToken_; - pi_uint32 eventId_; // Queue identifier of the event. - - native_type evEnd_; // CUDA event handle. If this _pi_event represents a user - // event, this will be nullptr. - - native_type evStart_; // CUDA event handle associated with the start - - native_type evQueued_; // CUDA event handle associated with the time - // the command was enqueued - - pi_queue queue_; // pi_queue associated with the event. If this is a user - // event, this will be nullptr. - - CUstream stream_; // CUstream associated with the event. If this is a user - // event, this will be uninitialized. - - pi_context context_; // pi_context associated with the event. If this is a - // native event, this will be the same context associated - // with the queue_ member. + static pi_event make_with_native(ur_context_handle_t context, + CUevent eventNative) { + auto urContext = reinterpret_cast(context); + return reinterpret_cast( + ur_event_handle_t_::make_with_native(urContext, eventNative)); + } }; /// Implementation of PI Program on CUDA Module object diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 014938c9ba542..dc572bd5e7e9c 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -137,12 +137,17 @@ add_sycl_library("ur_adapter_cuda" SHARED "ur/adapters/cuda/context.hpp" "ur/adapters/cuda/device.cpp" "ur/adapters/cuda/device.hpp" + "ur/adapters/cuda/enqueue.cpp" + "ur/adapters/cuda/event.cpp" + "ur/adapters/cuda/event.hpp" "ur/adapters/cuda/platform.cpp" "ur/adapters/cuda/platform.hpp" "ur/adapters/cuda/program.cpp" "ur/adapters/cuda/program.hpp" "ur/adapters/cuda/kernel.cpp" "ur/adapters/cuda/kernel.hpp" + "ur/adapters/cuda/queue.cpp" + "ur/adapters/cuda/queue.hpp" "ur/adapters/cuda/ur_interface_loader.cpp" "ur/adapters/cuda/tracing.cpp" INCLUDE_DIRS diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp new file mode 100644 index 0000000000000..3dfa1ba1dbd5c --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp @@ -0,0 +1,110 @@ +//===--------- enqueue.cpp - CUDA Adapter ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include "common.hpp" +#include "context.hpp" +#include "event.hpp" +#include "queue.hpp" + +#include + +/// Enqueues a wait on the given CUstream for all specified events (See +/// \ref enqueueEventWaitWithBarrier.) If the events list is empty, the enqueued +/// wait will wait on all previous events in the queue. +/// +ur_result_t urEnqueueEventsWaitWithBarrier( + ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + // This function makes one stream work on the previous work (or work + // represented by input events) and then all future work waits on that stream. + if (!hQueue) { + return UR_RESULT_ERROR_INVALID_QUEUE; + } + + ur_result_t result; + + try { + ScopedContext active(hQueue->get_context()); + uint32_t stream_token; + ur_stream_guard_ guard; + CUstream cuStream = hQueue->get_next_compute_stream( + numEventsInWaitList, phEventWaitList, guard, &stream_token); + { + std::lock_guard guard(hQueue->barrier_mutex_); + if (hQueue->barrier_event_ == nullptr) { + UR_CHECK_ERROR( + cuEventCreate(&hQueue->barrier_event_, CU_EVENT_DISABLE_TIMING)); + } + if (numEventsInWaitList == 0) { // wait on all work + if (hQueue->barrier_tmp_event_ == nullptr) { + UR_CHECK_ERROR(cuEventCreate(&hQueue->barrier_tmp_event_, + CU_EVENT_DISABLE_TIMING)); + } + hQueue->sync_streams( + [cuStream, tmp_event = hQueue->barrier_tmp_event_](CUstream s) { + if (cuStream != s) { + // record a new CUDA event on every stream and make one stream + // wait for these events + UR_CHECK_ERROR(cuEventRecord(tmp_event, s)); + UR_CHECK_ERROR(cuStreamWaitEvent(cuStream, tmp_event, 0)); + } + }); + } else { // wait just on given events + forLatestEvents(phEventWaitList, numEventsInWaitList, + [cuStream](ur_event_handle_t event) -> ur_result_t { + if (event->get_queue()->has_been_synchronized( + event->get_compute_stream_token())) { + return UR_RESULT_SUCCESS; + } else { + return UR_CHECK_ERROR( + cuStreamWaitEvent(cuStream, event->get(), 0)); + } + }); + } + + result = UR_CHECK_ERROR(cuEventRecord(hQueue->barrier_event_, cuStream)); + for (unsigned int i = 0; i < hQueue->compute_applied_barrier_.size(); + i++) { + hQueue->compute_applied_barrier_[i] = false; + } + for (unsigned int i = 0; i < hQueue->transfer_applied_barrier_.size(); + i++) { + hQueue->transfer_applied_barrier_[i] = false; + } + } + if (result != UR_RESULT_SUCCESS) { + return result; + } + + if (phEvent) { + *phEvent = ur_event_handle_t_::make_native( + UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, hQueue, cuStream, stream_token); + (*phEvent)->start(); + (*phEvent)->record(); + } + + return UR_RESULT_SUCCESS; + } catch (ur_result_t err) { + return err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } +} + +/// Enqueues a wait on the given CUstream for all events. +/// See \ref enqueueEventWait +/// TODO: Add support for multiple streams once the Event class is properly +/// refactored. +/// +ur_result_t urEnqueueEventsWait(ur_queue_handle_t hQueue, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, + phEventWaitList, phEvent); +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp new file mode 100644 index 0000000000000..6788de883e971 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp @@ -0,0 +1,309 @@ +//===--------- event.cpp - CUDA Adapter ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include "event.hpp" +#include "common.hpp" +#include "context.hpp" +#include "device.hpp" +#include "queue.hpp" + +#include +#include + +ur_event_handle_t_::ur_event_handle_t_(ur_command_t type, + ur_context_handle_t context, + ur_queue_handle_t queue, CUstream stream, + uint32_t stream_token) + : commandType_{type}, refCount_{1}, has_ownership_{true}, + hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false}, + streamToken_{stream_token}, evEnd_{nullptr}, evStart_{nullptr}, + evQueued_{nullptr}, queue_{queue}, stream_{stream}, context_{context} { + + bool profilingEnabled = queue_->ur_flags_ & UR_QUEUE_FLAG_PROFILING_ENABLE; + + UR_CHECK_ERROR(cuEventCreate( + &evEnd_, profilingEnabled ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING)); + + if (profilingEnabled) { + UR_CHECK_ERROR(cuEventCreate(&evQueued_, CU_EVENT_DEFAULT)); + UR_CHECK_ERROR(cuEventCreate(&evStart_, CU_EVENT_DEFAULT)); + } + + if (queue_ != nullptr) { + urQueueRetain(queue_); + } + urContextRetain(context_); +} + +ur_event_handle_t_::ur_event_handle_t_(ur_context_handle_t context, + CUevent eventNative) + // TODO(ur): Missing user command type + : commandType_{UR_COMMAND_EVENTS_WAIT}, refCount_{1}, has_ownership_{false}, + hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false}, + streamToken_{std::numeric_limits::max()}, evEnd_{eventNative}, + evStart_{nullptr}, evQueued_{nullptr}, queue_{nullptr}, context_{ + context} { + urContextRetain(context_); +} + +ur_event_handle_t_::~ur_event_handle_t_() { + if (queue_ != nullptr) { + urQueueRelease(queue_); + } + urContextRelease(context_); +} + +ur_result_t ur_event_handle_t_::start() { + assert(!is_started()); + ur_result_t result = UR_RESULT_SUCCESS; + + try { + if (queue_->ur_flags_ & UR_QUEUE_FLAG_PROFILING_ENABLE) { + // NOTE: This relies on the default stream to be unused. + result = UR_CHECK_ERROR(cuEventRecord(evQueued_, 0)); + result = UR_CHECK_ERROR(cuEventRecord(evStart_, stream_)); + } + } catch (ur_result_t error) { + result = error; + } + + isStarted_ = true; + return result; +} + +bool ur_event_handle_t_::is_completed() const noexcept { + if (!isRecorded_) { + return false; + } + if (!hasBeenWaitedOn_) { + const CUresult ret = cuEventQuery(evEnd_); + if (ret != CUDA_SUCCESS && ret != CUDA_ERROR_NOT_READY) { + UR_CHECK_ERROR(ret); + return false; + } + if (ret == CUDA_ERROR_NOT_READY) { + return false; + } + } + return true; +} + +uint64_t ur_event_handle_t_::get_queued_time() const { + assert(is_started()); + return queue_->get_device()->get_elapsed_time(evQueued_); +} + +uint64_t ur_event_handle_t_::get_start_time() const { + assert(is_started()); + return queue_->get_device()->get_elapsed_time(evStart_); +} + +uint64_t ur_event_handle_t_::get_end_time() const { + assert(is_started() && is_recorded()); + return queue_->get_device()->get_elapsed_time(evEnd_); +} + +ur_result_t ur_event_handle_t_::record() { + + if (is_recorded() || !is_started()) { + return UR_RESULT_ERROR_INVALID_EVENT; + } + + ur_result_t result = UR_RESULT_ERROR_INVALID_OPERATION; + + UR_ASSERT(queue_, UR_RESULT_ERROR_INVALID_QUEUE); + + try { + eventId_ = queue_->get_next_event_id(); + if (eventId_ == 0) { + sycl::detail::ur::die( + "Unrecoverable program state reached in event identifier overflow"); + } + result = UR_CHECK_ERROR(cuEventRecord(evEnd_, stream_)); + } catch (ur_result_t error) { + result = error; + } + + if (result == UR_RESULT_SUCCESS) { + isRecorded_ = true; + } + + return result; +} + +ur_result_t ur_event_handle_t_::wait() { + ur_result_t retErr; + try { + retErr = UR_CHECK_ERROR(cuEventSynchronize(evEnd_)); + hasBeenWaitedOn_ = true; + } catch (ur_result_t error) { + retErr = error; + } + + return retErr; +} + +ur_result_t ur_event_handle_t_::release() { + if (!backend_has_ownership()) + return UR_RESULT_SUCCESS; + + assert(queue_ != nullptr); + + UR_CHECK_ERROR(cuEventDestroy(evEnd_)); + + if (queue_->ur_flags_ & UR_QUEUE_FLAG_PROFILING_ENABLE) { + UR_CHECK_ERROR(cuEventDestroy(evQueued_)); + UR_CHECK_ERROR(cuEventDestroy(evStart_)); + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(ur_event_handle_t hEvent, + ur_event_info_t propName, + size_t propValueSize, + void *pPropValue, + size_t *pPropValueSizeRet) { + UR_ASSERT(hEvent, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet); + + switch (propName) { + case UR_EVENT_INFO_COMMAND_QUEUE: + return ReturnValue(hEvent->get_queue()); + case UR_EVENT_INFO_COMMAND_TYPE: + return ReturnValue(hEvent->get_command_type()); + case UR_EVENT_INFO_REFERENCE_COUNT: + return ReturnValue(hEvent->get_reference_count()); + case UR_EVENT_INFO_COMMAND_EXECUTION_STATUS: + return ReturnValue(hEvent->get_execution_status()); + case UR_EVENT_INFO_CONTEXT: + return ReturnValue(hEvent->get_context()); + default: + sycl::detail::ur::die("Event info request not implemented"); + } + + return UR_RESULT_ERROR_INVALID_ENUMERATION; +} + +/// Obtain profiling information from PI CUDA events +/// \TODO Timings from CUDA are only elapsed time. +UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( + ur_event_handle_t hEvent, ur_profiling_info_t propName, + size_t propValueSize, void *pPropValue, size_t *pPropValueSizeRet) { + UR_ASSERT(hEvent, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet); + + ur_queue_handle_t queue = hEvent->get_queue(); + if (queue == nullptr || + !(queue->ur_flags_ & UR_QUEUE_FLAG_PROFILING_ENABLE)) { + return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; + } + + switch (propName) { + case UR_PROFILING_INFO_COMMAND_QUEUED: + case UR_PROFILING_INFO_COMMAND_SUBMIT: + // Note: No user for this case + return ReturnValue(static_cast(hEvent->get_queued_time())); + case UR_PROFILING_INFO_COMMAND_START: + return ReturnValue(static_cast(hEvent->get_start_time())); + case UR_PROFILING_INFO_COMMAND_END: + return ReturnValue(static_cast(hEvent->get_end_time())); + default: + break; + } + sycl::detail::ur::die("Event Profiling info request not implemented"); + return {}; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback(ur_event_handle_t, + ur_execution_info_t, + ur_event_callback_t, + void *) { + sycl::detail::ur::die("Event Callback not implemented in CUDA adapter"); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) { + try { + UR_ASSERT(phEventWaitList, UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + UR_ASSERT(numEvents > 0, UR_RESULT_ERROR_INVALID_VALUE); + + auto context = phEventWaitList[0]->get_context(); + ScopedContext active(context); + + auto waitFunc = [context](ur_event_handle_t event) -> ur_result_t { + UR_ASSERT(event, UR_RESULT_ERROR_INVALID_EVENT); + UR_ASSERT(event->get_context() == context, + UR_RESULT_ERROR_INVALID_CONTEXT); + + return event->wait(); + }; + return forLatestEvents(phEventWaitList, numEvents, waitFunc); + } catch (ur_result_t err) { + return err; + } catch (...) { + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + } +} + +UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) { + UR_ASSERT(hEvent, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + const auto refCount = hEvent->increment_reference_count(); + + sycl::detail::ur::assertion( + refCount != 0, "Reference count overflow detected in urEventRetain."); + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { + UR_ASSERT(hEvent, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + // double delete or someone is messing with the ref count. + // either way, cannot safely proceed. + sycl::detail::ur::assertion( + hEvent->get_reference_count() != 0, + "Reference count overflow detected in urEventRelease."); + + // decrement ref count. If it is 0, delete the event. + if (hEvent->decrement_reference_count() == 0) { + std::unique_ptr event_ptr{hEvent}; + ur_result_t result = UR_RESULT_ERROR_INVALID_EVENT; + try { + ScopedContext active(hEvent->get_context()); + result = hEvent->release(); + } catch (...) { + result = UR_RESULT_ERROR_OUT_OF_RESOURCES; + } + return result; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEventGetNativeHandle( + ur_event_handle_t hEvent, ur_native_handle_t *phNativeEvent) { + *phNativeEvent = reinterpret_cast(hEvent->get()); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle( + ur_native_handle_t hNativeEvent, ur_context_handle_t hContext, + const ur_event_native_properties_t *pProperties, + ur_event_handle_t *phEvent) { + (void)pProperties; + + std::unique_ptr event_ptr{nullptr}; + + *phEvent = ur_event_handle_t_::make_with_native( + hContext, reinterpret_cast(hNativeEvent)); + + return UR_RESULT_SUCCESS; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp new file mode 100644 index 0000000000000..d0c7fef8a2b48 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp @@ -0,0 +1,191 @@ +//===--------- event.hpp - CUDA Adapter ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// +#pragma once + +#include +#include + +#include "queue.hpp" + +/// UR Event mapping to CUevent +/// +struct ur_event_handle_t_ { +public: + using native_type = CUevent; + + ur_result_t record(); + + ur_result_t wait(); + + ur_result_t start(); + + native_type get() const noexcept { return evEnd_; }; + + ur_queue_handle_t get_queue() const noexcept { return queue_; } + + CUstream get_stream() const noexcept { return stream_; } + + uint32_t get_compute_stream_token() const noexcept { return streamToken_; } + + ur_command_t get_command_type() const noexcept { return commandType_; } + + uint32_t get_reference_count() const noexcept { return refCount_; } + + bool is_recorded() const noexcept { return isRecorded_; } + + bool is_started() const noexcept { return isStarted_; } + + bool is_completed() const noexcept; + + uint32_t get_execution_status() const noexcept { + + if (!is_recorded()) { + return UR_EVENT_STATUS_SUBMITTED; + } + + if (!is_completed()) { + return UR_EVENT_STATUS_RUNNING; + } + return UR_EVENT_STATUS_COMPLETE; + } + + ur_context_handle_t get_context() const noexcept { return context_; }; + + uint32_t increment_reference_count() { return ++refCount_; } + + uint32_t decrement_reference_count() { return --refCount_; } + + uint32_t get_event_id() const noexcept { return eventId_; } + + bool backend_has_ownership() const noexcept { return has_ownership_; } + + // Returns the counter time when the associated command(s) were enqueued + // + uint64_t get_queued_time() const; + + // Returns the counter time when the associated command(s) started execution + // + uint64_t get_start_time() const; + + // Returns the counter time when the associated command(s) completed + // + uint64_t get_end_time() const; + + // construct a native CUDA. This maps closely to the underlying CUDA event. + static ur_event_handle_t + make_native(ur_command_t type, ur_queue_handle_t queue, CUstream stream, + uint32_t stream_token = std::numeric_limits::max()) { + // TODO(ur): Remove cast when pi_event is ported to UR + return new ur_event_handle_t_(type, queue->get_context(), queue, stream, + stream_token); + } + + static ur_event_handle_t make_with_native(ur_context_handle_t context, + CUevent eventNative) { + return new ur_event_handle_t_(context, eventNative); + } + + ur_result_t release(); + + ~ur_event_handle_t_(); + +private: + // This constructor is private to force programmers to use the make_native / + // make_user static members in order to create a pi_event for CUDA. + ur_event_handle_t_(ur_command_t type, ur_context_handle_t context, + ur_queue_handle_t queue, CUstream stream, + uint32_t stream_token); + + // This constructor is private to force programmers to use the + // make_with_native for event introp + ur_event_handle_t_(ur_context_handle_t context, CUevent eventNative); + + ur_command_t commandType_; // The type of command associated with event. + + std::atomic_uint32_t refCount_; // Event reference count. + + bool has_ownership_; // Signifies if event owns the native type. + + bool hasBeenWaitedOn_; // Signifies whether the event has been waited + // on through a call to wait(), which implies + // that it has completed. + + bool isRecorded_; // Signifies wether a native CUDA event has been recorded + // yet. + bool isStarted_; // Signifies wether the operation associated with the + // PI event has started or not + // + + uint32_t streamToken_; + uint32_t eventId_; // Queue identifier of the event. + + native_type evEnd_; // CUDA event handle. If this _pi_event represents a user + // event, this will be nullptr. + + native_type evStart_; // CUDA event handle associated with the start + + native_type evQueued_; // CUDA event handle associated with the time + // the command was enqueued + + ur_queue_handle_t queue_; // pi_queue associated with the event. If this is a + // user event, this will be nullptr. + + CUstream stream_; // CUstream associated with the event. If this is a user + // event, this will be uninitialized. + + ur_context_handle_t context_; // pi_context associated with the event. If this + // is a native event, this will be the same + // context associated with the queue_ member. +}; + +// Iterates over the event wait list, returns correct ur_result_t error codes. +// Invokes the callback for the latest event of each queue in the wait list. +// The callback must take a single pi_event argument and return a ur_result_t. +template +ur_result_t forLatestEvents(const ur_event_handle_t *event_wait_list, + std::size_t num_events_in_wait_list, Func &&f) { + + if (event_wait_list == nullptr || num_events_in_wait_list == 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + // Fast path if we only have a single event + if (num_events_in_wait_list == 1) { + return f(event_wait_list[0]); + } + + std::vector events{ + event_wait_list, event_wait_list + num_events_in_wait_list}; + std::sort(events.begin(), events.end(), + [](ur_event_handle_t e0, ur_event_handle_t e1) { + // Tiered sort creating sublists of streams (smallest value first) + // in which the corresponding events are sorted into a sequence of + // newest first. + return e0->get_stream() < e1->get_stream() || + (e0->get_stream() == e1->get_stream() && + e0->get_event_id() > e1->get_event_id()); + }); + + bool first = true; + CUstream lastSeenStream = 0; + for (ur_event_handle_t event : events) { + if (!event || (!first && event->get_stream() == lastSeenStream)) { + continue; + } + + first = false; + lastSeenStream = event->get_stream(); + + auto result = f(event); + if (result != UR_RESULT_SUCCESS) { + return result; + } + } + + return UR_RESULT_SUCCESS; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp new file mode 100644 index 0000000000000..1d10cedd82c91 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp @@ -0,0 +1,326 @@ +//===--------- queue.cpp - CUDA Adapter ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include "queue.hpp" +#include "common.hpp" +#include "context.hpp" +#include "event.hpp" + +#include +#include + +void ur_queue_handle_t_::compute_stream_wait_for_barrier_if_needed( + CUstream stream, uint32_t stream_i) { + if (barrier_event_ && !compute_applied_barrier_[stream_i]) { + UR_CHECK_ERROR(cuStreamWaitEvent(stream, barrier_event_, 0)); + compute_applied_barrier_[stream_i] = true; + } +} + +void ur_queue_handle_t_::transfer_stream_wait_for_barrier_if_needed( + CUstream stream, uint32_t stream_i) { + if (barrier_event_ && !transfer_applied_barrier_[stream_i]) { + UR_CHECK_ERROR(cuStreamWaitEvent(stream, barrier_event_, 0)); + transfer_applied_barrier_[stream_i] = true; + } +} + +CUstream ur_queue_handle_t_::get_next_compute_stream(uint32_t *stream_token) { + uint32_t stream_i; + uint32_t token; + while (true) { + if (num_compute_streams_ < compute_streams_.size()) { + // the check above is for performance - so as not to lock mutex every time + std::lock_guard guard(compute_stream_mutex_); + // The second check is done after mutex is locked so other threads can not + // change num_compute_streams_ after that + if (num_compute_streams_ < compute_streams_.size()) { + UR_CHECK_ERROR( + cuStreamCreate(&compute_streams_[num_compute_streams_++], flags_)); + } + } + token = compute_stream_idx_++; + stream_i = token % compute_streams_.size(); + // if a stream has been reused before it was next selected round-robin + // fashion, we want to delay its next use and instead select another one + // that is more likely to have completed all the enqueued work. + if (delay_compute_[stream_i]) { + delay_compute_[stream_i] = false; + } else { + break; + } + } + if (stream_token) { + *stream_token = token; + } + CUstream res = compute_streams_[stream_i]; + compute_stream_wait_for_barrier_if_needed(res, stream_i); + return res; +} + +CUstream ur_queue_handle_t_::get_next_compute_stream( + uint32_t num_events_in_wait_list, const ur_event_handle_t *event_wait_list, + ur_stream_guard_ &guard, uint32_t *stream_token) { + for (uint32_t i = 0; i < num_events_in_wait_list; i++) { + uint32_t token = event_wait_list[i]->get_compute_stream_token(); + if (reinterpret_cast(event_wait_list[i]->get_queue()) == + this && + can_reuse_stream(token)) { + std::unique_lock compute_sync_guard( + compute_stream_sync_mutex_); + // redo the check after lock to avoid data races on + // last_sync_compute_streams_ + if (can_reuse_stream(token)) { + uint32_t stream_i = token % delay_compute_.size(); + delay_compute_[stream_i] = true; + if (stream_token) { + *stream_token = token; + } + guard = ur_stream_guard_{std::move(compute_sync_guard)}; + CUstream res = event_wait_list[i]->get_stream(); + compute_stream_wait_for_barrier_if_needed(res, stream_i); + return res; + } + } + } + guard = {}; + return get_next_compute_stream(stream_token); +} + +CUstream ur_queue_handle_t_::get_next_transfer_stream() { + if (transfer_streams_.empty()) { // for example in in-order queue + return get_next_compute_stream(); + } + if (num_transfer_streams_ < transfer_streams_.size()) { + // the check above is for performance - so as not to lock mutex every time + std::lock_guard guard(transfer_stream_mutex_); + // The second check is done after mutex is locked so other threads can not + // change num_transfer_streams_ after that + if (num_transfer_streams_ < transfer_streams_.size()) { + UR_CHECK_ERROR( + cuStreamCreate(&transfer_streams_[num_transfer_streams_++], flags_)); + } + } + uint32_t stream_i = transfer_stream_idx_++ % transfer_streams_.size(); + CUstream res = transfer_streams_[stream_i]; + transfer_stream_wait_for_barrier_if_needed(res, stream_i); + return res; +} + +/// Creates a `ur_queue_handle_t` object on the CUDA backend. +/// Valid properties +/// * __SYCL_PI_CUDA_USE_DEFAULT_STREAM -> CU_STREAM_DEFAULT +/// * __SYCL_PI_CUDA_SYNC_WITH_DEFAULT -> CU_STREAM_NON_BLOCKING +/// +UR_APIEXPORT ur_result_t UR_APICALL +urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice, + const ur_queue_properties_t *pProps, ur_queue_handle_t *phQueue) { + try { + std::unique_ptr queueImpl{nullptr}; + + if (hContext->get_device() != hDevice) { + *phQueue = nullptr; + return UR_RESULT_ERROR_INVALID_DEVICE; + } + + unsigned int flags = CU_STREAM_NON_BLOCKING; + ur_queue_flags_t urFlags = 0; + bool is_out_of_order = false; + if (pProps && pProps->stype == UR_STRUCTURE_TYPE_QUEUE_PROPERTIES) { + urFlags = pProps->flags; + if (urFlags == __SYCL_UR_CUDA_USE_DEFAULT_STREAM) { + flags = CU_STREAM_DEFAULT; + } else if (urFlags == __SYCL_UR_CUDA_SYNC_WITH_DEFAULT) { + flags = 0; + } + + if (urFlags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) { + is_out_of_order = true; + } + } + + std::vector computeCuStreams( + is_out_of_order ? ur_queue_handle_t_::default_num_compute_streams : 1); + std::vector transferCuStreams( + is_out_of_order ? ur_queue_handle_t_::default_num_transfer_streams : 0); + + queueImpl = std::unique_ptr(new ur_queue_handle_t_{ + std::move(computeCuStreams), std::move(transferCuStreams), hContext, + hDevice, flags, urFlags}); + + *phQueue = queueImpl.release(); + + return UR_RESULT_SUCCESS; + } catch (ur_result_t err) { + + return err; + + } catch (...) { + + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + } +} + +UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) { + assert(hQueue != nullptr); + assert(hQueue->get_reference_count() > 0); + + hQueue->increment_reference_count(); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) { + assert(hQueue != nullptr); + + if (hQueue->decrement_reference_count() > 0) { + return UR_RESULT_SUCCESS; + } + + try { + std::unique_ptr queueImpl(hQueue); + + if (!hQueue->backend_has_ownership()) + return UR_RESULT_SUCCESS; + + ScopedContext active(hQueue->get_context()); + + hQueue->for_each_stream([](CUstream s) { + UR_CHECK_ERROR(cuStreamSynchronize(s)); + UR_CHECK_ERROR(cuStreamDestroy(s)); + }); + + return UR_RESULT_SUCCESS; + } catch (ur_result_t err) { + return err; + } catch (...) { + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + } +} + +UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) { + ur_result_t result = UR_RESULT_SUCCESS; + + try { + + assert(hQueue != + nullptr); // need PI_ERROR_INVALID_EXTERNAL_HANDLE error code + ScopedContext active(hQueue->get_context()); + + hQueue->sync_streams([&result](CUstream s) { + result = UR_CHECK_ERROR(cuStreamSynchronize(s)); + }); + + } catch (ur_result_t err) { + + result = err; + + } catch (...) { + + result = UR_RESULT_ERROR_OUT_OF_RESOURCES; + } + + return result; +} + +// There is no CUDA counterpart for queue flushing and we don't run into the +// same problem of having to flush cross-queue dependencies as some of the +// other plugins, so it can be left as no-op. +UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t hQueue) { + (void)hQueue; + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle( + ur_queue_handle_t hQueue, ur_native_handle_t *phNativeQueue) { + ScopedContext active(hQueue->get_context()); + *phNativeQueue = + reinterpret_cast(hQueue->get_next_compute_stream()); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( + ur_native_handle_t hNativeQueue, ur_context_handle_t hContext, + ur_device_handle_t hDevice, const ur_queue_native_properties_t *pProperties, + ur_queue_handle_t *phQueue) { + (void)pProperties; + + unsigned int cuFlags; + CUstream cuStream = reinterpret_cast(hNativeQueue); + UR_ASSERT(hContext->get_device() == hDevice, UR_RESULT_ERROR_INVALID_DEVICE); + + auto retErr = UR_CHECK_ERROR(cuStreamGetFlags(cuStream, &cuFlags)); + + ur_queue_flags_t flags = 0; + if (cuFlags == CU_STREAM_DEFAULT) + flags = __SYCL_UR_CUDA_USE_DEFAULT_STREAM; + else if (cuFlags == CU_STREAM_NON_BLOCKING) + flags = __SYCL_UR_CUDA_SYNC_WITH_DEFAULT; + else + sycl::detail::ur::die("Unknown cuda stream"); + + std::vector computeCuStreams(1, cuStream); + std::vector transferCuStreams(0); + + // Create queue and set num_compute_streams to 1, as computeCuStreams has + // valid stream + *phQueue = new ur_queue_handle_t_{std::move(computeCuStreams), + std::move(transferCuStreams), + hContext, + hDevice, + cuFlags, + flags, + /*backend_owns*/ false}; + (*phQueue)->num_compute_streams_ = 1; + + return retErr; +} + +UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue, + ur_queue_info_t propName, + size_t propValueSize, + void *pPropValue, + size_t *pPropSizeRet) { + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE); + + UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropSizeRet); + + switch (uint32_t{propName}) { + case UR_QUEUE_INFO_CONTEXT: + return ReturnValue(hQueue->context_); + case UR_QUEUE_INFO_DEVICE: + return ReturnValue(hQueue->device_); + case UR_QUEUE_INFO_REFERENCE_COUNT: + return ReturnValue(hQueue->get_reference_count()); + case UR_QUEUE_INFO_FLAGS: + return ReturnValue(hQueue->ur_flags_); + case UR_QUEUE_INFO_EMPTY: { + try { + bool IsReady = hQueue->all_of([](CUstream s) -> bool { + const CUresult ret = cuStreamQuery(s); + if (ret == CUDA_SUCCESS) + return true; + + if (ret == CUDA_ERROR_NOT_READY) + return false; + + UR_CHECK_ERROR(ret); + return false; + }); + return ReturnValue(IsReady); + } catch (ur_result_t err) { + return err; + } catch (...) { + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + } + } + default: + break; + } + sycl::detail::ur::die("Queue info request not implemented"); + return {}; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp new file mode 100644 index 0000000000000..99a7904b82b7e --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp @@ -0,0 +1,253 @@ +//===--------- queue.hpp - CUDA Adapter ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// +#pragma once + +#include + +#include +#include + +using ur_stream_guard_ = std::unique_lock; + +/// UR queue mapping on to CUstream objects. +/// +struct ur_queue_handle_t_ { + + using native_type = CUstream; + static constexpr int default_num_compute_streams = 128; + static constexpr int default_num_transfer_streams = 64; + + std::vector compute_streams_; + std::vector transfer_streams_; + // delay_compute_ keeps track of which streams have been recently reused and + // their next use should be delayed. If a stream has been recently reused it + // will be skipped the next time it would be selected round-robin style. When + // skipped, its delay flag is cleared. + std::vector delay_compute_; + // keep track of which streams have applied barrier + std::vector compute_applied_barrier_; + std::vector transfer_applied_barrier_; + ur_context_handle_t_ *context_; + ur_device_handle_t_ *device_; + // ur_queue_properties_t properties_; + CUevent barrier_event_ = nullptr; + CUevent barrier_tmp_event_ = nullptr; + std::atomic_uint32_t refCount_; + std::atomic_uint32_t eventCount_; + std::atomic_uint32_t compute_stream_idx_; + std::atomic_uint32_t transfer_stream_idx_; + unsigned int num_compute_streams_; + unsigned int num_transfer_streams_; + unsigned int last_sync_compute_streams_; + unsigned int last_sync_transfer_streams_; + unsigned int flags_; + ur_queue_flags_t ur_flags_; + // When compute_stream_sync_mutex_ and compute_stream_mutex_ both need to be + // locked at the same time, compute_stream_sync_mutex_ should be locked first + // to avoid deadlocks + std::mutex compute_stream_sync_mutex_; + std::mutex compute_stream_mutex_; + std::mutex transfer_stream_mutex_; + std::mutex barrier_mutex_; + bool has_ownership_; + + ur_queue_handle_t_(std::vector &&compute_streams, + std::vector &&transfer_streams, + ur_context_handle_t_ *context, ur_device_handle_t_ *device, + unsigned int flags, ur_queue_flags_t ur_flags, + bool backend_owns = true) + : compute_streams_{std::move(compute_streams)}, + transfer_streams_{std::move(transfer_streams)}, + delay_compute_(compute_streams_.size(), false), + compute_applied_barrier_(compute_streams_.size()), + transfer_applied_barrier_(transfer_streams_.size()), context_{context}, + device_{device}, refCount_{1}, eventCount_{0}, compute_stream_idx_{0}, + transfer_stream_idx_{0}, num_compute_streams_{0}, + num_transfer_streams_{0}, last_sync_compute_streams_{0}, + last_sync_transfer_streams_{0}, flags_(flags), + ur_flags_(ur_flags), has_ownership_{backend_owns} { + urContextRetain(context_); + urDeviceRetain(device_); + } + + ~ur_queue_handle_t_() { + urContextRelease(context_); + urDeviceRelease(device_); + } + + void compute_stream_wait_for_barrier_if_needed(CUstream stream, + uint32_t stream_i); + void transfer_stream_wait_for_barrier_if_needed(CUstream stream, + uint32_t stream_i); + + // get_next_compute/transfer_stream() functions return streams from + // appropriate pools in round-robin fashion + native_type get_next_compute_stream(uint32_t *stream_token = nullptr); + // this overload tries select a stream that was used by one of dependancies. + // If that is not possible returns a new stream. If a stream is reused it + // returns a lock that needs to remain locked as long as the stream is in use + native_type get_next_compute_stream(uint32_t num_events_in_wait_list, + const ur_event_handle_t *event_wait_list, + ur_stream_guard_ &guard, + uint32_t *stream_token = nullptr); + native_type get_next_transfer_stream(); + native_type get() { return get_next_compute_stream(); }; + + bool has_been_synchronized(uint32_t stream_token) { + // stream token not associated with one of the compute streams + if (stream_token == std::numeric_limits::max()) { + return false; + } + return last_sync_compute_streams_ >= stream_token; + } + + bool can_reuse_stream(uint32_t stream_token) { + // stream token not associated with one of the compute streams + if (stream_token == std::numeric_limits::max()) { + return false; + } + // If the command represented by the stream token was not the last command + // enqueued to the stream we can not reuse the stream - we need to allow for + // commands enqueued after it and the one we are about to enqueue to run + // concurrently + bool is_last_command = + (compute_stream_idx_ - stream_token) <= compute_streams_.size(); + // If there was a barrier enqueued to the queue after the command + // represented by the stream token we should not reuse the stream, as we can + // not take that stream into account for the bookkeeping for the next + // barrier - such a stream would not be synchronized with. Performance-wise + // it does not matter that we do not reuse the stream, as the work + // represented by the stream token is guaranteed to be complete by the + // barrier before any work we are about to enqueue to the stream will start, + // so the event does not need to be synchronized with. + return is_last_command && !has_been_synchronized(stream_token); + } + + template bool all_of(T &&f) { + { + std::lock_guard compute_guard(compute_stream_mutex_); + unsigned int end = + std::min(static_cast(compute_streams_.size()), + num_compute_streams_); + if (!std::all_of(compute_streams_.begin(), compute_streams_.begin() + end, + f)) + return false; + } + { + std::lock_guard transfer_guard(transfer_stream_mutex_); + unsigned int end = + std::min(static_cast(transfer_streams_.size()), + num_transfer_streams_); + if (!std::all_of(transfer_streams_.begin(), + transfer_streams_.begin() + end, f)) + return false; + } + return true; + } + + template void for_each_stream(T &&f) { + { + std::lock_guard compute_guard(compute_stream_mutex_); + unsigned int end = + std::min(static_cast(compute_streams_.size()), + num_compute_streams_); + for (unsigned int i = 0; i < end; i++) { + f(compute_streams_[i]); + } + } + { + std::lock_guard transfer_guard(transfer_stream_mutex_); + unsigned int end = + std::min(static_cast(transfer_streams_.size()), + num_transfer_streams_); + for (unsigned int i = 0; i < end; i++) { + f(transfer_streams_[i]); + } + } + } + + template void sync_streams(T &&f) { + auto sync_compute = [&f, &streams = compute_streams_, + &delay = delay_compute_](unsigned int start, + unsigned int stop) { + for (unsigned int i = start; i < stop; i++) { + f(streams[i]); + delay[i] = false; + } + }; + auto sync_transfer = [&f, &streams = transfer_streams_](unsigned int start, + unsigned int stop) { + for (unsigned int i = start; i < stop; i++) { + f(streams[i]); + } + }; + { + unsigned int size = static_cast(compute_streams_.size()); + std::lock_guard compute_sync_guard(compute_stream_sync_mutex_); + std::lock_guard compute_guard(compute_stream_mutex_); + unsigned int start = last_sync_compute_streams_; + unsigned int end = num_compute_streams_ < size + ? num_compute_streams_ + : compute_stream_idx_.load(); + if (ResetUsed) { + last_sync_compute_streams_ = end; + } + if (end - start >= size) { + sync_compute(0, size); + } else { + start %= size; + end %= size; + if (start <= end) { + sync_compute(start, end); + } else { + sync_compute(start, size); + sync_compute(0, end); + } + } + } + { + unsigned int size = static_cast(transfer_streams_.size()); + if (size > 0) { + std::lock_guard transfer_guard(transfer_stream_mutex_); + unsigned int start = last_sync_transfer_streams_; + unsigned int end = num_transfer_streams_ < size + ? num_transfer_streams_ + : transfer_stream_idx_.load(); + if (ResetUsed) { + last_sync_transfer_streams_ = end; + } + if (end - start >= size) { + sync_transfer(0, size); + } else { + start %= size; + end %= size; + if (start <= end) { + sync_transfer(start, end); + } else { + sync_transfer(start, size); + sync_transfer(0, end); + } + } + } + } + } + + ur_context_handle_t_ *get_context() const { return context_; }; + + ur_device_handle_t_ *get_device() const { return device_; }; + + uint32_t increment_reference_count() noexcept { return ++refCount_; } + + uint32_t decrement_reference_count() noexcept { return --refCount_; } + + uint32_t get_reference_count() const noexcept { return refCount_; } + + uint32_t get_next_event_id() noexcept { return ++eventCount_; } + + bool backend_has_ownership() const noexcept { return has_ownership_; } +}; From 103cec35c6b58abace79d0b662404e4d8ba0fc90 Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Tue, 18 Apr 2023 14:40:56 +0100 Subject: [PATCH 07/45] AAdd program and kernel ddi tables --- sycl/plugins/cuda/pi_cuda.cpp | 4 +- .../ur/adapters/cuda/kernel.cpp | 10 +--- .../ur/adapters/cuda/program.cpp | 18 ++++--- .../ur/adapters/cuda/ur_interface_loader.cpp | 47 ++++++++++--------- 4 files changed, 38 insertions(+), 41 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index c2c08b645b03a..e9dfa9c74ab35 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -3054,6 +3054,8 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextProgramGetNativeHandle, pi2ur::piextProgramGetNativeHandle) _PI_CL(piextProgramCreateWithNativeHandle, pi2ur::piextProgramCreateWithNativeHandle) + _PI_CL(piextProgramSetSpecializationConstant, + pi2ur::piextProgramSetSpecializationConstant) // Kernel _PI_CL(piKernelCreate, pi2ur::piKernelCreate) _PI_CL(piKernelSetArg, pi2ur::piKernelSetArg) @@ -3064,8 +3066,6 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piKernelRelease, pi2ur::piKernelRelease) _PI_CL(piextKernelGetNativeHandle, pi2ur::piextKernelGetNativeHandle) _PI_CL(piKernelSetExecInfo, pi2ur::piKernelSetExecInfo) - _PI_CL(piextProgramSetSpecializationConstant, - pi2ur::piextProgramSetSpecializationConstant) _PI_CL(piextKernelSetArgPointer, pi2ur::piKernelSetArgPointer) _PI_CL(piextKernelCreateWithNativeHandle, pi2ur::piextKernelCreateWithNativeHandle) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp index e34976394c5ff..e80960f7ceb3c 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp @@ -226,7 +226,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel, sycl::detail::ur::assertion( cuFuncGetAttribute(&numRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, hKernel->get()) == CUDA_SUCCESS); - return ReturnValue(uint32_t{numRegs}); + return ReturnValue(static_cast(numRegs)); } default: break; @@ -297,14 +297,6 @@ urKernelSetExecInfo(ur_kernel_handle_t hKernel, ur_kernel_exec_info_t propName, return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urProgramSetSpecializationConstants( - ur_program_handle_t, uint32_t, const ur_specialization_constant_info_t *) { - // This entry point is only used for native specialization constants (SPIR-V), - // and the CUDA plugin is AOT only so this entry point is not supported. - sycl::detail::ur::die("Native specialization constants are not supported"); - return {}; -} - UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle( ur_native_handle_t hNativeKernel, ur_context_handle_t hContext, ur_program_handle_t hProgram, diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp index 7a56620180fef..bca41b4c0b5ba 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp @@ -163,9 +163,8 @@ ur_result_t ur_program_handle_t_::build_program(const char *build_options) { /// query to PI and use cuModuleGetFunction to check for a kernel. /// Note: Another alternative is to add kernel names as metadata, like with /// reqd_work_group_size. -std::string getKernelNames(ur_program_handle_t) { - sycl::detail::ur::die("getKernelNames not implemented"); - return {}; +ur_result_t getKernelNames(ur_program_handle_t) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } /// CUDA will handle the PTX/CUBIN binaries internally through CUmodule object. @@ -282,9 +281,7 @@ urProgramLink(ur_context_handle_t hContext, uint32_t count, UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithNativeHandle( ur_native_handle_t hNativeProgram, ur_context_handle_t hContext, ur_program_handle_t *phProgram) { - sycl::detail::ur::die( - "Creation of UR program from native handle not implemented"); - return {}; + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL @@ -335,7 +332,7 @@ urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName, case UR_PROGRAM_INFO_BINARIES: return ReturnValue(&hProgram->binary_, 1); case UR_PROGRAM_INFO_NUM_KERNELS: - return ReturnValue(getKernelNames(hProgram).c_str()); + return getKernelNames(hProgram); default: break; } @@ -437,3 +434,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( return retError; } + +// This entry point is only used for native specialization constants (SPIR-V), +// and the CUDA plugin is AOT only so this entry point is not supported. +UR_APIEXPORT ur_result_t UR_APICALL urProgramSetSpecializationConstants( + ur_program_handle_t, uint32_t, const ur_specialization_constant_info_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp index d7f9ad75d38cd..9d408ff9d939f 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp @@ -83,19 +83,20 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramProcAddrTable( if (UR_RESULT_SUCCESS != result) { return result; } - pDdiTable->pfnBuild = nullptr; - pDdiTable->pfnCompile = nullptr; - pDdiTable->pfnCreateWithBinary = nullptr; - pDdiTable->pfnCreateWithIL = nullptr; - pDdiTable->pfnCreateWithNativeHandle = nullptr; - pDdiTable->pfnGetBuildInfo = nullptr; + pDdiTable->pfnBuild = urProgramBuild; + pDdiTable->pfnCompile = urProgramCompile; + pDdiTable->pfnCreateWithBinary = urProgramCreateWithBinary; + pDdiTable->pfnCreateWithIL = urProgramCreateWithIL; + pDdiTable->pfnCreateWithNativeHandle = urProgramCreateWithNativeHandle; + pDdiTable->pfnGetBuildInfo = urProgramGetBuildInfo; pDdiTable->pfnGetFunctionPointer = nullptr; - pDdiTable->pfnGetInfo = nullptr; - pDdiTable->pfnGetNativeHandle = nullptr; - pDdiTable->pfnLink = nullptr; - pDdiTable->pfnRelease = nullptr; - pDdiTable->pfnRetain = nullptr; - pDdiTable->pfnSetSpecializationConstants = nullptr; + pDdiTable->pfnGetInfo = urProgramGetInfo; + pDdiTable->pfnGetNativeHandle = urProgramGetNativeHandle; + pDdiTable->pfnLink = urProgramLink; + pDdiTable->pfnRelease = urProgramRelease; + pDdiTable->pfnRetain = urProgramRetain; + pDdiTable->pfnSetSpecializationConstants = + urProgramSetSpecializationConstants; return UR_RESULT_SUCCESS; } @@ -105,20 +106,20 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( if (UR_RESULT_SUCCESS != result) { return result; } - pDdiTable->pfnCreate = nullptr; - pDdiTable->pfnCreateWithNativeHandle = nullptr; - pDdiTable->pfnGetGroupInfo = nullptr; - pDdiTable->pfnGetInfo = nullptr; - pDdiTable->pfnGetNativeHandle = nullptr; - pDdiTable->pfnGetSubGroupInfo = nullptr; - pDdiTable->pfnRelease = nullptr; - pDdiTable->pfnRetain = nullptr; + pDdiTable->pfnCreate = urKernelCreate; + pDdiTable->pfnCreateWithNativeHandle = urKernelCreateWithNativeHandle; + pDdiTable->pfnGetGroupInfo = urKernelGetGroupInfo; + pDdiTable->pfnGetInfo = urKernelGetInfo; + pDdiTable->pfnGetNativeHandle = urKernelGetNativeHandle; + pDdiTable->pfnGetSubGroupInfo = urKernelGetSubGroupInfo; + pDdiTable->pfnRelease = urKernelRelease; + pDdiTable->pfnRetain = urKernelRetain; pDdiTable->pfnSetArgLocal = nullptr; pDdiTable->pfnSetArgMemObj = nullptr; - pDdiTable->pfnSetArgPointer = nullptr; + pDdiTable->pfnSetArgPointer = urKernelSetArgPointer; pDdiTable->pfnSetArgSampler = nullptr; - pDdiTable->pfnSetArgValue = nullptr; - pDdiTable->pfnSetExecInfo = nullptr; + pDdiTable->pfnSetArgValue = urKernelSetArgValue; + pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; pDdiTable->pfnSetSpecializationConstants = nullptr; return UR_RESULT_SUCCESS; } From c64033957addac8e4255ba8d60398a5162911011 Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Mon, 17 Apr 2023 14:41:26 +0100 Subject: [PATCH 08/45] [SYCL][PI][UR][CUDA] Port piEnqueueKernelLaunch to UR --- sycl/plugins/cuda/pi_cuda.cpp | 392 +----------------- .../ur/adapters/cuda/enqueue.cpp | 282 +++++++++++++ .../ur/adapters/cuda/ur_interface_loader.cpp | 6 +- 3 files changed, 292 insertions(+), 388 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index e9dfa9c74ab35..baf4f4a4983d1 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -239,74 +239,6 @@ int getAttribute(pi_device device, CUdevice_attribute attribute) { } /// \endcond -// Determine local work sizes that result in uniform work groups. -// The default threadsPerBlock only require handling the first work_dim -// dimension. -void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock, - const size_t *global_work_size, - const size_t maxThreadsPerBlock[3], pi_kernel kernel, - pi_uint32 local_size) { - assert(threadsPerBlock != nullptr); - assert(global_work_size != nullptr); - assert(kernel != nullptr); - int minGrid, maxBlockSize, maxBlockDim[3]; - - static auto isPrime = [](size_t number) -> bool { - auto lastNumToCheck = ceil(sqrt(number)); - if (number < 2) - return false; - if (number == 2) - return true; - if (number % 2 == 0) - return false; - for (int i = 3; i <= lastNumToCheck; i += 2) { - if (number % i == 0) - return false; - } - return true; - }; - - cuDeviceGetAttribute(&maxBlockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, - device->get()); - cuDeviceGetAttribute(&maxBlockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, - device->get()); - - PI_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize( - &minGrid, &maxBlockSize, kernel->get(), NULL, local_size, - maxThreadsPerBlock[0])); - - threadsPerBlock[2] = std::min(global_work_size[2], size_t(maxBlockDim[2])); - threadsPerBlock[1] = - std::min(global_work_size[1], std::min(maxBlockSize / threadsPerBlock[2], - size_t(maxBlockDim[1]))); - maxBlockDim[0] = maxBlockSize / (threadsPerBlock[1] * threadsPerBlock[2]); - threadsPerBlock[0] = - std::min(maxThreadsPerBlock[0], - std::min(global_work_size[0], size_t(maxBlockDim[0]))); - - // When global_work_size[0] is prime threadPerBlock[0] will later computed as - // 1, which is not efficient configuration. In such case we use - // global_work_size[0] + 1 to compute threadPerBlock[0]. - int adjusted_0_dim_global_work_size = - (isPrime(global_work_size[0]) && - (threadsPerBlock[0] != global_work_size[0])) - ? global_work_size[0] + 1 - : global_work_size[0]; - - static auto isPowerOf2 = [](size_t value) -> bool { - return value && !(value & (value - 1)); - }; - - // Find a local work group size that is a divisor of the global - // work group size to produce uniform work groups. - // Additionally, for best compute utilisation, the local size has - // to be a power of two. - while (0u != (adjusted_0_dim_global_work_size % threadsPerBlock[0]) || - !isPowerOf2(threadsPerBlock[0])) { - --threadsPerBlock[0]; - } -} - pi_result enqueueEventsWait(pi_queue command_queue, CUstream stream, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list) { @@ -365,27 +297,6 @@ void getUSMHostOrDevicePtr(PtrT usm_ptr, CUmemorytype *out_mem_type, } } -// Helper to verify out-of-registers case (exceeded block max registers). -// If the kernel requires a number of registers for the entire thread -// block exceeds the hardware limitations, then the cuLaunchKernel call -// will fail to launch with CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES error. -bool hasExceededMaxRegistersPerBlock(pi_device device, pi_kernel kernel, - size_t blockSize) { - assert(device); - assert(kernel); - - int maxRegsPerBlock{0}; - PI_CHECK_ERROR(cuDeviceGetAttribute( - &maxRegsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, - device->get())); - - int regsPerThread{0}; - PI_CHECK_ERROR(cuFuncGetAttribute(®sPerThread, CU_FUNC_ATTRIBUTE_NUM_REGS, - kernel->get())); - - return blockSize * regsPerThread > size_t(maxRegsPerBlock); -} - } // anonymous namespace /// ------ Error handling, matching OpenCL plugin semantics. @@ -421,20 +332,6 @@ void assertion(bool Condition, const char *Message) { //-------------- // PI object implementation -extern "C" { - -// Required in a number of functions, so forward declare here -pi_result cuda_piEnqueueEventsWait(pi_queue command_queue, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event); -pi_result cuda_piEnqueueEventsWaitWithBarrier(pi_queue command_queue, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event); - -} // extern "C" - /// \endcond // makes all future work submitted to queue wait for all work captured in event. @@ -953,172 +850,6 @@ pi_result cuda_piextKernelSetArgSampler(pi_kernel kernel, pi_uint32 arg_index, return retErr; } -pi_result cuda_piEnqueueKernelLaunch( - pi_queue command_queue, pi_kernel kernel, pi_uint32 work_dim, - const size_t *global_work_offset, const size_t *global_work_size, - const size_t *local_work_size, pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, pi_event *event) { - - // Preconditions - assert(command_queue != nullptr); - assert(command_queue->get_context() == kernel->get_context()); - assert(kernel != nullptr); - assert(global_work_offset != nullptr); - assert(work_dim > 0); - assert(work_dim < 4); - - if (*global_work_size == 0) { - return cuda_piEnqueueEventsWaitWithBarrier( - command_queue, num_events_in_wait_list, event_wait_list, event); - } - - // Set the number of threads per block to the number of threads per warp - // by default unless user has provided a better number - size_t threadsPerBlock[3] = {32u, 1u, 1u}; - size_t maxWorkGroupSize = 0u; - size_t maxThreadsPerBlock[3] = {}; - bool providedLocalWorkGroupSize = (local_work_size != nullptr); - pi_uint32 local_size = kernel->get_local_size(); - pi_result retError = PI_SUCCESS; - - try { - // Set the active context here as guessLocalWorkSize needs an active context - ScopedContext active(command_queue->get_context()); - { - size_t *reqdThreadsPerBlock = kernel->reqdThreadsPerBlock_; - maxWorkGroupSize = command_queue->device_->get_max_work_group_size(); - command_queue->device_->get_max_work_item_sizes( - sizeof(maxThreadsPerBlock), maxThreadsPerBlock); - - if (providedLocalWorkGroupSize) { - auto isValid = [&](int dim) { - if (reqdThreadsPerBlock[dim] != 0 && - local_work_size[dim] != reqdThreadsPerBlock[dim]) - return PI_ERROR_INVALID_WORK_GROUP_SIZE; - - if (local_work_size[dim] > maxThreadsPerBlock[dim]) - return PI_ERROR_INVALID_WORK_GROUP_SIZE; - // Checks that local work sizes are a divisor of the global work sizes - // which includes that the local work sizes are neither larger than - // the global work sizes and not 0. - if (0u == local_work_size[dim]) - return PI_ERROR_INVALID_WORK_GROUP_SIZE; - if (0u != (global_work_size[dim] % local_work_size[dim])) - return PI_ERROR_INVALID_WORK_GROUP_SIZE; - threadsPerBlock[dim] = local_work_size[dim]; - return PI_SUCCESS; - }; - - size_t kernelLocalWorkGroupSize = 0; - for (size_t dim = 0; dim < work_dim; dim++) { - auto err = isValid(dim); - if (err != PI_SUCCESS) - return err; - // If no error then sum the total local work size per dim. - kernelLocalWorkGroupSize += local_work_size[dim]; - } - - if (hasExceededMaxRegistersPerBlock( - reinterpret_cast(command_queue->device_), kernel, - kernelLocalWorkGroupSize)) { - return PI_ERROR_INVALID_WORK_GROUP_SIZE; - } - } else { - guessLocalWorkSize(reinterpret_cast(command_queue->device_), - threadsPerBlock, global_work_size, - maxThreadsPerBlock, kernel, local_size); - } - } - - if (maxWorkGroupSize < - size_t(threadsPerBlock[0] * threadsPerBlock[1] * threadsPerBlock[2])) { - return PI_ERROR_INVALID_WORK_GROUP_SIZE; - } - - size_t blocksPerGrid[3] = {1u, 1u, 1u}; - - for (size_t i = 0; i < work_dim; i++) { - blocksPerGrid[i] = - (global_work_size[i] + threadsPerBlock[i] - 1) / threadsPerBlock[i]; - } - - std::unique_ptr<_pi_event> retImplEv{nullptr}; - - pi_uint32 stream_token; - _pi_stream_guard guard; - CUstream cuStream = command_queue->get_next_compute_stream( - num_events_in_wait_list, - reinterpret_cast(event_wait_list), guard, - &stream_token); - CUfunction cuFunc = kernel->get(); - - retError = enqueueEventsWait(command_queue, cuStream, - num_events_in_wait_list, event_wait_list); - - // Set the implicit global offset parameter if kernel has offset variant - if (kernel->get_with_offset_parameter()) { - std::uint32_t cuda_implicit_offset[3] = {0, 0, 0}; - if (global_work_offset) { - for (size_t i = 0; i < work_dim; i++) { - cuda_implicit_offset[i] = - static_cast(global_work_offset[i]); - if (global_work_offset[i] != 0) { - cuFunc = kernel->get_with_offset_parameter(); - } - } - } - kernel->set_implicit_offset_arg(sizeof(cuda_implicit_offset), - cuda_implicit_offset); - } - - auto &argIndices = kernel->get_arg_indices(); - - if (event) { - retImplEv = std::unique_ptr<_pi_event>( - _pi_event::make_native(PI_COMMAND_TYPE_NDRANGE_KERNEL, command_queue, - cuStream, stream_token)); - retImplEv->start(); - } - - // Set local mem max size if env var is present - static const char *local_mem_sz_ptr = - std::getenv("SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE"); - - if (local_mem_sz_ptr) { - int device_max_local_mem = 0; - cuDeviceGetAttribute( - &device_max_local_mem, - CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, - command_queue->get_device()->get()); - - static const int env_val = std::atoi(local_mem_sz_ptr); - if (env_val <= 0 || env_val > device_max_local_mem) { - setErrorMessage("Invalid value specified for " - "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE", - UR_RESULT_ERROR_ADAPTER_SPECIFIC); - return PI_ERROR_PLUGIN_SPECIFIC_ERROR; - } - PI_CHECK_ERROR(cuFuncSetAttribute( - cuFunc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, env_val)); - } - - retError = PI_CHECK_ERROR(cuLaunchKernel( - cuFunc, blocksPerGrid[0], blocksPerGrid[1], blocksPerGrid[2], - threadsPerBlock[0], threadsPerBlock[1], threadsPerBlock[2], local_size, - cuStream, const_cast(argIndices.data()), nullptr)); - if (local_size != 0) - kernel->clear_local_size(); - - if (event) { - retError = map_ur_error(retImplEv->record()); - *event = retImplEv.release(); - } - } catch (pi_result err) { - retError = err; - } - return retError; -} - /// \TODO Not implemented pi_result cuda_piEnqueueNativeKernel(pi_queue, void (*)(void *), void *, size_t, pi_uint32, const pi_mem *, const void **, @@ -1297,115 +1028,6 @@ pi_result cuda_piMemRetain(pi_mem mem) { return PI_SUCCESS; } -/// Enqueues a wait on the given CUstream for all events. -/// See \ref enqueueEventWait -/// TODO: Add support for multiple streams once the Event class is properly -/// refactored. -/// -pi_result cuda_piEnqueueEventsWait(pi_queue command_queue, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event) { - return cuda_piEnqueueEventsWaitWithBarrier( - command_queue, num_events_in_wait_list, event_wait_list, event); -} - -/// Enqueues a wait on the given CUstream for all specified events (See -/// \ref enqueueEventWaitWithBarrier.) If the events list is empty, the enqueued -/// wait will wait on all previous events in the queue. -/// -/// \param[in] command_queue A valid PI queue. -/// \param[in] num_events_in_wait_list Number of events in event_wait_list. -/// \param[in] event_wait_list Events to wait on. -/// \param[out] event Event for when all events in event_wait_list have finished -/// or, if event_wait_list is empty, when all previous events in the queue have -/// finished. -/// -/// \return TBD -pi_result cuda_piEnqueueEventsWaitWithBarrier(pi_queue command_queue, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event) { - // This function makes one stream work on the previous work (or work - // represented by input events) and then all future work waits on that stream. - if (!command_queue) { - return PI_ERROR_INVALID_QUEUE; - } - - pi_result result; - - try { - ScopedContext active(command_queue->get_context()); - pi_uint32 stream_token; - _pi_stream_guard guard; - CUstream cuStream = command_queue->get_next_compute_stream( - num_events_in_wait_list, - reinterpret_cast(event_wait_list), guard, - &stream_token); - { - std::lock_guard guard(command_queue->barrier_mutex_); - if (command_queue->barrier_event_ == nullptr) { - PI_CHECK_ERROR(cuEventCreate(&command_queue->barrier_event_, - CU_EVENT_DISABLE_TIMING)); - } - if (num_events_in_wait_list == 0) { // wait on all work - if (command_queue->barrier_tmp_event_ == nullptr) { - PI_CHECK_ERROR(cuEventCreate(&command_queue->barrier_tmp_event_, - CU_EVENT_DISABLE_TIMING)); - } - command_queue->sync_streams( - [cuStream, - tmp_event = command_queue->barrier_tmp_event_](CUstream s) { - if (cuStream != s) { - // record a new CUDA event on every stream and make one stream - // wait for these events - PI_CHECK_ERROR(cuEventRecord(tmp_event, s)); - PI_CHECK_ERROR(cuStreamWaitEvent(cuStream, tmp_event, 0)); - } - }); - } else { // wait just on given events - forLatestEvents(event_wait_list, num_events_in_wait_list, - [cuStream](pi_event event) -> pi_result { - if (event->get_queue()->has_been_synchronized( - event->get_compute_stream_token())) { - return PI_SUCCESS; - } else { - return PI_CHECK_ERROR( - cuStreamWaitEvent(cuStream, event->get(), 0)); - } - }); - } - - result = PI_CHECK_ERROR( - cuEventRecord(command_queue->barrier_event_, cuStream)); - for (unsigned int i = 0; - i < command_queue->compute_applied_barrier_.size(); i++) { - command_queue->compute_applied_barrier_[i] = false; - } - for (unsigned int i = 0; - i < command_queue->transfer_applied_barrier_.size(); i++) { - command_queue->transfer_applied_barrier_[i] = false; - } - } - if (result != PI_SUCCESS) { - return result; - } - - if (event) { - *event = _pi_event::make_native(PI_COMMAND_TYPE_MARKER, command_queue, - cuStream, stream_token); - (*event)->start(); - (*event)->record(); - } - - return PI_SUCCESS; - } catch (pi_result err) { - return err; - } catch (...) { - return PI_ERROR_UNKNOWN; - } -} - /// Creates a PI sampler object /// /// \param[in] context The context the sampler is created for. @@ -2238,8 +1860,8 @@ pi_result cuda_piEnqueueMemBufferMap(pi_queue command_queue, pi_mem buffer, ScopedContext active(command_queue->get_context()); if (is_pinned) { - ret_err = cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list, - event_wait_list, nullptr); + ret_err = pi2ur::piEnqueueEventsWait( + command_queue, num_events_in_wait_list, event_wait_list, nullptr); } if (event) { @@ -2293,8 +1915,8 @@ pi_result cuda_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj, ScopedContext active(command_queue->get_context()); if (is_pinned) { - ret_err = cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list, - event_wait_list, nullptr); + ret_err = pi2ur::piEnqueueEventsWait( + command_queue, num_events_in_wait_list, event_wait_list, nullptr); } if (event) { @@ -3088,10 +2710,10 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piSamplerRetain, cuda_piSamplerRetain) _PI_CL(piSamplerRelease, cuda_piSamplerRelease) // Queue commands - _PI_CL(piEnqueueKernelLaunch, cuda_piEnqueueKernelLaunch) + _PI_CL(piEnqueueKernelLaunch, pi2ur::piEnqueueKernelLaunch) _PI_CL(piEnqueueNativeKernel, cuda_piEnqueueNativeKernel) - _PI_CL(piEnqueueEventsWait, cuda_piEnqueueEventsWait) - _PI_CL(piEnqueueEventsWaitWithBarrier, cuda_piEnqueueEventsWaitWithBarrier) + _PI_CL(piEnqueueEventsWait, pi2ur::piEnqueueEventsWait) + _PI_CL(piEnqueueEventsWaitWithBarrier, pi2ur::piEnqueueEventsWaitWithBarrier) _PI_CL(piEnqueueMemBufferRead, cuda_piEnqueueMemBufferRead) _PI_CL(piEnqueueMemBufferReadRect, cuda_piEnqueueMemBufferReadRect) _PI_CL(piEnqueueMemBufferWrite, cuda_piEnqueueMemBufferWrite) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp index 3dfa1ba1dbd5c..8b732a58fc7a1 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp @@ -9,10 +9,129 @@ #include "common.hpp" #include "context.hpp" #include "event.hpp" +#include "kernel.hpp" #include "queue.hpp" +#include #include +ur_result_t enqueueEventsWait(ur_queue_handle_t command_queue, CUstream stream, + uint32_t num_events_in_wait_list, + const ur_event_handle_t *event_wait_list) { + if (!event_wait_list) { + return UR_RESULT_SUCCESS; + } + try { + ScopedContext active(command_queue->get_context()); + + auto result = forLatestEvents( + event_wait_list, num_events_in_wait_list, + [stream](ur_event_handle_t event) -> ur_result_t { + if (event->get_stream() == stream) { + return UR_RESULT_SUCCESS; + } else { + return UR_CHECK_ERROR(cuStreamWaitEvent(stream, event->get(), 0)); + } + }); + + if (result != UR_RESULT_SUCCESS) { + return result; + } + return UR_RESULT_SUCCESS; + } catch (ur_result_t err) { + return err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } +} + +// Determine local work sizes that result in uniform work groups. +// The default threadsPerBlock only require handling the first work_dim +// dimension. +void guessLocalWorkSize(ur_device_handle_t device, size_t *threadsPerBlock, + const size_t *global_work_size, + const size_t maxThreadsPerBlock[3], + ur_kernel_handle_t kernel, uint32_t local_size) { + assert(threadsPerBlock != nullptr); + assert(global_work_size != nullptr); + assert(kernel != nullptr); + int minGrid, maxBlockSize, maxBlockDim[3]; + + static auto isPrime = [](size_t number) -> bool { + auto lastNumToCheck = ceil(sqrt(number)); + if (number < 2) + return false; + if (number == 2) + return true; + if (number % 2 == 0) + return false; + for (int i = 3; i <= lastNumToCheck; i += 2) { + if (number % i == 0) + return false; + } + return true; + }; + + cuDeviceGetAttribute(&maxBlockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, + device->get()); + cuDeviceGetAttribute(&maxBlockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, + device->get()); + + UR_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize( + &minGrid, &maxBlockSize, kernel->get(), NULL, local_size, + maxThreadsPerBlock[0])); + + threadsPerBlock[2] = std::min(global_work_size[2], size_t(maxBlockDim[2])); + threadsPerBlock[1] = + std::min(global_work_size[1], std::min(maxBlockSize / threadsPerBlock[2], + size_t(maxBlockDim[1]))); + maxBlockDim[0] = maxBlockSize / (threadsPerBlock[1] * threadsPerBlock[2]); + threadsPerBlock[0] = + std::min(maxThreadsPerBlock[0], + std::min(global_work_size[0], size_t(maxBlockDim[0]))); + + // When global_work_size[0] is prime threadPerBlock[0] will later computed as + // 1, which is not efficient configuration. In such case we use + // global_work_size[0] + 1 to compute threadPerBlock[0]. + int adjusted_0_dim_global_work_size = + (isPrime(global_work_size[0]) && + (threadsPerBlock[0] != global_work_size[0])) + ? global_work_size[0] + 1 + : global_work_size[0]; + + static auto isPowerOf2 = [](size_t value) -> bool { + return value && !(value & (value - 1)); + }; + + // Find a local work group size that is a divisor of the global + // work group size to produce uniform work groups. + // Additionally, for best compute utilisation, the local size has + // to be a power of two. + while (0u != (adjusted_0_dim_global_work_size % threadsPerBlock[0]) || + !isPowerOf2(threadsPerBlock[0])) { + --threadsPerBlock[0]; + } +} + +// Helper to verify out-of-registers case (exceeded block max registers). +// If the kernel requires a number of registers for the entire thread +// block exceeds the hardware limitations, then the cuLaunchKernel call +// will fail to launch with CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES error. +bool hasExceededMaxRegistersPerBlock(ur_device_handle_t device, + ur_kernel_handle_t kernel, + size_t blockSize) { + int maxRegsPerBlock{0}; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &maxRegsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, + device->get())); + + int regsPerThread{0}; + UR_CHECK_ERROR(cuFuncGetAttribute(®sPerThread, CU_FUNC_ATTRIBUTE_NUM_REGS, + kernel->get())); + + return blockSize * regsPerThread > size_t(maxRegsPerBlock); +}; + /// Enqueues a wait on the given CUstream for all specified events (See /// \ref enqueueEventWaitWithBarrier.) If the events list is empty, the enqueued /// wait will wait on all previous events in the queue. @@ -108,3 +227,166 @@ ur_result_t urEnqueueEventsWait(ur_queue_handle_t hQueue, return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, phEventWaitList, phEvent); } + +ur_result_t urEnqueueKernelLaunch( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + + // Preconditions + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hQueue->get_context() == hKernel->get_context(), + UR_RESULT_ERROR_INVALID_KERNEL); + UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(pGlobalWorkOffset, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + + if (*pGlobalWorkSize == 0) { + return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, + phEventWaitList, phEvent); + } + + // Set the number of threads per block to the number of threads per warp + // by default unless user has provided a better number + size_t threadsPerBlock[3] = {32u, 1u, 1u}; + size_t maxWorkGroupSize = 0u; + size_t maxThreadsPerBlock[3] = {}; + bool providedLocalWorkGroupSize = (pLocalWorkSize != nullptr); + int32_t local_size = hKernel->get_local_size(); + ur_result_t retError = UR_RESULT_SUCCESS; + + try { + // Set the active context here as guessLocalWorkSize needs an active context + ScopedContext active(hQueue->get_context()); + { + size_t *reqdThreadsPerBlock = hKernel->reqdThreadsPerBlock_; + maxWorkGroupSize = hQueue->device_->get_max_work_group_size(); + hQueue->device_->get_max_work_item_sizes(sizeof(maxThreadsPerBlock), + maxThreadsPerBlock); + + if (providedLocalWorkGroupSize) { + auto isValid = [&](int dim) { + if (reqdThreadsPerBlock[dim] != 0 && + pLocalWorkSize[dim] != reqdThreadsPerBlock[dim]) + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + + if (pLocalWorkSize[dim] > maxThreadsPerBlock[dim]) + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + // Checks that local work sizes are a divisor of the global work sizes + // which includes that the local work sizes are neither larger than + // the global work sizes and not 0. + if (0u == pLocalWorkSize[dim]) + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + if (0u != (pGlobalWorkSize[dim] % pLocalWorkSize[dim])) + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + threadsPerBlock[dim] = pLocalWorkSize[dim]; + return UR_RESULT_SUCCESS; + }; + + size_t kernelLocalWorkGroupSize = 0; + for (size_t dim = 0; dim < workDim; dim++) { + auto err = isValid(dim); + if (err != UR_RESULT_SUCCESS) + return err; + // If no error then sum the total local work size per dim. + kernelLocalWorkGroupSize += pLocalWorkSize[dim]; + } + + if (hasExceededMaxRegistersPerBlock(hQueue->device_, hKernel, + kernelLocalWorkGroupSize)) { + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + } else { + guessLocalWorkSize(hQueue->device_, threadsPerBlock, pGlobalWorkSize, + maxThreadsPerBlock, hKernel, local_size); + } + } + + if (maxWorkGroupSize < + size_t(threadsPerBlock[0] * threadsPerBlock[1] * threadsPerBlock[2])) { + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + + size_t blocksPerGrid[3] = {1u, 1u, 1u}; + + for (size_t i = 0; i < workDim; i++) { + blocksPerGrid[i] = + (pGlobalWorkSize[i] + threadsPerBlock[i] - 1) / threadsPerBlock[i]; + } + + std::unique_ptr retImplEv{nullptr}; + + uint32_t stream_token; + ur_stream_guard_ guard; + CUstream cuStream = hQueue->get_next_compute_stream( + numEventsInWaitList, phEventWaitList, guard, &stream_token); + CUfunction cuFunc = hKernel->get(); + + retError = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, + phEventWaitList); + + // Set the implicit global offset parameter if kernel has offset variant + if (hKernel->get_with_offset_parameter()) { + std::uint32_t cuda_implicit_offset[3] = {0, 0, 0}; + if (pGlobalWorkOffset) { + for (size_t i = 0; i < workDim; i++) { + cuda_implicit_offset[i] = + static_cast(pGlobalWorkOffset[i]); + if (pGlobalWorkOffset[i] != 0) { + cuFunc = hKernel->get_with_offset_parameter(); + } + } + } + hKernel->set_implicit_offset_arg(sizeof(cuda_implicit_offset), + cuda_implicit_offset); + } + + auto &argIndices = hKernel->get_arg_indices(); + + if (phEvent) { + retImplEv = + std::unique_ptr(ur_event_handle_t_::make_native( + UR_COMMAND_KERNEL_LAUNCH, hQueue, cuStream, stream_token)); + retImplEv->start(); + } + + // Set local mem max size if env var is present + static const char *local_mem_sz_ptr = + std::getenv("SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE"); + + if (local_mem_sz_ptr) { + int device_max_local_mem = 0; + cuDeviceGetAttribute( + &device_max_local_mem, + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, + hQueue->get_device()->get()); + + static const int env_val = std::atoi(local_mem_sz_ptr); + if (env_val <= 0 || env_val > device_max_local_mem) { + setErrorMessage("Invalid value specified for " + "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE", + UR_RESULT_ERROR_ADAPTER_SPECIFIC); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } + UR_CHECK_ERROR(cuFuncSetAttribute( + cuFunc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, env_val)); + } + + retError = UR_CHECK_ERROR(cuLaunchKernel( + cuFunc, blocksPerGrid[0], blocksPerGrid[1], blocksPerGrid[2], + threadsPerBlock[0], threadsPerBlock[1], threadsPerBlock[2], local_size, + cuStream, const_cast(argIndices.data()), nullptr)); + if (local_size != 0) + hKernel->clear_local_size(); + + if (phEvent) { + retError = retImplEv->record(); + *phEvent = retImplEv.release(); + } + } catch (ur_result_t err) { + retError = err; + } + return retError; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp index 9d408ff9d939f..c77184d5f226f 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp @@ -166,9 +166,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( } pDdiTable->pfnDeviceGlobalVariableRead = nullptr; pDdiTable->pfnDeviceGlobalVariableWrite = nullptr; - pDdiTable->pfnEventsWait = nullptr; - pDdiTable->pfnEventsWaitWithBarrier = nullptr; - pDdiTable->pfnKernelLaunch = nullptr; + pDdiTable->pfnEventsWait = urEnqueueEventsWait; + pDdiTable->pfnEventsWaitWithBarrier = urEnqueueEventsWaitWithBarrier; + pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch; pDdiTable->pfnMemBufferCopy = nullptr; pDdiTable->pfnMemBufferCopyRect = nullptr; pDdiTable->pfnMemBufferFill = nullptr; From 8c632473787ec896c98649f9c851704d1b70bc31 Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Wed, 19 Apr 2023 10:37:02 +0100 Subject: [PATCH 09/45] [SYCL][CUDA][UR] Add missing queue/event entry points to DDI table --- .../ur/adapters/cuda/ur_interface_loader.cpp | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp index c77184d5f226f..0ffa5dd53e2f6 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp @@ -66,14 +66,14 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEventProcAddrTable( if (UR_RESULT_SUCCESS != result) { return result; } - pDdiTable->pfnCreateWithNativeHandle = nullptr; - pDdiTable->pfnGetInfo = nullptr; - pDdiTable->pfnGetNativeHandle = nullptr; - pDdiTable->pfnGetProfilingInfo = nullptr; - pDdiTable->pfnRelease = nullptr; - pDdiTable->pfnRetain = nullptr; - pDdiTable->pfnSetCallback = nullptr; - pDdiTable->pfnWait = nullptr; + pDdiTable->pfnCreateWithNativeHandle = urEventCreateWithNativeHandle; + pDdiTable->pfnGetInfo = urEventGetInfo; + pDdiTable->pfnGetNativeHandle = urEventGetNativeHandle; + pDdiTable->pfnGetProfilingInfo = urEventGetProfilingInfo; + pDdiTable->pfnRelease = urEventRelease; + pDdiTable->pfnRetain = urEventRetain; + pDdiTable->pfnSetCallback = urEventSetCallback; + pDdiTable->pfnWait = urEventWait; return UR_RESULT_SUCCESS; } @@ -208,14 +208,14 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( if (UR_RESULT_SUCCESS != result) { return result; } - pDdiTable->pfnCreate = nullptr; - pDdiTable->pfnCreateWithNativeHandle = nullptr; - pDdiTable->pfnFinish = nullptr; - pDdiTable->pfnFlush = nullptr; - pDdiTable->pfnGetInfo = nullptr; - pDdiTable->pfnGetNativeHandle = nullptr; - pDdiTable->pfnRelease = nullptr; - pDdiTable->pfnRetain = nullptr; + pDdiTable->pfnCreate = urQueueCreate; + pDdiTable->pfnCreateWithNativeHandle = urQueueCreateWithNativeHandle; + pDdiTable->pfnFinish = urQueueFinish; + pDdiTable->pfnFlush = urQueueFlush; + pDdiTable->pfnGetInfo = urQueueGetInfo; + pDdiTable->pfnGetNativeHandle = urQueueGetNativeHandle; + pDdiTable->pfnRelease = urQueueRelease; + pDdiTable->pfnRetain = urQueueRetain; return UR_RESULT_SUCCESS; } From 76d4c5f9197c165ea1562edabbf2f1b71c0906b7 Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Wed, 19 Apr 2023 10:57:45 +0100 Subject: [PATCH 10/45] [SYCL][CUDA] Remove unused function from pi_cuda --- sycl/plugins/cuda/pi_cuda.cpp | 38 ----------------------------------- 1 file changed, 38 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index baf4f4a4983d1..af6b5759922d2 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -347,44 +347,6 @@ pi_result enqueueEventWait(pi_queue queue, pi_event event) { //-- PI API implementation extern "C" { -pi_result cuda_piContextGetInfo(pi_context context, pi_context_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) { - - switch (param_name) { - case PI_CONTEXT_INFO_NUM_DEVICES: - return getInfo(param_value_size, param_value, param_value_size_ret, 1); - case PI_CONTEXT_INFO_DEVICES: - return getInfo(param_value_size, param_value, param_value_size_ret, - context->get_device()); - case PI_CONTEXT_INFO_REFERENCE_COUNT: - return getInfo(param_value_size, param_value, param_value_size_ret, - context->get_reference_count()); - case PI_EXT_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: - case PI_EXT_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: - case PI_EXT_CONTEXT_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: - case PI_EXT_CONTEXT_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: { - // These queries should be dealt with in context_impl.cpp by calling the - // queries of each device separately and building the intersection set. - setErrorMessage("These queries should have never come here.", - UR_RESULT_ERROR_INVALID_ARGUMENT); - return PI_ERROR_PLUGIN_SPECIFIC_ERROR; - } - case PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT: - return getInfo(param_value_size, param_value, param_value_size_ret, - true); - case PI_EXT_ONEAPI_CONTEXT_INFO_USM_FILL2D_SUPPORT: - case PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMSET2D_SUPPORT: - // 2D USM operations currently not supported. - return getInfo(param_value_size, param_value, param_value_size_ret, - false); - default: - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); - } - - return PI_ERROR_OUT_OF_RESOURCES; -} - /// \return If available, the first binary that is PTX /// pi_result cuda_piextDeviceSelectBinary(pi_device device, From 3742495c95c15a2dd181a19de7ced6cddad2f364 Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Wed, 19 Apr 2023 11:26:08 +0100 Subject: [PATCH 11/45] [SYCL][CUDA] Add missing UR_APICALL, UR_APIEXPORT to entry points --- .../unified_runtime/ur/adapters/cuda/device.cpp | 2 +- .../ur/adapters/cuda/enqueue.cpp | 11 +++++------ .../ur/adapters/cuda/platform.cpp | 17 ++++++++--------- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp index ae987ab4a7c6e..567377be8796f 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp @@ -997,7 +997,7 @@ urDevicePartition(ur_device_handle_t, const ur_device_partition_property_t *, /// \return UR_RESULT_SUCCESS always since CUDA devices are always root /// devices. -ur_result_t urDeviceRelease(ur_device_handle_t device) { +UR_DLLEXPORT ur_result_t UR_APICALL urDeviceRelease(ur_device_handle_t device) { UR_ASSERT(device, UR_RESULT_ERROR_INVALID_NULL_HANDLE); return UR_RESULT_SUCCESS; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp index 8b732a58fc7a1..68c70aa1ae9ec 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp @@ -136,7 +136,7 @@ bool hasExceededMaxRegistersPerBlock(ur_device_handle_t device, /// \ref enqueueEventWaitWithBarrier.) If the events list is empty, the enqueued /// wait will wait on all previous events in the queue. /// -ur_result_t urEnqueueEventsWaitWithBarrier( +UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { // This function makes one stream work on the previous work (or work @@ -220,15 +220,14 @@ ur_result_t urEnqueueEventsWaitWithBarrier( /// TODO: Add support for multiple streams once the Event class is properly /// refactored. /// -ur_result_t urEnqueueEventsWait(ur_queue_handle_t hQueue, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { +UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( + ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, phEventWaitList, phEvent); } -ur_result_t urEnqueueKernelLaunch( +UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp index 5a4e43c320af0..2ca8c516c08e3 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp @@ -18,9 +18,9 @@ void enableCUDATracing(); void disableCUDATracing(); -ur_result_t urPlatformGetInfo(ur_platform_handle_t hPlatform, - ur_platform_info_t PlatformInfoType, size_t Size, - void *pPlatformInfo, size_t *pSizeRet) { +UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGetInfo( + ur_platform_handle_t hPlatform, ur_platform_info_t PlatformInfoType, + size_t Size, void *pPlatformInfo, size_t *pSizeRet) { UR_ASSERT(hPlatform, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UrReturnHelper ReturnValue(Size, pPlatformInfo, pSizeRet); @@ -57,7 +57,7 @@ ur_result_t urPlatformGetInfo(ur_platform_handle_t hPlatform, /// However because multiple devices in a context is not currently supported, /// place each device in a separate platform. /// -ur_result_t urPlatformGet(uint32_t NumEntries, +UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGet(uint32_t NumEntries, ur_platform_handle_t *phPlatforms, uint32_t *pNumPlatforms) { @@ -163,8 +163,8 @@ ur_result_t urPlatformGet(uint32_t NumEntries, } } -ur_result_t urPlatformGetApiVersion(ur_platform_handle_t hDriver, - ur_api_version_t *pVersion) { +UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGetApiVersion( + ur_platform_handle_t hDriver, ur_api_version_t *pVersion) { UR_ASSERT(hDriver, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(pVersion, UR_RESULT_ERROR_INVALID_NULL_POINTER); @@ -172,13 +172,12 @@ ur_result_t urPlatformGetApiVersion(ur_platform_handle_t hDriver, return UR_RESULT_SUCCESS; } -ur_result_t urInit(ur_device_init_flags_t) { +UR_DLLEXPORT ur_result_t UR_APICALL urInit(ur_device_init_flags_t) { enableCUDATracing(); return UR_RESULT_SUCCESS; } -ur_result_t urTearDown(void *) { +UR_DLLEXPORT ur_result_t UR_APICALL urTearDown(void *) { disableCUDATracing(); return UR_RESULT_SUCCESS; } - From 7e0f0ecd636839e820babea0ceaa38e92c1fd697 Mon Sep 17 00:00:00 2001 From: Petr Vesely Date: Wed, 19 Apr 2023 11:34:37 +0100 Subject: [PATCH 12/45] Small fixes --- sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp index bca41b4c0b5ba..129f4eb06b81e 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp @@ -113,8 +113,9 @@ ur_result_t ur_program_handle_t_::set_binary(const char *source, } ur_result_t ur_program_handle_t_::build_program(const char *build_options) { - - this->buildOptions_ = build_options; + if (build_options) { + this->buildOptions_ = build_options; + } constexpr const unsigned int numberOfOptions = 4u; From 17f91fc331e90b29065db3b2c4c7f5d170bb9ab3 Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Tue, 18 Apr 2023 11:48:29 +0100 Subject: [PATCH 13/45] [SYCL][PI][UR][CUDA] Port CUDA sampler to UR --- sycl/plugins/cuda/CMakeLists.txt | 2 + sycl/plugins/cuda/pi_cuda.cpp | 146 +----------------- sycl/plugins/cuda/pi_cuda.hpp | 16 +- sycl/plugins/unified_runtime/CMakeLists.txt | 2 + .../ur/adapters/cuda/sampler.cpp | 84 ++++++++++ .../ur/adapters/cuda/sampler.hpp | 29 ++++ .../ur/adapters/cuda/ur_interface_loader.cpp | 8 +- 7 files changed, 128 insertions(+), 159 deletions(-) create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.hpp diff --git a/sycl/plugins/cuda/CMakeLists.txt b/sycl/plugins/cuda/CMakeLists.txt index 6339f1e3466ea..7b8bb0377684e 100644 --- a/sycl/plugins/cuda/CMakeLists.txt +++ b/sycl/plugins/cuda/CMakeLists.txt @@ -72,6 +72,8 @@ add_sycl_plugin(cuda "../unified_runtime/ur/adapters/cuda/kernel.hpp" "../unified_runtime/ur/adapters/cuda/queue.hpp" "../unified_runtime/ur/adapters/cuda/queue.cpp" + "../unified_runtime/ur/adapters/cuda/sampler.cpp" + "../unified_runtime/ur/adapters/cuda/sampler.hpp" "../unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp" "../unified_runtime/ur/adapters/cuda/tracing.cpp" # --- diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index af6b5759922d2..0c2cc178eeec6 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -990,144 +990,6 @@ pi_result cuda_piMemRetain(pi_mem mem) { return PI_SUCCESS; } -/// Creates a PI sampler object -/// -/// \param[in] context The context the sampler is created for. -/// \param[in] sampler_properties The properties for the sampler. -/// \param[out] result_sampler Set to the resulting sampler object. -/// -/// \return PI_SUCCESS on success. PI_ERROR_INVALID_VALUE if given an invalid -/// property -/// or if there is multiple of properties from the same category. -pi_result cuda_piSamplerCreate(pi_context context, - const pi_sampler_properties *sampler_properties, - pi_sampler *result_sampler) { - std::unique_ptr<_pi_sampler> retImplSampl{new _pi_sampler(context)}; - - bool propSeen[3] = {false, false, false}; - for (size_t i = 0; sampler_properties[i] != 0; i += 2) { - switch (sampler_properties[i]) { - case PI_SAMPLER_PROPERTIES_NORMALIZED_COORDS: - if (propSeen[0]) { - return PI_ERROR_INVALID_VALUE; - } - propSeen[0] = true; - retImplSampl->props_ |= sampler_properties[i + 1]; - break; - case PI_SAMPLER_PROPERTIES_FILTER_MODE: - if (propSeen[1]) { - return PI_ERROR_INVALID_VALUE; - } - propSeen[1] = true; - retImplSampl->props_ |= - (sampler_properties[i + 1] - PI_SAMPLER_FILTER_MODE_NEAREST) << 1; - break; - case PI_SAMPLER_PROPERTIES_ADDRESSING_MODE: - if (propSeen[2]) { - return PI_ERROR_INVALID_VALUE; - } - propSeen[2] = true; - retImplSampl->props_ |= - (sampler_properties[i + 1] - PI_SAMPLER_ADDRESSING_MODE_NONE) << 2; - break; - default: - return PI_ERROR_INVALID_VALUE; - } - } - - if (!propSeen[0]) { - retImplSampl->props_ |= PI_TRUE; - } - // Default filter mode to PI_SAMPLER_FILTER_MODE_NEAREST - if (!propSeen[2]) { - retImplSampl->props_ |= - (PI_SAMPLER_ADDRESSING_MODE_CLAMP % PI_SAMPLER_ADDRESSING_MODE_NONE) - << 2; - } - - *result_sampler = retImplSampl.release(); - return PI_SUCCESS; -} - -/// Gets information from a PI sampler object -/// -/// \param[in] sampler The sampler to get the information from. -/// \param[in] param_name The name of the information to get. -/// \param[in] param_value_size The size of the param_value. -/// \param[out] param_value Set to information value. -/// \param[out] param_value_size_ret Set to the size of the information value. -/// -/// \return PI_SUCCESS on success. -pi_result cuda_piSamplerGetInfo(pi_sampler sampler, pi_sampler_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) { - assert(sampler != nullptr); - - switch (param_name) { - case PI_SAMPLER_INFO_REFERENCE_COUNT: - return getInfo(param_value_size, param_value, param_value_size_ret, - sampler->get_reference_count()); - case PI_SAMPLER_INFO_CONTEXT: - return getInfo(param_value_size, param_value, param_value_size_ret, - sampler->context_); - case PI_SAMPLER_INFO_NORMALIZED_COORDS: { - pi_bool norm_coords_prop = static_cast(sampler->props_ & 0x1); - return getInfo(param_value_size, param_value, param_value_size_ret, - norm_coords_prop); - } - case PI_SAMPLER_INFO_FILTER_MODE: { - pi_sampler_filter_mode filter_prop = static_cast( - ((sampler->props_ >> 1) & 0x1) + PI_SAMPLER_FILTER_MODE_NEAREST); - return getInfo(param_value_size, param_value, param_value_size_ret, - filter_prop); - } - case PI_SAMPLER_INFO_ADDRESSING_MODE: { - pi_sampler_addressing_mode addressing_prop = - static_cast( - (sampler->props_ >> 2) + PI_SAMPLER_ADDRESSING_MODE_NONE); - return getInfo(param_value_size, param_value, param_value_size_ret, - addressing_prop); - } - default: - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); - } - return {}; -} - -/// Retains a PI sampler object, incrementing its reference count. -/// -/// \param[in] sampler The sampler to increment the reference count of. -/// -/// \return PI_SUCCESS. -pi_result cuda_piSamplerRetain(pi_sampler sampler) { - assert(sampler != nullptr); - sampler->increment_reference_count(); - return PI_SUCCESS; -} - -/// Releases a PI sampler object, decrementing its reference count. If the -/// reference count reaches zero, the sampler object is destroyed. -/// -/// \param[in] sampler The sampler to decrement the reference count of. -/// -/// \return PI_SUCCESS. -pi_result cuda_piSamplerRelease(pi_sampler sampler) { - assert(sampler != nullptr); - - // double delete or someone is messing with the ref count. - // either way, cannot safely proceed. - sycl::detail::pi::assertion( - sampler->get_reference_count() != 0, - "Reference count overflow detected in cuda_piSamplerRelease."); - - // decrement ref count. If it is 0, delete the sampler. - if (sampler->decrement_reference_count() == 0) { - delete sampler; - } - - return PI_SUCCESS; -} - /// General 3D memory copy operation. /// This function requires the corresponding CUDA context to be at the top of /// the context stack @@ -2667,10 +2529,10 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextEventCreateWithNativeHandle, pi2ur::piextEventCreateWithNativeHandle) // Sampler - _PI_CL(piSamplerCreate, cuda_piSamplerCreate) - _PI_CL(piSamplerGetInfo, cuda_piSamplerGetInfo) - _PI_CL(piSamplerRetain, cuda_piSamplerRetain) - _PI_CL(piSamplerRelease, cuda_piSamplerRelease) + _PI_CL(piSamplerCreate, pi2ur::piSamplerCreate) + _PI_CL(piSamplerGetInfo, pi2ur::piSamplerGetInfo) + _PI_CL(piSamplerRetain, pi2ur::piSamplerRetain) + _PI_CL(piSamplerRelease, pi2ur::piSamplerRelease) // Queue commands _PI_CL(piEnqueueKernelLaunch, pi2ur::piEnqueueKernelLaunch) _PI_CL(piEnqueueNativeKernel, cuda_piEnqueueNativeKernel) diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index 1a8c7e64537cd..0df35e53c2d27 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -49,6 +49,7 @@ #include #include #include +#include // Share code between the PI Plugin and UR Adapter #include @@ -406,19 +407,8 @@ struct _pi_kernel : ur_kernel_handle_t_ { /// Sampler property layout: /// | 31 30 ... 6 5 | 4 3 2 | 1 | 0 | /// | N/A | addressing mode | fiter mode | normalize coords | -struct _pi_sampler { - std::atomic_uint32_t refCount_; - pi_uint32 props_; - pi_context context_; - - _pi_sampler(pi_context context) - : refCount_(1), props_(0), context_(context) {} - - pi_uint32 increment_reference_count() noexcept { return ++refCount_; } - - pi_uint32 decrement_reference_count() noexcept { return --refCount_; } - - pi_uint32 get_reference_count() const noexcept { return refCount_; } +struct _pi_sampler : ur_sampler_handle_t_ { + using ur_sampler_handle_t_::ur_sampler_handle_t_; }; // ------------------------------------------------------------- diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index dc572bd5e7e9c..13ac8a5a1e138 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -148,6 +148,8 @@ add_sycl_library("ur_adapter_cuda" SHARED "ur/adapters/cuda/kernel.hpp" "ur/adapters/cuda/queue.cpp" "ur/adapters/cuda/queue.hpp" + "ur/adapters/cuda/sampler.cpp" + "ur/adapters/cuda/sampler.hpp" "ur/adapters/cuda/ur_interface_loader.cpp" "ur/adapters/cuda/tracing.cpp" INCLUDE_DIRS diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp new file mode 100644 index 0000000000000..c07f548c92a26 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp @@ -0,0 +1,84 @@ +//===--------- sampler.cpp - CUDA Adapter ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include "sampler.hpp" +#include "common.hpp" + +ur_result_t urSamplerCreate(ur_context_handle_t hContext, + const ur_sampler_desc_t *pDesc, + ur_sampler_handle_t *phSampler) { + std::unique_ptr retImplSampl{ + new ur_sampler_handle_t_(hContext)}; + + if (pDesc && pDesc->stype == UR_STRUCTURE_TYPE_SAMPLER_DESC) { + retImplSampl->props_ |= pDesc->normalizedCoords; + retImplSampl->props_ |= (pDesc->filterMode << 1); + retImplSampl->props_ |= (pDesc->addressingMode << 2); + } else { + // Set default values + retImplSampl->props_ |= true; // Normalized Coords + retImplSampl->props_ |= UR_SAMPLER_ADDRESSING_MODE_CLAMP << 2; + } + + *phSampler = retImplSampl.release(); + return UR_RESULT_SUCCESS; +} + +ur_result_t urSamplerGetInfo(ur_sampler_handle_t hSampler, + ur_sampler_info_t propName, size_t propValueSize, + void *pPropValue, size_t *pPropSizeRet) { + UR_ASSERT(hSampler, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropSizeRet); + + switch (propName) { + case UR_SAMPLER_INFO_REFERENCE_COUNT: + return ReturnValue(hSampler->get_reference_count()); + case UR_SAMPLER_INFO_CONTEXT: + return ReturnValue(hSampler->context_); + case UR_SAMPLER_INFO_NORMALIZED_COORDS: { + bool norm_coords_prop = static_cast(hSampler->props_); + return ReturnValue(norm_coords_prop); + } + case UR_SAMPLER_INFO_FILTER_MODE: { + auto filter_prop = + static_cast(((hSampler->props_ >> 1) & 0x1)); + return ReturnValue(filter_prop); + } + case UR_SAMPLER_INFO_ADDRESSING_MODE: { + auto addressing_prop = + static_cast(hSampler->props_ >> 2); + return ReturnValue(addressing_prop); + } + default: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + } + return {}; +} + +ur_result_t urSamplerRetain(ur_sampler_handle_t hSampler) { + UR_ASSERT(hSampler, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + hSampler->increment_reference_count(); + return UR_RESULT_SUCCESS; +} + +ur_result_t urSamplerRelease(ur_sampler_handle_t hSampler) { + UR_ASSERT(hSampler, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + // double delete or someone is messing with the ref count. + // either way, cannot safely proceed. + sycl::detail::ur::assertion( + hSampler->get_reference_count() != 0, + "Reference count overflow detected in urSamplerRelease."); + + // decrement ref count. If it is 0, delete the sampler. + if (hSampler->decrement_reference_count() == 0) { + delete hSampler; + } + + return UR_RESULT_SUCCESS; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.hpp new file mode 100644 index 0000000000000..61ed98325a5ed --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.hpp @@ -0,0 +1,29 @@ +//===--------- sampler.hpp - CUDA Adapter ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include + +/// Implementation of samplers for CUDA +/// +/// Sampler property layout: +/// | 31 30 ... 6 5 | 4 3 2 | 1 | 0 | +/// | N/A | addressing mode | fiter mode | normalize coords | +struct ur_sampler_handle_t_ { + std::atomic_uint32_t refCount_; + uint32_t props_; + ur_context_handle_t context_; + + ur_sampler_handle_t_(ur_context_handle_t context) + : refCount_(1), props_(0), context_(context) {} + + uint32_t increment_reference_count() noexcept { return ++refCount_; } + + uint32_t decrement_reference_count() noexcept { return --refCount_; } + + uint32_t get_reference_count() const noexcept { return refCount_; } +}; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp index 0ffa5dd53e2f6..06ae75db02dec 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp @@ -130,12 +130,12 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetSamplerProcAddrTable( if (UR_RESULT_SUCCESS != result) { return result; } - pDdiTable->pfnCreate = nullptr; + pDdiTable->pfnCreate = urSamplerCreate; pDdiTable->pfnCreateWithNativeHandle = nullptr; - pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetInfo = urSamplerGetInfo; pDdiTable->pfnGetNativeHandle = nullptr; - pDdiTable->pfnRelease = nullptr; - pDdiTable->pfnRetain = nullptr; + pDdiTable->pfnRelease = urSamplerRelease; + pDdiTable->pfnRetain = urSamplerRetain; return UR_RESULT_SUCCESS; } From 6489ce147eb74d29930bfeb86d958dcd0b76344f Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Fri, 21 Apr 2023 14:51:21 +0100 Subject: [PATCH 14/45] [SYCL][CUDA] Fix missing input validation for various queue entry points --- .../ur/adapters/cuda/queue.cpp | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp index 1d10cedd82c91..371c3363b4e75 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp @@ -122,6 +122,9 @@ urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_queue_properties_t *pProps, ur_queue_handle_t *phQueue) { try { std::unique_ptr queueImpl{nullptr}; + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(phQueue, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); if (hContext->get_device() != hDevice) { *phQueue = nullptr; @@ -167,7 +170,7 @@ urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice, } UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) { - assert(hQueue != nullptr); + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); assert(hQueue->get_reference_count() > 0); hQueue->increment_reference_count(); @@ -175,7 +178,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) { } UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) { - assert(hQueue != nullptr); + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); if (hQueue->decrement_reference_count() > 0) { return UR_RESULT_SUCCESS; @@ -206,9 +209,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) { ur_result_t result = UR_RESULT_SUCCESS; try { - - assert(hQueue != - nullptr); // need PI_ERROR_INVALID_EXTERNAL_HANDLE error code + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); ScopedContext active(hQueue->get_context()); hQueue->sync_streams([&result](CUstream s) { @@ -231,12 +232,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) { // same problem of having to flush cross-queue dependencies as some of the // other plugins, so it can be left as no-op. UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t hQueue) { - (void)hQueue; + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle( ur_queue_handle_t hQueue, ur_native_handle_t *phNativeQueue) { + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(phNativeQueue, UR_RESULT_ERROR_INVALID_NULL_POINTER); + ScopedContext active(hQueue->get_context()); *phNativeQueue = reinterpret_cast(hQueue->get_next_compute_stream()); @@ -285,7 +289,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue, size_t propValueSize, void *pPropValue, size_t *pPropSizeRet) { - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE); + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(pPropValue || pPropSizeRet, UR_RESULT_ERROR_INVALID_NULL_POINTER); UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropSizeRet); @@ -321,6 +326,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue, default: break; } - sycl::detail::ur::die("Queue info request not implemented"); - return {}; + + return UR_RESULT_ERROR_INVALID_ENUMERATION; } From ebe90a2117c1fcd5e70c41626c8acd6c49b636f5 Mon Sep 17 00:00:00 2001 From: Martin Morrison-Grant Date: Fri, 21 Apr 2023 10:40:49 +0100 Subject: [PATCH 15/45] Refactor memory object and entry points into new memory.hpp/cpp files. Add entry points to DDI table. --- sycl/plugins/cuda/CMakeLists.txt | 2 + sycl/plugins/cuda/pi_cuda.cpp | 498 +++-------------- sycl/plugins/cuda/pi_cuda.hpp | 170 +----- sycl/plugins/unified_runtime/CMakeLists.txt | 2 + .../ur/adapters/cuda/memory.cpp | 513 ++++++++++++++++++ .../ur/adapters/cuda/memory.hpp | 195 +++++++ .../ur/adapters/cuda/ur_interface_loader.cpp | 20 +- 7 files changed, 801 insertions(+), 599 deletions(-) create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp diff --git a/sycl/plugins/cuda/CMakeLists.txt b/sycl/plugins/cuda/CMakeLists.txt index 7b8bb0377684e..da4ce9476ee91 100644 --- a/sycl/plugins/cuda/CMakeLists.txt +++ b/sycl/plugins/cuda/CMakeLists.txt @@ -76,6 +76,8 @@ add_sycl_plugin(cuda "../unified_runtime/ur/adapters/cuda/sampler.hpp" "../unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp" "../unified_runtime/ur/adapters/cuda/tracing.cpp" + "../unified_runtime/ur/adapters/cuda/memory.cpp" + "../unified_runtime/ur/adapters/cuda/memory.hpp" # --- "${sycl_inc_dir}/sycl/detail/pi.h" "${sycl_inc_dir}/sycl/detail/pi.hpp" diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 0c2cc178eeec6..ab0d428e3613a 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -94,6 +94,62 @@ pi_result map_ur_error(ur_result_t result) { } } +pi_mem_type map_ur_mem_type(ur_mem_type_t mem_type) { + switch (mem_type) { + case UR_MEM_TYPE_BUFFER: + default: + return PI_MEM_TYPE_BUFFER; + case UR_MEM_TYPE_IMAGE2D: + return PI_MEM_TYPE_IMAGE2D; + case UR_MEM_TYPE_IMAGE3D: + return PI_MEM_TYPE_IMAGE3D; + case UR_MEM_TYPE_IMAGE2D_ARRAY: + return PI_MEM_TYPE_IMAGE2D_ARRAY; + case UR_MEM_TYPE_IMAGE1D: + return PI_MEM_TYPE_IMAGE1D; + case UR_MEM_TYPE_IMAGE1D_ARRAY: + return PI_MEM_TYPE_IMAGE1D_ARRAY; + case UR_MEM_TYPE_IMAGE1D_BUFFER: + return PI_MEM_TYPE_IMAGE1D_BUFFER; + } +} + +template +inline pi_result +ConvertInputBitfield(pi_bitfield in, TypeOut *out, + const std::unordered_map &map) { + *out = 0; + for (auto &[FlagPI, FlagUR] : map) { + if (in & FlagPI) { + *out |= FlagUR; + } + } + + return PI_SUCCESS; +} + +// Convert bitfield flags from PI to UR for MemFlags +inline pi_result pi2urMemFlags(pi_mem_flags piFlags, ur_mem_flags_t *urFlags) { + static const std::unordered_map MemFlagsMap = { + {PI_MEM_FLAGS_ACCESS_RW, UR_MEM_FLAG_READ_WRITE}, + {PI_MEM_ACCESS_READ_ONLY, UR_MEM_FLAG_READ_ONLY}, + {PI_MEM_FLAGS_HOST_PTR_USE, UR_MEM_FLAG_USE_HOST_POINTER}, + {PI_MEM_FLAGS_HOST_PTR_COPY, UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER}, + {PI_MEM_FLAGS_HOST_PTR_ALLOC, UR_MEM_FLAG_ALLOC_HOST_POINTER}, + }; + + return ConvertInputBitfield(piFlags, urFlags, MemFlagsMap); +} + +// Convert bitfield flags from PI to UR for MapFlags +inline pi_result pi2urMapFlags(pi_mem_flags piFlags, ur_mem_flags_t *urFlags) { + static const std::unordered_map MapFlagsMap = { + {PI_MAP_READ, UR_MAP_FLAG_READ}, + {PI_MAP_WRITE, UR_MAP_FLAG_WRITE}, + }; + return ConvertInputBitfield(piFlags, urFlags, MapFlagsMap); +} + // Iterates over the event wait list, returns correct pi_result error codes. // Invokes the callback for the latest event of each queue in the wait list. // The callback must take a single pi_event argument and return a pi_result. @@ -400,245 +456,6 @@ pi_result cuda_piextGetDeviceFunctionPointer([[maybe_unused]] pi_device device, return retError; } -/// Creates a PI Memory object using a CUDA memory allocation. -/// Can trigger a manual copy depending on the mode. -/// \TODO Implement USE_HOST_PTR using cuHostRegister -/// -pi_result -cuda_piMemBufferCreate(pi_context context, pi_mem_flags flags, size_t size, - void *host_ptr, pi_mem *ret_mem, - [[maybe_unused]] const pi_mem_properties *properties) { - // Need input memory object - assert(ret_mem != nullptr); - assert((properties == nullptr || *properties == 0) && - "no mem properties goes to cuda RT yet"); - // Currently, USE_HOST_PTR is not implemented using host register - // since this triggers a weird segfault after program ends. - // Setting this constant to true enables testing that behavior. - const bool enableUseHostPtr = false; - const bool performInitialCopy = - (flags & PI_MEM_FLAGS_HOST_PTR_COPY) || - ((flags & PI_MEM_FLAGS_HOST_PTR_USE) && !enableUseHostPtr); - pi_result retErr = PI_SUCCESS; - pi_mem retMemObj = nullptr; - - try { - ScopedContext active(context); - CUdeviceptr ptr; - _pi_mem::mem_::buffer_mem_::alloc_mode allocMode = - _pi_mem::mem_::buffer_mem_::alloc_mode::classic; - - if ((flags & PI_MEM_FLAGS_HOST_PTR_USE) && enableUseHostPtr) { - retErr = PI_CHECK_ERROR( - cuMemHostRegister(host_ptr, size, CU_MEMHOSTREGISTER_DEVICEMAP)); - retErr = PI_CHECK_ERROR(cuMemHostGetDevicePointer(&ptr, host_ptr, 0)); - allocMode = _pi_mem::mem_::buffer_mem_::alloc_mode::use_host_ptr; - } else if (flags & PI_MEM_FLAGS_HOST_PTR_ALLOC) { - retErr = PI_CHECK_ERROR(cuMemAllocHost(&host_ptr, size)); - retErr = PI_CHECK_ERROR(cuMemHostGetDevicePointer(&ptr, host_ptr, 0)); - allocMode = _pi_mem::mem_::buffer_mem_::alloc_mode::alloc_host_ptr; - } else { - retErr = PI_CHECK_ERROR(cuMemAlloc(&ptr, size)); - if (flags & PI_MEM_FLAGS_HOST_PTR_COPY) { - allocMode = _pi_mem::mem_::buffer_mem_::alloc_mode::copy_in; - } - } - - if (retErr == PI_SUCCESS) { - pi_mem parentBuffer = nullptr; - - auto piMemObj = std::unique_ptr<_pi_mem>( - new _pi_mem{context, parentBuffer, allocMode, ptr, host_ptr, size}); - if (piMemObj != nullptr) { - retMemObj = piMemObj.release(); - if (performInitialCopy) { - // Operates on the default stream of the current CUDA context. - retErr = PI_CHECK_ERROR(cuMemcpyHtoD(ptr, host_ptr, size)); - // Synchronize with default stream implicitly used by cuMemcpyHtoD - // to make buffer data available on device before any other PI call - // uses it. - if (retErr == PI_SUCCESS) { - CUstream defaultStream = 0; - retErr = PI_CHECK_ERROR(cuStreamSynchronize(defaultStream)); - } - } - } else { - retErr = PI_ERROR_OUT_OF_HOST_MEMORY; - } - } - } catch (pi_result err) { - retErr = err; - } catch (...) { - retErr = PI_ERROR_OUT_OF_RESOURCES; - } - - *ret_mem = retMemObj; - - return retErr; -} - -/// Decreases the reference count of the Mem object. -/// If this is zero, calls the relevant CUDA Free function -/// \return PI_SUCCESS unless deallocation error -/// -pi_result cuda_piMemRelease(pi_mem memObj) { - assert((memObj != nullptr) && "PI_ERROR_INVALID_MEM_OBJECTS"); - - pi_result ret = PI_SUCCESS; - - try { - - // Do nothing if there are other references - if (memObj->decrement_reference_count() > 0) { - return PI_SUCCESS; - } - - // make sure memObj is released in case PI_CHECK_ERROR throws - std::unique_ptr<_pi_mem> uniqueMemObj(memObj); - - if (memObj->is_sub_buffer()) { - return PI_SUCCESS; - } - - ScopedContext active(uniqueMemObj->get_context()); - - if (memObj->mem_type_ == _pi_mem::mem_type::buffer) { - switch (uniqueMemObj->mem_.buffer_mem_.allocMode_) { - case _pi_mem::mem_::buffer_mem_::alloc_mode::copy_in: - case _pi_mem::mem_::buffer_mem_::alloc_mode::classic: - ret = PI_CHECK_ERROR(cuMemFree(uniqueMemObj->mem_.buffer_mem_.ptr_)); - break; - case _pi_mem::mem_::buffer_mem_::alloc_mode::use_host_ptr: - ret = PI_CHECK_ERROR( - cuMemHostUnregister(uniqueMemObj->mem_.buffer_mem_.hostPtr_)); - break; - case _pi_mem::mem_::buffer_mem_::alloc_mode::alloc_host_ptr: - ret = PI_CHECK_ERROR( - cuMemFreeHost(uniqueMemObj->mem_.buffer_mem_.hostPtr_)); - }; - } else if (memObj->mem_type_ == _pi_mem::mem_type::surface) { - ret = PI_CHECK_ERROR( - cuSurfObjectDestroy(uniqueMemObj->mem_.surface_mem_.get_surface())); - ret = PI_CHECK_ERROR( - cuArrayDestroy(uniqueMemObj->mem_.surface_mem_.get_array())); - } - - } catch (pi_result err) { - ret = err; - } catch (...) { - ret = PI_ERROR_OUT_OF_RESOURCES; - } - - if (ret != PI_SUCCESS) { - // A reported CUDA error is either an implementation or an asynchronous CUDA - // error for which it is unclear if the function that reported it succeeded - // or not. Either way, the state of the program is compromised and likely - // unrecoverable. - sycl::detail::pi::die( - "Unrecoverable program state reached in cuda_piMemRelease"); - } - - return PI_SUCCESS; -} - -/// Implements a buffer partition in the CUDA backend. -/// A buffer partition (or a sub-buffer, in OpenCL terms) is simply implemented -/// as an offset over an existing CUDA allocation. -/// -pi_result cuda_piMemBufferPartition( - pi_mem parent_buffer, pi_mem_flags flags, - [[maybe_unused]] pi_buffer_create_type buffer_create_type, - void *buffer_create_info, pi_mem *memObj) { - assert((parent_buffer != nullptr) && "PI_ERROR_INVALID_MEM_OBJECT"); - assert(parent_buffer->is_buffer() && "PI_ERROR_INVALID_MEM_OBJECTS"); - assert(!parent_buffer->is_sub_buffer() && "PI_ERROR_INVALID_MEM_OBJECT"); - - // Default value for flags means PI_MEM_FLAGS_ACCCESS_RW. - if (flags == 0) { - flags = PI_MEM_FLAGS_ACCESS_RW; - } - - assert((flags == PI_MEM_FLAGS_ACCESS_RW) && "PI_ERROR_INVALID_VALUE"); - assert((buffer_create_type == PI_BUFFER_CREATE_TYPE_REGION) && - "PI_ERROR_INVALID_VALUE"); - assert((buffer_create_info != nullptr) && "PI_ERROR_INVALID_VALUE"); - assert(memObj != nullptr); - - const auto bufferRegion = - *reinterpret_cast(buffer_create_info); - assert((bufferRegion.size != 0u) && "PI_ERROR_INVALID_BUFFER_SIZE"); - - assert((bufferRegion.origin <= (bufferRegion.origin + bufferRegion.size)) && - "Overflow"); - assert(((bufferRegion.origin + bufferRegion.size) <= - parent_buffer->mem_.buffer_mem_.get_size()) && - "PI_ERROR_INVALID_BUFFER_SIZE"); - // Retained indirectly due to retaining parent buffer below. - pi_context context = parent_buffer->context_; - _pi_mem::mem_::buffer_mem_::alloc_mode allocMode = - _pi_mem::mem_::buffer_mem_::alloc_mode::classic; - - assert(parent_buffer->mem_.buffer_mem_.ptr_ != - _pi_mem::mem_::buffer_mem_::native_type{0}); - _pi_mem::mem_::buffer_mem_::native_type ptr = - parent_buffer->mem_.buffer_mem_.ptr_ + bufferRegion.origin; - - void *hostPtr = nullptr; - if (parent_buffer->mem_.buffer_mem_.hostPtr_) { - hostPtr = static_cast(parent_buffer->mem_.buffer_mem_.hostPtr_) + - bufferRegion.origin; - } - - std::unique_ptr<_pi_mem> retMemObj{nullptr}; - try { - retMemObj = std::unique_ptr<_pi_mem>{new _pi_mem{ - context, parent_buffer, allocMode, ptr, hostPtr, bufferRegion.size}}; - } catch (pi_result err) { - *memObj = nullptr; - return err; - } catch (...) { - *memObj = nullptr; - return PI_ERROR_OUT_OF_HOST_MEMORY; - } - - *memObj = retMemObj.release(); - return PI_SUCCESS; -} - -pi_result cuda_piMemGetInfo(pi_mem, pi_mem_info, size_t, void *, size_t *) { - sycl::detail::pi::die("cuda_piMemGetInfo not implemented"); -} - -/// Gets the native CUDA handle of a PI mem object -/// -/// \param[in] mem The PI mem to get the native CUDA object of. -/// \param[out] nativeHandle Set to the native handle of the PI mem object. -/// -/// \return PI_SUCCESS -pi_result cuda_piextMemGetNativeHandle(pi_mem mem, - pi_native_handle *nativeHandle) { - *nativeHandle = static_cast(mem->mem_.buffer_mem_.get()); - return PI_SUCCESS; -} - -/// Created a PI mem object from a CUDA mem handle. -/// TODO: Implement this. -/// NOTE: The created PI object takes ownership of the native handle. -/// -/// \param[in] nativeHandle The native handle to create PI mem object from. -/// \param[in] context The PI context of the memory allocation. -/// \param[in] ownNativeHandle Indicates if we own the native memory handle or -/// it came from interop that asked to not transfer the ownership to SYCL RT. -/// \param[out] mem Set to the PI mem object created from native handle. -/// -/// \return TBD -pi_result cuda_piextMemCreateWithNativeHandle(pi_native_handle, pi_context, - bool, pi_mem *) { - sycl::detail::pi::die( - "Creation of PI mem from native handle not implemented"); - return {}; -} - /// Created a PI image mem object from a CUDA image mem handle. /// TODO: Implement this. /// NOTE: The created PI object takes ownership of the native handle. @@ -820,176 +637,6 @@ pi_result cuda_piEnqueueNativeKernel(pi_queue, void (*)(void *), void *, size_t, return {}; } -/// \TODO Not implemented -pi_result cuda_piMemImageCreate(pi_context context, pi_mem_flags flags, - const pi_image_format *image_format, - const pi_image_desc *image_desc, void *host_ptr, - pi_mem *ret_mem) { - // Need input memory object - assert(ret_mem != nullptr); - const bool performInitialCopy = (flags & PI_MEM_FLAGS_HOST_PTR_COPY) || - ((flags & PI_MEM_FLAGS_HOST_PTR_USE)); - pi_result retErr = PI_SUCCESS; - - // We only support RBGA channel order - // TODO: check SYCL CTS and spec. May also have to support BGRA - if (image_format->image_channel_order != - pi_image_channel_order::PI_IMAGE_CHANNEL_ORDER_RGBA) { - sycl::detail::pi::die( - "cuda_piMemImageCreate only supports RGBA channel order"); - } - - // We have to use cuArray3DCreate, which has some caveats. The height and - // depth parameters must be set to 0 produce 1D or 2D arrays. image_desc gives - // a minimum value of 1, so we need to convert the answer. - CUDA_ARRAY3D_DESCRIPTOR array_desc; - array_desc.NumChannels = 4; // Only support 4 channel image - array_desc.Flags = 0; // No flags required - array_desc.Width = image_desc->image_width; - if (image_desc->image_type == PI_MEM_TYPE_IMAGE1D) { - array_desc.Height = 0; - array_desc.Depth = 0; - } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE2D) { - array_desc.Height = image_desc->image_height; - array_desc.Depth = 0; - } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE3D) { - array_desc.Height = image_desc->image_height; - array_desc.Depth = image_desc->image_depth; - } - - // We need to get this now in bytes for calculating the total image size later - size_t pixel_type_size_bytes; - - switch (image_format->image_channel_data_type) { - case PI_IMAGE_CHANNEL_TYPE_UNORM_INT8: - case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8: - array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT8; - pixel_type_size_bytes = 1; - break; - case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT8: - array_desc.Format = CU_AD_FORMAT_SIGNED_INT8; - pixel_type_size_bytes = 1; - break; - case PI_IMAGE_CHANNEL_TYPE_UNORM_INT16: - case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16: - array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT16; - pixel_type_size_bytes = 2; - break; - case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT16: - array_desc.Format = CU_AD_FORMAT_SIGNED_INT16; - pixel_type_size_bytes = 2; - break; - case PI_IMAGE_CHANNEL_TYPE_HALF_FLOAT: - array_desc.Format = CU_AD_FORMAT_HALF; - pixel_type_size_bytes = 2; - break; - case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32: - array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT32; - pixel_type_size_bytes = 4; - break; - case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT32: - array_desc.Format = CU_AD_FORMAT_SIGNED_INT32; - pixel_type_size_bytes = 4; - break; - case PI_IMAGE_CHANNEL_TYPE_FLOAT: - array_desc.Format = CU_AD_FORMAT_FLOAT; - pixel_type_size_bytes = 4; - break; - default: - sycl::detail::pi::die( - "cuda_piMemImageCreate given unsupported image_channel_data_type"); - } - - // When a dimension isn't used image_desc has the size set to 1 - size_t pixel_size_bytes = - pixel_type_size_bytes * 4; // 4 is the only number of channels we support - size_t image_size_bytes = pixel_size_bytes * image_desc->image_width * - image_desc->image_height * image_desc->image_depth; - - ScopedContext active(context); - CUarray image_array; - retErr = PI_CHECK_ERROR(cuArray3DCreate(&image_array, &array_desc)); - - try { - if (performInitialCopy) { - // We have to use a different copy function for each image dimensionality - if (image_desc->image_type == PI_MEM_TYPE_IMAGE1D) { - retErr = PI_CHECK_ERROR( - cuMemcpyHtoA(image_array, 0, host_ptr, image_size_bytes)); - } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE2D) { - CUDA_MEMCPY2D cpy_desc; - memset(&cpy_desc, 0, sizeof(cpy_desc)); - cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; - cpy_desc.srcHost = host_ptr; - cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; - cpy_desc.dstArray = image_array; - cpy_desc.WidthInBytes = pixel_size_bytes * image_desc->image_width; - cpy_desc.Height = image_desc->image_height; - retErr = PI_CHECK_ERROR(cuMemcpy2D(&cpy_desc)); - } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE3D) { - CUDA_MEMCPY3D cpy_desc; - memset(&cpy_desc, 0, sizeof(cpy_desc)); - cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; - cpy_desc.srcHost = host_ptr; - cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; - cpy_desc.dstArray = image_array; - cpy_desc.WidthInBytes = pixel_size_bytes * image_desc->image_width; - cpy_desc.Height = image_desc->image_height; - cpy_desc.Depth = image_desc->image_depth; - retErr = PI_CHECK_ERROR(cuMemcpy3D(&cpy_desc)); - } - } - - // CUDA_RESOURCE_DESC is a union of different structs, shown here - // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXOBJECT.html - // We need to fill it as described here to use it for a surface or texture - // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__SURFOBJECT.html - // CUDA_RESOURCE_DESC::resType must be CU_RESOURCE_TYPE_ARRAY and - // CUDA_RESOURCE_DESC::res::array::hArray must be set to a valid CUDA array - // handle. - // CUDA_RESOURCE_DESC::flags must be set to zero - - CUDA_RESOURCE_DESC image_res_desc; - image_res_desc.res.array.hArray = image_array; - image_res_desc.resType = CU_RESOURCE_TYPE_ARRAY; - image_res_desc.flags = 0; - - CUsurfObject surface; - retErr = PI_CHECK_ERROR(cuSurfObjectCreate(&surface, &image_res_desc)); - - auto piMemObj = std::unique_ptr<_pi_mem>(new _pi_mem{ - context, image_array, surface, image_desc->image_type, host_ptr}); - - if (piMemObj == nullptr) { - return PI_ERROR_OUT_OF_HOST_MEMORY; - } - - *ret_mem = piMemObj.release(); - } catch (pi_result err) { - cuArrayDestroy(image_array); - return err; - } catch (...) { - cuArrayDestroy(image_array); - return PI_ERROR_UNKNOWN; - } - - return retErr; -} - -/// \TODO Not implemented -pi_result cuda_piMemImageGetInfo(pi_mem, pi_image_info, size_t, void *, - size_t *) { - sycl::detail::pi::die("cuda_piMemImageGetInfo not implemented"); - return {}; -} - -pi_result cuda_piMemRetain(pi_mem mem) { - assert(mem != nullptr); - assert(mem->get_reference_count() > 0); - mem->increment_reference_count(); - return PI_SUCCESS; -} - /// General 3D memory copy operation. /// This function requires the corresponding CUDA context to be at the top of /// the context stack @@ -1460,7 +1107,8 @@ pi_result cuda_piEnqueueMemImageRead( size_t byteOffsetX = origin[0] * elementByteSize * arrayDesc.NumChannels; size_t bytesToCopy = elementByteSize * arrayDesc.NumChannels * region[0]; - pi_mem_type imgType = image->mem_.surface_mem_.get_image_type(); + pi_mem_type imgType = + map_ur_mem_type(image->mem_.surface_mem_.get_image_type()); if (imgType == PI_MEM_TYPE_IMAGE1D) { retErr = PI_CHECK_ERROR( cuMemcpyAtoHAsync(ptr, array, byteOffsetX, bytesToCopy, cuStream)); @@ -1530,7 +1178,8 @@ cuda_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image, size_t byteOffsetX = origin[0] * elementByteSize * arrayDesc.NumChannels; size_t bytesToCopy = elementByteSize * arrayDesc.NumChannels * region[0]; - pi_mem_type imgType = image->mem_.surface_mem_.get_image_type(); + pi_mem_type imgType = + map_ur_mem_type(image->mem_.surface_mem_.get_image_type()); if (imgType == PI_MEM_TYPE_IMAGE1D) { retErr = PI_CHECK_ERROR( cuMemcpyHtoAAsync(array, byteOffsetX, ptr, bytesToCopy, cuStream)); @@ -1601,7 +1250,8 @@ pi_result cuda_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, src_origin[0] * elementByteSize * dstArrayDesc.NumChannels; size_t bytesToCopy = elementByteSize * srcArrayDesc.NumChannels * region[0]; - pi_mem_type imgType = src_image->mem_.surface_mem_.get_image_type(); + pi_mem_type imgType = + map_ur_mem_type(src_image->mem_.surface_mem_.get_image_type()); if (imgType == PI_MEM_TYPE_IMAGE1D) { retErr = PI_CHECK_ERROR(cuMemcpyAtoA(dstArray, dstByteOffsetX, srcArray, srcByteOffsetX, bytesToCopy)); @@ -1669,7 +1319,10 @@ pi_result cuda_piEnqueueMemBufferMap(pi_queue command_queue, pi_mem buffer, } // Allocate a pointer in the host to store the mapped information - auto hostPtr = buffer->mem_.buffer_mem_.map_to_ptr(offset, map_flags); + // TODO(ur): Remove conversion when this is ported to UR. + ur_map_flags_t map_flags_ur; + pi2urMapFlags(map_flags, &map_flags_ur); + auto hostPtr = buffer->mem_.buffer_mem_.map_to_ptr(offset, map_flags_ur); *ret_map = buffer->mem_.buffer_mem_.get_map_ptr(); if (hostPtr) { ret_err = PI_SUCCESS; @@ -2477,15 +2130,16 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextQueueCreateWithNativeHandle2, pi2ur::piextQueueCreateWithNativeHandle2) // Memory - _PI_CL(piMemBufferCreate, cuda_piMemBufferCreate) - _PI_CL(piMemImageCreate, cuda_piMemImageCreate) - _PI_CL(piMemGetInfo, cuda_piMemGetInfo) - _PI_CL(piMemImageGetInfo, cuda_piMemImageGetInfo) - _PI_CL(piMemRetain, cuda_piMemRetain) - _PI_CL(piMemRelease, cuda_piMemRelease) - _PI_CL(piMemBufferPartition, cuda_piMemBufferPartition) - _PI_CL(piextMemGetNativeHandle, cuda_piextMemGetNativeHandle) - _PI_CL(piextMemCreateWithNativeHandle, cuda_piextMemCreateWithNativeHandle) + _PI_CL(piMemBufferCreate, pi2ur::piMemBufferCreate) + _PI_CL(piMemImageCreate, pi2ur::piMemImageCreate) + _PI_CL(piMemGetInfo, pi2ur::piMemGetInfo) + _PI_CL(piMemImageGetInfo, pi2ur::piMemImageGetInfo) + _PI_CL(piMemRetain, pi2ur::piMemRetain) + _PI_CL(piMemRelease, pi2ur::piMemRelease) + _PI_CL(piMemBufferPartition, pi2ur::piMemBufferPartition) + _PI_CL(piextMemGetNativeHandle, pi2ur::piextMemGetNativeHandle) + _PI_CL(piextMemCreateWithNativeHandle, pi2ur::piextMemCreateWithNativeHandle) + // Program _PI_CL(piProgramCreate, pi2ur::piProgramCreate) _PI_CL(piclProgramCreateWithSource, pi2ur::piclProgramCreateWithSource) diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index 0df35e53c2d27..c1c84fa2a4557 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -50,6 +50,7 @@ #include #include #include +#include // Share code between the PI Plugin and UR Adapter #include @@ -128,173 +129,8 @@ struct _pi_context : ur_context_handle_t_ { /// \brief Represents non-SVM allocations on the CUDA backend. /// Keeps tracks of all mapped regions used for Map/Unmap calls. /// Only one region can be active at the same time per allocation. -struct _pi_mem { - - // TODO: Move as much shared data up as possible - using pi_context = _pi_context *; - - // Context where the memory object is accessibles - pi_context context_; - - /// Reference counting of the handler - std::atomic_uint32_t refCount_; - enum class mem_type { buffer, surface } mem_type_; - - /// A PI Memory object represents either plain memory allocations ("Buffers" - /// in OpenCL) or typed allocations ("Images" in OpenCL). - /// In CUDA their API handlers are different. Whereas "Buffers" are allocated - /// as pointer-like structs, "Images" are stored in Textures or Surfaces - /// This union allows implementation to use either from the same handler. - union mem_ { - // Handler for plain, pointer-based CUDA allocations - struct buffer_mem_ { - using native_type = CUdeviceptr; - - // If this allocation is a sub-buffer (i.e., a view on an existing - // allocation), this is the pointer to the parent handler structure - pi_mem parent_; - // CUDA handler for the pointer - native_type ptr_; - - /// Pointer associated with this device on the host - void *hostPtr_; - /// Size of the allocation in bytes - size_t size_; - /// Offset of the active mapped region. - size_t mapOffset_; - /// Pointer to the active mapped region, if any - void *mapPtr_; - /// Original flags for the mapped region - pi_map_flags mapFlags_; - - /** alloc_mode - * classic: Just a normal buffer allocated on the device via cuda malloc - * use_host_ptr: Use an address on the host for the device - * copy_in: The data for the device comes from the host but the host - pointer is not available later for re-use - * alloc_host_ptr: Uses pinned-memory allocation - */ - enum class alloc_mode { - classic, - use_host_ptr, - copy_in, - alloc_host_ptr - } allocMode_; - - native_type get() const noexcept { return ptr_; } - - size_t get_size() const noexcept { return size_; } - - void *get_map_ptr() const noexcept { return mapPtr_; } - - size_t get_map_offset(void *) const noexcept { return mapOffset_; } - - /// Returns a pointer to data visible on the host that contains - /// the data on the device associated with this allocation. - /// The offset is used to index into the CUDA allocation. - /// - void *map_to_ptr(size_t offset, pi_map_flags flags) noexcept { - assert(mapPtr_ == nullptr); - mapOffset_ = offset; - mapFlags_ = flags; - if (hostPtr_) { - mapPtr_ = static_cast(hostPtr_) + offset; - } else { - // TODO: Allocate only what is needed based on the offset - mapPtr_ = static_cast(malloc(this->get_size())); - } - return mapPtr_; - } - - /// Detach the allocation from the host memory. - void unmap(void *) noexcept { - assert(mapPtr_ != nullptr); - - if (mapPtr_ != hostPtr_) { - free(mapPtr_); - } - mapPtr_ = nullptr; - mapOffset_ = 0; - } - - pi_map_flags get_map_flags() const noexcept { - assert(mapPtr_ != nullptr); - return mapFlags_; - } - } buffer_mem_; - - // Handler data for surface object (i.e. Images) - struct surface_mem_ { - CUarray array_; - CUsurfObject surfObj_; - pi_mem_type imageType_; - - CUarray get_array() const noexcept { return array_; } - - CUsurfObject get_surface() const noexcept { return surfObj_; } - - pi_mem_type get_image_type() const noexcept { return imageType_; } - } surface_mem_; - } mem_; - - /// Constructs the PI MEM handler for a non-typed allocation ("buffer") - _pi_mem(pi_context ctxt, pi_mem parent, mem_::buffer_mem_::alloc_mode mode, - CUdeviceptr ptr, void *host_ptr, size_t size) - : context_{ctxt}, refCount_{1}, mem_type_{mem_type::buffer} { - mem_.buffer_mem_.ptr_ = ptr; - mem_.buffer_mem_.parent_ = parent; - mem_.buffer_mem_.hostPtr_ = host_ptr; - mem_.buffer_mem_.size_ = size; - mem_.buffer_mem_.mapOffset_ = 0; - mem_.buffer_mem_.mapPtr_ = nullptr; - mem_.buffer_mem_.mapFlags_ = PI_MAP_WRITE; - mem_.buffer_mem_.allocMode_ = mode; - if (is_sub_buffer()) { - cuda_piMemRetain(mem_.buffer_mem_.parent_); - } else { - pi2ur::piContextRetain(context_); - } - }; - - /// Constructs the PI allocation for an Image object (surface in CUDA) - _pi_mem(pi_context ctxt, CUarray array, CUsurfObject surf, - pi_mem_type image_type, void *host_ptr) - : context_{ctxt}, refCount_{1}, mem_type_{mem_type::surface} { - // Ignore unused parameter - (void)host_ptr; - - mem_.surface_mem_.array_ = array; - mem_.surface_mem_.surfObj_ = surf; - mem_.surface_mem_.imageType_ = image_type; - pi2ur::piContextRetain(context_); - } - - ~_pi_mem() { - if (mem_type_ == mem_type::buffer) { - if (is_sub_buffer()) { - cuda_piMemRelease(mem_.buffer_mem_.parent_); - return; - } - } - pi2ur::piContextRelease(context_); - } - - // TODO: Move as many shared funcs up as possible - bool is_buffer() const noexcept { return mem_type_ == mem_type::buffer; } - - bool is_sub_buffer() const noexcept { - return (is_buffer() && (mem_.buffer_mem_.parent_ != nullptr)); - } - - bool is_image() const noexcept { return mem_type_ == mem_type::surface; } - - pi_context get_context() const noexcept { return context_; } - - pi_uint32 increment_reference_count() noexcept { return ++refCount_; } - - pi_uint32 decrement_reference_count() noexcept { return --refCount_; } - - pi_uint32 get_reference_count() const noexcept { return refCount_; } +struct _pi_mem : ur_mem_handle_t_ { + using ur_mem_handle_t_::ur_mem_handle_t_; }; /// PI queue mapping on to CUstream objects. diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 13ac8a5a1e138..86f3049697cf3 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -152,6 +152,8 @@ add_sycl_library("ur_adapter_cuda" SHARED "ur/adapters/cuda/sampler.hpp" "ur/adapters/cuda/ur_interface_loader.cpp" "ur/adapters/cuda/tracing.cpp" + "ur/adapters/cuda/memory.cpp" + "ur/adapters/cuda/memory.hpp" INCLUDE_DIRS ${sycl_inc_dir} LIBRARIES diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp new file mode 100644 index 0000000000000..0827f09c79a9e --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp @@ -0,0 +1,513 @@ +//===--------- memory.cpp - CUDA Adapter ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include + +#include "common.hpp" +#include "context.hpp" +#include "memory.hpp" + +/// Creates a UR Memory object using a CUDA memory allocation. +/// Can trigger a manual copy depending on the mode. +/// \TODO Implement USE_HOST_PTR using cuHostRegister +/// +UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( + ur_context_handle_t hContext, ur_mem_flags_t flags, size_t size, + const ur_buffer_properties_t *pProperties, ur_mem_handle_t *phBuffer) { + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + // Validate flags + UR_ASSERT((flags & UR_MEM_FLAGS_MASK) == 0, + UR_RESULT_ERROR_INVALID_ENUMERATION); + if (flags & (UR_MEM_FLAG_USE_HOST_POINTER | UR_MEM_FLAG_ALLOC_HOST_POINTER | + UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) { + UR_ASSERT(pProperties && pProperties->pHost, + UR_RESULT_ERROR_INVALID_HOST_PTR); + } + // Need input memory object + UR_ASSERT(phBuffer, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(size != 0, UR_RESULT_ERROR_INVALID_BUFFER_SIZE); + uint64_t maxAlloc = 0; + urDeviceGetInfo(hContext->get_device(), UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, + sizeof(maxAlloc), &maxAlloc, nullptr); + UR_ASSERT(size <= maxAlloc, UR_RESULT_ERROR_INVALID_BUFFER_SIZE); + + // Currently, USE_HOST_PTR is not implemented using host register + // since this triggers a weird segfault after program ends. + // Setting this constant to true enables testing that behavior. + const bool enableUseHostPtr = false; + const bool performInitialCopy = + (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) || + ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && !enableUseHostPtr); + ur_result_t retErr = UR_RESULT_SUCCESS; + ur_mem_handle_t retMemObj = nullptr; + + try { + ScopedContext active(hContext); + CUdeviceptr ptr; + auto pHost = pProperties ? pProperties->pHost : nullptr; + + ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode allocMode = + ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::classic; + + if ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && enableUseHostPtr) { + retErr = UR_CHECK_ERROR( + cuMemHostRegister(pHost, size, CU_MEMHOSTREGISTER_DEVICEMAP)); + retErr = UR_CHECK_ERROR(cuMemHostGetDevicePointer(&ptr, pHost, 0)); + allocMode = ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::use_host_ptr; + } else if (flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) { + retErr = UR_CHECK_ERROR(cuMemAllocHost(&pHost, size)); + retErr = UR_CHECK_ERROR(cuMemHostGetDevicePointer(&ptr, pHost, 0)); + allocMode = + ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::alloc_host_ptr; + } else { + retErr = UR_CHECK_ERROR(cuMemAlloc(&ptr, size)); + if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) { + allocMode = ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::copy_in; + } + } + + if (retErr == UR_RESULT_SUCCESS) { + ur_mem_handle_t parentBuffer = nullptr; + + auto piMemObj = std::unique_ptr(new ur_mem_handle_t_{ + hContext, parentBuffer, flags, allocMode, ptr, pHost, size}); + if (piMemObj != nullptr) { + retMemObj = piMemObj.release(); + if (performInitialCopy) { + // Operates on the default stream of the current CUDA context. + retErr = UR_CHECK_ERROR(cuMemcpyHtoD(ptr, pHost, size)); + // Synchronize with default stream implicitly used by cuMemcpyHtoD + // to make buffer data available on device before any other UR call + // uses it. + if (retErr == UR_RESULT_SUCCESS) { + CUstream defaultStream = 0; + retErr = UR_CHECK_ERROR(cuStreamSynchronize(defaultStream)); + } + } + } else { + retErr = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + } + } catch (ur_result_t err) { + retErr = err; + } catch (...) { + retErr = UR_RESULT_ERROR_OUT_OF_RESOURCES; + } + + *phBuffer = retMemObj; + + return retErr; +} + +UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) { + UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hMem->get_reference_count() > 0, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + hMem->increment_reference_count(); + return UR_RESULT_SUCCESS; +} + +/// Decreases the reference count of the Mem object. +/// If this is zero, calls the relevant CUDA Free function +/// \return UR_RESULT_SUCCESS unless deallocation error +/// +UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) { + UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + ur_result_t ret = UR_RESULT_SUCCESS; + + try { + + // Do nothing if there are other references + if (hMem->decrement_reference_count() > 0) { + return UR_RESULT_SUCCESS; + } + + // make sure hMem is released in case check_error_ur throws + std::unique_ptr uniqueMemObj(hMem); + + if (hMem->is_sub_buffer()) { + return UR_RESULT_SUCCESS; + } + + ScopedContext active(uniqueMemObj->get_context()); + + if (hMem->mem_type_ == ur_mem_handle_t_::mem_type::buffer) { + switch (uniqueMemObj->mem_.buffer_mem_.allocMode_) { + case ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::copy_in: + case ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::classic: + ret = UR_CHECK_ERROR(cuMemFree(uniqueMemObj->mem_.buffer_mem_.ptr_)); + break; + case ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::use_host_ptr: + ret = UR_CHECK_ERROR( + cuMemHostUnregister(uniqueMemObj->mem_.buffer_mem_.hostPtr_)); + break; + case ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::alloc_host_ptr: + ret = UR_CHECK_ERROR( + cuMemFreeHost(uniqueMemObj->mem_.buffer_mem_.hostPtr_)); + }; + } else if (hMem->mem_type_ == ur_mem_handle_t_::mem_type::surface) { + ret = UR_CHECK_ERROR( + cuSurfObjectDestroy(uniqueMemObj->mem_.surface_mem_.get_surface())); + ret = UR_CHECK_ERROR( + cuArrayDestroy(uniqueMemObj->mem_.surface_mem_.get_array())); + } + + } catch (ur_result_t err) { + ret = err; + } catch (...) { + ret = UR_RESULT_ERROR_OUT_OF_RESOURCES; + } + + if (ret != UR_RESULT_SUCCESS) { + // A reported CUDA error is either an implementation or an asynchronous CUDA + // error for which it is unclear if the function that reported it succeeded + // or not. Either way, the state of the program is compromised and likely + // unrecoverable. + sycl::detail::ur::die( + "Unrecoverable program state reached in urMemRelease"); + } + + return UR_RESULT_SUCCESS; +} + +/// Gets the native CUDA handle of a UR mem object +/// +/// \param[in] hMem The UR mem to get the native CUDA object of. +/// \param[out] phNativeMem Set to the native handle of the UR mem object. +/// +/// \return UR_RESULT_SUCCESS +UR_APIEXPORT ur_result_t UR_APICALL +urMemGetNativeHandle(ur_mem_handle_t hMem, ur_native_handle_t *phNativeMem) { + UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(phNativeMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); + *phNativeMem = + reinterpret_cast(hMem->mem_.buffer_mem_.get()); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory, + ur_mem_info_t MemInfoType, + size_t propSize, + void *pMemInfo, + size_t *pPropSizeRet) { + UR_ASSERT(hMemory, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(MemInfoType <= UR_MEM_INFO_CONTEXT, + UR_RESULT_ERROR_INVALID_ENUMERATION); + UR_ASSERT(hMemory->is_buffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + UrReturnHelper ReturnValue(propSize, pMemInfo, pPropSizeRet); + + ScopedContext active(hMemory->get_context()); + + switch (MemInfoType) { + case UR_MEM_INFO_SIZE: { + try { + size_t allocSize = 0; + UR_CHECK_ERROR(cuMemGetAddressRange(nullptr, &allocSize, + hMemory->mem_.buffer_mem_.ptr_)); + return ReturnValue(allocSize); + } catch (ur_result_t err) { + return err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + } + case UR_MEM_INFO_CONTEXT: { + return ReturnValue(hMemory->get_context()); + } + + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } +} + +UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreateWithNativeHandle( + ur_native_handle_t hNativeMem, ur_context_handle_t hContext, + const ur_mem_native_properties_t *pProperties, ur_mem_handle_t *phMem) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreateWithNativeHandle( + ur_native_handle_t hNativeMem, ur_context_handle_t hContext, + const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, + const ur_mem_native_properties_t *pProperties, ur_mem_handle_t *phMem) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +/// \TODO Not implemented +UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( + ur_context_handle_t hContext, ur_mem_flags_t flags, + const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, + void *pHost, ur_mem_handle_t *phMem) { + // Need input memory object + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(phMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(pImageDesc, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT((flags & UR_MEM_FLAGS_MASK) == 0, + UR_RESULT_ERROR_INVALID_ENUMERATION); + if (flags & (UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER | + UR_MEM_FLAG_ALLOC_HOST_POINTER | UR_MEM_FLAG_USE_HOST_POINTER)) { + UR_ASSERT(pHost, UR_RESULT_ERROR_INVALID_HOST_PTR); + } + const bool performInitialCopy = + (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) || + ((flags & UR_MEM_FLAG_USE_HOST_POINTER)); + + UR_ASSERT(pImageDesc->stype == UR_STRUCTURE_TYPE_IMAGE_DESC, + UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); + UR_ASSERT(pImageDesc->type <= UR_MEM_TYPE_IMAGE1D_BUFFER, + UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); + UR_ASSERT(pImageDesc->numMipLevel == 0, + UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); + UR_ASSERT(pImageDesc->numSamples == 0, + UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); + UR_ASSERT(pHost == nullptr && pImageDesc->rowPitch == 0, + UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); + UR_ASSERT(pHost == nullptr && pImageDesc->slicePitch == 0, + UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); + + ur_result_t retErr = UR_RESULT_SUCCESS; + + // We only support RBGA channel order + // TODO: check SYCL CTS and spec. May also have to support BGRA + UR_ASSERT(pImageFormat->channelOrder == UR_IMAGE_CHANNEL_ORDER_RGBA, + UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION); + + // We have to use cuArray3DCreate, which has some caveats. The height and + // depth parameters must be set to 0 produce 1D or 2D arrays. pImageDesc gives + // a minimum value of 1, so we need to convert the answer. + CUDA_ARRAY3D_DESCRIPTOR array_desc; + array_desc.NumChannels = 4; // Only support 4 channel image + array_desc.Flags = 0; // No flags required + array_desc.Width = pImageDesc->width; + if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) { + array_desc.Height = 0; + array_desc.Depth = 0; + } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) { + array_desc.Height = pImageDesc->height; + array_desc.Depth = 0; + } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) { + array_desc.Height = pImageDesc->height; + array_desc.Depth = pImageDesc->depth; + } + + // We need to get this now in bytes for calculating the total image size later + size_t pixel_type_size_bytes; + + switch (pImageFormat->channelType) { + case UR_IMAGE_CHANNEL_TYPE_UNORM_INT8: + case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8: + array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT8; + pixel_type_size_bytes = 1; + break; + case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8: + array_desc.Format = CU_AD_FORMAT_SIGNED_INT8; + pixel_type_size_bytes = 1; + break; + case UR_IMAGE_CHANNEL_TYPE_UNORM_INT16: + case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16: + array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT16; + pixel_type_size_bytes = 2; + break; + case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16: + array_desc.Format = CU_AD_FORMAT_SIGNED_INT16; + pixel_type_size_bytes = 2; + break; + case UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT: + array_desc.Format = CU_AD_FORMAT_HALF; + pixel_type_size_bytes = 2; + break; + case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32: + array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT32; + pixel_type_size_bytes = 4; + break; + case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32: + array_desc.Format = CU_AD_FORMAT_SIGNED_INT32; + pixel_type_size_bytes = 4; + break; + case UR_IMAGE_CHANNEL_TYPE_FLOAT: + array_desc.Format = CU_AD_FORMAT_FLOAT; + pixel_type_size_bytes = 4; + break; + default: + sycl::detail::ur::die( + "urMemImageCreate given unsupported image_channel_data_type"); + } + + // When a dimension isn't used pImageDesc has the size set to 1 + size_t pixel_size_bytes = + pixel_type_size_bytes * 4; // 4 is the only number of channels we support + size_t image_size_bytes = pixel_size_bytes * pImageDesc->width * + pImageDesc->height * pImageDesc->depth; + + ScopedContext active(hContext); + CUarray image_array = nullptr; + try { + retErr = UR_CHECK_ERROR(cuArray3DCreate(&image_array, &array_desc)); + } catch (ur_result_t err) { + if (err == UR_RESULT_ERROR_INVALID_VALUE) { + return UR_RESULT_ERROR_INVALID_IMAGE_SIZE; + } + return err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + try { + if (performInitialCopy) { + // We have to use a different copy function for each image dimensionality + if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) { + retErr = UR_CHECK_ERROR( + cuMemcpyHtoA(image_array, 0, pHost, image_size_bytes)); + } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) { + CUDA_MEMCPY2D cpy_desc; + memset(&cpy_desc, 0, sizeof(cpy_desc)); + cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; + cpy_desc.srcHost = pHost; + cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; + cpy_desc.dstArray = image_array; + cpy_desc.WidthInBytes = pixel_size_bytes * pImageDesc->width; + cpy_desc.Height = pImageDesc->height; + retErr = UR_CHECK_ERROR(cuMemcpy2D(&cpy_desc)); + } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) { + CUDA_MEMCPY3D cpy_desc; + memset(&cpy_desc, 0, sizeof(cpy_desc)); + cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; + cpy_desc.srcHost = pHost; + cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; + cpy_desc.dstArray = image_array; + cpy_desc.WidthInBytes = pixel_size_bytes * pImageDesc->width; + cpy_desc.Height = pImageDesc->height; + cpy_desc.Depth = pImageDesc->depth; + retErr = UR_CHECK_ERROR(cuMemcpy3D(&cpy_desc)); + } + } + + // CUDA_RESOURCE_DESC is a union of different structs, shown here + // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXOBJECT.html + // We need to fill it as described here to use it for a surface or texture + // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__SURFOBJECT.html + // CUDA_RESOURCE_DESC::resType must be CU_RESOURCE_TYPE_ARRAY and + // CUDA_RESOURCE_DESC::res::array::hArray must be set to a valid CUDA array + // handle. + // CUDA_RESOURCE_DESC::flags must be set to zero + + CUDA_RESOURCE_DESC image_res_desc; + image_res_desc.res.array.hArray = image_array; + image_res_desc.resType = CU_RESOURCE_TYPE_ARRAY; + image_res_desc.flags = 0; + + CUsurfObject surface; + retErr = UR_CHECK_ERROR(cuSurfObjectCreate(&surface, &image_res_desc)); + + auto urMemObj = std::unique_ptr(new ur_mem_handle_t_( + hContext, image_array, surface, flags, pImageDesc->type, phMem)); + + if (urMemObj == nullptr) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + *phMem = urMemObj.release(); + } catch (ur_result_t err) { + if (image_array) { + cuArrayDestroy(image_array); + } + return err; + } catch (...) { + if (image_array) { + cuArrayDestroy(image_array); + } + return UR_RESULT_ERROR_UNKNOWN; + } + + return retErr; +} + +/// \TODO Not implemented +UR_APIEXPORT ur_result_t UR_APICALL +urMemImageGetInfo(ur_mem_handle_t hMemory, ur_image_info_t ImgInfoType, + size_t propSize, void *pImgInfo, size_t *pPropSizeRet) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +/// Implements a buffer partition in the CUDA backend. +/// A buffer partition (or a sub-buffer, in OpenCL terms) is simply implemented +/// as an offset over an existing CUDA allocation. +/// +UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition( + ur_mem_handle_t hBuffer, ur_mem_flags_t flags, + ur_buffer_create_type_t bufferCreateType, const ur_buffer_region_t *pRegion, + ur_mem_handle_t *phMem) { + UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT((flags & UR_MEM_FLAGS_MASK) == 0, + UR_RESULT_ERROR_INVALID_ENUMERATION); + UR_ASSERT(hBuffer->is_buffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(!hBuffer->is_sub_buffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + // Default value for flags means UR_MEM_FLAG_READ_WRITE. + if (flags == 0) { + flags = UR_MEM_FLAG_READ_WRITE; + } + + UR_ASSERT(!(flags & + (UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER | + UR_MEM_FLAG_ALLOC_HOST_POINTER | UR_MEM_FLAG_USE_HOST_POINTER)), + UR_RESULT_ERROR_INVALID_VALUE); + if (hBuffer->memFlags_ & UR_MEM_FLAG_WRITE_ONLY) { + UR_ASSERT(!(flags & (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_READ_ONLY)), + UR_RESULT_ERROR_INVALID_VALUE); + } + if (hBuffer->memFlags_ & UR_MEM_FLAG_READ_ONLY) { + UR_ASSERT(!(flags & (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)), + UR_RESULT_ERROR_INVALID_VALUE); + } + + UR_ASSERT(bufferCreateType == UR_BUFFER_CREATE_TYPE_REGION, + UR_RESULT_ERROR_INVALID_ENUMERATION); + UR_ASSERT(pRegion != nullptr, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(phMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + UR_ASSERT(pRegion->size != 0u, UR_RESULT_ERROR_INVALID_BUFFER_SIZE); + + assert((pRegion->origin <= (pRegion->origin + pRegion->size)) && "Overflow"); + UR_ASSERT(((pRegion->origin + pRegion->size) <= + hBuffer->mem_.buffer_mem_.get_size()), + UR_RESULT_ERROR_INVALID_BUFFER_SIZE); + // Retained indirectly due to retaining parent buffer below. + ur_context_handle_t context = hBuffer->context_; + + ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode allocMode = + ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::classic; + + assert(hBuffer->mem_.buffer_mem_.ptr_ != + ur_mem_handle_t_::mem_::buffer_mem_::native_type{0}); + ur_mem_handle_t_::mem_::buffer_mem_::native_type ptr = + hBuffer->mem_.buffer_mem_.ptr_ + pRegion->origin; + + void *hostPtr = nullptr; + if (hBuffer->mem_.buffer_mem_.hostPtr_) { + hostPtr = static_cast(hBuffer->mem_.buffer_mem_.hostPtr_) + + pRegion->origin; + } + + std::unique_ptr retMemObj{nullptr}; + try { + retMemObj = std::unique_ptr{new ur_mem_handle_t_{ + context, hBuffer, flags, allocMode, ptr, hostPtr, pRegion->size}}; + } catch (ur_result_t err) { + *phMem = nullptr; + return err; + } catch (...) { + *phMem = nullptr; + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + *phMem = retMemObj.release(); + return UR_RESULT_SUCCESS; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp new file mode 100644 index 0000000000000..44484250f062b --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp @@ -0,0 +1,195 @@ +//===--------- memory.hpp - CUDA Adapter ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// +#pragma once + +#include +#include +#include + +#include "common.hpp" + +/// UR Mem mapping to CUDA memory allocations, both data and texture/surface. +/// \brief Represents non-SVM allocations on the CUDA backend. +/// Keeps tracks of all mapped regions used for Map/Unmap calls. +/// Only one region can be active at the same time per allocation. +struct ur_mem_handle_t_ { + + // TODO: Move as much shared data up as possible + using ur_context = ur_context_handle_t_ *; + using ur_mem = ur_mem_handle_t_ *; + + // Context where the memory object is accessibles + ur_context context_; + + /// Reference counting of the handler + std::atomic_uint32_t refCount_; + enum class mem_type { buffer, surface } mem_type_; + + // Original mem flags passed + ur_mem_flags_t memFlags_; + + /// A UR Memory object represents either plain memory allocations ("Buffers" + /// in OpenCL) or typed allocations ("Images" in OpenCL). + /// In CUDA their API handlers are different. Whereas "Buffers" are allocated + /// as pointer-like structs, "Images" are stored in Textures or Surfaces + /// This union allows implementation to use either from the same handler. + union mem_ { + // Handler for plain, pointer-based CUDA allocations + struct buffer_mem_ { + using native_type = CUdeviceptr; + + // If this allocation is a sub-buffer (i.e., a view on an existing + // allocation), this is the pointer to the parent handler structure + ur_mem parent_; + // CUDA handler for the pointer + native_type ptr_; + + /// Pointer associated with this device on the host + void *hostPtr_; + /// Size of the allocation in bytes + size_t size_; + /// Offset of the active mapped region. + size_t mapOffset_; + /// Pointer to the active mapped region, if any + void *mapPtr_; + /// Original flags for the mapped region + ur_map_flags_t mapFlags_; + + /** alloc_mode + * classic: Just a normal buffer allocated on the device via cuda malloc + * use_host_ptr: Use an address on the host for the device + * copy_in: The data for the device comes from the host but the host + pointer is not available later for re-use + * alloc_host_ptr: Uses pinned-memory allocation + */ + enum class alloc_mode { + classic, + use_host_ptr, + copy_in, + alloc_host_ptr + } allocMode_; + + native_type get() const noexcept { return ptr_; } + + size_t get_size() const noexcept { return size_; } + + void *get_map_ptr() const noexcept { return mapPtr_; } + + size_t get_map_offset(void *) const noexcept { return mapOffset_; } + + /// Returns a pointer to data visible on the host that contains + /// the data on the device associated with this allocation. + /// The offset is used to index into the CUDA allocation. + /// + void *map_to_ptr(size_t offset, ur_map_flags_t flags) noexcept { + assert(mapPtr_ == nullptr); + mapOffset_ = offset; + mapFlags_ = flags; + if (hostPtr_) { + mapPtr_ = static_cast(hostPtr_) + offset; + } else { + // TODO: Allocate only what is needed based on the offset + mapPtr_ = static_cast(malloc(this->get_size())); + } + return mapPtr_; + } + + /// Detach the allocation from the host memory. + void unmap(void *) noexcept { + assert(mapPtr_ != nullptr); + + if (mapPtr_ != hostPtr_) { + free(mapPtr_); + } + mapPtr_ = nullptr; + mapOffset_ = 0; + } + + ur_map_flags_t get_map_flags() const noexcept { + assert(mapPtr_ != nullptr); + return mapFlags_; + } + } buffer_mem_; + + // Handler data for surface object (i.e. Images) + struct surface_mem_ { + CUarray array_; + CUsurfObject surfObj_; + ur_mem_type_t imageType_; + + CUarray get_array() const noexcept { return array_; } + + CUsurfObject get_surface() const noexcept { return surfObj_; } + + ur_mem_type_t get_image_type() const noexcept { return imageType_; } + } surface_mem_; + } mem_; + + /// Constructs the UR mem handler for a non-typed allocation ("buffer") + ur_mem_handle_t_(ur_context ctxt, ur_mem parent, ur_mem_flags_t mem_flags, + mem_::buffer_mem_::alloc_mode mode, CUdeviceptr ptr, + void *host_ptr, size_t size) + : context_{ctxt}, refCount_{1}, mem_type_{mem_type::buffer}, + memFlags_{mem_flags} { + mem_.buffer_mem_.ptr_ = ptr; + mem_.buffer_mem_.parent_ = parent; + mem_.buffer_mem_.hostPtr_ = host_ptr; + mem_.buffer_mem_.size_ = size; + mem_.buffer_mem_.mapOffset_ = 0; + mem_.buffer_mem_.mapPtr_ = nullptr; + mem_.buffer_mem_.mapFlags_ = UR_MAP_FLAG_WRITE; + mem_.buffer_mem_.allocMode_ = mode; + if (is_sub_buffer()) { + urMemRetain(mem_.buffer_mem_.parent_); + } else { + urContextRetain(context_); + } + }; + + /// Constructs the UR allocation for an Image object (surface in CUDA) + ur_mem_handle_t_(ur_context ctxt, CUarray array, CUsurfObject surf, + ur_mem_flags_t mem_flags, ur_mem_type_t image_type, + void *host_ptr) + : context_{ctxt}, refCount_{1}, mem_type_{mem_type::surface}, + memFlags_{mem_flags} { + // Ignore unused parameter + (void)host_ptr; + + mem_.surface_mem_.array_ = array; + mem_.surface_mem_.surfObj_ = surf; + mem_.surface_mem_.imageType_ = image_type; + urContextRetain(context_); + } + + ~ur_mem_handle_t_() { + if (mem_type_ == mem_type::buffer) { + if (is_sub_buffer()) { + urMemRelease(mem_.buffer_mem_.parent_); + return; + } + } + urContextRelease(context_); + } + + // TODO: Move as many shared funcs up as possible + bool is_buffer() const noexcept { return mem_type_ == mem_type::buffer; } + + bool is_sub_buffer() const noexcept { + return (is_buffer() && (mem_.buffer_mem_.parent_ != nullptr)); + } + + bool is_image() const noexcept { return mem_type_ == mem_type::surface; } + + ur_context get_context() const noexcept { return context_; } + + uint32_t increment_reference_count() noexcept { return ++refCount_; } + + uint32_t decrement_reference_count() noexcept { return --refCount_; } + + uint32_t get_reference_count() const noexcept { return refCount_; } +}; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp index 06ae75db02dec..35d807ffb6db4 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp @@ -145,16 +145,16 @@ urGetMemProcAddrTable(ur_api_version_t version, ur_mem_dditable_t *pDdiTable) { if (UR_RESULT_SUCCESS != result) { return result; } - pDdiTable->pfnBufferCreate = nullptr; - pDdiTable->pfnBufferPartition = nullptr; - pDdiTable->pfnBufferCreateWithNativeHandle = nullptr; - pDdiTable->pfnImageCreateWithNativeHandle = nullptr; - pDdiTable->pfnGetInfo = nullptr; - pDdiTable->pfnGetNativeHandle = nullptr; - pDdiTable->pfnImageCreate = nullptr; - pDdiTable->pfnImageGetInfo = nullptr; - pDdiTable->pfnRelease = nullptr; - pDdiTable->pfnRetain = nullptr; + pDdiTable->pfnBufferCreate = urMemBufferCreate; + pDdiTable->pfnBufferPartition = urMemBufferPartition; + pDdiTable->pfnBufferCreateWithNativeHandle = urMemBufferCreateWithNativeHandle; + pDdiTable->pfnImageCreateWithNativeHandle = urMemImageCreateWithNativeHandle; + pDdiTable->pfnGetInfo = urMemGetInfo; + pDdiTable->pfnGetNativeHandle = urMemGetNativeHandle; + pDdiTable->pfnImageCreate = urMemImageCreate; + pDdiTable->pfnImageGetInfo = urMemImageGetInfo; + pDdiTable->pfnRelease = urMemRelease; + pDdiTable->pfnRetain = urMemRetain; return UR_RESULT_SUCCESS; } From ef9f2243c146f19f52a1f4963cde6d5e74867a87 Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Tue, 18 Apr 2023 14:16:38 +0100 Subject: [PATCH 16/45] Port USM entry points --- sycl/plugins/cuda/CMakeLists.txt | 1 + sycl/plugins/cuda/pi_cuda.cpp | 613 +----------------- sycl/plugins/unified_runtime/CMakeLists.txt | 5 +- .../ur/adapters/cuda/device.hpp | 2 + .../ur/adapters/cuda/enqueue.cpp | 372 +++++++++++ .../ur/adapters/cuda/ur_interface_loader.cpp | 22 +- .../unified_runtime/ur/adapters/cuda/usm.cpp | 256 ++++++++ 7 files changed, 659 insertions(+), 612 deletions(-) create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp diff --git a/sycl/plugins/cuda/CMakeLists.txt b/sycl/plugins/cuda/CMakeLists.txt index da4ce9476ee91..70e4e1a200e1a 100644 --- a/sycl/plugins/cuda/CMakeLists.txt +++ b/sycl/plugins/cuda/CMakeLists.txt @@ -74,6 +74,7 @@ add_sycl_plugin(cuda "../unified_runtime/ur/adapters/cuda/queue.cpp" "../unified_runtime/ur/adapters/cuda/sampler.cpp" "../unified_runtime/ur/adapters/cuda/sampler.hpp" + "../unified_runtime/ur/adapters/cuda/usm.cpp" "../unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp" "../unified_runtime/ur/adapters/cuda/tracing.cpp" "../unified_runtime/ur/adapters/cuda/memory.cpp" diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index ab0d428e3613a..ed10a030b665c 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -234,14 +234,6 @@ pi_result check_error(CUresult result, const char *function, int line, /// \cond NODOXY #define PI_CHECK_ERROR(result) check_error(result, __func__, __LINE__, __FILE__) -ScopedContext::ScopedContext(pi_context ctxt) { - if (!ctxt) { - throw PI_ERROR_INVALID_CONTEXT; - } - - set_context(ctxt->get()); -} - /// \cond NODOXY template pi_result getInfoImpl(size_t param_value_size, void *param_value, @@ -286,13 +278,6 @@ pi_result getInfoArray(size_t array_length, size_t param_value_size, return getInfoImpl(param_value_size, param_value, param_value_size_ret, value, array_length * sizeof(T), memcpy); } - -int getAttribute(pi_device device, CUdevice_attribute attribute) { - int value; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&value, attribute, device->get()) == CUDA_SUCCESS); - return value; -} /// \endcond pi_result enqueueEventsWait(pi_queue command_queue, CUstream stream, @@ -325,34 +310,6 @@ pi_result enqueueEventsWait(pi_queue command_queue, CUstream stream, } } -template -void getUSMHostOrDevicePtr(PtrT usm_ptr, CUmemorytype *out_mem_type, - CUdeviceptr *out_dev_ptr, PtrT *out_host_ptr) { - // do not throw if cuPointerGetAttribute returns CUDA_ERROR_INVALID_VALUE - // checks with PI_CHECK_ERROR are not suggested - CUresult ret = cuPointerGetAttribute( - out_mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)usm_ptr); - assert((*out_mem_type != CU_MEMORYTYPE_ARRAY && - *out_mem_type != CU_MEMORYTYPE_UNIFIED) && - "ARRAY, UNIFIED types are not supported!"); - - // pointer not known to the CUDA subsystem (possibly a system allocated ptr) - if (ret == CUDA_ERROR_INVALID_VALUE) { - *out_mem_type = CU_MEMORYTYPE_HOST; - *out_dev_ptr = 0; - *out_host_ptr = usm_ptr; - - // todo: resets the above "non-stick" error - } else if (ret == CUDA_SUCCESS) { - *out_dev_ptr = (*out_mem_type == CU_MEMORYTYPE_DEVICE) - ? reinterpret_cast(usm_ptr) - : 0; - *out_host_ptr = (*out_mem_type == CU_MEMORYTYPE_HOST) ? usm_ptr : nullptr; - } else { - PI_CHECK_ERROR(ret); - } -} - } // anonymous namespace /// ------ Error handling, matching OpenCL plugin semantics. @@ -1413,548 +1370,6 @@ pi_result cuda_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj, return ret_err; } -/// USM: Implements USM Host allocations using CUDA Pinned Memory -/// -pi_result -cuda_piextUSMHostAlloc(void **result_ptr, pi_context context, - [[maybe_unused]] pi_usm_mem_properties *properties, - size_t size, [[maybe_unused]] pi_uint32 alignment) { - assert(result_ptr != nullptr); - assert(context != nullptr); - assert(properties == nullptr || *properties == 0); - pi_result result = PI_SUCCESS; - try { - ScopedContext active(context); - result = PI_CHECK_ERROR(cuMemAllocHost(result_ptr, size)); - } catch (pi_result error) { - result = error; - } - - assert(alignment == 0 || - (result == PI_SUCCESS && - reinterpret_cast(*result_ptr) % alignment == 0)); - return result; -} - -/// USM: Implements USM device allocations using a normal CUDA device pointer -/// -pi_result -cuda_piextUSMDeviceAlloc(void **result_ptr, pi_context context, - [[maybe_unused]] pi_device device, - [[maybe_unused]] pi_usm_mem_properties *properties, - size_t size, [[maybe_unused]] pi_uint32 alignment) { - assert(result_ptr != nullptr); - assert(context != nullptr); - assert(device != nullptr); - assert(properties == nullptr || *properties == 0); - pi_result result = PI_SUCCESS; - try { - ScopedContext active(context); - result = PI_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)result_ptr, size)); - } catch (pi_result error) { - result = error; - } - - assert(alignment == 0 || - (result == PI_SUCCESS && - reinterpret_cast(*result_ptr) % alignment == 0)); - return result; -} - -/// USM: Implements USM Shared allocations using CUDA Managed Memory -/// -pi_result -cuda_piextUSMSharedAlloc(void **result_ptr, pi_context context, - [[maybe_unused]] pi_device device, - [[maybe_unused]] pi_usm_mem_properties *properties, - size_t size, [[maybe_unused]] pi_uint32 alignment) { - assert(result_ptr != nullptr); - assert(context != nullptr); - assert(device != nullptr); - assert(properties == nullptr || *properties == 0); - pi_result result = PI_SUCCESS; - try { - ScopedContext active(context); - result = PI_CHECK_ERROR(cuMemAllocManaged((CUdeviceptr *)result_ptr, size, - CU_MEM_ATTACH_GLOBAL)); - } catch (pi_result error) { - result = error; - } - - assert(alignment == 0 || - (result == PI_SUCCESS && - reinterpret_cast(*result_ptr) % alignment == 0)); - return result; -} - -/// USM: Frees the given USM pointer associated with the context. -/// -pi_result cuda_piextUSMFree(pi_context context, void *ptr) { - assert(context != nullptr); - pi_result result = PI_SUCCESS; - try { - ScopedContext active(context); - bool is_managed; - unsigned int type; - void *attribute_values[2] = {&is_managed, &type}; - CUpointer_attribute attributes[2] = {CU_POINTER_ATTRIBUTE_IS_MANAGED, - CU_POINTER_ATTRIBUTE_MEMORY_TYPE}; - result = PI_CHECK_ERROR(cuPointerGetAttributes( - 2, attributes, attribute_values, (CUdeviceptr)ptr)); - assert(type == CU_MEMORYTYPE_DEVICE || type == CU_MEMORYTYPE_HOST); - if (is_managed || type == CU_MEMORYTYPE_DEVICE) { - // Memory allocated with cuMemAlloc and cuMemAllocManaged must be freed - // with cuMemFree - result = PI_CHECK_ERROR(cuMemFree((CUdeviceptr)ptr)); - } else { - // Memory allocated with cuMemAllocHost must be freed with cuMemFreeHost - result = PI_CHECK_ERROR(cuMemFreeHost(ptr)); - } - } catch (pi_result error) { - result = error; - } - return result; -} - -pi_result cuda_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value, - size_t count, - pi_uint32 num_events_in_waitlist, - const pi_event *events_waitlist, - pi_event *event) { - assert(queue != nullptr); - assert(ptr != nullptr); - pi_result result = PI_SUCCESS; - std::unique_ptr<_pi_event> event_ptr{nullptr}; - - try { - ScopedContext active(queue->get_context()); - pi_uint32 stream_token; - _pi_stream_guard guard; - CUstream cuStream = queue->get_next_compute_stream( - num_events_in_waitlist, - reinterpret_cast(events_waitlist), guard, - &stream_token); - result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist, - events_waitlist); - if (event) { - event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_FILL, queue, cuStream, stream_token)); - event_ptr->start(); - } - result = PI_CHECK_ERROR(cuMemsetD8Async( - (CUdeviceptr)ptr, (unsigned char)value & 0xFF, count, cuStream)); - if (event) { - result = map_ur_error(event_ptr->record()); - *event = event_ptr.release(); - } - } catch (pi_result err) { - result = err; - } - return result; -} - -pi_result cuda_piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking, - void *dst_ptr, const void *src_ptr, - size_t size, - pi_uint32 num_events_in_waitlist, - const pi_event *events_waitlist, - pi_event *event) { - assert(queue != nullptr); - assert(dst_ptr != nullptr); - assert(src_ptr != nullptr); - pi_result result = PI_SUCCESS; - - std::unique_ptr<_pi_event> event_ptr{nullptr}; - - try { - ScopedContext active(queue->get_context()); - CUstream cuStream = queue->get_next_transfer_stream(); - result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist, - events_waitlist); - if (event) { - event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue, cuStream)); - event_ptr->start(); - } - result = PI_CHECK_ERROR(cuMemcpyAsync( - (CUdeviceptr)dst_ptr, (CUdeviceptr)src_ptr, size, cuStream)); - if (event) { - result = map_ur_error(event_ptr->record()); - } - if (blocking) { - result = PI_CHECK_ERROR(cuStreamSynchronize(cuStream)); - } - if (event) { - *event = event_ptr.release(); - } - } catch (pi_result err) { - result = err; - } - return result; -} - -pi_result cuda_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr, - size_t size, - pi_usm_migration_flags flags, - pi_uint32 num_events_in_waitlist, - const pi_event *events_waitlist, - pi_event *event) { - pi_device device = - reinterpret_cast(queue->get_context()->get_device()); - - // Certain cuda devices and Windows do not have support for some Unified - // Memory features. cuMemPrefetchAsync requires concurrent memory access - // for managed memory. Therfore, ignore prefetch hint if concurrent managed - // memory access is not available. - if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { - setErrorMessage("Prefetch hint ignored as device does not support " - "concurrent managed access", - UR_RESULT_SUCCESS); - return PI_ERROR_PLUGIN_SPECIFIC_ERROR; - } - - unsigned int is_managed; - PI_CHECK_ERROR(cuPointerGetAttribute( - &is_managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)ptr)); - if (!is_managed) { - setErrorMessage("Prefetch hint ignored as prefetch only works with USM", - UR_RESULT_SUCCESS); - return PI_ERROR_PLUGIN_SPECIFIC_ERROR; - } - - // flags is currently unused so fail if set - if (flags != 0) - return PI_ERROR_INVALID_VALUE; - assert(queue != nullptr); - assert(ptr != nullptr); - pi_result result = PI_SUCCESS; - std::unique_ptr<_pi_event> event_ptr{nullptr}; - - try { - ScopedContext active(queue->get_context()); - CUstream cuStream = queue->get_next_transfer_stream(); - result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist, - events_waitlist); - if (event) { - event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue, cuStream)); - event_ptr->start(); - } - result = PI_CHECK_ERROR( - cuMemPrefetchAsync((CUdeviceptr)ptr, size, device->get(), cuStream)); - if (event) { - result = map_ur_error(event_ptr->record()); - *event = event_ptr.release(); - } - } catch (pi_result err) { - result = err; - } - return result; -} - -/// USM: memadvise API to govern behavior of automatic migration mechanisms -pi_result cuda_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr, - size_t length, pi_mem_advice advice, - pi_event *event) { - assert(queue != nullptr); - assert(ptr != nullptr); - - // Certain cuda devices and Windows do not have support for some Unified - // Memory features. Passing CU_MEM_ADVISE_[UN]SET_PREFERRED_LOCATION and - // CU_MEM_ADVISE_[UN]SET_ACCESSED_BY to cuMemAdvise on a GPU device requires - // the GPU device to report a non-zero value for - // CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Therfore, ignore memory - // advise if concurrent managed memory access is not available. - if (advice == PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION || - advice == PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION || - advice == PI_MEM_ADVICE_CUDA_SET_ACCESSED_BY || - advice == PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY || - advice == PI_MEM_ADVICE_RESET) { - pi_device device = - reinterpret_cast(queue->get_context()->get_device()); - if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { - setErrorMessage("Mem advise ignored as device does not support " - "concurrent managed access", - UR_RESULT_SUCCESS); - return PI_ERROR_PLUGIN_SPECIFIC_ERROR; - } - - // TODO: If ptr points to valid system-allocated pageable memory we should - // check that the device also has the - // CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS property. - } - - unsigned int is_managed; - PI_CHECK_ERROR(cuPointerGetAttribute( - &is_managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)ptr)); - if (!is_managed) { - setErrorMessage( - "Memory advice ignored as memory advices only works with USM", - UR_RESULT_SUCCESS); - return PI_ERROR_PLUGIN_SPECIFIC_ERROR; - } - - pi_result result = PI_SUCCESS; - std::unique_ptr<_pi_event> event_ptr{nullptr}; - - try { - ScopedContext active(queue->get_context()); - - if (event) { - event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_USER, queue, queue->get_next_transfer_stream())); - event_ptr->start(); - } - - switch (advice) { - case PI_MEM_ADVICE_CUDA_SET_READ_MOSTLY: - case PI_MEM_ADVICE_CUDA_UNSET_READ_MOSTLY: - case PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION: - case PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION: - case PI_MEM_ADVICE_CUDA_SET_ACCESSED_BY: - case PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY: - result = PI_CHECK_ERROR(cuMemAdvise( - (CUdeviceptr)ptr, length, - (CUmem_advise)(advice - PI_MEM_ADVICE_CUDA_SET_READ_MOSTLY + 1), - queue->get_context()->get_device()->get())); - break; - case PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION_HOST: - case PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION_HOST: - case PI_MEM_ADVICE_CUDA_SET_ACCESSED_BY_HOST: - case PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY_HOST: - result = PI_CHECK_ERROR(cuMemAdvise( - (CUdeviceptr)ptr, length, - (CUmem_advise)(advice - PI_MEM_ADVICE_CUDA_SET_READ_MOSTLY + 1 - - (PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION_HOST - - PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION)), - CU_DEVICE_CPU)); - break; - case PI_MEM_ADVICE_RESET: - PI_CHECK_ERROR(cuMemAdvise((CUdeviceptr)ptr, length, - CU_MEM_ADVISE_UNSET_READ_MOSTLY, - queue->get_context()->get_device()->get())); - PI_CHECK_ERROR(cuMemAdvise((CUdeviceptr)ptr, length, - CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION, - queue->get_context()->get_device()->get())); - PI_CHECK_ERROR(cuMemAdvise((CUdeviceptr)ptr, length, - CU_MEM_ADVISE_UNSET_ACCESSED_BY, - queue->get_context()->get_device()->get())); - break; - default: - sycl::detail::pi::die("Unknown advice"); - } - if (event) { - result = map_ur_error(event_ptr->record()); - *event = event_ptr.release(); - } - } catch (pi_result err) { - result = err; - } catch (...) { - result = PI_ERROR_UNKNOWN; - } - return result; -} - -// TODO: Implement this. Remember to return true for -// PI_EXT_ONEAPI_CONTEXT_INFO_USM_FILL2D_SUPPORT when it is implemented. -pi_result cuda_piextUSMEnqueueFill2D(pi_queue, void *, size_t, size_t, - const void *, size_t, size_t, pi_uint32, - const pi_event *, pi_event *) { - sycl::detail::pi::die("piextUSMEnqueueFill2D: not implemented"); - return {}; -} - -// TODO: Implement this. Remember to return true for -// PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMSET2D_SUPPORT when it is implemented. -pi_result cuda_piextUSMEnqueueMemset2D(pi_queue, void *, size_t, int, size_t, - size_t, pi_uint32, const pi_event *, - pi_event *) { - sycl::detail::pi::die("cuda_piextUSMEnqueueMemset2D: not implemented"); - return {}; -} - -/// 2D Memcpy API -/// -/// \param queue is the queue to submit to -/// \param blocking is whether this operation should block the host -/// \param dst_ptr is the location the data will be copied -/// \param dst_pitch is the total width of the destination memory including -/// padding -/// \param src_ptr is the data to be copied -/// \param dst_pitch is the total width of the source memory including padding -/// \param width is width in bytes of each row to be copied -/// \param height is height the columns to be copied -/// \param num_events_in_waitlist is the number of events to wait on -/// \param events_waitlist is an array of events to wait on -/// \param event is the event that represents this operation -pi_result cuda_piextUSMEnqueueMemcpy2D(pi_queue queue, pi_bool blocking, - void *dst_ptr, size_t dst_pitch, - const void *src_ptr, size_t src_pitch, - size_t width, size_t height, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event) { - - assert(queue != nullptr); - - pi_result result = PI_SUCCESS; - - try { - ScopedContext active(queue->get_context()); - CUstream cuStream = queue->get_next_transfer_stream(); - result = enqueueEventsWait(queue, cuStream, num_events_in_wait_list, - event_wait_list); - if (event) { - (*event) = _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, - queue, cuStream); - (*event)->start(); - } - - // Determine the direction of copy using cuPointerGetAttribute - // for both the src_ptr and dst_ptr - CUDA_MEMCPY2D cpyDesc; - memset(&cpyDesc, 0, sizeof(cpyDesc)); - - getUSMHostOrDevicePtr(src_ptr, &cpyDesc.srcMemoryType, &cpyDesc.srcDevice, - &cpyDesc.srcHost); - getUSMHostOrDevicePtr(dst_ptr, &cpyDesc.dstMemoryType, &cpyDesc.dstDevice, - &cpyDesc.dstHost); - - cpyDesc.dstPitch = dst_pitch; - cpyDesc.srcPitch = src_pitch; - cpyDesc.WidthInBytes = width; - cpyDesc.Height = height; - - result = PI_CHECK_ERROR(cuMemcpy2DAsync(&cpyDesc, cuStream)); - - if (event) { - (*event)->record(); - } - if (blocking) { - result = PI_CHECK_ERROR(cuStreamSynchronize(cuStream)); - } - } catch (pi_result err) { - result = err; - } - return result; -} - -/// API to query information about USM allocated pointers -/// Valid Queries: -/// PI_MEM_ALLOC_TYPE returns host/device/shared pi_host_usm value -/// PI_MEM_ALLOC_BASE_PTR returns the base ptr of an allocation if -/// the queried pointer fell inside an allocation. -/// Result must fit in void * -/// PI_MEM_ALLOC_SIZE returns how big the queried pointer's -/// allocation is in bytes. Result is a size_t. -/// PI_MEM_ALLOC_DEVICE returns the pi_device this was allocated against -/// -/// \param context is the pi_context -/// \param ptr is the pointer to query -/// \param param_name is the type of query to perform -/// \param param_value_size is the size of the result in bytes -/// \param param_value is the result -/// \param param_value_size_ret is how many bytes were written -pi_result cuda_piextUSMGetMemAllocInfo(pi_context context, const void *ptr, - pi_mem_alloc_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) { - assert(context != nullptr); - assert(ptr != nullptr); - pi_result result = PI_SUCCESS; - - try { - ScopedContext active(context); - switch (param_name) { - case PI_MEM_ALLOC_TYPE: { - unsigned int value; - // do not throw if cuPointerGetAttribute returns CUDA_ERROR_INVALID_VALUE - CUresult ret = cuPointerGetAttribute( - &value, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)ptr); - if (ret == CUDA_ERROR_INVALID_VALUE) { - // pointer not known to the CUDA subsystem - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_MEM_TYPE_UNKNOWN); - } - result = check_error(ret, __func__, __LINE__ - 5, __FILE__); - if (value) { - // pointer to managed memory - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_MEM_TYPE_SHARED); - } - result = PI_CHECK_ERROR(cuPointerGetAttribute( - &value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)ptr)); - assert(value == CU_MEMORYTYPE_DEVICE || value == CU_MEMORYTYPE_HOST); - if (value == CU_MEMORYTYPE_DEVICE) { - // pointer to device memory - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_MEM_TYPE_DEVICE); - } - if (value == CU_MEMORYTYPE_HOST) { - // pointer to host memory - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_MEM_TYPE_HOST); - } - // should never get here -#ifdef _MSC_VER - __assume(0); -#else - __builtin_unreachable(); -#endif - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_MEM_TYPE_UNKNOWN); - } - case PI_MEM_ALLOC_BASE_PTR: { -#if CUDA_VERSION >= 10020 - // CU_POINTER_ATTRIBUTE_RANGE_START_ADDR was introduced in CUDA 10.2 - unsigned int value; - result = PI_CHECK_ERROR(cuPointerGetAttribute( - &value, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, (CUdeviceptr)ptr)); - return getInfo(param_value_size, param_value, param_value_size_ret, - value); -#else - return PI_ERROR_INVALID_VALUE; -#endif - } - case PI_MEM_ALLOC_SIZE: { -#if CUDA_VERSION >= 10020 - // CU_POINTER_ATTRIBUTE_RANGE_SIZE was introduced in CUDA 10.2 - unsigned int value; - result = PI_CHECK_ERROR(cuPointerGetAttribute( - &value, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)ptr)); - return getInfo(param_value_size, param_value, param_value_size_ret, - value); -#else - return PI_ERROR_INVALID_VALUE; -#endif - } - case PI_MEM_ALLOC_DEVICE: { - // get device index associated with this pointer - unsigned int device_idx; - result = PI_CHECK_ERROR(cuPointerGetAttribute( - &device_idx, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)ptr)); - - // currently each device is in its own platform, so find the platform at - // the same index - std::vector platforms; - platforms.resize(device_idx + 1); - result = pi2ur::piPlatformsGet(device_idx + 1, platforms.data(), nullptr); - - // get the device from the platform - // TODO(ur): Remove cast when this entry point is moved to UR - pi_device device = - reinterpret_cast(platforms[device_idx]->devices_[0].get()); - return getInfo(param_value_size, param_value, param_value_size_ret, - device); - } - } - } catch (pi_result error) { - result = error; - } - return result; -} - pi_result cuda_piextEnqueueDeviceGlobalVariableWrite( pi_queue queue, pi_program program, const char *name, pi_bool blocking_write, size_t count, size_t offset, const void *src, @@ -1984,7 +1399,7 @@ pi_result cuda_piextEnqueueDeviceGlobalVariableWrite( if (offset + count > device_global_size) return PI_ERROR_INVALID_VALUE; - return cuda_piextUSMEnqueueMemcpy( + return pi2ur::piextUSMEnqueueMemcpy( queue, blocking_write, reinterpret_cast(device_global + offset), src, count, num_events_in_wait_list, event_wait_list, event); } catch (pi_result error) { @@ -2021,7 +1436,7 @@ pi_result cuda_piextEnqueueDeviceGlobalVariableRead( if (offset + count > device_global_size) return PI_ERROR_INVALID_VALUE; - return cuda_piextUSMEnqueueMemcpy( + return pi2ur::piextUSMEnqueueMemcpy( queue, blocking_read, dst, reinterpret_cast(device_global + offset), count, num_events_in_wait_list, event_wait_list, event); @@ -2206,18 +1621,18 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piEnqueueMemBufferMap, cuda_piEnqueueMemBufferMap) _PI_CL(piEnqueueMemUnmap, cuda_piEnqueueMemUnmap) // USM - _PI_CL(piextUSMHostAlloc, cuda_piextUSMHostAlloc) - _PI_CL(piextUSMDeviceAlloc, cuda_piextUSMDeviceAlloc) - _PI_CL(piextUSMSharedAlloc, cuda_piextUSMSharedAlloc) - _PI_CL(piextUSMFree, cuda_piextUSMFree) - _PI_CL(piextUSMEnqueueMemset, cuda_piextUSMEnqueueMemset) - _PI_CL(piextUSMEnqueueMemcpy, cuda_piextUSMEnqueueMemcpy) - _PI_CL(piextUSMEnqueuePrefetch, cuda_piextUSMEnqueuePrefetch) - _PI_CL(piextUSMEnqueueMemAdvise, cuda_piextUSMEnqueueMemAdvise) - _PI_CL(piextUSMEnqueueFill2D, cuda_piextUSMEnqueueFill2D) - _PI_CL(piextUSMEnqueueMemset2D, cuda_piextUSMEnqueueMemset2D) - _PI_CL(piextUSMEnqueueMemcpy2D, cuda_piextUSMEnqueueMemcpy2D) - _PI_CL(piextUSMGetMemAllocInfo, cuda_piextUSMGetMemAllocInfo) + _PI_CL(piextUSMHostAlloc, pi2ur::piextUSMHostAlloc) + _PI_CL(piextUSMDeviceAlloc, pi2ur::piextUSMDeviceAlloc) + _PI_CL(piextUSMSharedAlloc, pi2ur::piextUSMSharedAlloc) + _PI_CL(piextUSMFree, pi2ur::piextUSMFree) + _PI_CL(piextUSMEnqueueMemset, pi2ur::piextUSMEnqueueMemset) + _PI_CL(piextUSMEnqueueMemcpy, pi2ur::piextUSMEnqueueMemcpy) + _PI_CL(piextUSMEnqueuePrefetch, pi2ur::piextUSMEnqueuePrefetch) + _PI_CL(piextUSMEnqueueMemAdvise, pi2ur::piextUSMEnqueueMemAdvise) + _PI_CL(piextUSMEnqueueFill2D, pi2ur::piextUSMEnqueueFill2D) + _PI_CL(piextUSMEnqueueMemset2D, pi2ur::piextUSMEnqueueMemset2D) + _PI_CL(piextUSMEnqueueMemcpy2D, pi2ur::piextUSMEnqueueMemcpy2D) + _PI_CL(piextUSMGetMemAllocInfo, pi2ur::piextUSMGetMemAllocInfo) // Device global variable _PI_CL(piextEnqueueDeviceGlobalVariableWrite, cuda_piextEnqueueDeviceGlobalVariableWrite) diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 86f3049697cf3..2288a8e9949e1 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -150,10 +150,11 @@ add_sycl_library("ur_adapter_cuda" SHARED "ur/adapters/cuda/queue.hpp" "ur/adapters/cuda/sampler.cpp" "ur/adapters/cuda/sampler.hpp" - "ur/adapters/cuda/ur_interface_loader.cpp" - "ur/adapters/cuda/tracing.cpp" "ur/adapters/cuda/memory.cpp" "ur/adapters/cuda/memory.hpp" + "ur/adapters/cuda/usm.cpp" + "ur/adapters/cuda/ur_interface_loader.cpp" + "ur/adapters/cuda/tracing.cpp" INCLUDE_DIRS ${sycl_inc_dir} LIBRARIES diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp index c2195c958cfd7..9d01edd8a5ec3 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp @@ -57,3 +57,5 @@ struct ur_device_handle_t_ { int get_max_work_group_size() const noexcept { return max_work_group_size; }; }; + +int getAttribute(ur_device_handle_t device, CUdevice_attribute attribute); diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp index 68c70aa1ae9ec..7e0e7b5905f31 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp @@ -45,6 +45,89 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t command_queue, CUstream stream, } } +template +void getUSMHostOrDevicePtr(PtrT usm_ptr, CUmemorytype *out_mem_type, + CUdeviceptr *out_dev_ptr, PtrT *out_host_ptr) { + // do not throw if cuPointerGetAttribute returns CUDA_ERROR_INVALID_VALUE + // checks with PI_CHECK_ERROR are not suggested + CUresult ret = cuPointerGetAttribute( + out_mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)usm_ptr); + // ARRAY, UNIFIED types are not supported! + assert(*out_mem_type != CU_MEMORYTYPE_ARRAY && + *out_mem_type != CU_MEMORYTYPE_UNIFIED); + + // pointer not known to the CUDA subsystem (possibly a system allocated ptr) + if (ret == CUDA_ERROR_INVALID_VALUE) { + *out_mem_type = CU_MEMORYTYPE_HOST; + *out_dev_ptr = 0; + *out_host_ptr = usm_ptr; + + // todo: resets the above "non-stick" error + } else if (ret == CUDA_SUCCESS) { + *out_dev_ptr = (*out_mem_type == CU_MEMORYTYPE_DEVICE) + ? reinterpret_cast(usm_ptr) + : 0; + *out_host_ptr = (*out_mem_type == CU_MEMORYTYPE_HOST) ? usm_ptr : nullptr; + } else { + UR_CHECK_ERROR(ret); + } +} + +ur_result_t setCuMemAdvise(CUdeviceptr devPtr, size_t size, + ur_usm_advice_flags_t ur_advice_flags, + CUdevice device) { + std::unordered_map + URToCUMemAdviseDeviceFlagsMap = { + {UR_USM_ADVICE_FLAG_SET_READ_MOSTLY, CU_MEM_ADVISE_SET_READ_MOSTLY}, + {UR_USM_ADVICE_FLAG_CLEAR_READ_MOSTLY, + CU_MEM_ADVISE_UNSET_READ_MOSTLY}, + {UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION, + CU_MEM_ADVISE_SET_PREFERRED_LOCATION}, + {UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION, + CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION}, + {UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE, + CU_MEM_ADVISE_SET_ACCESSED_BY}, + {UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_DEVICE, + CU_MEM_ADVISE_UNSET_ACCESSED_BY}, + }; + for (auto &FlagPair : URToCUMemAdviseDeviceFlagsMap) { + if (ur_advice_flags & FlagPair.first) { + UR_CHECK_ERROR(cuMemAdvise(devPtr, size, FlagPair.second, device)); + } + } + + std::unordered_map + URToCUMemAdviseHostFlagsMap = { + {UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION_HOST, + CU_MEM_ADVISE_SET_PREFERRED_LOCATION}, + {UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION_HOST, + CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION}, + {UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_HOST, + CU_MEM_ADVISE_SET_ACCESSED_BY}, + {UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_HOST, + CU_MEM_ADVISE_UNSET_ACCESSED_BY}, + }; + + for (auto &FlagPair : URToCUMemAdviseHostFlagsMap) { + if (ur_advice_flags & FlagPair.first) { + UR_CHECK_ERROR(cuMemAdvise(devPtr, size, FlagPair.second, CU_DEVICE_CPU)); + } + } + + std::array UnmappedMemAdviceFlags = { + UR_USM_ADVICE_FLAG_SET_NON_ATOMIC_MOSTLY, + UR_USM_ADVICE_FLAG_CLEAR_NON_ATOMIC_MOSTLY, + UR_USM_ADVICE_FLAG_BIAS_CACHED, UR_USM_ADVICE_FLAG_BIAS_UNCACHED}; + + for (auto &unMappedFlag : UnmappedMemAdviceFlags) { + if (ur_advice_flags & unMappedFlag) { + throw UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + + return UR_RESULT_SUCCESS; +} + // Determine local work sizes that result in uniform work groups. // The default threadsPerBlock only require handling the first work_dim // dimension. @@ -389,3 +472,292 @@ UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( } return retError; } + +/// TODO(ur): Add support for the offset. +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( + ur_queue_handle_t hQueue, void *ptr, size_t patternSize, + const void *pPattern, size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE); + UR_ASSERT(ptr, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(size % patternSize == 0, UR_RESULT_ERROR_INVALID_SIZE); + + ur_result_t result = UR_RESULT_SUCCESS; + std::unique_ptr event_ptr{nullptr}; + + try { + ScopedContext active(hQueue->get_context()); + uint32_t stream_token; + ur_stream_guard_ guard; + CUstream cuStream = hQueue->get_next_compute_stream( + numEventsInWaitList, phEventWaitList, guard, &stream_token); + result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, + phEventWaitList); + if (phEvent) { + event_ptr = + std::unique_ptr(ur_event_handle_t_::make_native( + UR_COMMAND_USM_FILL, hQueue, cuStream, stream_token)); + event_ptr->start(); + } + switch (patternSize) { + case 1: + result = UR_CHECK_ERROR( + cuMemsetD8Async((CUdeviceptr)ptr, *((const uint8_t *)pPattern) & 0xFF, + size, cuStream)); + break; + case 2: + result = UR_CHECK_ERROR(cuMemsetD16Async( + (CUdeviceptr)ptr, *((const uint16_t *)pPattern) & 0xFFFF, size, + cuStream)); + break; + case 4: + result = UR_CHECK_ERROR(cuMemsetD32Async( + (CUdeviceptr)ptr, *((const uint32_t *)pPattern) & 0xFFFFFFFF, size, + cuStream)); + break; + default: + return UR_RESULT_ERROR_INVALID_ARGUMENT; + } + if (phEvent) { + result = event_ptr->record(); + *phEvent = event_ptr.release(); + } + } catch (ur_result_t err) { + result = err; + } + return result; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy( + ur_queue_handle_t hQueue, bool blocking, void *pDst, const void *pSrc, + size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE); + UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER); + ur_result_t result = UR_RESULT_SUCCESS; + + std::unique_ptr event_ptr{nullptr}; + + try { + ScopedContext active(hQueue->get_context()); + CUstream cuStream = hQueue->get_next_transfer_stream(); + result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, + phEventWaitList); + if (phEvent) { + event_ptr = + std::unique_ptr(ur_event_handle_t_::make_native( + UR_COMMAND_USM_MEMCPY, hQueue, cuStream)); + event_ptr->start(); + } + result = UR_CHECK_ERROR( + cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size, cuStream)); + if (phEvent) { + result = event_ptr->record(); + } + if (blocking) { + result = UR_CHECK_ERROR(cuStreamSynchronize(cuStream)); + } + if (phEvent) { + *phEvent = event_ptr.release(); + } + } catch (ur_result_t err) { + result = err; + } + return result; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( + ur_queue_handle_t hQueue, const void *pMem, size_t size, + ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE); + ur_device_handle_t device = hQueue->get_context()->get_device(); + + // Certain cuda devices and Windows do not have support for some Unified + // Memory features. cuMemPrefetchAsync requires concurrent memory access + // for managed memory. Therfore, ignore prefetch hint if concurrent managed + // memory access is not available. + if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { + setErrorMessage("Prefetch hint ignored as device does not support " + "concurrent managed access", + UR_RESULT_SUCCESS); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } + + unsigned int is_managed; + UR_CHECK_ERROR(cuPointerGetAttribute( + &is_managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem)); + if (!is_managed) { + setErrorMessage("Prefetch hint ignored as prefetch only works with USM", + UR_RESULT_SUCCESS); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } + + // flags is currently unused so fail if set + if (flags != 0) + return UR_RESULT_ERROR_INVALID_VALUE; + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); + ur_result_t result = UR_RESULT_SUCCESS; + std::unique_ptr event_ptr{nullptr}; + + try { + ScopedContext active(hQueue->get_context()); + CUstream cuStream = hQueue->get_next_transfer_stream(); + result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, + phEventWaitList); + if (phEvent) { + event_ptr = + std::unique_ptr(ur_event_handle_t_::make_native( + UR_COMMAND_MEM_BUFFER_COPY, hQueue, cuStream)); + event_ptr->start(); + } + result = UR_CHECK_ERROR( + cuMemPrefetchAsync((CUdeviceptr)pMem, size, device->get(), cuStream)); + if (phEvent) { + result = event_ptr->record(); + *phEvent = event_ptr.release(); + } + } catch (ur_result_t err) { + result = err; + } + return result; +} + +/// USM: memadvise API to govern behavior of automatic migration mechanisms +UR_APIEXPORT ur_result_t UR_APICALL +urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, + ur_usm_advice_flags_t advice, ur_event_handle_t *phEvent) { + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE); + UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + // Certain cuda devices and Windows do not have support for some Unified + // Memory features. Passing CU_MEM_ADVISE_SET/CLEAR_PREFERRED_LOCATION and + // to cuMemAdvise on a GPU device requires the GPU device to report a non-zero + // value for CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Therfore, ignore + // memory advise if concurrent managed memory access is not available. + if ((advice & UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION) || + (advice & UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION) || + (advice & UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE) || + (advice & UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_DEVICE) || + (advice & UR_USM_ADVICE_FLAG_DEFAULT)) { + ur_device_handle_t device = hQueue->get_context()->get_device(); + if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { + setErrorMessage("Mem advise ignored as device does not support " + "concurrent managed access", + UR_RESULT_SUCCESS); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } + + // TODO: If ptr points to valid system-allocated pageable memory we should + // check that the device also has the + // CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS property. + } + + unsigned int is_managed; + UR_CHECK_ERROR(cuPointerGetAttribute( + &is_managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem)); + if (!is_managed) { + setErrorMessage( + "Memory advice ignored as memory advices only works with USM", + UR_RESULT_SUCCESS); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } + + ur_result_t result = UR_RESULT_SUCCESS; + std::unique_ptr event_ptr{nullptr}; + + try { + ScopedContext active(hQueue->get_context()); + + if (phEvent) { + event_ptr = std::unique_ptr( + ur_event_handle_t_::make_native(UR_COMMAND_USM_ADVISE, hQueue, + hQueue->get_next_transfer_stream())); + event_ptr->start(); + } + + if (advice & UR_USM_ADVICE_FLAG_DEFAULT) { + UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size, + CU_MEM_ADVISE_UNSET_READ_MOSTLY, + hQueue->get_context()->get_device()->get())); + UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size, + CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION, + hQueue->get_context()->get_device()->get())); + UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size, + CU_MEM_ADVISE_UNSET_ACCESSED_BY, + hQueue->get_context()->get_device()->get())); + } else { + result = setCuMemAdvise((CUdeviceptr)pMem, size, advice, + hQueue->get_context()->get_device()->get()); + } + + if (phEvent) { + result = event_ptr->record(); + *phEvent = event_ptr.release(); + } + } catch (ur_result_t err) { + result = err; + } catch (...) { + result = UR_RESULT_ERROR_UNKNOWN; + } + return result; +} + +// TODO: Implement this. Remember to return true for +// PI_EXT_ONEAPI_CONTEXT_INFO_USM_FILL2D_SUPPORT when it is implemented. +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill2D( + ur_queue_handle_t hQueue, void *pMem, size_t pitch, size_t patternSize, + const void *pPattern, size_t width, size_t height, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( + ur_queue_handle_t hQueue, bool blocking, void *pDst, size_t dstPitch, + const void *pSrc, size_t srcPitch, size_t width, size_t height, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE); + ur_result_t result = UR_RESULT_SUCCESS; + + try { + ScopedContext active(hQueue->get_context()); + CUstream cuStream = hQueue->get_next_transfer_stream(); + result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, + phEventWaitList); + if (phEvent) { + (*phEvent) = ur_event_handle_t_::make_native( + UR_COMMAND_MEM_BUFFER_COPY_RECT, hQueue, cuStream); + (*phEvent)->start(); + } + + // Determine the direction of copy using cuPointerGetAttribute + // for both the src_ptr and dst_ptr + CUDA_MEMCPY2D cpyDesc = {0}; + + getUSMHostOrDevicePtr(pSrc, &cpyDesc.srcMemoryType, &cpyDesc.srcDevice, + &cpyDesc.srcHost); + getUSMHostOrDevicePtr(pDst, &cpyDesc.dstMemoryType, &cpyDesc.dstDevice, + &cpyDesc.dstHost); + + cpyDesc.dstPitch = dstPitch; + cpyDesc.srcPitch = srcPitch; + cpyDesc.WidthInBytes = width; + cpyDesc.Height = height; + + result = UR_CHECK_ERROR(cuMemcpy2DAsync(&cpyDesc, cuStream)); + + if (phEvent) { + (*phEvent)->record(); + } + if (blocking) { + result = UR_CHECK_ERROR(cuStreamSynchronize(cuStream)); + } + } catch (ur_result_t err) { + result = err; + } + return result; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp index 35d807ffb6db4..07ed631c5b31e 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp @@ -181,12 +181,12 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( pDdiTable->pfnMemImageRead = nullptr; pDdiTable->pfnMemImageWrite = nullptr; pDdiTable->pfnMemUnmap = nullptr; - pDdiTable->pfnUSMFill2D = nullptr; - pDdiTable->pfnUSMFill = nullptr; - pDdiTable->pfnUSMAdvise = nullptr; - pDdiTable->pfnUSMMemcpy2D = nullptr; - pDdiTable->pfnUSMMemcpy = nullptr; - pDdiTable->pfnUSMPrefetch = nullptr; + pDdiTable->pfnUSMFill2D = urEnqueueUSMFill2D; + pDdiTable->pfnUSMFill = urEnqueueUSMFill; + pDdiTable->pfnUSMAdvise = urEnqueueUSMAdvise; + pDdiTable->pfnUSMMemcpy2D = urEnqueueUSMMemcpy2D; + pDdiTable->pfnUSMMemcpy = urEnqueueUSMMemcpy; + pDdiTable->pfnUSMPrefetch = urEnqueueUSMPrefetch; return UR_RESULT_SUCCESS; } @@ -225,14 +225,14 @@ urGetUSMProcAddrTable(ur_api_version_t version, ur_usm_dditable_t *pDdiTable) { if (UR_RESULT_SUCCESS != result) { return result; } - pDdiTable->pfnDeviceAlloc = nullptr; - pDdiTable->pfnFree = nullptr; - pDdiTable->pfnGetMemAllocInfo = nullptr; - pDdiTable->pfnHostAlloc = nullptr; + pDdiTable->pfnDeviceAlloc = urUSMDeviceAlloc; + pDdiTable->pfnFree = urUSMFree; + pDdiTable->pfnGetMemAllocInfo = urUSMGetMemAllocInfo; + pDdiTable->pfnHostAlloc = urUSMHostAlloc; pDdiTable->pfnPoolCreate = nullptr; pDdiTable->pfnPoolDestroy = nullptr; pDdiTable->pfnPoolDestroy = nullptr; - pDdiTable->pfnSharedAlloc = nullptr; + pDdiTable->pfnSharedAlloc = urUSMSharedAlloc; return UR_RESULT_SUCCESS; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp new file mode 100644 index 0000000000000..0309d4a7b627a --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp @@ -0,0 +1,256 @@ +//===--------- usm.cpp - CUDA Adapter ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include + +#include "common.hpp" +#include "context.hpp" +#include "device.hpp" +#include "event.hpp" +#include "platform.hpp" +#include "queue.hpp" + +#include + +/// USM: Implements USM Host allocations using CUDA Pinned Memory +/// +UR_APIEXPORT ur_result_t UR_APICALL +urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc, + ur_usm_pool_handle_t pool, size_t size, void **ppMem) { + UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + size_t device_max_mem_alloc_size = 0; + UR_ASSERT(urDeviceGetInfo(hContext->get_device(), + UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, sizeof(size_t), + static_cast(&device_max_mem_alloc_size), + nullptr) == UR_RESULT_SUCCESS, + UR_RESULT_ERROR_INVALID_DEVICE); + UR_ASSERT(size > 0 && size <= device_max_mem_alloc_size, + UR_RESULT_ERROR_INVALID_USM_SIZE); + + ur_result_t result = UR_RESULT_SUCCESS; + try { + ScopedContext active(hContext); + result = UR_CHECK_ERROR(cuMemAllocHost(ppMem, size)); + } catch (ur_result_t error) { + result = error; + } + + UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 || + ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)), + UR_RESULT_ERROR_INVALID_VALUE); + + assert(result == UR_RESULT_SUCCESS && + (!pUSMDesc || pUSMDesc->align == 0 || + reinterpret_cast(*ppMem) % pUSMDesc->align == 0)); + + return result; +} + +/// USM: Implements USM device allocations using a normal CUDA device pointer +/// +UR_APIEXPORT ur_result_t UR_APICALL +urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, + const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool, + size_t size, void **ppMem) { + UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + size_t device_max_mem_alloc_size = 0; + UR_ASSERT(urDeviceGetInfo(hDevice, UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, + sizeof(size_t), + static_cast(&device_max_mem_alloc_size), + nullptr) == UR_RESULT_SUCCESS, + UR_RESULT_ERROR_INVALID_DEVICE); + UR_ASSERT(size > 0 && size <= device_max_mem_alloc_size, + UR_RESULT_ERROR_INVALID_USM_SIZE); + + ur_result_t result = UR_RESULT_SUCCESS; + try { + ScopedContext active(hContext); + result = UR_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)ppMem, size)); + } catch (ur_result_t error) { + result = error; + } + UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 || + ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)), + UR_RESULT_ERROR_INVALID_VALUE); + + assert(result == UR_RESULT_SUCCESS && + (!pUSMDesc || pUSMDesc->align == 0 || + reinterpret_cast(*ppMem) % pUSMDesc->align == 0)); + + return result; +} + +/// USM: Implements USM Shared allocations using CUDA Managed Memory +/// +UR_APIEXPORT ur_result_t UR_APICALL +urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, + const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool, + size_t size, void **ppMem) { + UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + size_t device_max_mem_alloc_size = 0; + UR_ASSERT(urDeviceGetInfo(hDevice, UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, + sizeof(size_t), + static_cast(&device_max_mem_alloc_size), + nullptr) == UR_RESULT_SUCCESS, + UR_RESULT_ERROR_INVALID_DEVICE); + UR_ASSERT(size > 0 && size <= device_max_mem_alloc_size, + UR_RESULT_ERROR_INVALID_USM_SIZE); + + ur_result_t result = UR_RESULT_SUCCESS; + try { + ScopedContext active(hContext); + result = UR_CHECK_ERROR( + cuMemAllocManaged((CUdeviceptr *)ppMem, size, CU_MEM_ATTACH_GLOBAL)); + } catch (ur_result_t error) { + result = error; + } + UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 || + ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)), + UR_RESULT_ERROR_INVALID_VALUE); + + assert(result == UR_RESULT_SUCCESS && + (!pUSMDesc || pUSMDesc->align == 0 || + reinterpret_cast(*ppMem) % pUSMDesc->align == 0)); + + return result; +} + +/// USM: Frees the given USM pointer associated with the context. +/// +UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext, + void *pMem) { + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); + ur_result_t result = UR_RESULT_SUCCESS; + try { + ScopedContext active(hContext); + bool is_managed; + unsigned int type; + void *attribute_values[2] = {&is_managed, &type}; + CUpointer_attribute attributes[2] = {CU_POINTER_ATTRIBUTE_IS_MANAGED, + CU_POINTER_ATTRIBUTE_MEMORY_TYPE}; + result = UR_CHECK_ERROR(cuPointerGetAttributes( + 2, attributes, attribute_values, (CUdeviceptr)pMem)); + UR_ASSERT(type == CU_MEMORYTYPE_DEVICE || type == CU_MEMORYTYPE_HOST, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + if (is_managed || type == CU_MEMORYTYPE_DEVICE) { + // Memory allocated with cuMemAlloc and cuMemAllocManaged must be freed + // with cuMemFree + result = UR_CHECK_ERROR(cuMemFree((CUdeviceptr)pMem)); + } else { + // Memory allocated with cuMemAllocHost must be freed with cuMemFreeHost + result = UR_CHECK_ERROR(cuMemFreeHost(pMem)); + } + } catch (ur_result_t error) { + result = error; + } + return result; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem, + ur_usm_alloc_info_t propName, size_t propValueSize, + void *pPropValue, size_t *pPropValueSizeRet) { + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + ur_result_t result = UR_RESULT_SUCCESS; + + UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet); + + try { + ScopedContext active(hContext); + switch (propName) { + case UR_USM_ALLOC_INFO_TYPE: { + unsigned int value; + // do not throw if cuPointerGetAttribute returns CUDA_ERROR_INVALID_VALUE + CUresult ret = cuPointerGetAttribute( + &value, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem); + if (ret == CUDA_ERROR_INVALID_VALUE) { + // pointer not known to the CUDA subsystem + return ReturnValue(UR_USM_TYPE_UNKNOWN); + } + result = check_error_ur(ret, __func__, __LINE__ - 5, __FILE__); + if (value) { + // pointer to managed memory + return ReturnValue(UR_USM_TYPE_SHARED); + } + result = UR_CHECK_ERROR(cuPointerGetAttribute( + &value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)pMem)); + UR_ASSERT(value == CU_MEMORYTYPE_DEVICE || value == CU_MEMORYTYPE_HOST, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + if (value == CU_MEMORYTYPE_DEVICE) { + // pointer to device memory + return ReturnValue(UR_USM_TYPE_DEVICE); + } + if (value == CU_MEMORYTYPE_HOST) { + // pointer to host memory + return ReturnValue(UR_USM_TYPE_HOST); + } + // should never get here +#ifdef _MSC_VER + __assume(0); +#else + __builtin_unreachable(); +#endif + return ReturnValue(UR_USM_TYPE_UNKNOWN); + } + case UR_USM_ALLOC_INFO_BASE_PTR: { +#if __CUDA_API_VERSION >= 10020 + // CU_POINTER_ATTRIBUTE_RANGE_START_ADDR was introduced in CUDA 10.2 + unsigned int value; + result = UR_CHECK_ERROR(cuPointerGetAttribute( + &value, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, (CUdeviceptr)pMem)); + return ReturnValue(value); +#else + return UR_RESULT_ERROR_INVALID_VALUE; +#endif + } + case UR_USM_ALLOC_INFO_SIZE: { +#if __CUDA_API_VERSION >= 10020 + // CU_POINTER_ATTRIBUTE_RANGE_SIZE was introduced in CUDA 10.2 + unsigned int value; + result = UR_CHECK_ERROR(cuPointerGetAttribute( + &value, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem)); + return ReturnValue(value); +#else + return UR_RESULT_ERROR_INVALID_VALUE; +#endif + } + case UR_USM_ALLOC_INFO_DEVICE: { + // get device index associated with this pointer + unsigned int device_idx; + result = UR_CHECK_ERROR(cuPointerGetAttribute( + &device_idx, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)pMem)); + + // currently each device is in its own platform, so find the platform at + // the same index + std::vector platforms; + platforms.resize(device_idx + 1); + result = urPlatformGet(device_idx + 1, platforms.data(), nullptr); + + // get the device from the platform + ur_device_handle_t device = platforms[device_idx]->devices_[0].get(); + return ReturnValue(device); + } + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } catch (ur_result_t error) { + result = error; + } + return result; +} From d185543356b377e682ab8e2076c99691b2666b72 Mon Sep 17 00:00:00 2001 From: Petr Vesely Date: Thu, 27 Apr 2023 10:55:46 +0100 Subject: [PATCH 17/45] [UR][CUDA][SYCL] Fix sycl-e2e tests --- .../ur/adapters/cuda/memory.cpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp index 0827f09c79a9e..59975b0a7b821 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp @@ -23,8 +23,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( // Validate flags UR_ASSERT((flags & UR_MEM_FLAGS_MASK) == 0, UR_RESULT_ERROR_INVALID_ENUMERATION); - if (flags & (UR_MEM_FLAG_USE_HOST_POINTER | UR_MEM_FLAG_ALLOC_HOST_POINTER | - UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) { + if (flags & + (UR_MEM_FLAG_USE_HOST_POINTER | UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) { UR_ASSERT(pProperties && pProperties->pHost, UR_RESULT_ERROR_INVALID_HOST_PTR); } @@ -251,8 +251,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( UR_ASSERT(pImageDesc, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT((flags & UR_MEM_FLAGS_MASK) == 0, UR_RESULT_ERROR_INVALID_ENUMERATION); - if (flags & (UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER | - UR_MEM_FLAG_ALLOC_HOST_POINTER | UR_MEM_FLAG_USE_HOST_POINTER)) { + if (flags & + (UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER | UR_MEM_FLAG_USE_HOST_POINTER)) { UR_ASSERT(pHost, UR_RESULT_ERROR_INVALID_HOST_PTR); } const bool performInitialCopy = @@ -267,10 +267,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); UR_ASSERT(pImageDesc->numSamples == 0, UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); - UR_ASSERT(pHost == nullptr && pImageDesc->rowPitch == 0, - UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); - UR_ASSERT(pHost == nullptr && pImageDesc->slicePitch == 0, - UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); + if (!pHost) { + UR_ASSERT(pImageDesc->rowPitch == 0, + UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); + UR_ASSERT(pImageDesc->slicePitch == 0, + UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); + } ur_result_t retErr = UR_RESULT_SUCCESS; From 398f3e9603caa3cb70e5eb7eecc044c88c41a841 Mon Sep 17 00:00:00 2001 From: Petr Vesely Date: Wed, 26 Apr 2023 11:24:54 +0100 Subject: [PATCH 18/45] [UR][CUDA] Port urEnqueueRead/Write & setArgMemObj --- sycl/plugins/cuda/pi_cuda.cpp | 10 +- .../ur/adapters/cuda/enqueue.cpp | 113 ++++++++++++++++++ .../ur/adapters/cuda/kernel.cpp | 34 ++++++ .../ur/adapters/cuda/ur_interface_loader.cpp | 6 +- 4 files changed, 155 insertions(+), 8 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index ed10a030b665c..06650450a0a32 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -1287,7 +1287,7 @@ pi_result cuda_piEnqueueMemBufferMap(pi_queue command_queue, pi_mem buffer, if (!is_pinned && ((map_flags & PI_MAP_READ) || (map_flags & PI_MAP_WRITE))) { // Pinned host memory is already on host so it doesn't need to be read. - ret_err = cuda_piEnqueueMemBufferRead( + ret_err = pi2ur::piEnqueueMemBufferRead( command_queue, buffer, blocking_map, offset, size, hostPtr, num_events_in_wait_list, event_wait_list, event); } else { @@ -1340,7 +1340,7 @@ pi_result cuda_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj, (memobj->mem_.buffer_mem_.get_map_flags() & PI_MAP_WRITE_INVALIDATE_REGION))) { // Pinned host memory is only on host so it doesn't need to be written to. - ret_err = cuda_piEnqueueMemBufferWrite( + ret_err = pi2ur::piEnqueueMemBufferWrite( command_queue, memobj, true, memobj->mem_.buffer_mem_.get_map_offset(mapped_ptr), memobj->mem_.buffer_mem_.get_size(), mapped_ptr, @@ -1607,9 +1607,9 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piEnqueueNativeKernel, cuda_piEnqueueNativeKernel) _PI_CL(piEnqueueEventsWait, pi2ur::piEnqueueEventsWait) _PI_CL(piEnqueueEventsWaitWithBarrier, pi2ur::piEnqueueEventsWaitWithBarrier) - _PI_CL(piEnqueueMemBufferRead, cuda_piEnqueueMemBufferRead) + _PI_CL(piEnqueueMemBufferRead, pi2ur::piEnqueueMemBufferRead) _PI_CL(piEnqueueMemBufferReadRect, cuda_piEnqueueMemBufferReadRect) - _PI_CL(piEnqueueMemBufferWrite, cuda_piEnqueueMemBufferWrite) + _PI_CL(piEnqueueMemBufferWrite, pi2ur::piEnqueueMemBufferRead) _PI_CL(piEnqueueMemBufferWriteRect, cuda_piEnqueueMemBufferWriteRect) _PI_CL(piEnqueueMemBufferCopy, cuda_piEnqueueMemBufferCopy) _PI_CL(piEnqueueMemBufferCopyRect, cuda_piEnqueueMemBufferCopyRect) @@ -1643,7 +1643,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextEnqueueReadHostPipe, cuda_piextEnqueueReadHostPipe) _PI_CL(piextEnqueueWriteHostPipe, cuda_piextEnqueueWriteHostPipe) - _PI_CL(piextKernelSetArgMemObj, cuda_piextKernelSetArgMemObj) + _PI_CL(piextKernelSetArgMemObj, pi2ur::piextKernelSetArgMemObj) _PI_CL(piextKernelSetArgSampler, cuda_piextKernelSetArgSampler) _PI_CL(piPluginGetLastError, pi2ur::piPluginGetLastError) _PI_CL(piTearDown, pi2ur::piTearDown) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp index 7e0e7b5905f31..674bea82ddef9 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp @@ -10,6 +10,7 @@ #include "context.hpp" #include "event.hpp" #include "kernel.hpp" +#include "memory.hpp" #include "queue.hpp" #include @@ -761,3 +762,115 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( } return result; } + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingRead, + size_t offset, size_t size, void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(!hBuffer->is_image(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER); + if (phEventWaitList) { + UR_ASSERT(numEventsInWaitList > 0, UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + } else { + UR_ASSERT(numEventsInWaitList == 0, + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + } + UR_ASSERT(offset + size <= hBuffer->mem_.buffer_mem_.size_, + UR_RESULT_ERROR_INVALID_SIZE); + + ur_result_t retErr = UR_RESULT_SUCCESS; + CUdeviceptr devPtr = hBuffer->mem_.buffer_mem_.get(); + std::unique_ptr retImplEv{nullptr}; + + try { + ScopedContext active(hQueue->get_context()); + CUstream cuStream = hQueue->get_next_transfer_stream(); + + retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, + phEventWaitList); + + if (phEvent) { + retImplEv = + std::unique_ptr(ur_event_handle_t_::make_native( + UR_COMMAND_MEM_BUFFER_READ, hQueue, cuStream)); + retImplEv->start(); + } + + UR_CHECK_ERROR(cuMemcpyDtoHAsync(pDst, devPtr + offset, size, cuStream)); + + if (phEvent) { + retErr = retImplEv->record(); + } + + if (blockingRead) { + UR_CHECK_ERROR(cuStreamSynchronize(cuStream)); + } + + if (phEvent) { + *phEvent = retImplEv.release(); + } + + } catch (ur_result_t err) { + retErr = err; + } + + return retErr; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingWrite, + size_t offset, size_t size, const void *pSrc, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(!hBuffer->is_image(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER); + if (phEventWaitList) { + UR_ASSERT(numEventsInWaitList > 0, UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + } else { + UR_ASSERT(numEventsInWaitList == 0, + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + } + UR_ASSERT(offset + size <= hBuffer->mem_.buffer_mem_.size_, + UR_RESULT_ERROR_INVALID_SIZE); + + ur_result_t retErr = UR_RESULT_SUCCESS; + CUdeviceptr devPtr = hBuffer->mem_.buffer_mem_.get(); + std::unique_ptr retImplEv{nullptr}; + + try { + ScopedContext active(hQueue->get_context()); + CUstream cuStream = hQueue->get_next_transfer_stream(); + + retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, + phEventWaitList); + + if (phEvent) { + retImplEv = + std::unique_ptr(ur_event_handle_t_::make_native( + UR_COMMAND_MEM_BUFFER_WRITE, hQueue, cuStream)); + retImplEv->start(); + } + + UR_CHECK_ERROR(cuMemcpyHtoDAsync(devPtr + offset, pSrc, size, cuStream)); + + if (phEvent) { + retErr = retImplEv->record(); + } + + if (blockingWrite) { + UR_CHECK_ERROR(cuStreamSynchronize(cuStream)); + } + + if (phEvent) { + *phEvent = retImplEv.release(); + } + } catch (ur_result_t err) { + retErr = err; + } + return retErr; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp index e80960f7ceb3c..e0f07b41e611b 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp @@ -7,6 +7,7 @@ //===-----------------------------------------------------------------===// #include "kernel.hpp" +#include "memory.hpp" UR_APIEXPORT ur_result_t UR_APICALL urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName, @@ -290,6 +291,39 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer( return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj( + ur_kernel_handle_t hKernel, uint32_t argIndex, ur_mem_handle_t hArgValue) { + + UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hArgValue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + ur_result_t retErr = UR_RESULT_SUCCESS; + try { + if (hArgValue->mem_type_ == ur_mem_handle_t_::mem_type::surface) { + CUDA_ARRAY3D_DESCRIPTOR arrayDesc; + UR_CHECK_ERROR(cuArray3DGetDescriptor( + &arrayDesc, hArgValue->mem_.surface_mem_.get_array())); + if (arrayDesc.Format != CU_AD_FORMAT_UNSIGNED_INT32 && + arrayDesc.Format != CU_AD_FORMAT_SIGNED_INT32 && + arrayDesc.Format != CU_AD_FORMAT_HALF && + arrayDesc.Format != CU_AD_FORMAT_FLOAT) { + setErrorMessage("PI CUDA kernels only support images with channel " + "types int32, uint32, float, and half.", + UR_RESULT_ERROR_ADAPTER_SPECIFIC); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } + CUsurfObject cuSurf = hArgValue->mem_.surface_mem_.get_surface(); + hKernel->set_kernel_arg(argIndex, sizeof(cuSurf), (void *)&cuSurf); + } else { + CUdeviceptr cuPtr = hArgValue->mem_.buffer_mem_.get(); + hKernel->set_kernel_arg(argIndex, sizeof(CUdeviceptr), (void *)&cuPtr); + } + } catch (ur_result_t err) { + retErr = err; + } + return retErr; +} + // A NOP for the CUDA backend UR_APIEXPORT ur_result_t UR_APICALL urKernelSetExecInfo(ur_kernel_handle_t hKernel, ur_kernel_exec_info_t propName, diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp index 07ed631c5b31e..085f87ab799ce 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp @@ -115,7 +115,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnRelease = urKernelRelease; pDdiTable->pfnRetain = urKernelRetain; pDdiTable->pfnSetArgLocal = nullptr; - pDdiTable->pfnSetArgMemObj = nullptr; + pDdiTable->pfnSetArgMemObj = urKernelSetArgMemObj; pDdiTable->pfnSetArgPointer = urKernelSetArgPointer; pDdiTable->pfnSetArgSampler = nullptr; pDdiTable->pfnSetArgValue = urKernelSetArgValue; @@ -173,9 +173,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( pDdiTable->pfnMemBufferCopyRect = nullptr; pDdiTable->pfnMemBufferFill = nullptr; pDdiTable->pfnMemBufferMap = nullptr; - pDdiTable->pfnMemBufferRead = nullptr; + pDdiTable->pfnMemBufferRead = urEnqueueMemBufferRead; pDdiTable->pfnMemBufferReadRect = nullptr; - pDdiTable->pfnMemBufferWrite = nullptr; + pDdiTable->pfnMemBufferWrite = urEnqueueMemBufferWrite; pDdiTable->pfnMemBufferWriteRect = nullptr; pDdiTable->pfnMemImageCopy = nullptr; pDdiTable->pfnMemImageRead = nullptr; From e02d3d302d32fec3abdd8e69d0c1fdbbbfd9d12f Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Fri, 28 Apr 2023 10:08:38 +0100 Subject: [PATCH 19/45] Port piextKernelSetArgSampler --- sycl/plugins/cuda/pi_cuda.cpp | 175 +----------------- .../ur/adapters/cuda/kernel.cpp | 24 ++- .../ur/adapters/cuda/ur_interface_loader.cpp | 2 +- 3 files changed, 25 insertions(+), 176 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 06650450a0a32..7b599e17dd04f 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -413,179 +413,6 @@ pi_result cuda_piextGetDeviceFunctionPointer([[maybe_unused]] pi_device device, return retError; } -/// Created a PI image mem object from a CUDA image mem handle. -/// TODO: Implement this. -/// NOTE: The created PI object takes ownership of the native handle. -/// -/// \param[in] pi_native_handle The native handle to create PI mem object from. -/// \param[in] pi_context The PI context of the memory allocation. -/// \param[in] ownNativeHandle Boolean indicates if we own the native memory -/// handle or it came from interop that asked to not transfer the ownership to -/// SYCL RT. \param[in] pi_image_format The format of the image. \param[in] -/// pi_image_desc The description information for the image. \param[out] pi_mem -/// Set to the PI mem object created from native handle. -/// -/// \return TBD -pi_result cuda_piextMemImageCreateWithNativeHandle(pi_native_handle, pi_context, - bool, - const pi_image_format *, - const pi_image_desc *, - pi_mem *) { - sycl::detail::pi::die( - "Creation of PI mem from native image handle not implemented"); - return {}; -} - -pi_result cuda_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer, - pi_bool blocking_write, size_t offset, - size_t size, const void *ptr, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event) { - - assert(buffer != nullptr); - assert(command_queue != nullptr); - pi_result retErr = PI_SUCCESS; - CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get(); - std::unique_ptr<_pi_event> retImplEv{nullptr}; - - try { - ScopedContext active(command_queue->get_context()); - CUstream cuStream = command_queue->get_next_transfer_stream(); - - retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, - event_wait_list); - - if (event) { - retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_WRITE, command_queue, cuStream)); - retImplEv->start(); - } - - retErr = - PI_CHECK_ERROR(cuMemcpyHtoDAsync(devPtr + offset, ptr, size, cuStream)); - - if (event) { - retErr = map_ur_error(retImplEv->record()); - } - - if (blocking_write) { - retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream)); - } - - if (event) { - *event = retImplEv.release(); - } - } catch (pi_result err) { - retErr = err; - } - return retErr; -} - -pi_result cuda_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer, - pi_bool blocking_read, size_t offset, - size_t size, void *ptr, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event) { - - assert(buffer != nullptr); - assert(command_queue != nullptr); - pi_result retErr = PI_SUCCESS; - CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get(); - std::unique_ptr<_pi_event> retImplEv{nullptr}; - - try { - ScopedContext active(command_queue->get_context()); - CUstream cuStream = command_queue->get_next_transfer_stream(); - - retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, - event_wait_list); - - if (event) { - retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_READ, command_queue, cuStream)); - retImplEv->start(); - } - - retErr = - PI_CHECK_ERROR(cuMemcpyDtoHAsync(ptr, devPtr + offset, size, cuStream)); - - if (event) { - retErr = map_ur_error(retImplEv->record()); - } - - if (blocking_read) { - retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream)); - } - - if (event) { - *event = retImplEv.release(); - } - - } catch (pi_result err) { - retErr = err; - } - return retErr; -} - -pi_result cuda_piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index, - const pi_mem *arg_value) { - - assert(kernel != nullptr); - assert(arg_value != nullptr); - - // Below sets kernel arg when zero-sized buffers are handled. - // In such case the corresponding memory is null. - if (*arg_value == nullptr) { - kernel->set_kernel_arg(arg_index, 0, nullptr); - return PI_SUCCESS; - } - - pi_result retErr = PI_SUCCESS; - try { - pi_mem arg_mem = *arg_value; - if (arg_mem->mem_type_ == _pi_mem::mem_type::surface) { - CUDA_ARRAY3D_DESCRIPTOR arrayDesc; - PI_CHECK_ERROR(cuArray3DGetDescriptor( - &arrayDesc, arg_mem->mem_.surface_mem_.get_array())); - if (arrayDesc.Format != CU_AD_FORMAT_UNSIGNED_INT32 && - arrayDesc.Format != CU_AD_FORMAT_SIGNED_INT32 && - arrayDesc.Format != CU_AD_FORMAT_HALF && - arrayDesc.Format != CU_AD_FORMAT_FLOAT) { - setErrorMessage("PI CUDA kernels only support images with channel " - "types int32, uint32, float, and half.", - UR_RESULT_ERROR_ADAPTER_SPECIFIC); - return PI_ERROR_PLUGIN_SPECIFIC_ERROR; - } - CUsurfObject cuSurf = arg_mem->mem_.surface_mem_.get_surface(); - kernel->set_kernel_arg(arg_index, sizeof(cuSurf), (void *)&cuSurf); - } else { - CUdeviceptr cuPtr = arg_mem->mem_.buffer_mem_.get(); - kernel->set_kernel_arg(arg_index, sizeof(CUdeviceptr), (void *)&cuPtr); - } - } catch (pi_result err) { - retErr = err; - } - return retErr; -} - -pi_result cuda_piextKernelSetArgSampler(pi_kernel kernel, pi_uint32 arg_index, - const pi_sampler *arg_value) { - - assert(kernel != nullptr); - assert(arg_value != nullptr); - - pi_result retErr = PI_SUCCESS; - try { - pi_uint32 samplerProps = (*arg_value)->props_; - kernel->set_kernel_arg(arg_index, sizeof(pi_uint32), (void *)&samplerProps); - } catch (pi_result err) { - retErr = err; - } - return retErr; -} - /// \TODO Not implemented pi_result cuda_piEnqueueNativeKernel(pi_queue, void (*)(void *), void *, size_t, pi_uint32, const pi_mem *, const void **, @@ -1644,7 +1471,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextEnqueueWriteHostPipe, cuda_piextEnqueueWriteHostPipe) _PI_CL(piextKernelSetArgMemObj, pi2ur::piextKernelSetArgMemObj) - _PI_CL(piextKernelSetArgSampler, cuda_piextKernelSetArgSampler) + _PI_CL(piextKernelSetArgSampler, pi2ur::piextKernelSetArgSampler) _PI_CL(piPluginGetLastError, pi2ur::piPluginGetLastError) _PI_CL(piTearDown, pi2ur::piTearDown) _PI_CL(piGetDeviceAndHostTimer, pi2ur::piGetDeviceAndHostTimer) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp index e0f07b41e611b..69f86ca319df5 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp @@ -8,6 +8,7 @@ #include "kernel.hpp" #include "memory.hpp" +#include "sampler.hpp" UR_APIEXPORT ur_result_t UR_APICALL urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName, @@ -295,7 +296,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj( ur_kernel_handle_t hKernel, uint32_t argIndex, ur_mem_handle_t hArgValue) { UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hArgValue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + // Below sets kernel arg when zero-sized buffers are handled. + // In such case the corresponding memory is null. + if (hArgValue == nullptr) { + hKernel->set_kernel_arg(argIndex, 0, nullptr); + return UR_RESULT_SUCCESS; + } ur_result_t retErr = UR_RESULT_SUCCESS; try { @@ -338,3 +345,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle( ur_kernel_handle_t *phKernel) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + +UR_APIEXPORT ur_result_t UR_APICALL +urKernelSetArgSampler(ur_kernel_handle_t hKernel, uint32_t argIndex, + ur_sampler_handle_t hArgValue) { + UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + ur_result_t retErr = UR_RESULT_SUCCESS; + try { + uint32_t samplerProps = hArgValue->props_; + hKernel->set_kernel_arg(argIndex, sizeof(uint32_t), (void *)&samplerProps); + } catch (ur_result_t err) { + retErr = err; + } + return retErr; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp index 085f87ab799ce..d7751a02e9707 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp @@ -117,7 +117,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetArgLocal = nullptr; pDdiTable->pfnSetArgMemObj = urKernelSetArgMemObj; pDdiTable->pfnSetArgPointer = urKernelSetArgPointer; - pDdiTable->pfnSetArgSampler = nullptr; + pDdiTable->pfnSetArgSampler = urKernelSetArgSampler; pDdiTable->pfnSetArgValue = urKernelSetArgValue; pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; pDdiTable->pfnSetSpecializationConstants = nullptr; From 816652ac6e2fe054b963f7b6bdaf32697f206791 Mon Sep 17 00:00:00 2001 From: Petr Vesely Date: Fri, 28 Apr 2023 11:08:47 +0100 Subject: [PATCH 20/45] [UR][SYCL][CUDA] Point PI to correct entry point --- sycl/plugins/cuda/pi_cuda.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 7b599e17dd04f..51ffd1fb63455 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -1436,7 +1436,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piEnqueueEventsWaitWithBarrier, pi2ur::piEnqueueEventsWaitWithBarrier) _PI_CL(piEnqueueMemBufferRead, pi2ur::piEnqueueMemBufferRead) _PI_CL(piEnqueueMemBufferReadRect, cuda_piEnqueueMemBufferReadRect) - _PI_CL(piEnqueueMemBufferWrite, pi2ur::piEnqueueMemBufferRead) + _PI_CL(piEnqueueMemBufferWrite, pi2ur::piEnqueueMemBufferWrite) _PI_CL(piEnqueueMemBufferWriteRect, cuda_piEnqueueMemBufferWriteRect) _PI_CL(piEnqueueMemBufferCopy, cuda_piEnqueueMemBufferCopy) _PI_CL(piEnqueueMemBufferCopyRect, cuda_piEnqueueMemBufferCopyRect) From 6d648f66266e0a921c3efcdb0227333fe75e7f18 Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Thu, 27 Apr 2023 11:47:20 +0100 Subject: [PATCH 21/45] Port remaining queue entry-points --- sycl/plugins/cuda/pi_cuda.cpp | 916 +----------------- .../ur/adapters/cuda/enqueue.cpp | 768 ++++++++++++++- .../ur/adapters/cuda/ur_interface_loader.cpp | 20 +- 3 files changed, 780 insertions(+), 924 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 51ffd1fb63455..09c2fddc6e207 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -73,83 +73,6 @@ pi_result cuda_piPluginGetBackendOption(pi_platform, return PI_ERROR_INVALID_VALUE; } -pi_result map_ur_error(ur_result_t result) { - switch (result) { - case UR_RESULT_SUCCESS: - return PI_SUCCESS; - case UR_RESULT_ERROR_INVALID_OPERATION: - return PI_ERROR_INVALID_OPERATION; - case UR_RESULT_ERROR_INVALID_CONTEXT: - return PI_ERROR_INVALID_CONTEXT; - case UR_RESULT_ERROR_INVALID_DEVICE: - return PI_ERROR_INVALID_DEVICE; - case UR_RESULT_ERROR_INVALID_VALUE: - return PI_ERROR_INVALID_VALUE; - case UR_RESULT_ERROR_OUT_OF_HOST_MEMORY: - return PI_ERROR_OUT_OF_HOST_MEMORY; - case UR_RESULT_ERROR_OUT_OF_RESOURCES: - return PI_ERROR_OUT_OF_RESOURCES; - default: - return PI_ERROR_UNKNOWN; - } -} - -pi_mem_type map_ur_mem_type(ur_mem_type_t mem_type) { - switch (mem_type) { - case UR_MEM_TYPE_BUFFER: - default: - return PI_MEM_TYPE_BUFFER; - case UR_MEM_TYPE_IMAGE2D: - return PI_MEM_TYPE_IMAGE2D; - case UR_MEM_TYPE_IMAGE3D: - return PI_MEM_TYPE_IMAGE3D; - case UR_MEM_TYPE_IMAGE2D_ARRAY: - return PI_MEM_TYPE_IMAGE2D_ARRAY; - case UR_MEM_TYPE_IMAGE1D: - return PI_MEM_TYPE_IMAGE1D; - case UR_MEM_TYPE_IMAGE1D_ARRAY: - return PI_MEM_TYPE_IMAGE1D_ARRAY; - case UR_MEM_TYPE_IMAGE1D_BUFFER: - return PI_MEM_TYPE_IMAGE1D_BUFFER; - } -} - -template -inline pi_result -ConvertInputBitfield(pi_bitfield in, TypeOut *out, - const std::unordered_map &map) { - *out = 0; - for (auto &[FlagPI, FlagUR] : map) { - if (in & FlagPI) { - *out |= FlagUR; - } - } - - return PI_SUCCESS; -} - -// Convert bitfield flags from PI to UR for MemFlags -inline pi_result pi2urMemFlags(pi_mem_flags piFlags, ur_mem_flags_t *urFlags) { - static const std::unordered_map MemFlagsMap = { - {PI_MEM_FLAGS_ACCESS_RW, UR_MEM_FLAG_READ_WRITE}, - {PI_MEM_ACCESS_READ_ONLY, UR_MEM_FLAG_READ_ONLY}, - {PI_MEM_FLAGS_HOST_PTR_USE, UR_MEM_FLAG_USE_HOST_POINTER}, - {PI_MEM_FLAGS_HOST_PTR_COPY, UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER}, - {PI_MEM_FLAGS_HOST_PTR_ALLOC, UR_MEM_FLAG_ALLOC_HOST_POINTER}, - }; - - return ConvertInputBitfield(piFlags, urFlags, MemFlagsMap); -} - -// Convert bitfield flags from PI to UR for MapFlags -inline pi_result pi2urMapFlags(pi_mem_flags piFlags, ur_mem_flags_t *urFlags) { - static const std::unordered_map MapFlagsMap = { - {PI_MAP_READ, UR_MAP_FLAG_READ}, - {PI_MAP_WRITE, UR_MAP_FLAG_WRITE}, - }; - return ConvertInputBitfield(piFlags, urFlags, MapFlagsMap); -} - // Iterates over the event wait list, returns correct pi_result error codes. // Invokes the callback for the latest event of each queue in the wait list. // The callback must take a single pi_event argument and return a pi_result. @@ -280,36 +203,6 @@ pi_result getInfoArray(size_t array_length, size_t param_value_size, } /// \endcond -pi_result enqueueEventsWait(pi_queue command_queue, CUstream stream, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list) { - if (!event_wait_list) { - return PI_SUCCESS; - } - try { - ScopedContext active(command_queue->get_context()); - - auto result = forLatestEvents( - event_wait_list, num_events_in_wait_list, - [stream](pi_event event) -> pi_result { - if (event->get_stream() == stream) { - return PI_SUCCESS; - } else { - return PI_CHECK_ERROR(cuStreamWaitEvent(stream, event->get(), 0)); - } - }); - - if (result != PI_SUCCESS) { - return result; - } - return PI_SUCCESS; - } catch (pi_result err) { - return err; - } catch (...) { - return PI_ERROR_UNKNOWN; - } -} - } // anonymous namespace /// ------ Error handling, matching OpenCL plugin semantics. @@ -413,790 +306,6 @@ pi_result cuda_piextGetDeviceFunctionPointer([[maybe_unused]] pi_device device, return retError; } -/// \TODO Not implemented -pi_result cuda_piEnqueueNativeKernel(pi_queue, void (*)(void *), void *, size_t, - pi_uint32, const pi_mem *, const void **, - pi_uint32, const pi_event *, pi_event *) { - sycl::detail::pi::die("Not implemented in CUDA backend"); - return {}; -} - -/// General 3D memory copy operation. -/// This function requires the corresponding CUDA context to be at the top of -/// the context stack -/// If the source and/or destination is on the device, src_ptr and/or dst_ptr -/// must be a pointer to a CUdeviceptr -static pi_result commonEnqueueMemBufferCopyRect( - CUstream cu_stream, pi_buff_rect_region region, const void *src_ptr, - const CUmemorytype_enum src_type, pi_buff_rect_offset src_offset, - size_t src_row_pitch, size_t src_slice_pitch, void *dst_ptr, - const CUmemorytype_enum dst_type, pi_buff_rect_offset dst_offset, - size_t dst_row_pitch, size_t dst_slice_pitch) { - - assert(region != nullptr); - assert(src_offset != nullptr); - assert(dst_offset != nullptr); - - assert(src_type == CU_MEMORYTYPE_DEVICE || src_type == CU_MEMORYTYPE_HOST); - assert(dst_type == CU_MEMORYTYPE_DEVICE || dst_type == CU_MEMORYTYPE_HOST); - - src_row_pitch = (!src_row_pitch) ? region->width_bytes + src_offset->x_bytes - : src_row_pitch; - src_slice_pitch = - (!src_slice_pitch) - ? ((region->height_scalar + src_offset->y_scalar) * src_row_pitch) - : src_slice_pitch; - dst_row_pitch = (!dst_row_pitch) ? region->width_bytes + dst_offset->x_bytes - : dst_row_pitch; - dst_slice_pitch = - (!dst_slice_pitch) - ? ((region->height_scalar + dst_offset->y_scalar) * dst_row_pitch) - : dst_slice_pitch; - - CUDA_MEMCPY3D params = {}; - - params.WidthInBytes = region->width_bytes; - params.Height = region->height_scalar; - params.Depth = region->depth_scalar; - - params.srcMemoryType = src_type; - params.srcDevice = src_type == CU_MEMORYTYPE_DEVICE - ? *static_cast(src_ptr) - : 0; - params.srcHost = src_type == CU_MEMORYTYPE_HOST ? src_ptr : nullptr; - params.srcXInBytes = src_offset->x_bytes; - params.srcY = src_offset->y_scalar; - params.srcZ = src_offset->z_scalar; - params.srcPitch = src_row_pitch; - params.srcHeight = src_slice_pitch / src_row_pitch; - - params.dstMemoryType = dst_type; - params.dstDevice = dst_type == CU_MEMORYTYPE_DEVICE - ? *static_cast(dst_ptr) - : 0; - params.dstHost = dst_type == CU_MEMORYTYPE_HOST ? dst_ptr : nullptr; - params.dstXInBytes = dst_offset->x_bytes; - params.dstY = dst_offset->y_scalar; - params.dstZ = dst_offset->z_scalar; - params.dstPitch = dst_row_pitch; - params.dstHeight = dst_slice_pitch / dst_row_pitch; - - return PI_CHECK_ERROR(cuMemcpy3DAsync(¶ms, cu_stream)); -} - -pi_result cuda_piEnqueueMemBufferReadRect( - pi_queue command_queue, pi_mem buffer, pi_bool blocking_read, - pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, - pi_buff_rect_region region, size_t buffer_row_pitch, - size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, - void *ptr, pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, pi_event *event) { - - assert(buffer != nullptr); - assert(command_queue != nullptr); - - pi_result retErr = PI_SUCCESS; - CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get(); - std::unique_ptr<_pi_event> retImplEv{nullptr}; - - try { - ScopedContext active(command_queue->get_context()); - CUstream cuStream = command_queue->get_next_transfer_stream(); - - retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, - event_wait_list); - - if (event) { - retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_READ_RECT, command_queue, cuStream)); - retImplEv->start(); - } - - retErr = commonEnqueueMemBufferCopyRect( - cuStream, region, &devPtr, CU_MEMORYTYPE_DEVICE, buffer_offset, - buffer_row_pitch, buffer_slice_pitch, ptr, CU_MEMORYTYPE_HOST, - host_offset, host_row_pitch, host_slice_pitch); - - if (event) { - retErr = map_ur_error(retImplEv->record()); - } - - if (blocking_read) { - retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream)); - } - - if (event) { - *event = retImplEv.release(); - } - - } catch (pi_result err) { - retErr = err; - } - return retErr; -} - -pi_result cuda_piEnqueueMemBufferWriteRect( - pi_queue command_queue, pi_mem buffer, pi_bool blocking_write, - pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, - pi_buff_rect_region region, size_t buffer_row_pitch, - size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, - const void *ptr, pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, pi_event *event) { - - assert(buffer != nullptr); - assert(command_queue != nullptr); - - pi_result retErr = PI_SUCCESS; - CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get(); - std::unique_ptr<_pi_event> retImplEv{nullptr}; - - try { - ScopedContext active(command_queue->get_context()); - CUstream cuStream = command_queue->get_next_transfer_stream(); - retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, - event_wait_list); - - if (event) { - retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_WRITE_RECT, command_queue, cuStream)); - retImplEv->start(); - } - - retErr = commonEnqueueMemBufferCopyRect( - cuStream, region, ptr, CU_MEMORYTYPE_HOST, host_offset, host_row_pitch, - host_slice_pitch, &devPtr, CU_MEMORYTYPE_DEVICE, buffer_offset, - buffer_row_pitch, buffer_slice_pitch); - - if (event) { - retErr = map_ur_error(retImplEv->record()); - } - - if (blocking_write) { - retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream)); - } - - if (event) { - *event = retImplEv.release(); - } - - } catch (pi_result err) { - retErr = err; - } - return retErr; -} - -pi_result cuda_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, - pi_mem dst_buffer, size_t src_offset, - size_t dst_offset, size_t size, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event) { - if (!command_queue) { - return PI_ERROR_INVALID_QUEUE; - } - - std::unique_ptr<_pi_event> retImplEv{nullptr}; - - try { - ScopedContext active(command_queue->get_context()); - pi_result result; - - auto stream = command_queue->get_next_transfer_stream(); - result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list, - event_wait_list); - - if (event) { - retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_COPY, command_queue, stream)); - result = map_ur_error(retImplEv->start()); - } - - auto src = src_buffer->mem_.buffer_mem_.get() + src_offset; - auto dst = dst_buffer->mem_.buffer_mem_.get() + dst_offset; - - result = PI_CHECK_ERROR(cuMemcpyDtoDAsync(dst, src, size, stream)); - - if (event) { - result = map_ur_error(retImplEv->record()); - *event = retImplEv.release(); - } - - return result; - } catch (pi_result err) { - return err; - } catch (...) { - return PI_ERROR_UNKNOWN; - } -} - -pi_result cuda_piEnqueueMemBufferCopyRect( - pi_queue command_queue, pi_mem src_buffer, pi_mem dst_buffer, - pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin, - pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch, - size_t dst_row_pitch, size_t dst_slice_pitch, - pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, - pi_event *event) { - - assert(src_buffer != nullptr); - assert(dst_buffer != nullptr); - assert(command_queue != nullptr); - - pi_result retErr = PI_SUCCESS; - CUdeviceptr srcPtr = src_buffer->mem_.buffer_mem_.get(); - CUdeviceptr dstPtr = dst_buffer->mem_.buffer_mem_.get(); - std::unique_ptr<_pi_event> retImplEv{nullptr}; - - try { - ScopedContext active(command_queue->get_context()); - CUstream cuStream = command_queue->get_next_transfer_stream(); - retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, - event_wait_list); - - if (event) { - retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, command_queue, cuStream)); - retImplEv->start(); - } - - retErr = commonEnqueueMemBufferCopyRect( - cuStream, region, &srcPtr, CU_MEMORYTYPE_DEVICE, src_origin, - src_row_pitch, src_slice_pitch, &dstPtr, CU_MEMORYTYPE_DEVICE, - dst_origin, dst_row_pitch, dst_slice_pitch); - - if (event) { - retImplEv->record(); - *event = retImplEv.release(); - } - - } catch (pi_result err) { - retErr = err; - } - return retErr; -} - -pi_result cuda_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, - const void *pattern, size_t pattern_size, - size_t offset, size_t size, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event) { - assert(command_queue != nullptr); - - auto args_are_multiples_of_pattern_size = - (offset % pattern_size == 0) || (size % pattern_size == 0); - - auto pattern_is_valid = (pattern != nullptr); - - auto pattern_size_is_valid = - ((pattern_size & (pattern_size - 1)) == 0) && // is power of two - (pattern_size > 0) && (pattern_size <= 128); // falls within valid range - - assert(args_are_multiples_of_pattern_size && pattern_is_valid && - pattern_size_is_valid); - (void)args_are_multiples_of_pattern_size; - (void)pattern_is_valid; - (void)pattern_size_is_valid; - - std::unique_ptr<_pi_event> retImplEv{nullptr}; - - try { - ScopedContext active(command_queue->get_context()); - - auto stream = command_queue->get_next_transfer_stream(); - pi_result result; - result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list, - event_wait_list); - - if (event) { - retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_FILL, command_queue, stream)); - result = map_ur_error(retImplEv->start()); - } - - auto dstDevice = buffer->mem_.buffer_mem_.get() + offset; - auto N = size / pattern_size; - - // pattern size in bytes - switch (pattern_size) { - case 1: { - auto value = *static_cast(pattern); - result = PI_CHECK_ERROR(cuMemsetD8Async(dstDevice, value, N, stream)); - break; - } - case 2: { - auto value = *static_cast(pattern); - result = PI_CHECK_ERROR(cuMemsetD16Async(dstDevice, value, N, stream)); - break; - } - case 4: { - auto value = *static_cast(pattern); - result = PI_CHECK_ERROR(cuMemsetD32Async(dstDevice, value, N, stream)); - break; - } - default: { - // CUDA has no memset functions that allow setting values more than 4 - // bytes. PI API lets you pass an arbitrary "pattern" to the buffer - // fill, which can be more than 4 bytes. We must break up the pattern - // into 4 byte values, and set the buffer using multiple strided calls. - // This means that one cuMemsetD2D32Async call is made for every 4 bytes - // in the pattern. - - auto number_of_steps = pattern_size / sizeof(uint32_t); - - // we walk up the pattern in 4-byte steps, and call cuMemset for each - // 4-byte chunk of the pattern. - for (auto step = 0u; step < number_of_steps; ++step) { - // take 4 bytes of the pattern - auto value = *(static_cast(pattern) + step); - - // offset the pointer to the part of the buffer we want to write to - auto offset_ptr = dstDevice + (step * sizeof(uint32_t)); - - // set all of the pattern chunks - result = PI_CHECK_ERROR( - cuMemsetD2D32Async(offset_ptr, pattern_size, value, 1, N, stream)); - } - - break; - } - } - - if (event) { - result = map_ur_error(retImplEv->record()); - *event = retImplEv.release(); - } - - return result; - } catch (pi_result err) { - return err; - } catch (...) { - return PI_ERROR_UNKNOWN; - } -} - -static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR array_desc) { - switch (array_desc.Format) { - case CU_AD_FORMAT_UNSIGNED_INT8: - case CU_AD_FORMAT_SIGNED_INT8: - return 1; - case CU_AD_FORMAT_UNSIGNED_INT16: - case CU_AD_FORMAT_SIGNED_INT16: - case CU_AD_FORMAT_HALF: - return 2; - case CU_AD_FORMAT_UNSIGNED_INT32: - case CU_AD_FORMAT_SIGNED_INT32: - case CU_AD_FORMAT_FLOAT: - return 4; - default: - sycl::detail::pi::die("Invalid image format."); - return 0; - } -} - -/// General ND memory copy operation for images (where N > 1). -/// This function requires the corresponding CUDA context to be at the top of -/// the context stack -/// If the source and/or destination is an array, src_ptr and/or dst_ptr -/// must be a pointer to a CUarray -static pi_result commonEnqueueMemImageNDCopy( - CUstream cu_stream, pi_mem_type img_type, const size_t *region, - const void *src_ptr, const CUmemorytype_enum src_type, - const size_t *src_offset, void *dst_ptr, const CUmemorytype_enum dst_type, - const size_t *dst_offset) { - assert(region != nullptr); - - assert(src_type == CU_MEMORYTYPE_ARRAY || src_type == CU_MEMORYTYPE_HOST); - assert(dst_type == CU_MEMORYTYPE_ARRAY || dst_type == CU_MEMORYTYPE_HOST); - - if (img_type == PI_MEM_TYPE_IMAGE2D) { - CUDA_MEMCPY2D cpyDesc; - memset(&cpyDesc, 0, sizeof(cpyDesc)); - cpyDesc.srcMemoryType = src_type; - if (src_type == CU_MEMORYTYPE_ARRAY) { - cpyDesc.srcArray = *static_cast(src_ptr); - cpyDesc.srcXInBytes = src_offset[0]; - cpyDesc.srcY = src_offset[1]; - } else { - cpyDesc.srcHost = src_ptr; - } - cpyDesc.dstMemoryType = dst_type; - if (dst_type == CU_MEMORYTYPE_ARRAY) { - cpyDesc.dstArray = *static_cast(dst_ptr); - cpyDesc.dstXInBytes = dst_offset[0]; - cpyDesc.dstY = dst_offset[1]; - } else { - cpyDesc.dstHost = dst_ptr; - } - cpyDesc.WidthInBytes = region[0]; - cpyDesc.Height = region[1]; - return PI_CHECK_ERROR(cuMemcpy2DAsync(&cpyDesc, cu_stream)); - } - if (img_type == PI_MEM_TYPE_IMAGE3D) { - CUDA_MEMCPY3D cpyDesc; - memset(&cpyDesc, 0, sizeof(cpyDesc)); - cpyDesc.srcMemoryType = src_type; - if (src_type == CU_MEMORYTYPE_ARRAY) { - cpyDesc.srcArray = *static_cast(src_ptr); - cpyDesc.srcXInBytes = src_offset[0]; - cpyDesc.srcY = src_offset[1]; - cpyDesc.srcZ = src_offset[2]; - } else { - cpyDesc.srcHost = src_ptr; - } - cpyDesc.dstMemoryType = dst_type; - if (dst_type == CU_MEMORYTYPE_ARRAY) { - cpyDesc.dstArray = *static_cast(dst_ptr); - cpyDesc.dstXInBytes = dst_offset[0]; - cpyDesc.dstY = dst_offset[1]; - cpyDesc.dstZ = dst_offset[2]; - } else { - cpyDesc.dstHost = dst_ptr; - } - cpyDesc.WidthInBytes = region[0]; - cpyDesc.Height = region[1]; - cpyDesc.Depth = region[2]; - return PI_CHECK_ERROR(cuMemcpy3DAsync(&cpyDesc, cu_stream)); - } - return PI_ERROR_INVALID_VALUE; -} - -pi_result cuda_piEnqueueMemImageRead( - pi_queue command_queue, pi_mem image, pi_bool blocking_read, - const size_t *origin, const size_t *region, size_t row_pitch, - size_t slice_pitch, void *ptr, pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, pi_event *event) { - // Ignore unused parameters - (void)row_pitch; - (void)slice_pitch; - - assert(command_queue != nullptr); - assert(image != nullptr); - assert(image->mem_type_ == _pi_mem::mem_type::surface); - - pi_result retErr = PI_SUCCESS; - - try { - ScopedContext active(command_queue->get_context()); - CUstream cuStream = command_queue->get_next_transfer_stream(); - retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, - event_wait_list); - - CUarray array = image->mem_.surface_mem_.get_array(); - - CUDA_ARRAY_DESCRIPTOR arrayDesc; - retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&arrayDesc, array)); - - int elementByteSize = imageElementByteSize(arrayDesc); - - size_t byteOffsetX = origin[0] * elementByteSize * arrayDesc.NumChannels; - size_t bytesToCopy = elementByteSize * arrayDesc.NumChannels * region[0]; - - pi_mem_type imgType = - map_ur_mem_type(image->mem_.surface_mem_.get_image_type()); - if (imgType == PI_MEM_TYPE_IMAGE1D) { - retErr = PI_CHECK_ERROR( - cuMemcpyAtoHAsync(ptr, array, byteOffsetX, bytesToCopy, cuStream)); - } else { - size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]}; - size_t srcOffset[3] = {byteOffsetX, origin[1], origin[2]}; - - retErr = commonEnqueueMemImageNDCopy( - cuStream, imgType, adjustedRegion, &array, CU_MEMORYTYPE_ARRAY, - srcOffset, ptr, CU_MEMORYTYPE_HOST, nullptr); - - if (retErr != PI_SUCCESS) { - return retErr; - } - } - - if (event) { - auto new_event = _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_READ, - command_queue, cuStream); - new_event->record(); - *event = new_event; - } - - if (blocking_read) { - retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream)); - } - } catch (pi_result err) { - return err; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - - return retErr; -} - -pi_result -cuda_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image, - pi_bool blocking_write, const size_t *origin, - const size_t *region, size_t input_row_pitch, - size_t input_slice_pitch, const void *ptr, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, pi_event *event) { - // Ignore unused parameters - (void)blocking_write; - (void)input_row_pitch; - (void)input_slice_pitch; - - assert(command_queue != nullptr); - assert(image != nullptr); - assert(image->mem_type_ == _pi_mem::mem_type::surface); - - pi_result retErr = PI_SUCCESS; - - try { - ScopedContext active(command_queue->get_context()); - CUstream cuStream = command_queue->get_next_transfer_stream(); - retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, - event_wait_list); - - CUarray array = image->mem_.surface_mem_.get_array(); - - CUDA_ARRAY_DESCRIPTOR arrayDesc; - retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&arrayDesc, array)); - - int elementByteSize = imageElementByteSize(arrayDesc); - - size_t byteOffsetX = origin[0] * elementByteSize * arrayDesc.NumChannels; - size_t bytesToCopy = elementByteSize * arrayDesc.NumChannels * region[0]; - - pi_mem_type imgType = - map_ur_mem_type(image->mem_.surface_mem_.get_image_type()); - if (imgType == PI_MEM_TYPE_IMAGE1D) { - retErr = PI_CHECK_ERROR( - cuMemcpyHtoAAsync(array, byteOffsetX, ptr, bytesToCopy, cuStream)); - } else { - size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]}; - size_t dstOffset[3] = {byteOffsetX, origin[1], origin[2]}; - - retErr = commonEnqueueMemImageNDCopy( - cuStream, imgType, adjustedRegion, ptr, CU_MEMORYTYPE_HOST, nullptr, - &array, CU_MEMORYTYPE_ARRAY, dstOffset); - - if (retErr != PI_SUCCESS) { - return retErr; - } - } - - if (event) { - auto new_event = _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_WRITE, - command_queue, cuStream); - new_event->record(); - *event = new_event; - } - } catch (pi_result err) { - return err; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - - return retErr; -} - -pi_result cuda_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, - pi_mem dst_image, const size_t *src_origin, - const size_t *dst_origin, - const size_t *region, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event) { - assert(src_image->mem_type_ == _pi_mem::mem_type::surface); - assert(dst_image->mem_type_ == _pi_mem::mem_type::surface); - assert(src_image->mem_.surface_mem_.get_image_type() == - dst_image->mem_.surface_mem_.get_image_type()); - - pi_result retErr = PI_SUCCESS; - - try { - ScopedContext active(command_queue->get_context()); - CUstream cuStream = command_queue->get_next_transfer_stream(); - retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, - event_wait_list); - - CUarray srcArray = src_image->mem_.surface_mem_.get_array(); - CUarray dstArray = dst_image->mem_.surface_mem_.get_array(); - - CUDA_ARRAY_DESCRIPTOR srcArrayDesc; - retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&srcArrayDesc, srcArray)); - CUDA_ARRAY_DESCRIPTOR dstArrayDesc; - retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&dstArrayDesc, dstArray)); - - assert(srcArrayDesc.Format == dstArrayDesc.Format); - assert(srcArrayDesc.NumChannels == dstArrayDesc.NumChannels); - - int elementByteSize = imageElementByteSize(srcArrayDesc); - - size_t dstByteOffsetX = - dst_origin[0] * elementByteSize * srcArrayDesc.NumChannels; - size_t srcByteOffsetX = - src_origin[0] * elementByteSize * dstArrayDesc.NumChannels; - size_t bytesToCopy = elementByteSize * srcArrayDesc.NumChannels * region[0]; - - pi_mem_type imgType = - map_ur_mem_type(src_image->mem_.surface_mem_.get_image_type()); - if (imgType == PI_MEM_TYPE_IMAGE1D) { - retErr = PI_CHECK_ERROR(cuMemcpyAtoA(dstArray, dstByteOffsetX, srcArray, - srcByteOffsetX, bytesToCopy)); - } else { - size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]}; - size_t srcOffset[3] = {srcByteOffsetX, src_origin[1], src_origin[2]}; - size_t dstOffset[3] = {dstByteOffsetX, dst_origin[1], dst_origin[2]}; - - retErr = commonEnqueueMemImageNDCopy( - cuStream, imgType, adjustedRegion, &srcArray, CU_MEMORYTYPE_ARRAY, - srcOffset, &dstArray, CU_MEMORYTYPE_ARRAY, dstOffset); - - if (retErr != PI_SUCCESS) { - return retErr; - } - } - - if (event) { - auto new_event = _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_COPY, - command_queue, cuStream); - new_event->record(); - *event = new_event; - } - } catch (pi_result err) { - return err; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - - return retErr; -} - -/// \TODO Not implemented in CUDA. -pi_result cuda_piEnqueueMemImageFill(pi_queue, pi_mem, const void *, - const size_t *, const size_t *, pi_uint32, - const pi_event *, pi_event *) { - sycl::detail::pi::die("cuda_piEnqueueMemImageFill not implemented"); - return {}; -} - -/// Implements mapping on the host using a BufferRead operation. -/// Mapped pointers are stored in the pi_mem object. -/// If the buffer uses pinned host memory a pointer to that memory is returned -/// and no read operation is done. -/// -pi_result cuda_piEnqueueMemBufferMap(pi_queue command_queue, pi_mem buffer, - pi_bool blocking_map, - pi_map_flags map_flags, size_t offset, - size_t size, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event, void **ret_map) { - assert(ret_map != nullptr); - assert(command_queue != nullptr); - assert(buffer != nullptr); - assert(buffer->mem_type_ == _pi_mem::mem_type::buffer); - - pi_result ret_err = PI_ERROR_INVALID_OPERATION; - const bool is_pinned = buffer->mem_.buffer_mem_.allocMode_ == - _pi_mem::mem_::buffer_mem_::alloc_mode::alloc_host_ptr; - - // Currently no support for overlapping regions - if (buffer->mem_.buffer_mem_.get_map_ptr() != nullptr) { - return ret_err; - } - - // Allocate a pointer in the host to store the mapped information - // TODO(ur): Remove conversion when this is ported to UR. - ur_map_flags_t map_flags_ur; - pi2urMapFlags(map_flags, &map_flags_ur); - auto hostPtr = buffer->mem_.buffer_mem_.map_to_ptr(offset, map_flags_ur); - *ret_map = buffer->mem_.buffer_mem_.get_map_ptr(); - if (hostPtr) { - ret_err = PI_SUCCESS; - } - - if (!is_pinned && ((map_flags & PI_MAP_READ) || (map_flags & PI_MAP_WRITE))) { - // Pinned host memory is already on host so it doesn't need to be read. - ret_err = pi2ur::piEnqueueMemBufferRead( - command_queue, buffer, blocking_map, offset, size, hostPtr, - num_events_in_wait_list, event_wait_list, event); - } else { - ScopedContext active(command_queue->get_context()); - - if (is_pinned) { - ret_err = pi2ur::piEnqueueEventsWait( - command_queue, num_events_in_wait_list, event_wait_list, nullptr); - } - - if (event) { - try { - *event = _pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_MAP, command_queue, - command_queue->get_next_transfer_stream()); - (*event)->start(); - (*event)->record(); - } catch (pi_result error) { - ret_err = error; - } - } - } - - return ret_err; -} - -/// Implements the unmap from the host, using a BufferWrite operation. -/// Requires the mapped pointer to be already registered in the given memobj. -/// If memobj uses pinned host memory, this will not do a write. -/// -pi_result cuda_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj, - void *mapped_ptr, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event) { - pi_result ret_err = PI_SUCCESS; - - assert(command_queue != nullptr); - assert(mapped_ptr != nullptr); - assert(memobj != nullptr); - assert(memobj->mem_type_ == _pi_mem::mem_type::buffer); - assert(memobj->mem_.buffer_mem_.get_map_ptr() != nullptr); - assert(memobj->mem_.buffer_mem_.get_map_ptr() == mapped_ptr); - - const bool is_pinned = memobj->mem_.buffer_mem_.allocMode_ == - _pi_mem::mem_::buffer_mem_::alloc_mode::alloc_host_ptr; - - if (!is_pinned && - ((memobj->mem_.buffer_mem_.get_map_flags() & PI_MAP_WRITE) || - (memobj->mem_.buffer_mem_.get_map_flags() & - PI_MAP_WRITE_INVALIDATE_REGION))) { - // Pinned host memory is only on host so it doesn't need to be written to. - ret_err = pi2ur::piEnqueueMemBufferWrite( - command_queue, memobj, true, - memobj->mem_.buffer_mem_.get_map_offset(mapped_ptr), - memobj->mem_.buffer_mem_.get_size(), mapped_ptr, - num_events_in_wait_list, event_wait_list, event); - } else { - ScopedContext active(command_queue->get_context()); - - if (is_pinned) { - ret_err = pi2ur::piEnqueueEventsWait( - command_queue, num_events_in_wait_list, event_wait_list, nullptr); - } - - if (event) { - try { - *event = _pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_UNMAP, command_queue, - command_queue->get_next_transfer_stream()); - (*event)->start(); - (*event)->record(); - } catch (pi_result error) { - ret_err = error; - } - } - } - - memobj->mem_.buffer_mem_.unmap(mapped_ptr); - return ret_err; -} - pi_result cuda_piextEnqueueDeviceGlobalVariableWrite( pi_queue queue, pi_program program, const char *name, pi_bool blocking_write, size_t count, size_t offset, const void *src, @@ -1431,22 +540,23 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piSamplerRelease, pi2ur::piSamplerRelease) // Queue commands _PI_CL(piEnqueueKernelLaunch, pi2ur::piEnqueueKernelLaunch) - _PI_CL(piEnqueueNativeKernel, cuda_piEnqueueNativeKernel) + _PI_CL(piEnqueueNativeKernel, pi2ur::piEnqueueNativeKernel) _PI_CL(piEnqueueEventsWait, pi2ur::piEnqueueEventsWait) _PI_CL(piEnqueueEventsWaitWithBarrier, pi2ur::piEnqueueEventsWaitWithBarrier) _PI_CL(piEnqueueMemBufferRead, pi2ur::piEnqueueMemBufferRead) - _PI_CL(piEnqueueMemBufferReadRect, cuda_piEnqueueMemBufferReadRect) + _PI_CL(piEnqueueMemBufferReadRect, pi2ur::piEnqueueMemBufferReadRect) _PI_CL(piEnqueueMemBufferWrite, pi2ur::piEnqueueMemBufferWrite) - _PI_CL(piEnqueueMemBufferWriteRect, cuda_piEnqueueMemBufferWriteRect) - _PI_CL(piEnqueueMemBufferCopy, cuda_piEnqueueMemBufferCopy) - _PI_CL(piEnqueueMemBufferCopyRect, cuda_piEnqueueMemBufferCopyRect) - _PI_CL(piEnqueueMemBufferFill, cuda_piEnqueueMemBufferFill) - _PI_CL(piEnqueueMemImageRead, cuda_piEnqueueMemImageRead) - _PI_CL(piEnqueueMemImageWrite, cuda_piEnqueueMemImageWrite) - _PI_CL(piEnqueueMemImageCopy, cuda_piEnqueueMemImageCopy) - _PI_CL(piEnqueueMemImageFill, cuda_piEnqueueMemImageFill) - _PI_CL(piEnqueueMemBufferMap, cuda_piEnqueueMemBufferMap) - _PI_CL(piEnqueueMemUnmap, cuda_piEnqueueMemUnmap) + _PI_CL(piEnqueueMemBufferWriteRect, pi2ur::piEnqueueMemBufferWriteRect) + _PI_CL(piEnqueueMemBufferCopy, pi2ur::piEnqueueMemBufferCopy) + _PI_CL(piEnqueueMemBufferCopyRect, pi2ur::piEnqueueMemBufferCopyRect) + _PI_CL(piEnqueueMemBufferFill, pi2ur::piEnqueueMemBufferFill) + _PI_CL(piEnqueueMemImageRead, pi2ur::piEnqueueMemImageRead) + _PI_CL(piEnqueueMemImageWrite, pi2ur::piEnqueueMemImageWrite) + _PI_CL(piEnqueueMemImageCopy, pi2ur::piEnqueueMemImageCopy) + _PI_CL(piEnqueueMemImageFill, pi2ur::piEnqueueMemImageFill) + _PI_CL(piEnqueueMemBufferMap, pi2ur::piEnqueueMemBufferMap) + _PI_CL(piEnqueueMemUnmap, pi2ur::piEnqueueMemUnmap) + // USM _PI_CL(piextUSMHostAlloc, pi2ur::piextUSMHostAlloc) _PI_CL(piextUSMDeviceAlloc, pi2ur::piextUSMDeviceAlloc) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp index 674bea82ddef9..fd2106dd6c141 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp @@ -19,9 +19,8 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t command_queue, CUstream stream, uint32_t num_events_in_wait_list, const ur_event_handle_t *event_wait_list) { - if (!event_wait_list) { - return UR_RESULT_SUCCESS; - } + UR_ASSERT(event_wait_list, UR_RESULT_SUCCESS); + try { ScopedContext active(command_queue->get_context()); @@ -34,11 +33,7 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t command_queue, CUstream stream, return UR_CHECK_ERROR(cuStreamWaitEvent(stream, event->get(), 0)); } }); - - if (result != UR_RESULT_SUCCESS) { - return result; - } - return UR_RESULT_SUCCESS; + return result; } catch (ur_result_t err) { return err; } catch (...) { @@ -225,9 +220,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { // This function makes one stream work on the previous work (or work // represented by input events) and then all future work waits on that stream. - if (!hQueue) { - return UR_RESULT_ERROR_INVALID_QUEUE; - } + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE); ur_result_t result; @@ -474,6 +467,759 @@ UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( return retError; } +/// General 3D memory copy operation. +/// This function requires the corresponding CUDA context to be at the top of +/// the context stack +/// If the source and/or destination is on the device, src_ptr and/or dst_ptr +/// must be a pointer to a CUdeviceptr +static ur_result_t commonEnqueueMemBufferCopyRect( + CUstream cu_stream, ur_rect_region_t region, const void *src_ptr, + const CUmemorytype_enum src_type, ur_rect_offset_t src_offset, + size_t src_row_pitch, size_t src_slice_pitch, void *dst_ptr, + const CUmemorytype_enum dst_type, ur_rect_offset_t dst_offset, + size_t dst_row_pitch, size_t dst_slice_pitch) { + + UR_ASSERT(src_type == CU_MEMORYTYPE_DEVICE || src_type == CU_MEMORYTYPE_HOST, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(dst_type == CU_MEMORYTYPE_DEVICE || dst_type == CU_MEMORYTYPE_HOST, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + src_row_pitch = + (!src_row_pitch) ? region.width + src_offset.x : src_row_pitch; + src_slice_pitch = (!src_slice_pitch) + ? ((region.height + src_offset.y) * src_row_pitch) + : src_slice_pitch; + dst_row_pitch = + (!dst_row_pitch) ? region.width + dst_offset.x : dst_row_pitch; + dst_slice_pitch = (!dst_slice_pitch) + ? ((region.height + dst_offset.y) * dst_row_pitch) + : dst_slice_pitch; + + CUDA_MEMCPY3D params = {}; + + params.WidthInBytes = region.width; + params.Height = region.height; + params.Depth = region.depth; + + params.srcMemoryType = src_type; + params.srcDevice = src_type == CU_MEMORYTYPE_DEVICE + ? *static_cast(src_ptr) + : 0; + params.srcHost = src_type == CU_MEMORYTYPE_HOST ? src_ptr : nullptr; + params.srcXInBytes = src_offset.x; + params.srcY = src_offset.y; + params.srcZ = src_offset.z; + params.srcPitch = src_row_pitch; + params.srcHeight = src_slice_pitch / src_row_pitch; + + params.dstMemoryType = dst_type; + params.dstDevice = dst_type == CU_MEMORYTYPE_DEVICE + ? *static_cast(dst_ptr) + : 0; + params.dstHost = dst_type == CU_MEMORYTYPE_HOST ? dst_ptr : nullptr; + params.dstXInBytes = dst_offset.x; + params.dstY = dst_offset.y; + params.dstZ = dst_offset.z; + params.dstPitch = dst_row_pitch; + params.dstHeight = dst_slice_pitch / dst_row_pitch; + + return UR_CHECK_ERROR(cuMemcpy3DAsync(¶ms, cu_stream)); +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect( + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingRead, + ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, + ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, + size_t hostRowPitch, size_t hostSlicePitch, void *pDst, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + ur_result_t retErr = UR_RESULT_SUCCESS; + CUdeviceptr devPtr = hBuffer->mem_.buffer_mem_.get(); + std::unique_ptr retImplEv{nullptr}; + + try { + ScopedContext active(hQueue->get_context()); + CUstream cuStream = hQueue->get_next_transfer_stream(); + + retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, + phEventWaitList); + + if (phEvent) { + retImplEv = + std::unique_ptr(ur_event_handle_t_::make_native( + UR_COMMAND_MEM_BUFFER_READ_RECT, hQueue, cuStream)); + retImplEv->start(); + } + + retErr = commonEnqueueMemBufferCopyRect( + cuStream, region, &devPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin, + bufferRowPitch, bufferSlicePitch, pDst, CU_MEMORYTYPE_HOST, hostOrigin, + hostRowPitch, bufferSlicePitch); + + if (phEvent) { + retErr = retImplEv->record(); + } + + if (blockingRead) { + retErr = UR_CHECK_ERROR(cuStreamSynchronize(cuStream)); + } + + if (phEvent) { + *phEvent = retImplEv.release(); + } + + } catch (ur_result_t err) { + retErr = err; + } + return retErr; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingWrite, + ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, + ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, + size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + ur_result_t retErr = UR_RESULT_SUCCESS; + CUdeviceptr devPtr = hBuffer->mem_.buffer_mem_.get(); + std::unique_ptr retImplEv{nullptr}; + + try { + ScopedContext active(hQueue->get_context()); + CUstream cuStream = hQueue->get_next_transfer_stream(); + retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, + phEventWaitList); + + if (phEvent) { + retImplEv = + std::unique_ptr(ur_event_handle_t_::make_native( + UR_COMMAND_MEM_BUFFER_WRITE_RECT, hQueue, cuStream)); + retImplEv->start(); + } + + retErr = commonEnqueueMemBufferCopyRect( + cuStream, region, pSrc, CU_MEMORYTYPE_HOST, hostOrigin, hostRowPitch, + hostSlicePitch, &devPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin, + bufferRowPitch, bufferSlicePitch); + + if (phEvent) { + retErr = retImplEv->record(); + } + + if (blockingWrite) { + retErr = UR_CHECK_ERROR(cuStreamSynchronize(cuStream)); + } + + if (phEvent) { + *phEvent = retImplEv.release(); + } + + } catch (ur_result_t err) { + retErr = err; + } + return retErr; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy( + ur_queue_handle_t hQueue, ur_mem_handle_t hBufferSrc, + ur_mem_handle_t hBufferDst, size_t srcOffset, size_t dstOffset, size_t size, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + std::unique_ptr retImplEv{nullptr}; + + try { + ScopedContext active(hQueue->get_context()); + ur_result_t result; + + auto stream = hQueue->get_next_transfer_stream(); + result = + enqueueEventsWait(hQueue, stream, numEventsInWaitList, phEventWaitList); + + if (phEvent) { + retImplEv = + std::unique_ptr(ur_event_handle_t_::make_native( + UR_COMMAND_MEM_BUFFER_COPY, hQueue, stream)); + result = retImplEv->start(); + } + + auto src = hBufferSrc->mem_.buffer_mem_.get() + srcOffset; + auto dst = hBufferDst->mem_.buffer_mem_.get() + dstOffset; + + result = UR_CHECK_ERROR(cuMemcpyDtoDAsync(dst, src, size, stream)); + + if (phEvent) { + result = retImplEv->record(); + *phEvent = retImplEv.release(); + } + + return result; + } catch (ur_result_t err) { + return err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( + ur_queue_handle_t hQueue, ur_mem_handle_t hBufferSrc, + ur_mem_handle_t hBufferDst, ur_rect_offset_t srcOrigin, + ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, + size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + UR_ASSERT(hBufferSrc, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hBufferDst, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + ur_result_t retErr = UR_RESULT_SUCCESS; + CUdeviceptr srcPtr = hBufferSrc->mem_.buffer_mem_.get(); + CUdeviceptr dstPtr = hBufferDst->mem_.buffer_mem_.get(); + std::unique_ptr retImplEv{nullptr}; + + try { + ScopedContext active(hQueue->get_context()); + CUstream cuStream = hQueue->get_next_transfer_stream(); + retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, + phEventWaitList); + + if (phEvent) { + retImplEv = + std::unique_ptr(ur_event_handle_t_::make_native( + UR_COMMAND_MEM_BUFFER_COPY_RECT, hQueue, cuStream)); + retImplEv->start(); + } + + retErr = commonEnqueueMemBufferCopyRect( + cuStream, region, &srcPtr, CU_MEMORYTYPE_DEVICE, srcOrigin, srcRowPitch, + srcSlicePitch, &dstPtr, CU_MEMORYTYPE_DEVICE, dstOrigin, dstRowPitch, + dstSlicePitch); + + if (phEvent) { + retImplEv->record(); + *phEvent = retImplEv.release(); + } + + } catch (ur_result_t err) { + retErr = err; + } + return retErr; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, const void *pPattern, + size_t patternSize, size_t offset, size_t size, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + auto args_are_multiples_of_pattern_size = + (offset % patternSize == 0) || (size % patternSize == 0); + + auto pattern_is_valid = (pPattern != nullptr); + + auto pattern_size_is_valid = + ((patternSize & (patternSize - 1)) == 0) && // is power of two + (patternSize > 0) && (patternSize <= 128); // falls within valid range + + UR_ASSERT(args_are_multiples_of_pattern_size && pattern_is_valid && + pattern_size_is_valid, + UR_RESULT_ERROR_INVALID_SIZE); + + std::unique_ptr retImplEv{nullptr}; + + try { + ScopedContext active(hQueue->get_context()); + + auto stream = hQueue->get_next_transfer_stream(); + ur_result_t result; + result = + enqueueEventsWait(hQueue, stream, numEventsInWaitList, phEventWaitList); + + if (phEvent) { + retImplEv = + std::unique_ptr(ur_event_handle_t_::make_native( + UR_COMMAND_MEM_BUFFER_FILL, hQueue, stream)); + result = retImplEv->start(); + } + + auto dstDevice = hBuffer->mem_.buffer_mem_.get() + offset; + auto N = size / patternSize; + + // pattern size in bytes + switch (patternSize) { + case 1: { + auto value = *static_cast(pPattern); + result = UR_CHECK_ERROR(cuMemsetD8Async(dstDevice, value, N, stream)); + break; + } + case 2: { + auto value = *static_cast(pPattern); + result = UR_CHECK_ERROR(cuMemsetD16Async(dstDevice, value, N, stream)); + break; + } + case 4: { + auto value = *static_cast(pPattern); + result = UR_CHECK_ERROR(cuMemsetD32Async(dstDevice, value, N, stream)); + break; + } + default: { + // CUDA has no memset functions that allow setting values more than 4 + // bytes. PI API lets you pass an arbitrary "pattern" to the buffer + // fill, which can be more than 4 bytes. We must break up the pattern + // into 4 byte values, and set the buffer using multiple strided calls. + // This means that one cuMemsetD2D32Async call is made for every 4 bytes + // in the pattern. + + auto number_of_steps = patternSize / sizeof(uint32_t); + + // we walk up the pattern in 4-byte steps, and call cuMemset for each + // 4-byte chunk of the pattern. + for (auto step = 0u; step < number_of_steps; ++step) { + // take 4 bytes of the pattern + auto value = *(static_cast(pPattern) + step); + + // offset the pointer to the part of the buffer we want to write to + auto offset_ptr = dstDevice + (step * sizeof(uint32_t)); + + // set all of the pattern chunks + result = UR_CHECK_ERROR( + cuMemsetD2D32Async(offset_ptr, patternSize, value, 1, N, stream)); + } + + break; + } + } + + if (phEvent) { + result = retImplEv->record(); + *phEvent = retImplEv.release(); + } + + return result; + } catch (ur_result_t err) { + return err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } +} + +static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR array_desc) { + switch (array_desc.Format) { + case CU_AD_FORMAT_UNSIGNED_INT8: + case CU_AD_FORMAT_SIGNED_INT8: + return 1; + case CU_AD_FORMAT_UNSIGNED_INT16: + case CU_AD_FORMAT_SIGNED_INT16: + case CU_AD_FORMAT_HALF: + return 2; + case CU_AD_FORMAT_UNSIGNED_INT32: + case CU_AD_FORMAT_SIGNED_INT32: + case CU_AD_FORMAT_FLOAT: + return 4; + default: + sycl::detail::ur::die("Invalid image format."); + return 0; + } +} + +/// General ND memory copy operation for images (where N > 1). +/// This function requires the corresponding CUDA context to be at the top of +/// the context stack +/// If the source and/or destination is an array, src_ptr and/or dst_ptr +/// must be a pointer to a CUarray +static ur_result_t commonEnqueueMemImageNDCopy( + CUstream cu_stream, ur_mem_type_t img_type, const ur_rect_region_t region, + const void *src_ptr, const CUmemorytype_enum src_type, + const ur_rect_offset_t src_offset, void *dst_ptr, + const CUmemorytype_enum dst_type, const ur_rect_offset_t dst_offset) { + UR_ASSERT(src_type == CU_MEMORYTYPE_ARRAY || src_type == CU_MEMORYTYPE_HOST, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(dst_type == CU_MEMORYTYPE_ARRAY || dst_type == CU_MEMORYTYPE_HOST, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + if (img_type == UR_MEM_TYPE_IMAGE2D) { + CUDA_MEMCPY2D cpyDesc; + memset(&cpyDesc, 0, sizeof(cpyDesc)); + cpyDesc.srcMemoryType = src_type; + if (src_type == CU_MEMORYTYPE_ARRAY) { + cpyDesc.srcArray = *static_cast(src_ptr); + cpyDesc.srcXInBytes = src_offset.x; + cpyDesc.srcY = src_offset.y; + } else { + cpyDesc.srcHost = src_ptr; + } + cpyDesc.dstMemoryType = dst_type; + if (dst_type == CU_MEMORYTYPE_ARRAY) { + cpyDesc.dstArray = *static_cast(dst_ptr); + cpyDesc.dstXInBytes = dst_offset.x; + cpyDesc.dstY = dst_offset.y; + } else { + cpyDesc.dstHost = dst_ptr; + } + cpyDesc.WidthInBytes = region.width; + cpyDesc.Height = region.height; + return UR_CHECK_ERROR(cuMemcpy2DAsync(&cpyDesc, cu_stream)); + } + if (img_type == UR_MEM_TYPE_IMAGE3D) { + CUDA_MEMCPY3D cpyDesc; + memset(&cpyDesc, 0, sizeof(cpyDesc)); + cpyDesc.srcMemoryType = src_type; + if (src_type == CU_MEMORYTYPE_ARRAY) { + cpyDesc.srcArray = *static_cast(src_ptr); + cpyDesc.srcXInBytes = src_offset.x; + cpyDesc.srcY = src_offset.y; + cpyDesc.srcZ = src_offset.z; + } else { + cpyDesc.srcHost = src_ptr; + } + cpyDesc.dstMemoryType = dst_type; + if (dst_type == CU_MEMORYTYPE_ARRAY) { + cpyDesc.dstArray = *static_cast(dst_ptr); + cpyDesc.dstXInBytes = dst_offset.x; + cpyDesc.dstY = dst_offset.y; + cpyDesc.dstZ = dst_offset.z; + } else { + cpyDesc.dstHost = dst_ptr; + } + cpyDesc.WidthInBytes = region.width; + cpyDesc.Height = region.height; + cpyDesc.Depth = region.depth; + return UR_CHECK_ERROR(cuMemcpy3DAsync(&cpyDesc, cu_stream)); + } + return UR_RESULT_ERROR_INVALID_VALUE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( + ur_queue_handle_t hQueue, ur_mem_handle_t hImage, bool blockingRead, + ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch, + size_t phEventWaitListslicePitch, void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hImage, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hImage->mem_type_ == ur_mem_handle_t_::mem_type::surface, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + ur_result_t retErr = UR_RESULT_SUCCESS; + + try { + ScopedContext active(hQueue->get_context()); + CUstream cuStream = hQueue->get_next_transfer_stream(); + retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, + phEventWaitList); + + CUarray array = hImage->mem_.surface_mem_.get_array(); + + CUDA_ARRAY_DESCRIPTOR arrayDesc; + retErr = UR_CHECK_ERROR(cuArrayGetDescriptor(&arrayDesc, array)); + + int elementByteSize = imageElementByteSize(arrayDesc); + + size_t byteOffsetX = origin.x * elementByteSize * arrayDesc.NumChannels; + size_t bytesToCopy = elementByteSize * arrayDesc.NumChannels * region.width; + + ur_mem_type_t imgType = hImage->mem_.surface_mem_.get_image_type(); + if (imgType == UR_MEM_TYPE_IMAGE1D) { + retErr = UR_CHECK_ERROR( + cuMemcpyAtoHAsync(pDst, array, byteOffsetX, bytesToCopy, cuStream)); + } else { + ur_rect_region_t adjustedRegion = {bytesToCopy, region.height, + region.depth}; + ur_rect_offset_t srcOffset = {byteOffsetX, origin.y, origin.z}; + + retErr = commonEnqueueMemImageNDCopy( + cuStream, imgType, adjustedRegion, &array, CU_MEMORYTYPE_ARRAY, + srcOffset, pDst, CU_MEMORYTYPE_HOST, ur_rect_offset_t{}); + + if (retErr != UR_RESULT_SUCCESS) { + return retErr; + } + } + + if (phEvent) { + auto new_event = ur_event_handle_t_::make_native( + UR_COMMAND_MEM_IMAGE_READ, hQueue, cuStream); + new_event->record(); + *phEvent = new_event; + } + + if (blockingRead) { + retErr = UR_CHECK_ERROR(cuStreamSynchronize(cuStream)); + } + } catch (ur_result_t err) { + return err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + return retErr; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite( + ur_queue_handle_t hQueue, ur_mem_handle_t hImage, bool blockingWrite, + ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch, + size_t slicePitch, void *pSrc, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hImage, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hImage->mem_type_ == ur_mem_handle_t_::mem_type::surface, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + ur_result_t retErr = UR_RESULT_SUCCESS; + + try { + ScopedContext active(hQueue->get_context()); + CUstream cuStream = hQueue->get_next_transfer_stream(); + retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, + phEventWaitList); + + CUarray array = hImage->mem_.surface_mem_.get_array(); + + CUDA_ARRAY_DESCRIPTOR arrayDesc; + retErr = UR_CHECK_ERROR(cuArrayGetDescriptor(&arrayDesc, array)); + + int elementByteSize = imageElementByteSize(arrayDesc); + + size_t byteOffsetX = origin.x * elementByteSize * arrayDesc.NumChannels; + size_t bytesToCopy = elementByteSize * arrayDesc.NumChannels * region.width; + + ur_mem_type_t imgType = hImage->mem_.surface_mem_.get_image_type(); + if (imgType == UR_MEM_TYPE_IMAGE1D) { + retErr = UR_CHECK_ERROR( + cuMemcpyHtoAAsync(array, byteOffsetX, pSrc, bytesToCopy, cuStream)); + } else { + ur_rect_region_t adjustedRegion = {bytesToCopy, region.height, + region.depth}; + ur_rect_offset_t dstOffset = {byteOffsetX, origin.y, origin.z}; + + retErr = commonEnqueueMemImageNDCopy( + cuStream, imgType, adjustedRegion, pSrc, CU_MEMORYTYPE_HOST, + ur_rect_offset_t{}, &array, CU_MEMORYTYPE_ARRAY, dstOffset); + + if (retErr != UR_RESULT_SUCCESS) { + return retErr; + } + } + + if (phEvent) { + auto new_event = ur_event_handle_t_::make_native( + UR_COMMAND_MEM_IMAGE_WRITE, hQueue, cuStream); + new_event->record(); + *phEvent = new_event; + } + } catch (ur_result_t err) { + return err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + return retErr; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy( + ur_queue_handle_t hQueue, ur_mem_handle_t hImageSrc, + ur_mem_handle_t hImageDst, ur_rect_offset_t srcOrigin, + ur_rect_offset_t dstOrigin, ur_rect_region_t region, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + UR_ASSERT(hImageSrc->mem_type_ == ur_mem_handle_t_::mem_type::surface, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(hImageDst->mem_type_ == ur_mem_handle_t_::mem_type::surface, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(hImageSrc->mem_.surface_mem_.get_image_type() == + hImageDst->mem_.surface_mem_.get_image_type(), + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + ur_result_t retErr = UR_RESULT_SUCCESS; + + try { + ScopedContext active(hQueue->get_context()); + CUstream cuStream = hQueue->get_next_transfer_stream(); + retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, + phEventWaitList); + + CUarray srcArray = hImageSrc->mem_.surface_mem_.get_array(); + CUarray dstArray = hImageDst->mem_.surface_mem_.get_array(); + + CUDA_ARRAY_DESCRIPTOR srcArrayDesc; + retErr = UR_CHECK_ERROR(cuArrayGetDescriptor(&srcArrayDesc, srcArray)); + CUDA_ARRAY_DESCRIPTOR dstArrayDesc; + retErr = UR_CHECK_ERROR(cuArrayGetDescriptor(&dstArrayDesc, dstArray)); + + UR_ASSERT(srcArrayDesc.Format == dstArrayDesc.Format, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(srcArrayDesc.NumChannels == dstArrayDesc.NumChannels, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + int elementByteSize = imageElementByteSize(srcArrayDesc); + + size_t dstByteOffsetX = + dstOrigin.x * elementByteSize * srcArrayDesc.NumChannels; + size_t srcByteOffsetX = + srcOrigin.x * elementByteSize * dstArrayDesc.NumChannels; + size_t bytesToCopy = + elementByteSize * srcArrayDesc.NumChannels * region.width; + + ur_mem_type_t imgType = hImageSrc->mem_.surface_mem_.get_image_type(); + if (imgType == UR_MEM_TYPE_IMAGE1D) { + retErr = UR_CHECK_ERROR(cuMemcpyAtoA(dstArray, dstByteOffsetX, srcArray, + srcByteOffsetX, bytesToCopy)); + } else { + ur_rect_region_t adjustedRegion = {bytesToCopy, region.height, + region.depth}; + ur_rect_offset_t srcOffset = {srcByteOffsetX, srcOrigin.y, srcOrigin.z}; + ur_rect_offset_t dstOffset = {dstByteOffsetX, dstOrigin.y, dstOrigin.z}; + + retErr = commonEnqueueMemImageNDCopy( + cuStream, imgType, adjustedRegion, &srcArray, CU_MEMORYTYPE_ARRAY, + srcOffset, &dstArray, CU_MEMORYTYPE_ARRAY, dstOffset); + + if (retErr != UR_RESULT_SUCCESS) { + return retErr; + } + } + + if (phEvent) { + auto new_event = ur_event_handle_t_::make_native( + UR_COMMAND_MEM_IMAGE_COPY, hQueue, cuStream); + new_event->record(); + *phEvent = new_event; + } + } catch (ur_result_t err) { + return err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + return retErr; +} + +/// Implements mapping on the host using a BufferRead operation. +/// Mapped pointers are stored in the pi_mem object. +/// If the buffer uses pinned host memory a pointer to that memory is returned +/// and no read operation is done. +/// +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingMap, + ur_map_flags_t mapFlags, size_t offset, size_t size, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent, void **ppRetMap) { + UR_ASSERT(ppRetMap != nullptr, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(hQueue != nullptr, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hBuffer != nullptr, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hBuffer->mem_type_ == ur_mem_handle_t_::mem_type::buffer, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + ur_result_t ret_err = UR_RESULT_ERROR_INVALID_MEM_OBJECT; + const bool is_pinned = + hBuffer->mem_.buffer_mem_.allocMode_ == + ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::alloc_host_ptr; + + // Currently no support for overlapping regions + if (hBuffer->mem_.buffer_mem_.get_map_ptr() != nullptr) { + return ret_err; + } + + // Allocate a pointer in the host to store the mapped information + auto hostPtr = hBuffer->mem_.buffer_mem_.map_to_ptr(offset, mapFlags); + *ppRetMap = hBuffer->mem_.buffer_mem_.get_map_ptr(); + if (hostPtr) { + ret_err = UR_RESULT_SUCCESS; + } + + if (!is_pinned && + ((mapFlags & UR_MAP_FLAG_READ) || (mapFlags & UR_MAP_FLAG_WRITE))) { + // Pinned host memory is already on host so it doesn't need to be read. + ret_err = urEnqueueMemBufferRead(hQueue, hBuffer, blockingMap, offset, size, + hostPtr, numEventsInWaitList, + phEventWaitList, phEvent); + } else { + ScopedContext active(hQueue->get_context()); + + if (is_pinned) { + ret_err = urEnqueueEventsWait(hQueue, numEventsInWaitList, + phEventWaitList, nullptr); + } + + if (phEvent) { + try { + *phEvent = + ur_event_handle_t_::make_native(UR_COMMAND_MEM_BUFFER_MAP, hQueue, + hQueue->get_next_transfer_stream()); + (*phEvent)->start(); + (*phEvent)->record(); + } catch (ur_result_t error) { + ret_err = error; + } + } + } + + return ret_err; +} + +/// Implements the unmap from the host, using a BufferWrite operation. +/// Requires the mapped pointer to be already registered in the given memobj. +/// If memobj uses pinned host memory, this will not do a write. +/// +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( + ur_queue_handle_t hQueue, ur_mem_handle_t hMem, void *pMappedPtr, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + ur_result_t ret_err = UR_RESULT_SUCCESS; + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(pMappedPtr, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + UR_ASSERT(hMem->mem_type_ == ur_mem_handle_t_::mem_type::buffer, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(hMem->mem_.buffer_mem_.get_map_ptr() != nullptr, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(hMem->mem_.buffer_mem_.get_map_ptr() == pMappedPtr, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + const bool is_pinned = + hMem->mem_.buffer_mem_.allocMode_ == + ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::alloc_host_ptr; + + if (!is_pinned && + (hMem->mem_.buffer_mem_.get_map_flags() & UR_MAP_FLAG_WRITE)) { + // Pinned host memory is only on host so it doesn't need to be written to. + ret_err = urEnqueueMemBufferWrite( + hQueue, hMem, true, hMem->mem_.buffer_mem_.get_map_offset(pMappedPtr), + hMem->mem_.buffer_mem_.get_size(), pMappedPtr, numEventsInWaitList, + phEventWaitList, phEvent); + } else { + ScopedContext active(hQueue->get_context()); + + if (is_pinned) { + ret_err = urEnqueueEventsWait(hQueue, numEventsInWaitList, + phEventWaitList, nullptr); + } + + if (phEvent) { + try { + *phEvent = ur_event_handle_t_::make_native( + UR_COMMAND_MEM_UNMAP, hQueue, hQueue->get_next_transfer_stream()); + (*phEvent)->start(); + (*phEvent)->record(); + } catch (ur_result_t error) { + ret_err = error; + } + } + } + + hMem->mem_.buffer_mem_.unmap(pMappedPtr); + return ret_err; +} + /// TODO(ur): Add support for the offset. UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( ur_queue_handle_t hQueue, void *ptr, size_t patternSize, diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp index d7751a02e9707..c95eed5c24e05 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp @@ -169,18 +169,18 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( pDdiTable->pfnEventsWait = urEnqueueEventsWait; pDdiTable->pfnEventsWaitWithBarrier = urEnqueueEventsWaitWithBarrier; pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch; - pDdiTable->pfnMemBufferCopy = nullptr; - pDdiTable->pfnMemBufferCopyRect = nullptr; - pDdiTable->pfnMemBufferFill = nullptr; - pDdiTable->pfnMemBufferMap = nullptr; + pDdiTable->pfnMemBufferCopy = urEnqueueMemBufferCopy; + pDdiTable->pfnMemBufferCopyRect = urEnqueueMemBufferCopyRect; + pDdiTable->pfnMemBufferFill = urEnqueueMemBufferFill; + pDdiTable->pfnMemBufferMap = urEnqueueMemBufferMap; pDdiTable->pfnMemBufferRead = urEnqueueMemBufferRead; - pDdiTable->pfnMemBufferReadRect = nullptr; + pDdiTable->pfnMemBufferReadRect = urEnqueueMemBufferReadRect; pDdiTable->pfnMemBufferWrite = urEnqueueMemBufferWrite; - pDdiTable->pfnMemBufferWriteRect = nullptr; - pDdiTable->pfnMemImageCopy = nullptr; - pDdiTable->pfnMemImageRead = nullptr; - pDdiTable->pfnMemImageWrite = nullptr; - pDdiTable->pfnMemUnmap = nullptr; + pDdiTable->pfnMemBufferWriteRect = urEnqueueMemBufferWriteRect; + pDdiTable->pfnMemImageCopy = urEnqueueMemImageCopy; + pDdiTable->pfnMemImageRead = urEnqueueMemImageRead; + pDdiTable->pfnMemImageWrite = urEnqueueMemImageWrite; + pDdiTable->pfnMemUnmap = urEnqueueMemUnmap; pDdiTable->pfnUSMFill2D = urEnqueueUSMFill2D; pDdiTable->pfnUSMFill = urEnqueueUSMFill; pDdiTable->pfnUSMAdvise = urEnqueueUSMAdvise; From 4f9277bf6ca7afdc0e1dc906a5aa34cac531bffb Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Tue, 2 May 2023 10:48:34 +0100 Subject: [PATCH 22/45] Don't check MAX_MEM_ALLOC_SIZE when creating a buffer Rely on cuMemAlloc failing if the given size is too big instead --- sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp index 59975b0a7b821..abca91b594e19 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp @@ -31,10 +31,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( // Need input memory object UR_ASSERT(phBuffer, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(size != 0, UR_RESULT_ERROR_INVALID_BUFFER_SIZE); - uint64_t maxAlloc = 0; - urDeviceGetInfo(hContext->get_device(), UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, - sizeof(maxAlloc), &maxAlloc, nullptr); - UR_ASSERT(size <= maxAlloc, UR_RESULT_ERROR_INVALID_BUFFER_SIZE); // Currently, USE_HOST_PTR is not implemented using host register // since this triggers a weird segfault after program ends. From 8968c1f0aaf38a0663c1c7333fa566df0fc4f2b2 Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Fri, 28 Apr 2023 10:00:15 +0100 Subject: [PATCH 23/45] [SYCL][CUDA] Port CUDA global variable read/write to UR --- sycl/plugins/cuda/pi_cuda.cpp | 80 +------------------ .../ur/adapters/cuda/enqueue.cpp | 73 +++++++++++++++++ .../ur/adapters/cuda/ur_interface_loader.cpp | 4 +- 3 files changed, 77 insertions(+), 80 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 09c2fddc6e207..992b69f4078d0 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -306,82 +306,6 @@ pi_result cuda_piextGetDeviceFunctionPointer([[maybe_unused]] pi_device device, return retError; } -pi_result cuda_piextEnqueueDeviceGlobalVariableWrite( - pi_queue queue, pi_program program, const char *name, - pi_bool blocking_write, size_t count, size_t offset, const void *src, - pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, - pi_event *event) { - assert(queue != nullptr); - assert(program != nullptr); - - if (name == nullptr || src == nullptr) - return PI_ERROR_INVALID_VALUE; - - // Since CUDA requires a the global variable to be referenced by name, we use - // metadata to find the correct name to access it by. - auto device_global_name_it = program->globalIDMD_.find(name); - if (device_global_name_it == program->globalIDMD_.end()) - return PI_ERROR_INVALID_VALUE; - std::string device_global_name = device_global_name_it->second; - - pi_result result = PI_SUCCESS; - try { - CUdeviceptr device_global = 0; - size_t device_global_size = 0; - result = PI_CHECK_ERROR( - cuModuleGetGlobal(&device_global, &device_global_size, program->get(), - device_global_name.c_str())); - - if (offset + count > device_global_size) - return PI_ERROR_INVALID_VALUE; - - return pi2ur::piextUSMEnqueueMemcpy( - queue, blocking_write, reinterpret_cast(device_global + offset), - src, count, num_events_in_wait_list, event_wait_list, event); - } catch (pi_result error) { - result = error; - } - return result; -} - -pi_result cuda_piextEnqueueDeviceGlobalVariableRead( - pi_queue queue, pi_program program, const char *name, pi_bool blocking_read, - size_t count, size_t offset, void *dst, pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, pi_event *event) { - assert(queue != nullptr); - assert(program != nullptr); - - if (name == nullptr || dst == nullptr) - return PI_ERROR_INVALID_VALUE; - - // Since CUDA requires a the global variable to be referenced by name, we use - // metadata to find the correct name to access it by. - auto device_global_name_it = program->globalIDMD_.find(name); - if (device_global_name_it == program->globalIDMD_.end()) - return PI_ERROR_INVALID_VALUE; - std::string device_global_name = device_global_name_it->second; - - pi_result result = PI_SUCCESS; - try { - CUdeviceptr device_global = 0; - size_t device_global_size = 0; - result = PI_CHECK_ERROR( - cuModuleGetGlobal(&device_global, &device_global_size, program->get(), - device_global_name.c_str())); - - if (offset + count > device_global_size) - return PI_ERROR_INVALID_VALUE; - - return pi2ur::piextUSMEnqueueMemcpy( - queue, blocking_read, dst, - reinterpret_cast(device_global + offset), count, - num_events_in_wait_list, event_wait_list, event); - } catch (pi_result error) { - result = error; - } - return result; -} - /// Host Pipes pi_result cuda_piextEnqueueReadHostPipe( pi_queue queue, pi_program program, const char *pipe_symbol, @@ -572,9 +496,9 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextUSMGetMemAllocInfo, pi2ur::piextUSMGetMemAllocInfo) // Device global variable _PI_CL(piextEnqueueDeviceGlobalVariableWrite, - cuda_piextEnqueueDeviceGlobalVariableWrite) + pi2ur::piextEnqueueDeviceGlobalVariableWrite) _PI_CL(piextEnqueueDeviceGlobalVariableRead, - cuda_piextEnqueueDeviceGlobalVariableRead) + pi2ur::piextEnqueueDeviceGlobalVariableRead) // Host Pipe _PI_CL(piextEnqueueReadHostPipe, cuda_piextEnqueueReadHostPipe) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp index fd2106dd6c141..99a8285d2cd44 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp @@ -1620,3 +1620,76 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( } return retErr; } + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite( + ur_queue_handle_t hQueue, ur_program_handle_t hProgram, const char *name, + bool blockingWrite, size_t count, size_t offset, const void *pSrc, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(name && pSrc, UR_RESULT_ERROR_INVALID_VALUE); + + // Since CUDA requires a the global variable to be referenced by name, we use + // metadata to find the correct name to access it by. + auto device_global_name_it = hProgram->globalIDMD_.find(name); + if (device_global_name_it == hProgram->globalIDMD_.end()) + return UR_RESULT_ERROR_INVALID_VALUE; + std::string device_global_name = device_global_name_it->second; + + ur_result_t result = UR_RESULT_SUCCESS; + try { + CUdeviceptr device_global = 0; + size_t device_global_size = 0; + result = UR_CHECK_ERROR( + cuModuleGetGlobal(&device_global, &device_global_size, hProgram->get(), + device_global_name.c_str())); + + if (offset + count > device_global_size) + return UR_RESULT_ERROR_INVALID_VALUE; + + return urEnqueueUSMMemcpy( + hQueue, blockingWrite, reinterpret_cast(device_global + offset), + pSrc, count, numEventsInWaitList, phEventWaitList, phEvent); + } catch (ur_result_t error) { + result = error; + } + return result; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead( + ur_queue_handle_t hQueue, ur_program_handle_t hProgram, const char *name, + bool blockingRead, size_t count, size_t offset, void *pDst, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(name && pDst, UR_RESULT_ERROR_INVALID_VALUE); + + // Since CUDA requires a the global variable to be referenced by name, we use + // metadata to find the correct name to access it by. + auto device_global_name_it = hProgram->globalIDMD_.find(name); + if (device_global_name_it == hProgram->globalIDMD_.end()) + return UR_RESULT_ERROR_INVALID_VALUE; + std::string device_global_name = device_global_name_it->second; + + ur_result_t result = UR_RESULT_SUCCESS; + try { + CUdeviceptr device_global = 0; + size_t device_global_size = 0; + result = UR_CHECK_ERROR( + cuModuleGetGlobal(&device_global, &device_global_size, hProgram->get(), + device_global_name.c_str())); + + if (offset + count > device_global_size) + return UR_RESULT_ERROR_INVALID_VALUE; + + return urEnqueueUSMMemcpy( + hQueue, blockingRead, pDst, + reinterpret_cast(device_global + offset), count, + numEventsInWaitList, phEventWaitList, phEvent); + } catch (ur_result_t error) { + result = error; + } + return result; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp index c95eed5c24e05..b87e2f822d391 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp @@ -164,8 +164,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( if (UR_RESULT_SUCCESS != result) { return result; } - pDdiTable->pfnDeviceGlobalVariableRead = nullptr; - pDdiTable->pfnDeviceGlobalVariableWrite = nullptr; + pDdiTable->pfnDeviceGlobalVariableRead = urEnqueueDeviceGlobalVariableRead; + pDdiTable->pfnDeviceGlobalVariableWrite = urEnqueueDeviceGlobalVariableWrite; pDdiTable->pfnEventsWait = urEnqueueEventsWait; pDdiTable->pfnEventsWaitWithBarrier = urEnqueueEventsWaitWithBarrier; pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch; From 1fb2afdb6a93d5944bee0ef0c84faefd75b4273c Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Wed, 3 May 2023 12:22:27 +0100 Subject: [PATCH 24/45] [SYCL][CUDA] Only build CUDA UR adapter when CUDA plugin is enabled --- sycl/plugins/unified_runtime/CMakeLists.txt | 78 +++++++++++---------- 1 file changed, 40 insertions(+), 38 deletions(-) diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 2288a8e9949e1..eaec6367392b5 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -124,44 +124,46 @@ set_target_properties("ur_adapter_level_zero" PROPERTIES SOVERSION "0" ) -# Build CUDA adapter -add_sycl_library("ur_adapter_cuda" SHARED - SOURCES - "ur/ur.hpp" - "ur/ur.cpp" - "ur/usm_allocator.cpp" - "ur/usm_allocator.hpp" - "ur/adapters/cuda/common.cpp" - "ur/adapters/cuda/common.hpp" - "ur/adapters/cuda/context.cpp" - "ur/adapters/cuda/context.hpp" - "ur/adapters/cuda/device.cpp" - "ur/adapters/cuda/device.hpp" - "ur/adapters/cuda/enqueue.cpp" - "ur/adapters/cuda/event.cpp" - "ur/adapters/cuda/event.hpp" - "ur/adapters/cuda/platform.cpp" - "ur/adapters/cuda/platform.hpp" - "ur/adapters/cuda/program.cpp" - "ur/adapters/cuda/program.hpp" - "ur/adapters/cuda/kernel.cpp" - "ur/adapters/cuda/kernel.hpp" - "ur/adapters/cuda/queue.cpp" - "ur/adapters/cuda/queue.hpp" - "ur/adapters/cuda/sampler.cpp" - "ur/adapters/cuda/sampler.hpp" - "ur/adapters/cuda/memory.cpp" - "ur/adapters/cuda/memory.hpp" - "ur/adapters/cuda/usm.cpp" - "ur/adapters/cuda/ur_interface_loader.cpp" - "ur/adapters/cuda/tracing.cpp" - INCLUDE_DIRS - ${sycl_inc_dir} - LIBRARIES - UnifiedRuntime-Headers - Threads::Threads - cudadrv -) +if ("cuda" IN_LIST SYCL_ENABLE_PLUGINS) + # Build CUDA adapter + add_sycl_library("ur_adapter_cuda" SHARED + SOURCES + "ur/ur.hpp" + "ur/ur.cpp" + "ur/usm_allocator.cpp" + "ur/usm_allocator.hpp" + "ur/adapters/cuda/common.cpp" + "ur/adapters/cuda/common.hpp" + "ur/adapters/cuda/context.cpp" + "ur/adapters/cuda/context.hpp" + "ur/adapters/cuda/device.cpp" + "ur/adapters/cuda/device.hpp" + "ur/adapters/cuda/enqueue.cpp" + "ur/adapters/cuda/event.cpp" + "ur/adapters/cuda/event.hpp" + "ur/adapters/cuda/platform.cpp" + "ur/adapters/cuda/platform.hpp" + "ur/adapters/cuda/program.cpp" + "ur/adapters/cuda/program.hpp" + "ur/adapters/cuda/kernel.cpp" + "ur/adapters/cuda/kernel.hpp" + "ur/adapters/cuda/queue.cpp" + "ur/adapters/cuda/queue.hpp" + "ur/adapters/cuda/sampler.cpp" + "ur/adapters/cuda/sampler.hpp" + "ur/adapters/cuda/memory.cpp" + "ur/adapters/cuda/memory.hpp" + "ur/adapters/cuda/usm.cpp" + "ur/adapters/cuda/ur_interface_loader.cpp" + "ur/adapters/cuda/tracing.cpp" + INCLUDE_DIRS + ${sycl_inc_dir} + LIBRARIES + UnifiedRuntime-Headers + Threads::Threads + cudadrv + ) +endif() if (TARGET UnifiedRuntimeLoader) set_target_properties(hello_world PROPERTIES EXCLUDE_FROM_ALL 1 EXCLUDE_FROM_DEFAULT_BUILD 1) From 96c85ac0857de96ec3723badd5fd173b099ccdba Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Wed, 3 May 2023 14:14:27 +0100 Subject: [PATCH 25/45] [SYCL][CUDA] Don't link non-CUDA adapters with cudadrv --- sycl/plugins/unified_runtime/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index eaec6367392b5..9ceb01b670b98 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -116,7 +116,6 @@ add_sycl_library("ur_adapter_level_zero" SHARED LevelZeroLoader-Headers LevelZeroLoader Threads::Threads - cudadrv ) set_target_properties("ur_adapter_level_zero" PROPERTIES From 3812978c9af72f8ddfdc37249cab596bb344f4b5 Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Thu, 4 May 2023 13:56:37 +0100 Subject: [PATCH 26/45] [SYCL][CUDA] Port piextGetDeviceFunctionPointer and piextDeviceSelectBinary to UR --- sycl/plugins/cuda/pi_cuda.cpp | 56 +------------------ .../ur/adapters/cuda/device.cpp | 25 +++++++++ .../ur/adapters/cuda/program.cpp | 25 +++++++++ .../ur/adapters/cuda/ur_interface_loader.cpp | 4 +- 4 files changed, 54 insertions(+), 56 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 992b69f4078d0..6f3002e808cc7 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -253,58 +253,6 @@ pi_result enqueueEventWait(pi_queue queue, pi_event event) { //-- PI API implementation extern "C" { -/// \return If available, the first binary that is PTX -/// -pi_result cuda_piextDeviceSelectBinary(pi_device device, - pi_device_binary *binaries, - pi_uint32 num_binaries, - pi_uint32 *selected_binary) { - // Ignore unused parameter - (void)device; - - if (!binaries) { - sycl::detail::pi::die("No list of device images provided"); - } - if (num_binaries < 1) { - sycl::detail::pi::die("No binary images in the list"); - } - - // Look for an image for the NVPTX64 target, and return the first one that is - // found - for (pi_uint32 i = 0; i < num_binaries; i++) { - if (strcmp(binaries[i]->DeviceTargetSpec, - __SYCL_PI_DEVICE_BINARY_TARGET_NVPTX64) == 0) { - *selected_binary = i; - return PI_SUCCESS; - } - } - - // No image can be loaded for the given device - return PI_ERROR_INVALID_BINARY; -} - -pi_result cuda_piextGetDeviceFunctionPointer([[maybe_unused]] pi_device device, - pi_program program, - const char *func_name, - pi_uint64 *func_pointer_ret) { - // Check if device passed is the same the device bound to the context - assert(device == program->get_context()->get_device()); - assert(func_pointer_ret != nullptr); - - CUfunction func; - CUresult ret = cuModuleGetFunction(&func, program->get(), func_name); - *func_pointer_ret = reinterpret_cast(func); - pi_result retError = PI_SUCCESS; - - if (ret != CUDA_SUCCESS && ret != CUDA_ERROR_NOT_FOUND) - retError = PI_CHECK_ERROR(ret); - if (ret == CUDA_ERROR_NOT_FOUND) { - *func_pointer_ret = 0; - retError = PI_ERROR_INVALID_KERNEL_NAME; - } - - return retError; -} /// Host Pipes pi_result cuda_piextEnqueueReadHostPipe( @@ -375,8 +323,8 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piDevicePartition, pi2ur::piDevicePartition) _PI_CL(piDeviceRetain, pi2ur::piDeviceRetain) _PI_CL(piDeviceRelease, pi2ur::piDeviceRelease) - _PI_CL(piextDeviceSelectBinary, cuda_piextDeviceSelectBinary) - _PI_CL(piextGetDeviceFunctionPointer, cuda_piextGetDeviceFunctionPointer) + _PI_CL(piextDeviceSelectBinary, pi2ur::piextDeviceSelectBinary) + _PI_CL(piextGetDeviceFunctionPointer, pi2ur::piextGetDeviceFunctionPointer) _PI_CL(piextDeviceGetNativeHandle, pi2ur::piextDeviceGetNativeHandle) _PI_CL(piextDeviceCreateWithNativeHandle, pi2ur::piextDeviceCreateWithNativeHandle) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp index 567377be8796f..6d87373524341 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp @@ -1145,3 +1145,28 @@ ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice, return UR_RESULT_SUCCESS; } + +/// \return If available, the first binary that is PTX +/// +UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary( + ur_device_handle_t hDevice, const ur_device_binary_t *pBinaries, + uint32_t NumBinaries, uint32_t *pSelectedBinary) { + // Ignore unused parameter + (void)hDevice; + + UR_ASSERT(pBinaries, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(NumBinaries > 0, UR_RESULT_ERROR_INVALID_ARGUMENT); + + // Look for an image for the NVPTX64 target, and return the first one that is + // found + for (uint32_t i = 0; i < NumBinaries; i++) { + if (strcmp(pBinaries[i].pDeviceTargetSpec, + UR_DEVICE_BINARY_TARGET_NVPTX64) == 0) { + *pSelectedBinary = i; + return UR_RESULT_SUCCESS; + } + } + + // No image can be loaded for the given device + return UR_RESULT_ERROR_INVALID_BINARY; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp index 129f4eb06b81e..82f6db76fda68 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp @@ -442,3 +442,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramSetSpecializationConstants( ur_program_handle_t, uint32_t, const ur_specialization_constant_info_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + +UR_APIEXPORT ur_result_t UR_APICALL urProgramGetFunctionPointer( + ur_device_handle_t hDevice, ur_program_handle_t hProgram, + const char *pFunctionName, void **ppFunctionPointer) { + // Check if device passed is the same the device bound to the context + UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hDevice == hProgram->get_context()->get_device(), + UR_RESULT_ERROR_INVALID_DEVICE); + UR_ASSERT(ppFunctionPointer, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + CUfunction func; + CUresult ret = cuModuleGetFunction(&func, hProgram->get(), pFunctionName); + *ppFunctionPointer = func; + ur_result_t retError = UR_RESULT_SUCCESS; + + if (ret != CUDA_SUCCESS && ret != CUDA_ERROR_NOT_FOUND) + retError = UR_CHECK_ERROR(ret); + if (ret == CUDA_ERROR_NOT_FOUND) { + *ppFunctionPointer = 0; + retError = UR_RESULT_ERROR_INVALID_FUNCTION_NAME; + } + + return retError; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp index b87e2f822d391..f7fb58c256d47 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp @@ -89,7 +89,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramProcAddrTable( pDdiTable->pfnCreateWithIL = urProgramCreateWithIL; pDdiTable->pfnCreateWithNativeHandle = urProgramCreateWithNativeHandle; pDdiTable->pfnGetBuildInfo = urProgramGetBuildInfo; - pDdiTable->pfnGetFunctionPointer = nullptr; + pDdiTable->pfnGetFunctionPointer = urProgramGetFunctionPointer; pDdiTable->pfnGetInfo = urProgramGetInfo; pDdiTable->pfnGetNativeHandle = urProgramGetNativeHandle; pDdiTable->pfnLink = urProgramLink; @@ -250,7 +250,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable( pDdiTable->pfnPartition = urDevicePartition; pDdiTable->pfnRelease = urDeviceRelease; pDdiTable->pfnRetain = urDeviceRetain; - pDdiTable->pfnSelectBinary = nullptr; + pDdiTable->pfnSelectBinary = urDeviceSelectBinary; return UR_RESULT_SUCCESS; } From 764e683bcfb54c40ec28db0f2632e1817a8cbdc4 Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Fri, 5 May 2023 15:30:55 +0100 Subject: [PATCH 27/45] [SYCL][CUDA] Port piPluginGetBackendOption to UR --- sycl/plugins/cuda/pi_cuda.cpp | 21 +------------------ .../ur/adapters/cuda/platform.cpp | 20 ++++++++++++++++++ .../ur/adapters/cuda/ur_interface_loader.cpp | 1 + 3 files changed, 22 insertions(+), 20 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 6f3002e808cc7..70a7b319cb353 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -54,25 +54,6 @@ pi_result map_error(CUresult result) { } } -// Returns plugin specific backend option. -// Current support is only for optimization options. -// Return empty string for cuda. -// TODO: Determine correct string to be passed. -pi_result cuda_piPluginGetBackendOption(pi_platform, - const char *frontend_option, - const char **backend_option) { - using namespace std::literals; - if (frontend_option == nullptr) - return PI_ERROR_INVALID_VALUE; - if (frontend_option == "-O0"sv || frontend_option == "-O1"sv || - frontend_option == "-O2"sv || frontend_option == "-O3"sv || - frontend_option == ""sv) { - *backend_option = ""; - return PI_SUCCESS; - } - return PI_ERROR_INVALID_VALUE; -} - // Iterates over the event wait list, returns correct pi_result error codes. // Invokes the callback for the latest event of each queue in the wait list. // The callback must take a single pi_event argument and return a pi_result. @@ -457,7 +438,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piPluginGetLastError, pi2ur::piPluginGetLastError) _PI_CL(piTearDown, pi2ur::piTearDown) _PI_CL(piGetDeviceAndHostTimer, pi2ur::piGetDeviceAndHostTimer) - _PI_CL(piPluginGetBackendOption, cuda_piPluginGetBackendOption) + _PI_CL(piPluginGetBackendOption, pi2ur::piPluginGetBackendOption) #undef _PI_CL diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp index 2ca8c516c08e3..dbbb177926c32 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp @@ -181,3 +181,23 @@ UR_DLLEXPORT ur_result_t UR_APICALL urTearDown(void *) { disableCUDATracing(); return UR_RESULT_SUCCESS; } + +// Returns plugin specific backend option. +// Current support is only for optimization options. +// Return empty string for cuda. +// TODO: Determine correct string to be passed. +UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetBackendOption( + ur_platform_handle_t hPlatform, const char *pFrontendOption, + const char **ppPlatformOption) { + (void)hPlatform; + using namespace std::literals; + if (pFrontendOption == nullptr) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + if (pFrontendOption == "-O0"sv || pFrontendOption == "-O1"sv || + pFrontendOption == "-O2"sv || pFrontendOption == "-O3"sv || + pFrontendOption == ""sv) { + *ppPlatformOption = ""; + return UR_RESULT_SUCCESS; + } + return UR_RESULT_ERROR_INVALID_VALUE; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp index f7fb58c256d47..49189598be91d 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp @@ -41,6 +41,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetPlatformProcAddrTable( pDdiTable->pfnGetApiVersion = urPlatformGetApiVersion; pDdiTable->pfnGetInfo = urPlatformGetInfo; pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnGetBackendOption = urPlatformGetBackendOption; return UR_RESULT_SUCCESS; } From d98adf801cbb0ceb14474071074bed3e9feb118d Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Fri, 5 May 2023 16:36:21 +0100 Subject: [PATCH 28/45] [SYCL][CUDA] Port read/write host pipe to UR --- sycl/plugins/cuda/pi_cuda.cpp | 41 +------------------ .../ur/adapters/cuda/enqueue.cpp | 37 +++++++++++++++++ .../ur/adapters/cuda/ur_interface_loader.cpp | 2 + 3 files changed, 41 insertions(+), 39 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 70a7b319cb353..1d28c08f64098 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -235,43 +235,6 @@ pi_result enqueueEventWait(pi_queue queue, pi_event event) { //-- PI API implementation extern "C" { -/// Host Pipes -pi_result cuda_piextEnqueueReadHostPipe( - pi_queue queue, pi_program program, const char *pipe_symbol, - pi_bool blocking, void *ptr, size_t size, pi_uint32 num_events_in_waitlist, - const pi_event *events_waitlist, pi_event *event) { - (void)queue; - (void)program; - (void)pipe_symbol; - (void)blocking; - (void)ptr; - (void)size; - (void)num_events_in_waitlist; - (void)events_waitlist; - (void)event; - - sycl::detail::pi::die("cuda_piextEnqueueReadHostPipe not implemented"); - return {}; -} - -pi_result cuda_piextEnqueueWriteHostPipe( - pi_queue queue, pi_program program, const char *pipe_symbol, - pi_bool blocking, void *ptr, size_t size, pi_uint32 num_events_in_waitlist, - const pi_event *events_waitlist, pi_event *event) { - (void)queue; - (void)program; - (void)pipe_symbol; - (void)blocking; - (void)ptr; - (void)size; - (void)num_events_in_waitlist; - (void)events_waitlist; - (void)event; - - sycl::detail::pi::die("cuda_piextEnqueueWriteHostPipe not implemented"); - return {}; -} - const char SupportedVersion[] = _PI_CUDA_PLUGIN_VERSION_STRING; pi_result piPluginInit(pi_plugin *PluginInit) { @@ -430,8 +393,8 @@ pi_result piPluginInit(pi_plugin *PluginInit) { pi2ur::piextEnqueueDeviceGlobalVariableRead) // Host Pipe - _PI_CL(piextEnqueueReadHostPipe, cuda_piextEnqueueReadHostPipe) - _PI_CL(piextEnqueueWriteHostPipe, cuda_piextEnqueueWriteHostPipe) + _PI_CL(piextEnqueueReadHostPipe, pi2ur::piextEnqueueReadHostPipe) + _PI_CL(piextEnqueueWriteHostPipe, pi2ur::piextEnqueueWriteHostPipe) _PI_CL(piextKernelSetArgMemObj, pi2ur::piextKernelSetArgMemObj) _PI_CL(piextKernelSetArgSampler, pi2ur::piextKernelSetArgSampler) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp index 99a8285d2cd44..073a9ffce26a6 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp @@ -1693,3 +1693,40 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead( } return result; } + +/// Host Pipes +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueReadHostPipe( + ur_queue_handle_t hQueue, ur_program_handle_t hProgram, + const char *pipe_symbol, bool blocking, void *pDst, size_t size, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + (void)hQueue; + (void)hProgram; + (void)pipe_symbol; + (void)blocking; + (void)pDst; + (void)size; + (void)numEventsInWaitList; + (void)phEventWaitList; + (void)phEvent; + + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe( + ur_queue_handle_t hQueue, ur_program_handle_t hProgram, + const char *pipe_symbol, bool blocking, void *pSrc, size_t size, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + (void)hQueue; + (void)hProgram; + (void)pipe_symbol; + (void)blocking; + (void)pSrc; + (void)size; + (void)numEventsInWaitList; + (void)phEventWaitList; + (void)phEvent; + + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp index 49189598be91d..ebb4bc771ccd2 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp @@ -188,6 +188,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( pDdiTable->pfnUSMMemcpy2D = urEnqueueUSMMemcpy2D; pDdiTable->pfnUSMMemcpy = urEnqueueUSMMemcpy; pDdiTable->pfnUSMPrefetch = urEnqueueUSMPrefetch; + pDdiTable->pfnReadHostPipe = urEnqueueReadHostPipe; + pDdiTable->pfnWriteHostPipe = urEnqueueWriteHostPipe; return UR_RESULT_SUCCESS; } From 797d3f750c192b8c6051b9bc8710fc2914ea374d Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Mon, 15 May 2023 13:11:05 +0100 Subject: [PATCH 29/45] [CUDA][UR]Fix program_info_kernel_names --- sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp index 82f6db76fda68..91e0b5c85d1b1 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp @@ -332,7 +332,7 @@ urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName, return ReturnValue(&hProgram->binarySizeInBytes_, 1); case UR_PROGRAM_INFO_BINARIES: return ReturnValue(&hProgram->binary_, 1); - case UR_PROGRAM_INFO_NUM_KERNELS: + case UR_PROGRAM_INFO_KERNEL_NAMES: return getKernelNames(hProgram); default: break; From 6f68c7c7a085e1b63effaf919e0bc9995a58cad7 Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Tue, 9 May 2023 16:38:53 +0100 Subject: [PATCH 30/45] [SYCL][CUDA] Remove unused code from CUDA PI and move remaining documentation to UR --- sycl/plugins/cuda/pi_cuda.cpp | 213 ------------------ sycl/plugins/cuda/pi_cuda.hpp | 175 -------------- .../ur/adapters/cuda/context.cpp | 4 - .../ur/adapters/cuda/context.hpp | 46 +++- .../ur/adapters/cuda/device.cpp | 3 - .../ur/adapters/cuda/enqueue.cpp | 1 - .../ur/adapters/cuda/event.hpp | 1 - .../ur/adapters/cuda/kernel.hpp | 16 ++ 8 files changed, 54 insertions(+), 405 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 1d28c08f64098..bc8cbaa1e7e2c 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -16,221 +16,8 @@ #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - // Forward declarations void enableCUDATracing(); -void disableCUDATracing(); - -namespace { -pi_result map_error(CUresult result) { - switch (result) { - case CUDA_SUCCESS: - return PI_SUCCESS; - case CUDA_ERROR_NOT_PERMITTED: - return PI_ERROR_INVALID_OPERATION; - case CUDA_ERROR_INVALID_CONTEXT: - return PI_ERROR_INVALID_CONTEXT; - case CUDA_ERROR_INVALID_DEVICE: - return PI_ERROR_INVALID_DEVICE; - case CUDA_ERROR_INVALID_VALUE: - return PI_ERROR_INVALID_VALUE; - case CUDA_ERROR_OUT_OF_MEMORY: - return PI_ERROR_OUT_OF_HOST_MEMORY; - case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: - return PI_ERROR_OUT_OF_RESOURCES; - default: - return PI_ERROR_UNKNOWN; - } -} - -// Iterates over the event wait list, returns correct pi_result error codes. -// Invokes the callback for the latest event of each queue in the wait list. -// The callback must take a single pi_event argument and return a pi_result. -template -pi_result forLatestEvents(const pi_event *event_wait_list, - std::size_t num_events_in_wait_list, Func &&f) { - - if (event_wait_list == nullptr || num_events_in_wait_list == 0) { - return PI_ERROR_INVALID_EVENT_WAIT_LIST; - } - - // Fast path if we only have a single event - if (num_events_in_wait_list == 1) { - return f(event_wait_list[0]); - } - - std::vector events{event_wait_list, - event_wait_list + num_events_in_wait_list}; - std::sort(events.begin(), events.end(), [](pi_event e0, pi_event e1) { - // Tiered sort creating sublists of streams (smallest value first) in which - // the corresponding events are sorted into a sequence of newest first. - return e0->get_stream() < e1->get_stream() || - (e0->get_stream() == e1->get_stream() && - e0->get_event_id() > e1->get_event_id()); - }); - - bool first = true; - CUstream lastSeenStream = 0; - for (pi_event event : events) { - if (!event || (!first && event->get_stream() == lastSeenStream)) { - continue; - } - - first = false; - lastSeenStream = event->get_stream(); - - auto result = f(event); - if (result != PI_SUCCESS) { - return result; - } - } - - return PI_SUCCESS; -} - -/// Converts CUDA error into PI error codes, and outputs error information -/// to stderr. -/// If PI_CUDA_ABORT env variable is defined, it aborts directly instead of -/// throwing the error. This is intended for debugging purposes. -/// \return PI_SUCCESS if \param result was CUDA_SUCCESS. -/// \throw pi_error exception (integer) if input was not success. -/// -pi_result check_error(CUresult result, const char *function, int line, - const char *file) { - if (result == CUDA_SUCCESS || result == CUDA_ERROR_DEINITIALIZED) { - return PI_SUCCESS; - } - - if (std::getenv("SYCL_PI_SUPPRESS_ERROR_MESSAGE") == nullptr) { - const char *errorString = nullptr; - const char *errorName = nullptr; - cuGetErrorName(result, &errorName); - cuGetErrorString(result, &errorString); - std::stringstream ss; - ss << "\nPI CUDA ERROR:" - << "\n\tValue: " << result - << "\n\tName: " << errorName - << "\n\tDescription: " << errorString - << "\n\tFunction: " << function << "\n\tSource Location: " << file - << ":" << line << "\n" - << std::endl; - std::cerr << ss.str(); - } - - if (std::getenv("PI_CUDA_ABORT") != nullptr) { - std::abort(); - } - - throw map_error(result); -} - -/// \cond NODOXY -#define PI_CHECK_ERROR(result) check_error(result, __func__, __LINE__, __FILE__) - -/// \cond NODOXY -template -pi_result getInfoImpl(size_t param_value_size, void *param_value, - size_t *param_value_size_ret, T value, size_t value_size, - Assign &&assign_func) { - - if (param_value != nullptr) { - - if (param_value_size < value_size) { - return PI_ERROR_INVALID_VALUE; - } - - assign_func(param_value, value, value_size); - } - - if (param_value_size_ret != nullptr) { - *param_value_size_ret = value_size; - } - - return PI_SUCCESS; -} - -template -pi_result getInfo(size_t param_value_size, void *param_value, - size_t *param_value_size_ret, T value) { - - auto assignment = [](void *param_value, T value, size_t value_size) { - // Ignore unused parameter - (void)value_size; - - *static_cast(param_value) = value; - }; - - return getInfoImpl(param_value_size, param_value, param_value_size_ret, value, - sizeof(T), assignment); -} - -template -pi_result getInfoArray(size_t array_length, size_t param_value_size, - void *param_value, size_t *param_value_size_ret, - T *value) { - return getInfoImpl(param_value_size, param_value, param_value_size_ret, value, - array_length * sizeof(T), memcpy); -} -/// \endcond - -} // anonymous namespace - -/// ------ Error handling, matching OpenCL plugin semantics. -namespace sycl { -__SYCL_INLINE_VER_NAMESPACE(_V1) { -namespace detail { -namespace pi { - -// Report error and no return (keeps compiler from printing warnings). -// TODO: Probably change that to throw a catchable exception, -// but for now it is useful to see every failure. -// -[[noreturn]] void die(const char *Message) { - std::cerr << "pi_die: " << Message << std::endl; - std::terminate(); -} - -// Reports error messages -void cuPrint(const char *Message) { - std::cerr << "pi_print: " << Message << std::endl; -} - -void assertion(bool Condition, const char *Message) { - if (!Condition) - die(Message); -} - -} // namespace pi -} // namespace detail -} // __SYCL_INLINE_VER_NAMESPACE(_V1) -} // namespace sycl - -//-------------- -// PI object implementation - -/// \endcond - -// makes all future work submitted to queue wait for all work captured in event. -pi_result enqueueEventWait(pi_queue queue, pi_event event) { - // for native events, the cuStreamWaitEvent call is used. - // This makes all future work submitted to stream wait for all - // work captured in event. - queue->for_each_stream([e = event->get()](CUstream s) { - PI_CHECK_ERROR(cuStreamWaitEvent(s, e, 0)); - }); - return PI_SUCCESS; -} //-- PI API implementation extern "C" { diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index c1c84fa2a4557..f1d15016bc0e5 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -25,23 +25,6 @@ #define _PI_CUDA_PLUGIN_VERSION_STRING \ _PI_PLUGIN_VERSION_STRING(_PI_CUDA_PLUGIN_VERSION) -#include "sycl/detail/pi.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - #include #include #include @@ -55,200 +38,42 @@ // Share code between the PI Plugin and UR Adapter #include -extern "C" { - -/// \cond IGNORE_BLOCK_IN_DOXYGEN -pi_result cuda_piMemRetain(pi_mem); -pi_result cuda_piMemRelease(pi_mem); -/// \endcond -} - using _pi_stream_guard = std::unique_lock; -/// A PI platform stores all known PI devices, -/// in the CUDA plugin this is just a vector of -/// available devices since initialization is done -/// when devices are used. -/// struct _pi_platform : ur_platform_handle_t_ { using ur_platform_handle_t_::ur_platform_handle_t_; }; -/// PI device mapping to a CUdevice. -/// Includes an observer pointer to the platform, -/// and implements the reference counting semantics since -/// CUDA objects are not refcounted. -/// struct _pi_device : ur_device_handle_t_ { using ur_device_handle_t_::ur_device_handle_t_; }; -/// PI context mapping to a CUDA context object. -/// -/// There is no direct mapping between a CUDA context and a PI context, -/// main differences described below: -/// -/// CUDA context vs PI context -/// -/// One of the main differences between the PI API and the CUDA driver API is -/// that the second modifies the state of the threads by assigning -/// `CUcontext` objects to threads. `CUcontext` objects store data associated -/// with a given device and control access to said device from the user side. -/// PI API context are objects that are passed to functions, and not bound -/// to threads. -/// The _pi_context object doesn't implement this behavior, only holds the -/// CUDA context data. The RAII object \ref ScopedContext implements the active -/// context behavior. -/// -/// Primary vs User-defined context -/// -/// CUDA has two different types of context, the Primary context, -/// which is usable by all threads on a given process for a given device, and -/// the aforementioned custom contexts. -/// CUDA documentation, and performance analysis, indicates it is recommended -/// to use Primary context whenever possible. -/// Primary context is used as well by the CUDA Runtime API. -/// For PI applications to interop with CUDA Runtime API, they have to use -/// the primary context - and make that active in the thread. -/// The `_pi_context` object can be constructed with a `kind` parameter -/// that allows to construct a Primary or `user-defined` context, so that -/// the PI object interface is always the same. -/// -/// Destructor callback -/// -/// Required to implement CP023, SYCL Extended Context Destruction, -/// the PI Context can store a number of callback functions that will be -/// called upon destruction of the PI Context. -/// See proposal for details. -/// struct _pi_context : ur_context_handle_t_ { using ur_context_handle_t_::ur_context_handle_t_; }; -/// PI Mem mapping to CUDA memory allocations, both data and texture/surface. -/// \brief Represents non-SVM allocations on the CUDA backend. -/// Keeps tracks of all mapped regions used for Map/Unmap calls. -/// Only one region can be active at the same time per allocation. struct _pi_mem : ur_mem_handle_t_ { using ur_mem_handle_t_::ur_mem_handle_t_; }; -/// PI queue mapping on to CUstream objects. -/// struct _pi_queue : ur_queue_handle_t_ { using ur_queue_handle_t_::ur_queue_handle_t_; }; -typedef void (*pfn_notify)(pi_event event, pi_int32 eventCommandStatus, - void *userData); - struct _pi_event : ur_event_handle_t_ { using ur_event_handle_t_::ur_event_handle_t_; - - // Helpers for queue command implementations until they also get ported to UR - static pi_event - make_native(pi_command_type type, pi_queue queue, CUstream stream, - uint32_t stream_token = std::numeric_limits::max()) { - auto urQueue = reinterpret_cast(queue); - static std::unordered_map<_pi_command_type, ur_command_t> cmdMap = { - {PI_COMMAND_TYPE_NDRANGE_KERNEL, UR_COMMAND_KERNEL_LAUNCH}, - {PI_COMMAND_TYPE_MEM_BUFFER_READ, UR_COMMAND_MEM_BUFFER_READ}, - {PI_COMMAND_TYPE_MEM_BUFFER_WRITE, UR_COMMAND_MEM_BUFFER_WRITE}, - {PI_COMMAND_TYPE_MEM_BUFFER_COPY, UR_COMMAND_MEM_BUFFER_COPY}, - {PI_COMMAND_TYPE_MEM_BUFFER_MAP, UR_COMMAND_MEM_BUFFER_MAP}, - {PI_COMMAND_TYPE_MEM_BUFFER_UNMAP, UR_COMMAND_MEM_UNMAP}, - {PI_COMMAND_TYPE_MEM_BUFFER_READ_RECT, UR_COMMAND_MEM_BUFFER_READ_RECT}, - {PI_COMMAND_TYPE_MEM_BUFFER_WRITE_RECT, - UR_COMMAND_MEM_BUFFER_WRITE_RECT}, - {PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, UR_COMMAND_MEM_BUFFER_COPY_RECT}, - {PI_COMMAND_TYPE_MEM_BUFFER_FILL, UR_COMMAND_MEM_BUFFER_FILL}, - {PI_COMMAND_TYPE_IMAGE_READ, UR_COMMAND_MEM_IMAGE_READ}, - {PI_COMMAND_TYPE_IMAGE_WRITE, UR_COMMAND_MEM_IMAGE_WRITE}, - {PI_COMMAND_TYPE_IMAGE_COPY, UR_COMMAND_MEM_IMAGE_COPY}, - {PI_COMMAND_TYPE_BARRIER, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER}, - {PI_COMMAND_TYPE_DEVICE_GLOBAL_VARIABLE_READ, - UR_COMMAND_DEVICE_GLOBAL_VARIABLE_READ}, - {PI_COMMAND_TYPE_DEVICE_GLOBAL_VARIABLE_WRITE, - UR_COMMAND_DEVICE_GLOBAL_VARIABLE_WRITE}, - }; - - // TODO(ur): There is no exact mapping for the following commands. Just - // default to KERNEL_LAUNCH for now. - // PI_COMMAND_TYPE_USER - // PI_COMMAND_TYPE_MEM_BUFFER_FILL, - // PI_COMMAND_TYPE_IMAGE_READ, - // PI_COMMAND_TYPE_IMAGE_WRITE, - // PI_COMMAND_TYPE_IMAGE_COPY, - // PI_COMMAND_TYPE_NATIVE_KERNEL, - // PI_COMMAND_TYPE_COPY_BUFFER_TO_IMAGE, - // PI_COMMAND_TYPE_COPY_IMAGE_TO_BUFFER, - // PI_COMMAND_TYPE_MAP_IMAGE, - // PI_COMMAND_TYPE_MARKER, - // PI_COMMAND_TYPE_ACQUIRE_GL_OBJECTS, - // PI_COMMAND_TYPE_RELEASE_GL_OBJECTS, - // PI_COMMAND_TYPE_BARRIER, - // PI_COMMAND_TYPE_MIGRATE_MEM_OBJECTS, - // PI_COMMAND_TYPE_FILL_IMAGE - // PI_COMMAND_TYPE_SVM_FREE - // PI_COMMAND_TYPE_SVM_MEMCPY - // PI_COMMAND_TYPE_SVM_MEMFILL - // PI_COMMAND_TYPE_SVM_MAP - // PI_COMMAND_TYPE_SVM_UNMAP - - ur_command_t urCmd = UR_COMMAND_KERNEL_LAUNCH; - auto cmdIt = cmdMap.find(type); - if (cmdIt != cmdMap.end()) { - urCmd = cmdIt->second; - } - return reinterpret_cast( - ur_event_handle_t_::make_native(urCmd, urQueue, stream, stream_token)); - } - - static pi_event make_with_native(ur_context_handle_t context, - CUevent eventNative) { - auto urContext = reinterpret_cast(context); - return reinterpret_cast( - ur_event_handle_t_::make_with_native(urContext, eventNative)); - } }; -/// Implementation of PI Program on CUDA Module object -/// struct _pi_program : ur_program_handle_t_ { using ur_program_handle_t_::ur_program_handle_t_; }; -/// Implementation of a PI Kernel for CUDA -/// -/// PI Kernels are used to set kernel arguments, -/// creating a state on the Kernel object for a given -/// invocation. This is not the case of CUFunction objects, -/// which are simply passed together with the arguments on the invocation. -/// The PI Kernel implementation for CUDA stores the list of arguments, -/// argument sizes and offsets to emulate the interface of PI Kernel, -/// saving the arguments for the later dispatch. -/// Note that in PI API, the Local memory is specified as a size per -/// individual argument, but in CUDA only the total usage of shared -/// memory is required since it is not passed as a parameter. -/// A compiler pass converts the PI API local memory model into the -/// CUDA shared model. This object simply calculates the total of -/// shared memory, and the initial offsets of each parameter. -/// struct _pi_kernel : ur_kernel_handle_t_ { using ur_kernel_handle_t_::ur_kernel_handle_t_; }; -/// Implementation of samplers for CUDA -/// -/// Sampler property layout: -/// | 31 30 ... 6 5 | 4 3 2 | 1 | 0 | -/// | N/A | addressing mode | fiter mode | normalize coords | struct _pi_sampler : ur_sampler_handle_t_ { using ur_sampler_handle_t_::ur_sampler_handle_t_; }; -// ------------------------------------------------------------- -// Helper types and functions -// - #endif // PI_CUDA_HPP diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp index a84d4c71c8dd2..27ed647639a6c 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp @@ -133,10 +133,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextCreateWithNativeHandle( (void)hNativeContext; (void)phContext; - // TODO(ur): Needed for the conformance test to pass, but it may be valid - // to have a null CUDA context - UR_ASSERT(hNativeContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - return UR_RESULT_ERROR_INVALID_OPERATION; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp index 34575829c318b..bc3cb32f55b9c 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp @@ -14,15 +14,49 @@ #include #include -// We need this declaration temporarily while UR and PI share ScopedContext -class _pi_context; -using pi_context = _pi_context *; - #include "common.hpp" #include "device.hpp" typedef void (*ur_context_extended_deleter_t)(void *user_data); +/// UR context mapping to a CUDA context object. +/// +/// There is no direct mapping between a CUDA context and a UR context, +/// main differences described below: +/// +/// CUDA context vs UR context +/// +/// One of the main differences between the UR API and the CUDA driver API is +/// that the second modifies the state of the threads by assigning +/// `CUcontext` objects to threads. `CUcontext` objects store data associated +/// with a given device and control access to said device from the user side. +/// UR API context are objects that are passed to functions, and not bound +/// to threads. +/// The _ur_context object doesn't implement this behavior, only holds the +/// CUDA context data. The RAII object \ref ScopedContext implements the active +/// context behavior. +/// +/// Primary vs User-defined context +/// +/// CUDA has two different types of context, the Primary context, +/// which is usable by all threads on a given process for a given device, and +/// the aforementioned custom contexts. +/// CUDA documentation, and performance analysis, indicates it is recommended +/// to use Primary context whenever possible. +/// Primary context is used as well by the CUDA Runtime API. +/// For UR applications to interop with CUDA Runtime API, they have to use +/// the primary context - and make that active in the thread. +/// The `_ur_context` object can be constructed with a `kind` parameter +/// that allows to construct a Primary or `user-defined` context, so that +/// the UR object interface is always the same. +/// +/// Destructor callback +/// +/// Required to implement CP023, SYCL Extended Context Destruction, +/// the PI Context can store a number of callback functions that will be +/// called upon destruction of the UR Context. +/// See proposal for details. +/// struct ur_context_handle_t_ { struct deleter_data { @@ -76,10 +110,6 @@ struct ur_context_handle_t_ { namespace { class ScopedContext { public: - // TODO(ur): Needed for compatibility with PI; once the CUDA PI plugin is - // fully moved over we can drop this constructor - ScopedContext(pi_context ctxt); - ScopedContext(ur_context_handle_t ctxt) { if (!ctxt) { throw UR_RESULT_ERROR_INVALID_CONTEXT; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp index 6d87373524341..06544fbbfdba5 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp @@ -1067,9 +1067,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle( UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( ur_native_handle_t hNativeDevice, ur_platform_handle_t hPlatform, ur_device_handle_t *phDevice) { - // TODO(ur): This is neede for the UR CTS, but it might be valid to to have a - // null native handle - UR_ASSERT(hNativeDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(phDevice, UR_RESULT_ERROR_INVALID_NULL_POINTER); // We can't cast between ur_native_handle_t and CUdevice, so memcpy the bits diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp index 073a9ffce26a6..1cb7b912da1a8 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp @@ -1220,7 +1220,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( return ret_err; } -/// TODO(ur): Add support for the offset. UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( ur_queue_handle_t hQueue, void *ptr, size_t patternSize, const void *pPattern, size_t size, uint32_t numEventsInWaitList, diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp index d0c7fef8a2b48..b0f10b33a5822 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp @@ -80,7 +80,6 @@ struct ur_event_handle_t_ { static ur_event_handle_t make_native(ur_command_t type, ur_queue_handle_t queue, CUstream stream, uint32_t stream_token = std::numeric_limits::max()) { - // TODO(ur): Remove cast when pi_event is ported to UR return new ur_event_handle_t_(type, queue->get_context(), queue, stream, stream_token); } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp index 42e624cefba48..00f0792479979 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp @@ -16,6 +16,22 @@ #include "program.hpp" +/// Implementation of a UR Kernel for CUDA +/// +/// UR Kernels are used to set kernel arguments, +/// creating a state on the Kernel object for a given +/// invocation. This is not the case of CUFunction objects, +/// which are simply passed together with the arguments on the invocation. +/// The UR Kernel implementation for CUDA stores the list of arguments, +/// argument sizes and offsets to emulate the interface of UR Kernel, +/// saving the arguments for the later dispatch. +/// Note that in UR API, the Local memory is specified as a size per +/// individual argument, but in CUDA only the total usage of shared +/// memory is required since it is not passed as a parameter. +/// A compiler pass converts the UR API local memory model into the +/// CUDA shared model. This object simply calculates the total of +/// shared memory, and the initial offsets of each parameter. +/// struct ur_kernel_handle_t_ { using native_type = CUfunction; From ff2559fccce15333670b589ec340c7a4da1d3831 Mon Sep 17 00:00:00 2001 From: Aaron Greig Date: Tue, 23 May 2023 11:51:39 +0100 Subject: [PATCH 31/45] [SYCL][CUDA] Add a few extra checks to the cuda UR program implementation. --- .../ur/adapters/cuda/program.cpp | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp index 91e0b5c85d1b1..de31ed7735a9e 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp @@ -371,7 +371,15 @@ urProgramRelease(ur_program_handle_t program) { try { ScopedContext active(program->get_context()); auto cuModule = program->get(); - result = UR_CHECK_ERROR(cuModuleUnload(cuModule)); + // "0" is a valid handle for a cuModule, so the best way to check if we + // actually loaded a module and need to unload it is to look at the build + // status. + if (program->buildStatus_ == UR_PROGRAM_BUILD_STATUS_SUCCESS) { + result = UR_CHECK_ERROR(cuModuleUnload(cuModule)); + } else if(program->buildStatus_ == UR_PROGRAM_BUILD_STATUS_NONE) { + // Nothing to free. + result = UR_RESULT_SUCCESS; + } } catch (...) { result = UR_RESULT_ERROR_OUT_OF_RESOURCES; } @@ -391,6 +399,7 @@ urProgramRelease(ur_program_handle_t program) { UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle( ur_program_handle_t program, ur_native_handle_t *nativeHandle) { UR_ASSERT(program, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(nativeHandle, UR_RESULT_ERROR_INVALID_NULL_POINTER); *nativeHandle = reinterpret_cast(program->get()); return UR_RESULT_SUCCESS; } @@ -417,8 +426,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( std::unique_ptr retProgram{ new ur_program_handle_t_{hContext}}; - retError = - retProgram->set_metadata(pProperties->pMetadatas, pProperties->count); + if (pProperties && pProperties->pMetadatas) { + retError = + retProgram->set_metadata(pProperties->pMetadatas, pProperties->count); + } UR_ASSERT(retError == UR_RESULT_SUCCESS, retError); auto pBinary_string = reinterpret_cast(pBinary); From d69f029f2faa695a821d1498272fc114ded2e292 Mon Sep 17 00:00:00 2001 From: Aaron Greig Date: Tue, 23 May 2023 12:35:23 +0100 Subject: [PATCH 32/45] [SYCL][CUDA] Implement UR_DEVICE_INFO_IL_VERSION query for cuda. --- .../ur/adapters/cuda/device.cpp | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp index 06544fbbfdba5..9d6a80d98c907 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp @@ -938,6 +938,32 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, return ReturnValue(memory_bandwidth); } + case UR_DEVICE_INFO_IL_VERSION: { + std::string il_version = "nvptx-"; + + int driver_version = 0; + cuDriverGetVersion(&driver_version); + int major = driver_version / 1000; + int minor = driver_version % 1000 / 10; + + // We can work out which ptx ISA version we support based on the versioning + // table published here + // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes + // Major versions that we support are consistent in how they line up, so we + // can derive that easily. The minor versions for version 10 don't line up + // the same so it needs a special case. This is not ideal but it does seem + // to be the best bet to avoid a maintenance burden here. + il_version += std::to_string(major - 4) + "."; + if (major == 10) { + il_version += std::to_string(minor + 3); + } else if (major >= 11) { + il_version += std::to_string(minor); + } else { + return UR_RESULT_ERROR_INVALID_VALUE; + } + + return ReturnValue(il_version.data(), il_version.size()); + } case UR_EXT_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP: { // Maximum number of 32-bit registers available to a thread block. // Note: This number is shared by all thread blocks simultaneously resident From 190f3c7116738f74ca3af944638cb4bb362d8b87 Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Wed, 24 May 2023 09:04:24 +0100 Subject: [PATCH 33/45] [SYCL][CUDA][UR] Remove queue backward compatability apis --- sycl/plugins/cuda/pi_cuda.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index bc8cbaa1e7e2c..9af47b47a6b2a 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -71,18 +71,14 @@ pi_result piPluginInit(pi_plugin *PluginInit) { // Queue _PI_CL(piQueueCreate, pi2ur::piQueueCreate) _PI_CL(piextQueueCreate, pi2ur::piextQueueCreate) - _PI_CL(piextQueueCreate2, pi2ur::piextQueueCreate2) _PI_CL(piQueueGetInfo, pi2ur::piQueueGetInfo) _PI_CL(piQueueFinish, pi2ur::piQueueFinish) _PI_CL(piQueueFlush, pi2ur::piQueueFlush) _PI_CL(piQueueRetain, pi2ur::piQueueRetain) _PI_CL(piQueueRelease, pi2ur::piQueueRelease) _PI_CL(piextQueueGetNativeHandle, pi2ur::piextQueueGetNativeHandle) - _PI_CL(piextQueueGetNativeHandle2, pi2ur::piextQueueGetNativeHandle2) _PI_CL(piextQueueCreateWithNativeHandle, pi2ur::piextQueueCreateWithNativeHandle) - _PI_CL(piextQueueCreateWithNativeHandle2, - pi2ur::piextQueueCreateWithNativeHandle2) // Memory _PI_CL(piMemBufferCreate, pi2ur::piMemBufferCreate) _PI_CL(piMemImageCreate, pi2ur::piMemImageCreate) From a4415034f35f6fdeecc59a113f20d0b10a2b6cb7 Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Thu, 25 May 2023 04:22:11 +0100 Subject: [PATCH 34/45] [SYCL][CUDA][UR] Add usmPool entry points to ddi tables and fix ur*nativeHandle apis --- sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp | 1 + sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp | 1 + sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp | 5 +++-- .../unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp | 5 +++-- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp index 9d6a80d98c907..a5889ddba9b06 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp @@ -1092,6 +1092,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle( UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( ur_native_handle_t hNativeDevice, ur_platform_handle_t hPlatform, + const ur_device_native_properties_t *pProperties, ur_device_handle_t *phDevice) { UR_ASSERT(phDevice, UR_RESULT_ERROR_INVALID_NULL_POINTER); diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp index de31ed7735a9e..bf1af9441aed6 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp @@ -281,6 +281,7 @@ urProgramLink(ur_context_handle_t hContext, uint32_t count, /// \return TBD UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithNativeHandle( ur_native_handle_t hNativeProgram, ur_context_handle_t hContext, + const ur_program_native_properties_t *pProperties, ur_program_handle_t *phProgram) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp index 371c3363b4e75..2c13c6ea29d14 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp @@ -236,8 +236,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t hQueue) { return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle( - ur_queue_handle_t hQueue, ur_native_handle_t *phNativeQueue) { +UR_APIEXPORT ur_result_t UR_APICALL +urQueueGetNativeHandle(ur_queue_handle_t hQueue, ur_queue_native_desc_t *pDesc, + ur_native_handle_t *phNativeQueue) { UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(phNativeQueue, UR_RESULT_ERROR_INVALID_NULL_POINTER); diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp index ebb4bc771ccd2..bd57bfd762429 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp @@ -233,8 +233,9 @@ urGetUSMProcAddrTable(ur_api_version_t version, ur_usm_dditable_t *pDdiTable) { pDdiTable->pfnGetMemAllocInfo = urUSMGetMemAllocInfo; pDdiTable->pfnHostAlloc = urUSMHostAlloc; pDdiTable->pfnPoolCreate = nullptr; - pDdiTable->pfnPoolDestroy = nullptr; - pDdiTable->pfnPoolDestroy = nullptr; + pDdiTable->pfnPoolRetain = nullptr; + pDdiTable->pfnPoolRelease = nullptr; + pDdiTable->pfnPoolGetInfo = nullptr; pDdiTable->pfnSharedAlloc = urUSMSharedAlloc; return UR_RESULT_SUCCESS; } From 3b6536941f9aad8dc196c63d6243c03ee6070bda Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Wed, 31 May 2023 11:48:56 +0100 Subject: [PATCH 35/45] Fix CUDA adapter formatting --- sycl/plugins/cuda/pi_cuda.hpp | 6 +++--- sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp | 1 - sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp | 3 ++- sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp | 4 ++-- sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp | 2 +- sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp | 6 +++--- sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp | 2 +- sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp | 4 ++-- .../ur/adapters/cuda/ur_interface_loader.cpp | 3 ++- 9 files changed, 16 insertions(+), 15 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index f1d15016bc0e5..8fb4664199286 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -27,13 +27,13 @@ #include #include +#include #include +#include #include #include -#include #include #include -#include // Share code between the PI Plugin and UR Adapter #include @@ -52,7 +52,7 @@ struct _pi_context : ur_context_handle_t_ { using ur_context_handle_t_::ur_context_handle_t_; }; -struct _pi_mem : ur_mem_handle_t_ { +struct _pi_mem : ur_mem_handle_t_ { using ur_mem_handle_t_::ur_mem_handle_t_; }; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp index f25aa88b3e292..de767c929d638 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp @@ -86,7 +86,6 @@ void sycl::detail::ur::cuPrint(const char *Message) { std::cerr << "ur_print: " << Message << std::endl; } - // Global variables for ZER_EXT_RESULT_ADAPTER_SPECIFIC_ERROR thread_local ur_result_t ErrorMessageCode = UR_RESULT_SUCCESS; thread_local char ErrorMessage[MaxMessageSize]; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp index a5889ddba9b06..f53caafcb587d 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp @@ -988,7 +988,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, device->get()) == CUDA_SUCCESS); // CUDA API (8.x - 12.1) guarantees 12 bytes + \0 are written - sycl::detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) == 12); + sycl::detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) == + 12); return ReturnValue(AddressBuffer, strnlen(AddressBuffer, AddressBufferSize - 1) + 1); } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp index 6788de883e971..f1a0b9d2a97d2 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp @@ -46,8 +46,8 @@ ur_event_handle_t_::ur_event_handle_t_(ur_context_handle_t context, : commandType_{UR_COMMAND_EVENTS_WAIT}, refCount_{1}, has_ownership_{false}, hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false}, streamToken_{std::numeric_limits::max()}, evEnd_{eventNative}, - evStart_{nullptr}, evQueued_{nullptr}, queue_{nullptr}, context_{ - context} { + evStart_{nullptr}, evQueued_{nullptr}, queue_{nullptr}, + context_{context} { urContextRetain(context_); } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp index 44484250f062b..5712218b06425 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp @@ -7,9 +7,9 @@ //===-----------------------------------------------------------------===// #pragma once +#include #include #include -#include #include "common.hpp" diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp index dbbb177926c32..fdf0f723e168f 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp @@ -57,9 +57,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGetInfo( /// However because multiple devices in a context is not currently supported, /// place each device in a separate platform. /// -UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGet(uint32_t NumEntries, - ur_platform_handle_t *phPlatforms, - uint32_t *pNumPlatforms) { +UR_DLLEXPORT ur_result_t UR_APICALL +urPlatformGet(uint32_t NumEntries, ur_platform_handle_t *phPlatforms, + uint32_t *pNumPlatforms) { try { static std::once_flag initFlag; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp index bf1af9441aed6..0081e921ec677 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp @@ -377,7 +377,7 @@ urProgramRelease(ur_program_handle_t program) { // status. if (program->buildStatus_ == UR_PROGRAM_BUILD_STATUS_SUCCESS) { result = UR_CHECK_ERROR(cuModuleUnload(cuModule)); - } else if(program->buildStatus_ == UR_PROGRAM_BUILD_STATUS_NONE) { + } else if (program->buildStatus_ == UR_PROGRAM_BUILD_STATUS_NONE) { // Nothing to free. result = UR_RESULT_SUCCESS; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp index 99a7904b82b7e..daa1017d0f0aa 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp @@ -69,8 +69,8 @@ struct ur_queue_handle_t_ { device_{device}, refCount_{1}, eventCount_{0}, compute_stream_idx_{0}, transfer_stream_idx_{0}, num_compute_streams_{0}, num_transfer_streams_{0}, last_sync_compute_streams_{0}, - last_sync_transfer_streams_{0}, flags_(flags), - ur_flags_(ur_flags), has_ownership_{backend_owns} { + last_sync_transfer_streams_{0}, flags_(flags), ur_flags_(ur_flags), + has_ownership_{backend_owns} { urContextRetain(context_); urDeviceRetain(device_); } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp index bd57bfd762429..f0eb6008d8a36 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp @@ -148,7 +148,8 @@ urGetMemProcAddrTable(ur_api_version_t version, ur_mem_dditable_t *pDdiTable) { } pDdiTable->pfnBufferCreate = urMemBufferCreate; pDdiTable->pfnBufferPartition = urMemBufferPartition; - pDdiTable->pfnBufferCreateWithNativeHandle = urMemBufferCreateWithNativeHandle; + pDdiTable->pfnBufferCreateWithNativeHandle = + urMemBufferCreateWithNativeHandle; pDdiTable->pfnImageCreateWithNativeHandle = urMemImageCreateWithNativeHandle; pDdiTable->pfnGetInfo = urMemGetInfo; pDdiTable->pfnGetNativeHandle = urMemGetNativeHandle; From 0011b9178664eef57a928d6f1fd0097ebcacbf61 Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Wed, 31 May 2023 14:08:11 +0100 Subject: [PATCH 36/45] Mark KernelFusion/sync_two_queues_event_dep as unsupported on cuda pending further investigation --- sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp b/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp index 4fe263431aed2..4c3c4f5f8ecb7 100644 --- a/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp +++ b/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp @@ -1,5 +1,6 @@ // For this test, complete_fusion must be supported. // REQUIRES: fusion +// UNSUPPORTED: cuda // RUN: %{build} -o %t.out // RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s From 9e97af7f1d97a7af8b0ef2708a25d216c74bb932 Mon Sep 17 00:00:00 2001 From: Aaron Greig Date: Wed, 24 May 2023 15:09:53 +0100 Subject: [PATCH 37/45] [SYCL][CUDA] Fix assumption about work dimensions in EnqueueKernelLaunch. --- .../ur/adapters/cuda/enqueue.cpp | 39 +++++++++++-------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp index 1cb7b912da1a8..b0c4562d60525 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp @@ -128,7 +128,7 @@ ur_result_t setCuMemAdvise(CUdeviceptr devPtr, size_t size, // The default threadsPerBlock only require handling the first work_dim // dimension. void guessLocalWorkSize(ur_device_handle_t device, size_t *threadsPerBlock, - const size_t *global_work_size, + const size_t *global_work_size, const uint32_t work_dim, const size_t maxThreadsPerBlock[3], ur_kernel_handle_t kernel, uint32_t local_size) { assert(threadsPerBlock != nullptr); @@ -136,6 +136,13 @@ void guessLocalWorkSize(ur_device_handle_t device, size_t *threadsPerBlock, assert(kernel != nullptr); int minGrid, maxBlockSize, maxBlockDim[3]; + // The below assumes a three dimensional range but this is not guaranteed by + // UR. + size_t global_size_normalized[3] = {1, 1, 1}; + for (uint32_t i = 0; i < work_dim; i++) { + global_size_normalized[i] = global_work_size[i]; + } + static auto isPrime = [](size_t number) -> bool { auto lastNumToCheck = ceil(sqrt(number)); if (number < 2) @@ -160,23 +167,24 @@ void guessLocalWorkSize(ur_device_handle_t device, size_t *threadsPerBlock, &minGrid, &maxBlockSize, kernel->get(), NULL, local_size, maxThreadsPerBlock[0])); - threadsPerBlock[2] = std::min(global_work_size[2], size_t(maxBlockDim[2])); - threadsPerBlock[1] = - std::min(global_work_size[1], std::min(maxBlockSize / threadsPerBlock[2], - size_t(maxBlockDim[1]))); + threadsPerBlock[2] = + std::min(global_size_normalized[2], size_t(maxBlockDim[2])); + threadsPerBlock[1] = std::min( + global_size_normalized[1], + std::min(maxBlockSize / threadsPerBlock[2], size_t(maxBlockDim[1]))); maxBlockDim[0] = maxBlockSize / (threadsPerBlock[1] * threadsPerBlock[2]); threadsPerBlock[0] = std::min(maxThreadsPerBlock[0], - std::min(global_work_size[0], size_t(maxBlockDim[0]))); + std::min(global_size_normalized[0], size_t(maxBlockDim[0]))); - // When global_work_size[0] is prime threadPerBlock[0] will later computed as - // 1, which is not efficient configuration. In such case we use - // global_work_size[0] + 1 to compute threadPerBlock[0]. + // When global_size_normalized[0] is prime threadPerBlock[0] will later + // computed as 1, which is not efficient configuration. In such case we use + // global_size_normalized[0] + 1 to compute threadPerBlock[0]. int adjusted_0_dim_global_work_size = - (isPrime(global_work_size[0]) && - (threadsPerBlock[0] != global_work_size[0])) - ? global_work_size[0] + 1 - : global_work_size[0]; + (isPrime(global_size_normalized[0]) && + (threadsPerBlock[0] != global_size_normalized[0])) + ? global_size_normalized[0] + 1 + : global_size_normalized[0]; static auto isPowerOf2 = [](size_t value) -> bool { return value && !(value & (value - 1)); @@ -209,7 +217,7 @@ bool hasExceededMaxRegistersPerBlock(ur_device_handle_t device, kernel->get())); return blockSize * regsPerThread > size_t(maxRegsPerBlock); -}; +} /// Enqueues a wait on the given CUstream for all specified events (See /// \ref enqueueEventWaitWithBarrier.) If the events list is empty, the enqueued @@ -309,7 +317,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - // Preconditions UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hQueue->get_context() == hKernel->get_context(), @@ -376,7 +383,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( } } else { guessLocalWorkSize(hQueue->device_, threadsPerBlock, pGlobalWorkSize, - maxThreadsPerBlock, hKernel, local_size); + workDim, maxThreadsPerBlock, hKernel, local_size); } } From b538dd89ced1166b6bcdb6dc90462469c228702e Mon Sep 17 00:00:00 2001 From: Aaron Greig Date: Mon, 29 May 2023 16:50:05 +0100 Subject: [PATCH 38/45] [SYCL][CUDA] Correct return type of cuda USM capability queries. --- .../unified_runtime/ur/adapters/cuda/device.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp index f53caafcb587d..b633086d057e8 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp @@ -707,7 +707,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, // // query if/how the device can access page-locked host memory, possibly // through PCIe, using the same pointer as the host - uint64_t value = {}; + uint32_t value = {}; if (getAttribute(device, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)) { // the device shares a unified address space with the host if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= @@ -734,7 +734,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, // associated with this device." // // query how the device can access memory allocated on the device itself (?) - uint64_t value = + uint32_t value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS | UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS | @@ -747,7 +747,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, // allocation associated with this device." // // query if/how the device can access managed memory associated to it - uint64_t value = {}; + uint32_t value = {}; if (getAttribute(device, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) { // the device can allocate managed memory on this system value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | @@ -775,7 +775,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, // // query if/how the device can access managed memory associated to other // devices - uint64_t value = {}; + uint32_t value = {}; if (getAttribute(device, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) { // the device can allocate managed memory on this system value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS; @@ -804,7 +804,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, // // query if/how the device can access pageable host memory allocated by the // system allocator - uint64_t value = {}; + uint32_t value = {}; if (getAttribute(device, CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS)) { // the device suppports coherently accessing pageable memory without // calling cuMemHostRegister/cudaHostRegister on it From 9811f9b258265c1da39cdee78bde6eac9fdd6f19 Mon Sep 17 00:00:00 2001 From: Aaron Greig Date: Mon, 5 Jun 2023 12:16:20 +0100 Subject: [PATCH 39/45] [SYCL][CUDA] A number of small cuda adapter fixes for cts/spec compliance. --- .../ur/adapters/cuda/device.cpp | 2 ++ .../ur/adapters/cuda/kernel.cpp | 25 ++++++++++++++++--- .../ur/adapters/cuda/kernel.hpp | 2 +- .../ur/adapters/cuda/program.cpp | 16 ++++++------ sycl/plugins/unified_runtime/ur/ur.hpp | 5 +++- 5 files changed, 38 insertions(+), 12 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp index b633086d057e8..39d582405a1e1 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp @@ -993,6 +993,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, return ReturnValue(AddressBuffer, strnlen(AddressBuffer, AddressBufferSize - 1) + 1); } + case UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS: + return ReturnValue(false); // TODO: Investigate if this information is available on CUDA. case UR_DEVICE_INFO_GPU_EU_COUNT: case UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH: diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp index 69f86ca319df5..900b23dd84306 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp @@ -15,6 +15,7 @@ urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName, ur_kernel_handle_t *phKernel) { UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(phKernel, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(pKernelName, UR_RESULT_ERROR_INVALID_NULL_POINTER); ur_result_t retErr = UR_RESULT_SUCCESS; std::unique_ptr retKernel{nullptr}; @@ -23,8 +24,16 @@ urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName, ScopedContext active(hProgram->get_context()); CUfunction cuFunc; - retErr = UR_CHECK_ERROR( - cuModuleGetFunction(&cuFunc, hProgram->get(), pKernelName)); + CUresult functionResult = + cuModuleGetFunction(&cuFunc, hProgram->get(), pKernelName); + + // We can't add this as a generic mapping in UR_CHECK_ERROR since cuda's + // NOT_FOUND error applies to more than just functions. + if (functionResult == CUDA_ERROR_NOT_FOUND) { + throw UR_RESULT_ERROR_INVALID_KERNEL_NAME; + } else { + retErr = UR_CHECK_ERROR(functionResult); + } std::string kernel_name_woffset = std::string(pKernelName) + "_with_offset"; CUfunction cuFuncWithOffsetParam; @@ -187,6 +196,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue(ur_kernel_handle_t hKernel, uint32_t argIndex, size_t argSize, const void *pArgValue) { UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(argSize, UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE); ur_result_t retErr = UR_RESULT_SUCCESS; try { @@ -335,7 +345,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj( UR_APIEXPORT ur_result_t UR_APICALL urKernelSetExecInfo(ur_kernel_handle_t hKernel, ur_kernel_exec_info_t propName, size_t propSize, const void *pPropValue) { - return UR_RESULT_SUCCESS; + UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(pPropValue, UR_RESULT_ERROR_INVALID_NULL_POINTER); + switch (propName) { + case UR_KERNEL_EXEC_INFO_USM_INDIRECT_ACCESS: + case UR_KERNEL_EXEC_INFO_USM_PTRS: + case UR_KERNEL_EXEC_INFO_CACHE_CONFIG: + return UR_RESULT_SUCCESS; + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } } UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle( diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp index 00f0792479979..9308b7b408b44 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp @@ -175,7 +175,7 @@ struct ur_kernel_handle_t_ { /// Note this only returns the current known number of arguments, not the /// real one required by the kernel, since this cannot be queried from /// the CUDA Driver API - uint32_t get_num_args() const noexcept { return args_.indices_.size() - 1; } + size_t get_num_args() const noexcept { return args_.indices_.size() - 1; } void set_kernel_arg(int index, size_t size, const void *arg) { args_.add_arg(index, size, arg); diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp index 0081e921ec677..314a9a866c813 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp @@ -418,27 +418,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(phProgram, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(pBinary != nullptr && size != 0, UR_RESULT_ERROR_INVALID_BINARY); + UR_ASSERT(pBinary != nullptr, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(hContext->get_device()->get() == hDevice->get(), UR_RESULT_ERROR_INVALID_CONTEXT); + UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE); ur_result_t retError = UR_RESULT_SUCCESS; std::unique_ptr retProgram{ new ur_program_handle_t_{hContext}}; - if (pProperties && pProperties->pMetadatas) { + if (pProperties) { + if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } else if (pProperties->count == 0 && pProperties->pMetadatas != nullptr) { + return UR_RESULT_ERROR_INVALID_SIZE; + } retError = retProgram->set_metadata(pProperties->pMetadatas, pProperties->count); } UR_ASSERT(retError == UR_RESULT_SUCCESS, retError); auto pBinary_string = reinterpret_cast(pBinary); - if (size == 0) { - size = strlen(pBinary_string) + 1; - } - - UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE); retError = retProgram->set_binary(pBinary_string, size); UR_ASSERT(retError == UR_RESULT_SUCCESS, retError); @@ -463,6 +464,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetFunctionPointer( UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hDevice == hProgram->get_context()->get_device(), UR_RESULT_ERROR_INVALID_DEVICE); + UR_ASSERT(pFunctionName, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(ppFunctionPointer, UR_RESULT_ERROR_INVALID_NULL_POINTER); CUfunction func; diff --git a/sycl/plugins/unified_runtime/ur/ur.hpp b/sycl/plugins/unified_runtime/ur/ur.hpp index c2f3a3782f9a0..2099b31529176 100644 --- a/sycl/plugins/unified_runtime/ur/ur.hpp +++ b/sycl/plugins/unified_runtime/ur/ur.hpp @@ -205,11 +205,14 @@ template ur_result_t getInfoImpl(size_t param_value_size, void *param_value, size_t *param_value_size_ret, T value, size_t value_size, Assign &&assign_func) { + if (!param_value && !param_value_size_ret) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } if (param_value != nullptr) { if (param_value_size < value_size) { - return UR_RESULT_ERROR_INVALID_VALUE; + return UR_RESULT_ERROR_INVALID_SIZE; } assign_func(param_value, value, value_size); From fce479c0c244cccbc41370e6e7d234cbe96d13fb Mon Sep 17 00:00:00 2001 From: Aaron Greig Date: Tue, 6 Jun 2023 14:44:50 +0100 Subject: [PATCH 40/45] [SYCL][UR] Avoid zero-length new in pi2ur. --- sycl/plugins/unified_runtime/pi2ur.hpp | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 2408fa452351f..5fed9d0f933f7 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -1611,17 +1611,20 @@ inline pi_result piProgramCreateWithBinary( reinterpret_cast(Context); auto UrDevice = reinterpret_cast(DeviceList[0]); - std::unique_ptr pMetadatas( - new ur_program_metadata_t[NumMetadataEntries]); - for (unsigned i = 0; i < NumMetadataEntries; i++) { - HANDLE_ERRORS(mapPIMetadataToUR(&Metadata[i], &pMetadatas[i])); - } - - ur_program_properties_t Properties; + ur_program_properties_t Properties = {}; Properties.stype = UR_STRUCTURE_TYPE_PROGRAM_PROPERTIES; Properties.pNext = nullptr; Properties.count = NumMetadataEntries; - Properties.pMetadatas = pMetadatas.get(); + + std::unique_ptr pMetadatas; + if (NumMetadataEntries) { + pMetadatas.reset(new ur_program_metadata_t[NumMetadataEntries]); + for (unsigned i = 0; i < NumMetadataEntries; i++) { + HANDLE_ERRORS(mapPIMetadataToUR(&Metadata[i], &pMetadatas[i])); + } + + Properties.pMetadatas = pMetadatas.get(); + } ur_program_handle_t *UrProgram = reinterpret_cast(Program); From 9b3448afbb123f67867d60c3c240f2c62bb0bd99 Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Thu, 8 Jun 2023 16:32:07 +0100 Subject: [PATCH 41/45] [SYCL][CUDA] Mass fixup of code style in the CUDA adapter --- .../ur/adapters/cuda/common.cpp | 51 +- .../ur/adapters/cuda/common.hpp | 16 +- .../ur/adapters/cuda/context.cpp | 58 +- .../ur/adapters/cuda/context.hpp | 68 +- .../ur/adapters/cuda/device.cpp | 775 +++++----- .../ur/adapters/cuda/device.hpp | 52 +- .../ur/adapters/cuda/enqueue.cpp | 1363 ++++++++--------- .../ur/adapters/cuda/event.cpp | 209 ++- .../ur/adapters/cuda/event.hpp | 155 +- .../ur/adapters/cuda/kernel.cpp | 212 ++- .../ur/adapters/cuda/kernel.hpp | 162 +- .../ur/adapters/cuda/memory.cpp | 356 +++-- .../ur/adapters/cuda/memory.hpp | 181 ++- .../ur/adapters/cuda/platform.cpp | 115 +- .../ur/adapters/cuda/platform.hpp | 2 +- .../ur/adapters/cuda/program.cpp | 293 ++-- .../ur/adapters/cuda/program.hpp | 41 +- .../ur/adapters/cuda/queue.cpp | 259 ++-- .../ur/adapters/cuda/queue.hpp | 281 ++-- .../ur/adapters/cuda/sampler.cpp | 40 +- .../ur/adapters/cuda/sampler.hpp | 16 +- .../unified_runtime/ur/adapters/cuda/usm.cpp | 153 +- 22 files changed, 2410 insertions(+), 2448 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp index de767c929d638..86975e5097257 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp @@ -12,8 +12,8 @@ #include -ur_result_t map_error_ur(CUresult result) { - switch (result) { +ur_result_t mapErrorUR(CUresult Result) { + switch (Result) { case CUDA_SUCCESS: return UR_RESULT_SUCCESS; case CUDA_ERROR_NOT_PERMITTED: @@ -33,33 +33,33 @@ ur_result_t map_error_ur(CUresult result) { } } -ur_result_t check_error_ur(CUresult result, const char *function, int line, - const char *file) { - if (result == CUDA_SUCCESS || result == CUDA_ERROR_DEINITIALIZED) { +ur_result_t checkErrorUR(CUresult Result, const char *Function, int Line, + const char *File) { + if (Result == CUDA_SUCCESS || Result == CUDA_ERROR_DEINITIALIZED) { return UR_RESULT_SUCCESS; } if (std::getenv("SYCL_PI_SUPPRESS_ERROR_MESSAGE") == nullptr) { - const char *errorString = nullptr; - const char *errorName = nullptr; - cuGetErrorName(result, &errorName); - cuGetErrorString(result, &errorString); - std::stringstream ss; - ss << "\nUR CUDA ERROR:" - << "\n\tValue: " << result - << "\n\tName: " << errorName - << "\n\tDescription: " << errorString - << "\n\tFunction: " << function << "\n\tSource Location: " << file - << ":" << line << "\n" + const char *ErrorString = nullptr; + const char *ErrorName = nullptr; + cuGetErrorName(Result, &ErrorName); + cuGetErrorString(Result, &ErrorString); + std::stringstream SS; + SS << "\nUR CUDA ERROR:" + << "\n\tValue: " << Result + << "\n\tName: " << ErrorName + << "\n\tDescription: " << ErrorString + << "\n\tFunction: " << Function << "\n\tSource Location: " << File + << ":" << Line << "\n" << std::endl; - std::cerr << ss.str(); + std::cerr << SS.str(); } if (std::getenv("PI_CUDA_ABORT") != nullptr) { std::abort(); } - throw map_error_ur(result); + throw mapErrorUR(Result); } std::string getCudaVersionString() { @@ -91,16 +91,11 @@ thread_local ur_result_t ErrorMessageCode = UR_RESULT_SUCCESS; thread_local char ErrorMessage[MaxMessageSize]; // Utility function for setting a message and warning -[[maybe_unused]] void setErrorMessage(const char *message, - ur_result_t error_code) { - assert(strlen(message) <= MaxMessageSize); - strcpy(ErrorMessage, message); - ErrorMessageCode = error_code; -} - -ur_result_t zerPluginGetLastError(char **message) { - *message = &ErrorMessage[0]; - return ErrorMessageCode; +[[maybe_unused]] void setErrorMessage(const char *pMessage, + ur_result_t ErrorCode) { + assert(strlen(pMessage) <= MaxMessageSize); + strcpy(ErrorMessage, pMessage); + ErrorMessageCode = ErrorCode; } // Returns plugin specific error and warning messages; common implementation diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp index 3aa23c67bf492..5cfa609018b29 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp @@ -11,20 +11,20 @@ #include #include -ur_result_t map_error_ur(CUresult result); +ur_result_t mapErrorUR(CUresult Result); /// Converts CUDA error into UR error codes, and outputs error information /// to stderr. /// If PI_CUDA_ABORT env variable is defined, it aborts directly instead of /// throwing the error. This is intended for debugging purposes. -/// \return UR_RESULT_SUCCESS if \param result was CUDA_SUCCESS. +/// \return UR_RESULT_SUCCESS if \param Result was CUDA_SUCCESS. /// \throw ur_result_t exception (integer) if input was not success. /// -ur_result_t check_error_ur(CUresult result, const char *function, int line, - const char *file); +ur_result_t checkErrorUR(CUresult Result, const char *Function, int Line, + const char *File); -#define UR_CHECK_ERROR(result) \ - check_error_ur(result, __func__, __LINE__, __FILE__) +#define UR_CHECK_ERROR(Result) \ + checkErrorUR(Result, __func__, __LINE__, __FILE__) std::string getCudaVersionString(); @@ -33,8 +33,8 @@ extern thread_local ur_result_t ErrorMessageCode; extern thread_local char ErrorMessage[MaxMessageSize]; // Utility function for setting a message and warning -[[maybe_unused]] void setErrorMessage(const char *message, - ur_result_t error_code); +[[maybe_unused]] void setErrorMessage(const char *pMessage, + ur_result_t ErrorCode); /// ------ Error handling, matching OpenCL plugin semantics. namespace sycl { diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp index 27ed647639a6c..c922e8a3ddad6 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp @@ -26,19 +26,19 @@ urContextCreate(uint32_t DeviceCount, const ur_device_handle_t *phDevices, UR_ASSERT(phContext, UR_RESULT_ERROR_INVALID_NULL_POINTER); assert(DeviceCount == 1); - ur_result_t errcode_ret = UR_RESULT_SUCCESS; + ur_result_t RetErr = UR_RESULT_SUCCESS; - std::unique_ptr piContextPtr{nullptr}; + std::unique_ptr ContextPtr{nullptr}; try { - piContextPtr = std::unique_ptr( + ContextPtr = std::unique_ptr( new ur_context_handle_t_{*phDevices}); - *phContext = piContextPtr.release(); - } catch (ur_result_t err) { - errcode_ret = err; + *phContext = ContextPtr.release(); + } catch (ur_result_t Err) { + RetErr = Err; } catch (...) { - errcode_ret = UR_RESULT_ERROR_OUT_OF_RESOURCES; + RetErr = UR_RESULT_ERROR_OUT_OF_RESOURCES; } - return errcode_ret; + return RetErr; } UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo( @@ -52,24 +52,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo( case UR_CONTEXT_INFO_NUM_DEVICES: return ReturnValue(1); case UR_CONTEXT_INFO_DEVICES: - return ReturnValue(hContext->get_device()); + return ReturnValue(hContext->getDevice()); case UR_CONTEXT_INFO_REFERENCE_COUNT: - return ReturnValue(hContext->get_reference_count()); + return ReturnValue(hContext->getReferenceCount()); case UR_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: { - uint32_t capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | + uint32_t Capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE | UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE | UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL; - return ReturnValue(capabilities); + return ReturnValue(Capabilities); } case UR_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: { - int major = 0; + int Major = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&major, + cuDeviceGetAttribute(&Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - hContext->get_device()->get()) == CUDA_SUCCESS); - uint32_t capabilities = - (major >= 7) ? UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | + hContext->getDevice()->get()) == CUDA_SUCCESS); + uint32_t Capabilities = + (Major >= 7) ? UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP | UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE | @@ -78,7 +78,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo( UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP | UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE; - return ReturnValue(capabilities); + return ReturnValue(Capabilities); } case UR_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT: // 2D USM memcpy is supported. @@ -94,25 +94,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo( return UR_RESULT_ERROR_INVALID_ENUMERATION; } -UR_APIEXPORT ur_result_t UR_APICALL urContextRelease(ur_context_handle_t ctxt) { - UR_ASSERT(ctxt, UR_RESULT_ERROR_INVALID_NULL_HANDLE); +UR_APIEXPORT ur_result_t UR_APICALL +urContextRelease(ur_context_handle_t hContext) { + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - if (ctxt->decrement_reference_count() > 0) { + if (hContext->decrementReferenceCount() > 0) { return UR_RESULT_SUCCESS; } - ctxt->invoke_extended_deleters(); + hContext->invokeExtendedDeleters(); - std::unique_ptr context{ctxt}; + std::unique_ptr Context{hContext}; return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urContextRetain(ur_context_handle_t ctxt) { - UR_ASSERT(ctxt, UR_RESULT_ERROR_INVALID_NULL_HANDLE); +UR_APIEXPORT ur_result_t UR_APICALL +urContextRetain(ur_context_handle_t hContext) { + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - assert(ctxt->get_reference_count() > 0); + assert(hContext->getReferenceCount() > 0); - ctxt->increment_reference_count(); + hContext->incrementReferenceCount(); return UR_RESULT_SUCCESS; } @@ -142,6 +144,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextSetExtendedDeleter( UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(pfnDeleter, UR_RESULT_ERROR_INVALID_NULL_POINTER); - hContext->set_extended_deleter(pfnDeleter, pUserData); + hContext->setExtendedDeleter(pfnDeleter, pUserData); return UR_RESULT_SUCCESS; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp index bc3cb32f55b9c..96103d4d52c14 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp @@ -60,78 +60,78 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data); struct ur_context_handle_t_ { struct deleter_data { - ur_context_extended_deleter_t function; - void *user_data; + ur_context_extended_deleter_t Function; + void *UserData; - void operator()() { function(user_data); } + void operator()() { Function(UserData); } }; using native_type = CUcontext; - native_type cuContext_; - ur_device_handle_t deviceId_; - std::atomic_uint32_t refCount_; + native_type CUContext; + ur_device_handle_t DeviceID; + std::atomic_uint32_t RefCount; - ur_context_handle_t_(ur_device_handle_t_ *devId) - : cuContext_{devId->get_context()}, deviceId_{devId}, refCount_{1} { - urDeviceRetain(deviceId_); + ur_context_handle_t_(ur_device_handle_t_ *DevID) + : CUContext{DevID->getContext()}, DeviceID{DevID}, RefCount{1} { + urDeviceRetain(DeviceID); }; - ~ur_context_handle_t_() { urDeviceRelease(deviceId_); } + ~ur_context_handle_t_() { urDeviceRelease(DeviceID); } - void invoke_extended_deleters() { - std::lock_guard guard(mutex_); - for (auto &deleter : extended_deleters_) { - deleter(); + void invokeExtendedDeleters() { + std::lock_guard Guard(Mutex); + for (auto &Deleter : ExtendedDeleters) { + Deleter(); } } - void set_extended_deleter(ur_context_extended_deleter_t function, - void *user_data) { - std::lock_guard guard(mutex_); - extended_deleters_.emplace_back(deleter_data{function, user_data}); + void setExtendedDeleter(ur_context_extended_deleter_t Function, + void *UserData) { + std::lock_guard Guard(Mutex); + ExtendedDeleters.emplace_back(deleter_data{Function, UserData}); } - ur_device_handle_t get_device() const noexcept { return deviceId_; } + ur_device_handle_t getDevice() const noexcept { return DeviceID; } - native_type get() const noexcept { return cuContext_; } + native_type get() const noexcept { return CUContext; } - uint32_t increment_reference_count() noexcept { return ++refCount_; } + uint32_t incrementReferenceCount() noexcept { return ++RefCount; } - uint32_t decrement_reference_count() noexcept { return --refCount_; } + uint32_t decrementReferenceCount() noexcept { return --RefCount; } - uint32_t get_reference_count() const noexcept { return refCount_; } + uint32_t getReferenceCount() const noexcept { return RefCount; } private: - std::mutex mutex_; - std::vector extended_deleters_; + std::mutex Mutex; + std::vector ExtendedDeleters; }; namespace { class ScopedContext { public: - ScopedContext(ur_context_handle_t ctxt) { - if (!ctxt) { + ScopedContext(ur_context_handle_t Context) { + if (!Context) { throw UR_RESULT_ERROR_INVALID_CONTEXT; } - set_context(ctxt->get()); + setContext(Context->get()); } - ScopedContext(CUcontext ctxt) { set_context(ctxt); } + ScopedContext(CUcontext NativeContext) { setContext(NativeContext); } ~ScopedContext() {} private: - void set_context(CUcontext desired) { - CUcontext original = nullptr; + void setContext(CUcontext Desired) { + CUcontext Original = nullptr; - UR_CHECK_ERROR(cuCtxGetCurrent(&original)); + UR_CHECK_ERROR(cuCtxGetCurrent(&Original)); // Make sure the desired context is active on the current thread, setting // it if necessary - if (original != desired) { - UR_CHECK_ERROR(cuCtxSetCurrent(desired)); + if (Original != Desired) { + UR_CHECK_ERROR(cuCtxSetCurrent(Desired)); } } }; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp index 39d582405a1e1..c3028a58717c6 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp @@ -20,27 +20,27 @@ int getAttribute(ur_device_handle_t device, CUdevice_attribute attribute) { return value; } -uint64_t ur_device_handle_t_::get_elapsed_time(CUevent ev) const { - float miliSeconds = 0.0f; +uint64_t ur_device_handle_t_::getElapsedTime(CUevent ev) const { + float Milliseconds = 0.0f; - UR_CHECK_ERROR(cuEventElapsedTime(&miliSeconds, evBase_, ev)); + UR_CHECK_ERROR(cuEventElapsedTime(&Milliseconds, EvBase, ev)); - return static_cast(miliSeconds * 1.0e6); + return static_cast(Milliseconds * 1.0e6); } -UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, - ur_device_info_t infoType, +UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + ur_device_info_t propName, size_t propSize, - void *pDeviceInfo, + void *pPropValue, size_t *pPropSizeRet) { - UR_ASSERT(device, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UrReturnHelper ReturnValue(propSize, pDeviceInfo, pPropSizeRet); + UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); - static constexpr uint32_t max_work_item_dimensions = 3u; + static constexpr uint32_t MaxWorkItemDimensions = 3u; - ScopedContext active(device->get_context()); + ScopedContext Active(hDevice->getContext()); - switch ((uint32_t)infoType) { + switch ((uint32_t)propName) { case UR_DEVICE_INFO_TYPE: { return ReturnValue(UR_DEVICE_TYPE_GPU); } @@ -48,80 +48,80 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, return ReturnValue(4318u); } case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: { - int compute_units = 0; + int ComputeUnits = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&compute_units, + cuDeviceGetAttribute(&ComputeUnits, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, - device->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion(compute_units >= 0); - return ReturnValue(static_cast(compute_units)); + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(ComputeUnits >= 0); + return ReturnValue(static_cast(ComputeUnits)); } case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: { - return ReturnValue(max_work_item_dimensions); + return ReturnValue(MaxWorkItemDimensions); } case UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES: { struct { - size_t sizes[max_work_item_dimensions]; - } return_sizes; + size_t Sizes[MaxWorkItemDimensions]; + } ReturnSizes; - int max_x = 0, max_y = 0, max_z = 0; + int MaxX = 0, MaxY = 0, MaxZ = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&max_x, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, - device->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion(max_x >= 0); + cuDeviceGetAttribute(&MaxX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(MaxX >= 0); sycl::detail::ur::assertion( - cuDeviceGetAttribute(&max_y, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, - device->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion(max_y >= 0); + cuDeviceGetAttribute(&MaxY, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(MaxY >= 0); sycl::detail::ur::assertion( - cuDeviceGetAttribute(&max_z, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, - device->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion(max_z >= 0); + cuDeviceGetAttribute(&MaxZ, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(MaxZ >= 0); - return_sizes.sizes[0] = size_t(max_x); - return_sizes.sizes[1] = size_t(max_y); - return_sizes.sizes[2] = size_t(max_z); - return ReturnValue(return_sizes); + ReturnSizes.Sizes[0] = size_t(MaxX); + ReturnSizes.Sizes[1] = size_t(MaxY); + ReturnSizes.Sizes[2] = size_t(MaxZ); + return ReturnValue(ReturnSizes); } case UR_DEVICE_INFO_MAX_WORK_GROUPS_3D: { struct { - size_t sizes[max_work_item_dimensions]; - } return_sizes; - int max_x = 0, max_y = 0, max_z = 0; + size_t Sizes[MaxWorkItemDimensions]; + } ReturnSizes; + int MaxX = 0, MaxY = 0, MaxZ = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&max_x, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, - device->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion(max_x >= 0); + cuDeviceGetAttribute(&MaxX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(MaxX >= 0); sycl::detail::ur::assertion( - cuDeviceGetAttribute(&max_y, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, - device->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion(max_y >= 0); + cuDeviceGetAttribute(&MaxY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(MaxY >= 0); sycl::detail::ur::assertion( - cuDeviceGetAttribute(&max_z, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, - device->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion(max_z >= 0); + cuDeviceGetAttribute(&MaxZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(MaxZ >= 0); - return_sizes.sizes[0] = size_t(max_x); - return_sizes.sizes[1] = size_t(max_y); - return_sizes.sizes[2] = size_t(max_z); - return ReturnValue(return_sizes); + ReturnSizes.Sizes[0] = size_t(MaxX); + ReturnSizes.Sizes[1] = size_t(MaxY); + ReturnSizes.Sizes[2] = size_t(MaxZ); + return ReturnValue(ReturnSizes); } case UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE: { - int max_work_group_size = 0; + int MaxWorkGroupSize = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&max_work_group_size, + cuDeviceGetAttribute(&MaxWorkGroupSize, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - device->get()) == CUDA_SUCCESS); + hDevice->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion(max_work_group_size >= 0); + sycl::detail::ur::assertion(MaxWorkGroupSize >= 0); - return ReturnValue(size_t(max_work_group_size)); + return ReturnValue(size_t(MaxWorkGroupSize)); } case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR: { return ReturnValue(1u); @@ -167,55 +167,55 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, } case UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS: { // Number of sub-groups = max block size / warp size + possible remainder - int max_threads = 0; + int MaxThreads = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&max_threads, + cuDeviceGetAttribute(&MaxThreads, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - device->get()) == CUDA_SUCCESS); - int warpSize = 0; + hDevice->get()) == CUDA_SUCCESS); + int WarpSize = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, - device->get()) == CUDA_SUCCESS); - int maxWarps = (max_threads + warpSize - 1) / warpSize; - return ReturnValue(maxWarps); + cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, + hDevice->get()) == CUDA_SUCCESS); + int MaxWarps = (MaxThreads + WarpSize - 1) / WarpSize; + return ReturnValue(MaxWarps); } case UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: { // Volta provides independent thread scheduling // TODO: Revisit for previous generation GPUs - int major = 0; + int Major = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&major, + cuDeviceGetAttribute(&Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - device->get()) == CUDA_SUCCESS); - bool ifp = (major >= 7); - return ReturnValue(ifp); + hDevice->get()) == CUDA_SUCCESS); + bool IFP = (Major >= 7); + return ReturnValue(IFP); } case UR_DEVICE_INFO_ATOMIC_64: { - int major = 0; + int Major = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&major, + cuDeviceGetAttribute(&Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - device->get()) == CUDA_SUCCESS); + hDevice->get()) == CUDA_SUCCESS); - bool atomic64 = (major >= 6) ? true : false; - return ReturnValue(atomic64); + bool Atomic64 = (Major >= 6) ? true : false; + return ReturnValue(Atomic64); } case UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: { - uint64_t capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | + uint64_t Capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE | UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE | UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL; - return ReturnValue(capabilities); + return ReturnValue(Capabilities); } case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: { - int major = 0; + int Major = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&major, + cuDeviceGetAttribute(&Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - device->get()) == CUDA_SUCCESS); - uint64_t capabilities = - (major >= 7) ? UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | + hDevice->get()) == CUDA_SUCCESS); + uint64_t Capabilities = + (Major >= 7) ? UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP | UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE | @@ -224,18 +224,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP | UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE; - return ReturnValue(capabilities); + return ReturnValue(Capabilities); } case UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: { // SYCL2020 4.6.4.2 minimum mandated capabilities for // atomic_fence_order_capabilities. - ur_memory_order_capability_flags_t capabilities = + ur_memory_order_capability_flags_t Capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE | UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE | UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL; - return ReturnValue(capabilities); + return ReturnValue(Capabilities); } case UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: { // SYCL2020 4.6.4.2 minimum mandated capabilities for @@ -243,42 +243,42 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, // Because scopes are hierarchical, wider scopes support all narrower // scopes. At a minimum, each device must support WORK_ITEM, SUB_GROUP and // WORK_GROUP. (https://github.com/KhronosGroup/SYCL-Docs/pull/382) - ur_memory_scope_capability_flags_t capabilities = + ur_memory_scope_capability_flags_t Capabilities = UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP; - return ReturnValue(capabilities); + return ReturnValue(Capabilities); } case UR_DEVICE_INFO_BFLOAT16: { - int major = 0; + int Major = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&major, + cuDeviceGetAttribute(&Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - device->get()) == CUDA_SUCCESS); + hDevice->get()) == CUDA_SUCCESS); - bool bfloat16 = (major >= 8) ? true : false; - return ReturnValue(bfloat16); + bool BFloat16 = (Major >= 8) ? true : false; + return ReturnValue(BFloat16); } case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: { // NVIDIA devices only support one sub-group size (the warp size) - int warpSize = 0; + int WarpSize = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, - device->get()) == CUDA_SUCCESS); - size_t sizes[1] = {static_cast(warpSize)}; - return ReturnValue(sizes, 1); + cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, + hDevice->get()) == CUDA_SUCCESS); + size_t Sizes[1] = {static_cast(WarpSize)}; + return ReturnValue(Sizes, 1); } case UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY: { - int clock_freq = 0; + int ClockFreq = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&clock_freq, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, - device->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion(clock_freq >= 0); - return ReturnValue(static_cast(clock_freq) / 1000u); + cuDeviceGetAttribute(&ClockFreq, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(ClockFreq >= 0); + return ReturnValue(static_cast(ClockFreq) / 1000u); } case UR_DEVICE_INFO_ADDRESS_BITS: { - auto bits = uint32_t{std::numeric_limits::digits}; - return ReturnValue(bits); + auto Bits = uint32_t{std::numeric_limits::digits}; + return ReturnValue(Bits); } case UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: { // Max size of memory object allocation in bytes. @@ -287,22 +287,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, // 32 × 1024 × 1024) for devices that are not of type // CL_DEVICE_TYPE_CUSTOM. - size_t global = 0; - sycl::detail::ur::assertion(cuDeviceTotalMem(&global, device->get()) == + size_t Global = 0; + sycl::detail::ur::assertion(cuDeviceTotalMem(&Global, hDevice->get()) == CUDA_SUCCESS); - auto quarter_global = static_cast(global / 4u); + auto QuarterGlobal = static_cast(Global / 4u); - auto max_alloc = std::max(std::min(1024u * 1024u * 1024u, quarter_global), - 32u * 1024u * 1024u); + auto MaxAlloc = std::max(std::min(1024u * 1024u * 1024u, QuarterGlobal), + 32u * 1024u * 1024u); - return ReturnValue(uint64_t{max_alloc}); + return ReturnValue(uint64_t{MaxAlloc}); } case UR_DEVICE_INFO_IMAGE_SUPPORTED: { - bool enabled = false; + bool Enabled = false; if (std::getenv("SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT") != nullptr) { - enabled = true; + Enabled = true; } else { sycl::detail::ur::cuPrint( "Images are not fully supported by the CUDA BE, their support is " @@ -311,7 +311,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, "runtime."); } - return ReturnValue(uint32_t{enabled}); + return ReturnValue(uint32_t{Enabled}); } case UR_DEVICE_INFO_MAX_READ_IMAGE_ARGS: { // This call doesn't match to CUDA as it doesn't have images, but instead @@ -327,117 +327,117 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, } case UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: { // Take the smaller of maximum surface and maximum texture height. - int tex_height = 0; + int TexHeight = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&tex_height, + cuDeviceGetAttribute(&TexHeight, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, - device->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion(tex_height >= 0); - int surf_height = 0; + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(TexHeight >= 0); + int SurfHeight = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&surf_height, + cuDeviceGetAttribute(&SurfHeight, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT, - device->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion(surf_height >= 0); + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(SurfHeight >= 0); - int min = std::min(tex_height, surf_height); + int Min = std::min(TexHeight, SurfHeight); - return ReturnValue(static_cast(min)); + return ReturnValue(static_cast(Min)); } case UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH: { // Take the smaller of maximum surface and maximum texture width. - int tex_width = 0; + int TexWidth = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&tex_width, + cuDeviceGetAttribute(&TexWidth, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, - device->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion(tex_width >= 0); - int surf_width = 0; + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(TexWidth >= 0); + int SurfWidth = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&surf_width, + cuDeviceGetAttribute(&SurfWidth, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH, - device->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion(surf_width >= 0); + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(SurfWidth >= 0); - int min = std::min(tex_width, surf_width); + int Min = std::min(TexWidth, SurfWidth); - return ReturnValue(static_cast(min)); + return ReturnValue(static_cast(Min)); } case UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT: { // Take the smaller of maximum surface and maximum texture height. - int tex_height = 0; + int TexHeight = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&tex_height, + cuDeviceGetAttribute(&TexHeight, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, - device->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion(tex_height >= 0); - int surf_height = 0; + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(TexHeight >= 0); + int SurfHeight = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&surf_height, + cuDeviceGetAttribute(&SurfHeight, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT, - device->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion(surf_height >= 0); + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(SurfHeight >= 0); - int min = std::min(tex_height, surf_height); + int Min = std::min(TexHeight, SurfHeight); - return ReturnValue(static_cast(min)); + return ReturnValue(static_cast(Min)); } case UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH: { // Take the smaller of maximum surface and maximum texture width. - int tex_width = 0; + int TexWidth = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&tex_width, + cuDeviceGetAttribute(&TexWidth, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, - device->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion(tex_width >= 0); - int surf_width = 0; + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(TexWidth >= 0); + int SurfWidth = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&surf_width, + cuDeviceGetAttribute(&SurfWidth, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH, - device->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion(surf_width >= 0); + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(SurfWidth >= 0); - int min = std::min(tex_width, surf_width); + int Min = std::min(TexWidth, SurfWidth); - return ReturnValue(static_cast(min)); + return ReturnValue(static_cast(Min)); } case UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH: { // Take the smaller of maximum surface and maximum texture depth. - int tex_depth = 0; + int TexDepth = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&tex_depth, + cuDeviceGetAttribute(&TexDepth, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, - device->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion(tex_depth >= 0); - int surf_depth = 0; + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(TexDepth >= 0); + int SurfDepth = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&surf_depth, + cuDeviceGetAttribute(&SurfDepth, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH, - device->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion(surf_depth >= 0); + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(SurfDepth >= 0); - int min = std::min(tex_depth, surf_depth); + int Min = std::min(TexDepth, SurfDepth); - return ReturnValue(static_cast(min)); + return ReturnValue(static_cast(Min)); } case UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE: { // Take the smaller of maximum surface and maximum texture width. - int tex_width = 0; + int TexWidth = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&tex_width, + cuDeviceGetAttribute(&TexWidth, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, - device->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion(tex_width >= 0); - int surf_width = 0; + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(TexWidth >= 0); + int SurfWidth = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&surf_width, + cuDeviceGetAttribute(&SurfWidth, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH, - device->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion(surf_width >= 0); + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(SurfWidth >= 0); - int min = std::min(tex_width, surf_width); + int Min = std::min(TexWidth, SurfWidth); - return ReturnValue(static_cast(min)); + return ReturnValue(static_cast(Min)); } case UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE: { return ReturnValue(0lu); @@ -454,14 +454,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, return ReturnValue(4000lu); } case UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: { - int mem_base_addr_align = 0; + int MemBaseAddrAlign = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&mem_base_addr_align, + cuDeviceGetAttribute(&MemBaseAddrAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, - device->get()) == CUDA_SUCCESS); + hDevice->get()) == CUDA_SUCCESS); // Multiply by 8 as clGetDeviceInfo returns this value in bits - mem_base_addr_align *= 8; - return ReturnValue(mem_base_addr_align); + MemBaseAddrAlign *= 8; + return ReturnValue(MemBaseAddrAlign); } case UR_DEVICE_INFO_HALF_FP_CONFIG: { // TODO: is this config consistent across all NVIDIA GPUs? @@ -469,7 +469,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, } case UR_DEVICE_INFO_SINGLE_FP_CONFIG: { // TODO: is this config consistent across all NVIDIA GPUs? - uint64_t config = + uint64_t Config = UR_DEVICE_FP_CAPABILITY_FLAG_DENORM | UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN | UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST | @@ -477,17 +477,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF | UR_DEVICE_FP_CAPABILITY_FLAG_FMA | UR_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT; - return ReturnValue(config); + return ReturnValue(Config); } case UR_DEVICE_INFO_DOUBLE_FP_CONFIG: { // TODO: is this config consistent across all NVIDIA GPUs? - uint64_t config = UR_DEVICE_FP_CAPABILITY_FLAG_DENORM | + uint64_t Config = UR_DEVICE_FP_CAPABILITY_FLAG_DENORM | UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN | UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST | UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO | UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF | UR_DEVICE_FP_CAPABILITY_FLAG_FMA; - return ReturnValue(config); + return ReturnValue(Config); } case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE: { // TODO: is this config consistent across all NVIDIA GPUs? @@ -499,30 +499,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, return ReturnValue(128u); } case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE: { - int cache_size = 0; + int CacheSize = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&cache_size, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, - device->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion(cache_size >= 0); + cuDeviceGetAttribute(&CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(CacheSize >= 0); // The L2 cache is global to the GPU. - return ReturnValue(static_cast(cache_size)); + return ReturnValue(static_cast(CacheSize)); } case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: { - size_t bytes = 0; + size_t Bytes = 0; // Runtime API has easy access to this value, driver API info is scarse. - sycl::detail::ur::assertion(cuDeviceTotalMem(&bytes, device->get()) == + sycl::detail::ur::assertion(cuDeviceTotalMem(&Bytes, hDevice->get()) == CUDA_SUCCESS); - return ReturnValue(uint64_t{bytes}); + return ReturnValue(uint64_t{Bytes}); } case UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE: { - int constant_memory = 0; + int ConstantMemory = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&constant_memory, + cuDeviceGetAttribute(&ConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, - device->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion(constant_memory >= 0); + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(ConstantMemory >= 0); - return ReturnValue(static_cast(constant_memory)); + return ReturnValue(static_cast(ConstantMemory)); } case UR_DEVICE_INFO_MAX_CONSTANT_ARGS: { // TODO: is there a way to retrieve this from CUDA driver API? @@ -537,32 +537,32 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, // OpenCL's "local memory" maps most closely to CUDA's "shared memory". // CUDA has its own definition of "local memory", which maps to OpenCL's // "private memory". - int local_mem_size = 0; + int LocalMemSize = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&local_mem_size, + cuDeviceGetAttribute(&LocalMemSize, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, - device->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion(local_mem_size >= 0); - return ReturnValue(static_cast(local_mem_size)); + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(LocalMemSize >= 0); + return ReturnValue(static_cast(LocalMemSize)); } case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: { - int ecc_enabled = 0; + int ECCEnabled = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&ecc_enabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, - device->get()) == CUDA_SUCCESS); + cuDeviceGetAttribute(&ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, + hDevice->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion((ecc_enabled == 0) | (ecc_enabled == 1)); - auto result = static_cast(ecc_enabled); - return ReturnValue(result); + sycl::detail::ur::assertion((ECCEnabled == 0) | (ECCEnabled == 1)); + auto Result = static_cast(ECCEnabled); + return ReturnValue(Result); } case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY: { - int is_integrated = 0; + int IsIntegrated = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&is_integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, - device->get()) == CUDA_SUCCESS); + cuDeviceGetAttribute(&IsIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, + hDevice->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion((is_integrated == 0) | (is_integrated == 1)); - auto result = static_cast(is_integrated); + sycl::detail::ur::assertion((IsIntegrated == 0) | (IsIntegrated == 1)); + auto result = static_cast(IsIntegrated); return ReturnValue(result); } case UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION: { @@ -586,9 +586,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, return ReturnValue(true); } case UR_DEVICE_INFO_EXECUTION_CAPABILITIES: { - auto capability = ur_device_exec_capability_flags_t{ + auto Capability = ur_device_exec_capability_flags_t{ UR_DEVICE_EXEC_CAPABILITY_FLAG_KERNEL}; - return ReturnValue(capability); + return ReturnValue(Capability); } case UR_DEVICE_INFO_QUEUE_PROPERTIES: return ReturnValue( @@ -596,14 +596,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, UR_QUEUE_FLAG_PROFILING_ENABLE)); case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: { // The mandated minimum capability: - uint64_t capability = UR_QUEUE_FLAG_PROFILING_ENABLE | + uint64_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE | UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE; - return ReturnValue(capability); + return ReturnValue(Capability); } case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: { // The mandated minimum capability: - uint64_t capability = UR_QUEUE_FLAG_PROFILING_ENABLE; - return ReturnValue(capability); + uint64_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE; + return ReturnValue(Capability); } case UR_DEVICE_INFO_BUILT_IN_KERNELS: { // An empty string is returned if no built-in kernels are supported by the @@ -611,27 +611,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, return ReturnValue(""); } case UR_DEVICE_INFO_PLATFORM: { - return ReturnValue(device->get_platform()); + return ReturnValue(hDevice->getPlatform()); } case UR_DEVICE_INFO_NAME: { - static constexpr size_t MAX_DEVICE_NAME_LENGTH = 256u; - char name[MAX_DEVICE_NAME_LENGTH]; - sycl::detail::ur::assertion(cuDeviceGetName(name, MAX_DEVICE_NAME_LENGTH, - device->get()) == CUDA_SUCCESS); - return ReturnValue(name, strlen(name) + 1); + static constexpr size_t MaxDeviceNameLength = 256u; + char Name[MaxDeviceNameLength]; + sycl::detail::ur::assertion( + cuDeviceGetName(Name, MaxDeviceNameLength, hDevice->get()) == + CUDA_SUCCESS); + return ReturnValue(Name, strlen(Name) + 1); } case UR_DEVICE_INFO_VENDOR: { return ReturnValue("NVIDIA Corporation"); } case UR_DEVICE_INFO_DRIVER_VERSION: { - auto version = getCudaVersionString(); - return ReturnValue(version.c_str()); + auto Version = getCudaVersionString(); + return ReturnValue(Version.c_str()); } case UR_DEVICE_INFO_PROFILE: { return ReturnValue("CUDA"); } case UR_DEVICE_INFO_REFERENCE_COUNT: { - return ReturnValue(device->get_reference_count()); + return ReturnValue(hDevice->getReferenceCount()); } case UR_DEVICE_INFO_VERSION: { std::stringstream SS; @@ -639,13 +640,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, sycl::detail::ur::assertion( cuDeviceGetAttribute(&Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - device->get()) == CUDA_SUCCESS); + hDevice->get()) == CUDA_SUCCESS); SS << Major; int Minor; sycl::detail::ur::assertion( cuDeviceGetAttribute(&Minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, - device->get()) == CUDA_SUCCESS); + hDevice->get()) == CUDA_SUCCESS); SS << "." << Minor; return ReturnValue(SS.str().c_str()); } @@ -658,19 +659,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, SupportedExtensions += "pi_ext_intel_devicelib_assert "; SupportedExtensions += " "; - int major = 0; - int minor = 0; + int Major = 0; + int Minor = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&major, + cuDeviceGetAttribute(&Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - device->get()) == CUDA_SUCCESS); + hDevice->get()) == CUDA_SUCCESS); sycl::detail::ur::assertion( - cuDeviceGetAttribute(&minor, + cuDeviceGetAttribute(&Minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, - device->get()) == CUDA_SUCCESS); + hDevice->get()) == CUDA_SUCCESS); - if ((major >= 6) || ((major == 5) && (minor >= 3))) { + if ((Major >= 6) || ((Major == 5) && (Minor >= 3))) { SupportedExtensions += "cl_khr_fp16 "; } @@ -707,14 +708,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, // // query if/how the device can access page-locked host memory, possibly // through PCIe, using the same pointer as the host - uint32_t value = {}; - if (getAttribute(device, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)) { + uint32_t Value = {}; + if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)) { // the device shares a unified address space with the host - if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= + if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 6) { // compute capability 6.x introduces operations that are atomic with // respect to other CPUs and GPUs in the system - value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | + Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS | UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS | UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; @@ -722,11 +723,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, // on GPU architectures with compute capability lower than 6.x, atomic // operations from the GPU to CPU memory will not be atomic with respect // to CPU initiated atomic operations - value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | + Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; } } - return ReturnValue(value); + return ReturnValue(Value); } case UR_DEVICE_INFO_USM_DEVICE_SUPPORT: { // from cl_intel_unified_shared_memory: @@ -734,12 +735,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, // associated with this device." // // query how the device can access memory allocated on the device itself (?) - uint32_t value = + uint32_t Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS | UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS | UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; - return ReturnValue(value); + return ReturnValue(Value); } case UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT: { // from cl_intel_unified_shared_memory: @@ -747,24 +748,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, // allocation associated with this device." // // query if/how the device can access managed memory associated to it - uint32_t value = {}; - if (getAttribute(device, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) { + uint32_t Value = {}; + if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) { // the device can allocate managed memory on this system - value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | + Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS; } - if (getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { + if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { // the device can coherently access managed memory concurrently with the // CPU - value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; - if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= + Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; + if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 6) { // compute capability 6.x introduces operations that are atomic with // respect to other CPUs and GPUs in the system - value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; + Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; } } - return ReturnValue(value); + return ReturnValue(Value); } case UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT: { // from cl_intel_unified_shared_memory: @@ -775,27 +776,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, // // query if/how the device can access managed memory associated to other // devices - uint32_t value = {}; - if (getAttribute(device, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) { + uint32_t Value = {}; + if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) { // the device can allocate managed memory on this system - value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS; + Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS; } - if (getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { + if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { // all devices with the CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS // attribute can coherently access managed memory concurrently with the // CPU - value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; + Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; } - if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= + if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 6) { // compute capability 6.x introduces operations that are atomic with // respect to other CPUs and GPUs in the system - if (value & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS) - value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS; - if (value & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS) - value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; + if (Value & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS) + Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS; + if (Value & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS) + Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; } - return ReturnValue(value); + return ReturnValue(Value); } case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: { // from cl_intel_unified_shared_memory: @@ -804,39 +805,39 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, // // query if/how the device can access pageable host memory allocated by the // system allocator - uint32_t value = {}; - if (getAttribute(device, CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS)) { + uint32_t Value = {}; + if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS)) { // the device suppports coherently accessing pageable memory without // calling cuMemHostRegister/cudaHostRegister on it - if (getAttribute(device, + if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED)) { // the link between the device and the host supports native atomic // operations - value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | + Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS | UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS | UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; } else { // the link between the device and the host does not support native // atomic operations - value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | + Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; } } - return ReturnValue(value); + return ReturnValue(Value); } case UR_DEVICE_INFO_ASYNC_BARRIER: { - int value = - getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 8; - return ReturnValue(static_cast(value)); + int Value = getAttribute(hDevice, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 8; + return ReturnValue(static_cast(Value)); } case UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION: { - int major = - getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); - int minor = - getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); - std::string result = std::to_string(major) + "." + std::to_string(minor); - return ReturnValue(result.c_str()); + int Major = + getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); + int Minor = + getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); + std::string Result = std::to_string(Major) + "." + std::to_string(Minor); + return ReturnValue(Result.c_str()); } case UR_DEVICE_INFO_GLOBAL_MEM_FREE: { @@ -848,103 +849,102 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, return ReturnValue(FreeMemory); } case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: { - int value = 0; + int Value = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, - device->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion(value >= 0); + cuDeviceGetAttribute(&Value, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(Value >= 0); // Convert kilohertz to megahertz when returning. - return ReturnValue(value / 1000); + return ReturnValue(Value / 1000); } case UR_DEVICE_INFO_MEMORY_BUS_WIDTH: { - int value = 0; + int Value = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&value, + cuDeviceGetAttribute(&Value, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, - device->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion(value >= 0); - return ReturnValue(value); + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(Value >= 0); + return ReturnValue(Value); } case UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES: { return ReturnValue(int32_t{1}); } case UR_DEVICE_INFO_DEVICE_ID: { - int value = 0; + int Value = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, - device->get()) == CUDA_SUCCESS); - sycl::detail::ur::assertion(value >= 0); - return ReturnValue(value); + cuDeviceGetAttribute(&Value, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, + hDevice->get()) == CUDA_SUCCESS); + sycl::detail::ur::assertion(Value >= 0); + return ReturnValue(Value); } case UR_DEVICE_INFO_UUID: { - int driver_version = 0; - cuDriverGetVersion(&driver_version); - int major = driver_version / 1000; - int minor = driver_version % 1000 / 10; - CUuuid uuid; - if ((major > 11) || (major == 11 && minor >= 4)) { - sycl::detail::ur::assertion(cuDeviceGetUuid_v2(&uuid, device->get()) == + int DriverVersion = 0; + cuDriverGetVersion(&DriverVersion); + int Major = DriverVersion / 1000; + int Minor = DriverVersion % 1000 / 10; + CUuuid UUID; + if ((Major > 11) || (Major == 11 && Minor >= 4)) { + sycl::detail::ur::assertion(cuDeviceGetUuid_v2(&UUID, hDevice->get()) == CUDA_SUCCESS); } else { - sycl::detail::ur::assertion(cuDeviceGetUuid(&uuid, device->get()) == + sycl::detail::ur::assertion(cuDeviceGetUuid(&UUID, hDevice->get()) == CUDA_SUCCESS); } - std::array name; - std::copy(uuid.bytes, uuid.bytes + 16, name.begin()); - return ReturnValue(name.data(), 16); + std::array Name; + std::copy(UUID.bytes, UUID.bytes + 16, Name.begin()); + return ReturnValue(Name.data(), 16); } case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH: { - int major = 0; + int Major = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&major, + cuDeviceGetAttribute(&Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - device->get()) == CUDA_SUCCESS); + hDevice->get()) == CUDA_SUCCESS); - int minor = 0; + int Minor = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&minor, + cuDeviceGetAttribute(&Minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, - device->get()) == CUDA_SUCCESS); + hDevice->get()) == CUDA_SUCCESS); // Some specific devices seem to need special handling. See reference // https://github.com/jeffhammond/HPCInfo/blob/master/cuda/gpu-detect.cu - bool is_xavier_agx = major == 7 && minor == 2; - bool is_orin_agx = major == 8 && minor == 7; - - int memory_clock_khz = 0; - if (is_xavier_agx) { - memory_clock_khz = 2133000; - } else if (is_orin_agx) { - memory_clock_khz = 3200000; + bool IsXavierAGX = Major == 7 && Minor == 2; + bool IsOrinAGX = Major == 8 && Minor == 7; + + int MemoryClockKHz = 0; + if (IsXavierAGX) { + MemoryClockKHz = 2133000; + } else if (IsOrinAGX) { + MemoryClockKHz = 3200000; } else { sycl::detail::ur::assertion( - cuDeviceGetAttribute(&memory_clock_khz, + cuDeviceGetAttribute(&MemoryClockKHz, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, - device->get()) == CUDA_SUCCESS); + hDevice->get()) == CUDA_SUCCESS); } - int memory_bus_width = 0; - if (is_orin_agx) { - memory_bus_width = 256; + int MemoryBusWidth = 0; + if (IsOrinAGX) { + MemoryBusWidth = 256; } else { sycl::detail::ur::assertion( - cuDeviceGetAttribute(&memory_bus_width, + cuDeviceGetAttribute(&MemoryBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, - device->get()) == CUDA_SUCCESS); + hDevice->get()) == CUDA_SUCCESS); } - uint64_t memory_bandwidth = - uint64_t(memory_clock_khz) * memory_bus_width * 250; + uint64_t MemoryBandwidth = uint64_t(MemoryClockKHz) * MemoryBusWidth * 250; - return ReturnValue(memory_bandwidth); + return ReturnValue(MemoryBandwidth); } case UR_DEVICE_INFO_IL_VERSION: { - std::string il_version = "nvptx-"; + std::string ILVersion = "nvptx-"; - int driver_version = 0; - cuDriverGetVersion(&driver_version); - int major = driver_version / 1000; - int minor = driver_version % 1000 / 10; + int DriverVersion = 0; + cuDriverGetVersion(&DriverVersion); + int Major = DriverVersion / 1000; + int Minor = DriverVersion % 1000 / 10; // We can work out which ptx ISA version we support based on the versioning // table published here @@ -953,29 +953,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, // can derive that easily. The minor versions for version 10 don't line up // the same so it needs a special case. This is not ideal but it does seem // to be the best bet to avoid a maintenance burden here. - il_version += std::to_string(major - 4) + "."; - if (major == 10) { - il_version += std::to_string(minor + 3); - } else if (major >= 11) { - il_version += std::to_string(minor); + ILVersion += std::to_string(Major - 4) + "."; + if (Major == 10) { + ILVersion += std::to_string(Minor + 3); + } else if (Major >= 11) { + ILVersion += std::to_string(Minor); } else { return UR_RESULT_ERROR_INVALID_VALUE; } - return ReturnValue(il_version.data(), il_version.size()); + return ReturnValue(ILVersion.data(), ILVersion.size()); } case UR_EXT_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP: { // Maximum number of 32-bit registers available to a thread block. // Note: This number is shared by all thread blocks simultaneously resident // on a multiprocessor. - int max_registers{-1}; + int MaxRegisters{-1}; UR_CHECK_ERROR(cuDeviceGetAttribute( - &max_registers, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, - device->get())); + &MaxRegisters, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, + hDevice->get())); - sycl::detail::ur::assertion(max_registers >= 0); + sycl::detail::ur::assertion(MaxRegisters >= 0); - return ReturnValue(static_cast(max_registers)); + return ReturnValue(static_cast(MaxRegisters)); } case UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT: return ReturnValue(false); @@ -985,7 +985,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, constexpr size_t AddressBufferSize = 13; char AddressBuffer[AddressBufferSize]; sycl::detail::ur::assertion( - cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, device->get()) == + cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, hDevice->get()) == CUDA_SUCCESS); // CUDA API (8.x - 12.1) guarantees 12 bytes + \0 are written sycl::detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) == @@ -1012,8 +1012,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, /// \return PI_SUCCESS if the function is executed successfully /// CUDA devices are always root devices so retain always returns success. -UR_APIEXPORT ur_result_t UR_APICALL urDeviceRetain(ur_device_handle_t device) { - UR_ASSERT(device, UR_RESULT_ERROR_INVALID_NULL_HANDLE); +UR_APIEXPORT ur_result_t UR_APICALL urDeviceRetain(ur_device_handle_t hDevice) { + UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); return UR_RESULT_SUCCESS; } @@ -1026,8 +1026,9 @@ urDevicePartition(ur_device_handle_t, const ur_device_partition_property_t *, /// \return UR_RESULT_SUCCESS always since CUDA devices are always root /// devices. -UR_DLLEXPORT ur_result_t UR_APICALL urDeviceRelease(ur_device_handle_t device) { - UR_ASSERT(device, UR_RESULT_ERROR_INVALID_NULL_HANDLE); +UR_DLLEXPORT ur_result_t UR_APICALL +urDeviceRelease(ur_device_handle_t hDevice) { + UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); return UR_RESULT_SUCCESS; } @@ -1037,32 +1038,32 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(ur_platform_handle_t hPlatform, uint32_t NumEntries, ur_device_handle_t *phDevices, uint32_t *pNumDevices) { - ur_result_t err = UR_RESULT_SUCCESS; - const bool askingForAll = DeviceType == UR_DEVICE_TYPE_ALL; - const bool askingForDefault = DeviceType == UR_DEVICE_TYPE_DEFAULT; - const bool askingForGPU = DeviceType == UR_DEVICE_TYPE_GPU; - const bool returnDevices = askingForDefault || askingForAll || askingForGPU; + ur_result_t Result = UR_RESULT_SUCCESS; + const bool AskingForAll = DeviceType == UR_DEVICE_TYPE_ALL; + const bool AskingForDefault = DeviceType == UR_DEVICE_TYPE_DEFAULT; + const bool AskingForGPU = DeviceType == UR_DEVICE_TYPE_GPU; + const bool ReturnDevices = AskingForDefault || AskingForAll || AskingForGPU; UR_ASSERT(hPlatform, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - size_t numDevices = returnDevices ? hPlatform->devices_.size() : 0; + size_t NumDevices = ReturnDevices ? hPlatform->Devices.size() : 0; try { UR_ASSERT(pNumDevices || phDevices, UR_RESULT_ERROR_INVALID_VALUE); if (pNumDevices) { - *pNumDevices = numDevices; + *pNumDevices = NumDevices; } - if (returnDevices && phDevices) { - for (size_t i = 0; i < std::min(size_t(NumEntries), numDevices); ++i) { - phDevices[i] = hPlatform->devices_[i].get(); + if (ReturnDevices && phDevices) { + for (size_t i = 0; i < std::min(size_t(NumEntries), NumDevices); ++i) { + phDevices[i] = hPlatform->Devices[i].get(); } } - return err; - } catch (ur_result_t err) { - return err; + return Result; + } catch (ur_result_t Err) { + return Err; } catch (...) { return UR_RESULT_ERROR_OUT_OF_RESOURCES; } @@ -1101,41 +1102,41 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( // We can't cast between ur_native_handle_t and CUdevice, so memcpy the bits // instead - CUdevice cu_device = 0; - memcpy(&cu_device, hNativeDevice, sizeof(CUdevice)); + CUdevice CuDevice = 0; + memcpy(&CuDevice, hNativeDevice, sizeof(CUdevice)); - auto is_device = [=](std::unique_ptr &dev) { - return dev->get() == cu_device; + auto IsDevice = [=](std::unique_ptr &Dev) { + return Dev->get() == CuDevice; }; // If a platform is provided just check if the device is in it if (hPlatform) { - auto search_res = std::find_if(begin(hPlatform->devices_), - end(hPlatform->devices_), is_device); - if (search_res != end(hPlatform->devices_)) { - *phDevice = search_res->get(); + auto SearchRes = std::find_if(begin(hPlatform->Devices), + end(hPlatform->Devices), IsDevice); + if (SearchRes != end(hPlatform->Devices)) { + *phDevice = SearchRes->get(); return UR_RESULT_SUCCESS; } } // Get list of platforms - uint32_t num_platforms = 0; - ur_result_t result = urPlatformGet(0, nullptr, &num_platforms); - if (result != UR_RESULT_SUCCESS) - return result; + uint32_t NumPlatforms = 0; + ur_result_t Result = urPlatformGet(0, nullptr, &NumPlatforms); + if (Result != UR_RESULT_SUCCESS) + return Result; - ur_platform_handle_t *plat = static_cast( - malloc(num_platforms * sizeof(ur_platform_handle_t))); - result = urPlatformGet(num_platforms, plat, nullptr); - if (result != UR_RESULT_SUCCESS) - return result; + ur_platform_handle_t *Plat = static_cast( + malloc(NumPlatforms * sizeof(ur_platform_handle_t))); + Result = urPlatformGet(NumPlatforms, Plat, nullptr); + if (Result != UR_RESULT_SUCCESS) + return Result; // Iterate through platforms to find device that matches nativeHandle - for (uint32_t j = 0; j < num_platforms; ++j) { - auto search_res = std::find_if(begin(plat[j]->devices_), - end(plat[j]->devices_), is_device); - if (search_res != end(plat[j]->devices_)) { - *phDevice = static_cast((*search_res).get()); + for (uint32_t j = 0; j < NumPlatforms; ++j) { + auto SearchRes = + std::find_if(begin(Plat[j]->Devices), end(Plat[j]->Devices), IsDevice); + if (SearchRes != end(Plat[j]->Devices)) { + *phDevice = static_cast((*SearchRes).get()); return UR_RESULT_SUCCESS; } } @@ -1150,12 +1151,12 @@ ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice, uint64_t *pHostTimestamp) { UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - CUevent event; - ScopedContext active(hDevice->get_context()); + CUevent Event; + ScopedContext Active(hDevice->getContext()); if (pDeviceTimestamp) { - UR_CHECK_ERROR(cuEventCreate(&event, CU_EVENT_DEFAULT)); - UR_CHECK_ERROR(cuEventRecord(event, 0)); + UR_CHECK_ERROR(cuEventCreate(&Event, CU_EVENT_DEFAULT)); + UR_CHECK_ERROR(cuEventRecord(Event, 0)); } if (pHostTimestamp) { @@ -1166,8 +1167,8 @@ ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice, } if (pDeviceTimestamp) { - UR_CHECK_ERROR(cuEventSynchronize(event)); - *pDeviceTimestamp = hDevice->get_elapsed_time(event); + UR_CHECK_ERROR(cuEventSynchronize(Event)); + *pDeviceTimestamp = hDevice->getElapsedTime(Event); } return UR_RESULT_SUCCESS; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp index 9d01edd8a5ec3..ff8d85cf7a3d9 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp @@ -13,49 +13,47 @@ struct ur_device_handle_t_ { private: using native_type = CUdevice; - native_type cuDevice_; - CUcontext cuContext_; - CUevent evBase_; // CUDA event used as base counter - std::atomic_uint32_t refCount_; - ur_platform_handle_t platform_; + native_type CuDevice; + CUcontext CuContext; + CUevent EvBase; // CUDA event used as base counter + std::atomic_uint32_t RefCount; + ur_platform_handle_t Platform; - static constexpr uint32_t max_work_item_dimensions = 3u; - size_t max_work_item_sizes[max_work_item_dimensions]; - int max_work_group_size; + static constexpr uint32_t MaxWorkItemDimensions = 3u; + size_t MaxWorkItemSizes[MaxWorkItemDimensions]; + int MaxWorkGroupSize; public: ur_device_handle_t_(native_type cuDevice, CUcontext cuContext, CUevent evBase, ur_platform_handle_t platform) - : cuDevice_(cuDevice), cuContext_(cuContext), evBase_(evBase), - refCount_{1}, platform_(platform) {} + : CuDevice(cuDevice), CuContext(cuContext), EvBase(evBase), RefCount{1}, + Platform(platform) {} - ur_device_handle_t_() { cuDevicePrimaryCtxRelease(cuDevice_); } + ur_device_handle_t_() { cuDevicePrimaryCtxRelease(CuDevice); } - native_type get() const noexcept { return cuDevice_; }; + native_type get() const noexcept { return CuDevice; }; - CUcontext get_context() const noexcept { return cuContext_; }; + CUcontext getContext() const noexcept { return CuContext; }; - uint32_t get_reference_count() const noexcept { return refCount_; } + uint32_t getReferenceCount() const noexcept { return RefCount; } - ur_platform_handle_t get_platform() const noexcept { return platform_; }; + ur_platform_handle_t getPlatform() const noexcept { return Platform; }; - uint64_t get_elapsed_time(CUevent) const; + uint64_t getElapsedTime(CUevent) const; - void save_max_work_item_sizes(size_t size, - size_t *save_max_work_item_sizes) noexcept { - memcpy(max_work_item_sizes, save_max_work_item_sizes, size); + void saveMaxWorkItemSizes(size_t Size, + size_t *SaveMaxWorkItemSizes) noexcept { + memcpy(MaxWorkItemSizes, SaveMaxWorkItemSizes, Size); }; - void save_max_work_group_size(int value) noexcept { - max_work_group_size = value; - }; + void saveMaxWorkGroupSize(int Value) noexcept { MaxWorkGroupSize = Value; }; - void get_max_work_item_sizes(size_t ret_size, - size_t *ret_max_work_item_sizes) const noexcept { - memcpy(ret_max_work_item_sizes, max_work_item_sizes, ret_size); + void getMaxWorkItemSizes(size_t RetSize, + size_t *RetMaxWorkItemSizes) const noexcept { + memcpy(RetMaxWorkItemSizes, MaxWorkItemSizes, RetSize); }; - int get_max_work_group_size() const noexcept { return max_work_group_size; }; + int getMaxWorkGroupSize() const noexcept { return MaxWorkGroupSize; }; }; -int getAttribute(ur_device_handle_t device, CUdevice_attribute attribute); +int getAttribute(ur_device_handle_t Device, CUdevice_attribute Attribute); diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp index b0c4562d60525..ef87dab96d2fa 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp @@ -16,62 +16,62 @@ #include #include -ur_result_t enqueueEventsWait(ur_queue_handle_t command_queue, CUstream stream, - uint32_t num_events_in_wait_list, - const ur_event_handle_t *event_wait_list) { - UR_ASSERT(event_wait_list, UR_RESULT_SUCCESS); +ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream, + uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList) { + UR_ASSERT(EventWaitList, UR_RESULT_SUCCESS); try { - ScopedContext active(command_queue->get_context()); + ScopedContext Active(CommandQueue->getContext()); - auto result = forLatestEvents( - event_wait_list, num_events_in_wait_list, - [stream](ur_event_handle_t event) -> ur_result_t { - if (event->get_stream() == stream) { + auto Result = forLatestEvents( + EventWaitList, NumEventsInWaitList, + [Stream](ur_event_handle_t Event) -> ur_result_t { + if (Event->getStream() == Stream) { return UR_RESULT_SUCCESS; } else { - return UR_CHECK_ERROR(cuStreamWaitEvent(stream, event->get(), 0)); + return UR_CHECK_ERROR(cuStreamWaitEvent(Stream, Event->get(), 0)); } }); - return result; - } catch (ur_result_t err) { - return err; + return Result; + } catch (ur_result_t Err) { + return Err; } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } } template -void getUSMHostOrDevicePtr(PtrT usm_ptr, CUmemorytype *out_mem_type, - CUdeviceptr *out_dev_ptr, PtrT *out_host_ptr) { +void getUSMHostOrDevicePtr(PtrT USMPtr, CUmemorytype *OutMemType, + CUdeviceptr *OutDevPtr, PtrT *OutHostPtr) { // do not throw if cuPointerGetAttribute returns CUDA_ERROR_INVALID_VALUE // checks with PI_CHECK_ERROR are not suggested - CUresult ret = cuPointerGetAttribute( - out_mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)usm_ptr); + CUresult Ret = cuPointerGetAttribute( + OutMemType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)USMPtr); // ARRAY, UNIFIED types are not supported! - assert(*out_mem_type != CU_MEMORYTYPE_ARRAY && - *out_mem_type != CU_MEMORYTYPE_UNIFIED); + assert(*OutMemType != CU_MEMORYTYPE_ARRAY && + *OutMemType != CU_MEMORYTYPE_UNIFIED); // pointer not known to the CUDA subsystem (possibly a system allocated ptr) - if (ret == CUDA_ERROR_INVALID_VALUE) { - *out_mem_type = CU_MEMORYTYPE_HOST; - *out_dev_ptr = 0; - *out_host_ptr = usm_ptr; + if (Ret == CUDA_ERROR_INVALID_VALUE) { + *OutMemType = CU_MEMORYTYPE_HOST; + *OutDevPtr = 0; + *OutHostPtr = USMPtr; // todo: resets the above "non-stick" error - } else if (ret == CUDA_SUCCESS) { - *out_dev_ptr = (*out_mem_type == CU_MEMORYTYPE_DEVICE) - ? reinterpret_cast(usm_ptr) - : 0; - *out_host_ptr = (*out_mem_type == CU_MEMORYTYPE_HOST) ? usm_ptr : nullptr; + } else if (Ret == CUDA_SUCCESS) { + *OutDevPtr = (*OutMemType == CU_MEMORYTYPE_DEVICE) + ? reinterpret_cast(USMPtr) + : 0; + *OutHostPtr = (*OutMemType == CU_MEMORYTYPE_HOST) ? USMPtr : nullptr; } else { - UR_CHECK_ERROR(ret); + UR_CHECK_ERROR(Ret); } } -ur_result_t setCuMemAdvise(CUdeviceptr devPtr, size_t size, - ur_usm_advice_flags_t ur_advice_flags, - CUdevice device) { +ur_result_t setCuMemAdvise(CUdeviceptr DevPtr, size_t Size, + ur_usm_advice_flags_t URAdviceFlags, + CUdevice Device) { std::unordered_map URToCUMemAdviseDeviceFlagsMap = { {UR_USM_ADVICE_FLAG_SET_READ_MOSTLY, CU_MEM_ADVISE_SET_READ_MOSTLY}, @@ -87,8 +87,8 @@ ur_result_t setCuMemAdvise(CUdeviceptr devPtr, size_t size, CU_MEM_ADVISE_UNSET_ACCESSED_BY}, }; for (auto &FlagPair : URToCUMemAdviseDeviceFlagsMap) { - if (ur_advice_flags & FlagPair.first) { - UR_CHECK_ERROR(cuMemAdvise(devPtr, size, FlagPair.second, device)); + if (URAdviceFlags & FlagPair.first) { + UR_CHECK_ERROR(cuMemAdvise(DevPtr, Size, FlagPair.second, Device)); } } @@ -105,8 +105,8 @@ ur_result_t setCuMemAdvise(CUdeviceptr devPtr, size_t size, }; for (auto &FlagPair : URToCUMemAdviseHostFlagsMap) { - if (ur_advice_flags & FlagPair.first) { - UR_CHECK_ERROR(cuMemAdvise(devPtr, size, FlagPair.second, CU_DEVICE_CPU)); + if (URAdviceFlags & FlagPair.first) { + UR_CHECK_ERROR(cuMemAdvise(DevPtr, Size, FlagPair.second, CU_DEVICE_CPU)); } } @@ -115,8 +115,8 @@ ur_result_t setCuMemAdvise(CUdeviceptr devPtr, size_t size, UR_USM_ADVICE_FLAG_CLEAR_NON_ATOMIC_MOSTLY, UR_USM_ADVICE_FLAG_BIAS_CACHED, UR_USM_ADVICE_FLAG_BIAS_UNCACHED}; - for (auto &unMappedFlag : UnmappedMemAdviceFlags) { - if (ur_advice_flags & unMappedFlag) { + for (auto &UnmappedFlag : UnmappedMemAdviceFlags) { + if (URAdviceFlags & UnmappedFlag) { throw UR_RESULT_ERROR_INVALID_ENUMERATION; } } @@ -127,76 +127,76 @@ ur_result_t setCuMemAdvise(CUdeviceptr devPtr, size_t size, // Determine local work sizes that result in uniform work groups. // The default threadsPerBlock only require handling the first work_dim // dimension. -void guessLocalWorkSize(ur_device_handle_t device, size_t *threadsPerBlock, - const size_t *global_work_size, const uint32_t work_dim, - const size_t maxThreadsPerBlock[3], - ur_kernel_handle_t kernel, uint32_t local_size) { - assert(threadsPerBlock != nullptr); - assert(global_work_size != nullptr); - assert(kernel != nullptr); - int minGrid, maxBlockSize, maxBlockDim[3]; +void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock, + const size_t *GlobalWorkSize, const uint32_t WorkDim, + const size_t MaxThreadsPerBlock[3], + ur_kernel_handle_t Kernel, uint32_t LocalSize) { + assert(ThreadsPerBlock != nullptr); + assert(GlobalWorkSize != nullptr); + assert(Kernel != nullptr); + int MinGrid, MaxBlockSize, MaxBlockDim[3]; // The below assumes a three dimensional range but this is not guaranteed by // UR. - size_t global_size_normalized[3] = {1, 1, 1}; - for (uint32_t i = 0; i < work_dim; i++) { - global_size_normalized[i] = global_work_size[i]; + size_t GlobalSizeNormalized[3] = {1, 1, 1}; + for (uint32_t i = 0; i < WorkDim; i++) { + GlobalSizeNormalized[i] = GlobalWorkSize[i]; } - static auto isPrime = [](size_t number) -> bool { - auto lastNumToCheck = ceil(sqrt(number)); - if (number < 2) + static auto IsPrime = [](size_t Number) -> bool { + auto LastNumToCheck = ceil(sqrt(Number)); + if (Number < 2) return false; - if (number == 2) + if (Number == 2) return true; - if (number % 2 == 0) + if (Number % 2 == 0) return false; - for (int i = 3; i <= lastNumToCheck; i += 2) { - if (number % i == 0) + for (int i = 3; i <= LastNumToCheck; i += 2) { + if (Number % i == 0) return false; } return true; }; - cuDeviceGetAttribute(&maxBlockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, - device->get()); - cuDeviceGetAttribute(&maxBlockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, - device->get()); - - UR_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize( - &minGrid, &maxBlockSize, kernel->get(), NULL, local_size, - maxThreadsPerBlock[0])); - - threadsPerBlock[2] = - std::min(global_size_normalized[2], size_t(maxBlockDim[2])); - threadsPerBlock[1] = std::min( - global_size_normalized[1], - std::min(maxBlockSize / threadsPerBlock[2], size_t(maxBlockDim[1]))); - maxBlockDim[0] = maxBlockSize / (threadsPerBlock[1] * threadsPerBlock[2]); - threadsPerBlock[0] = - std::min(maxThreadsPerBlock[0], - std::min(global_size_normalized[0], size_t(maxBlockDim[0]))); - - // When global_size_normalized[0] is prime threadPerBlock[0] will later + cuDeviceGetAttribute(&MaxBlockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, + Device->get()); + cuDeviceGetAttribute(&MaxBlockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, + Device->get()); + + UR_CHECK_ERROR( + cuOccupancyMaxPotentialBlockSize(&MinGrid, &MaxBlockSize, Kernel->get(), + NULL, LocalSize, MaxThreadsPerBlock[0])); + + ThreadsPerBlock[2] = + std::min(GlobalSizeNormalized[2], size_t(MaxBlockDim[2])); + ThreadsPerBlock[1] = std::min( + GlobalSizeNormalized[1], + std::min(MaxBlockSize / ThreadsPerBlock[2], size_t(MaxBlockDim[1]))); + MaxBlockDim[0] = MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[2]); + ThreadsPerBlock[0] = + std::min(MaxThreadsPerBlock[0], + std::min(GlobalSizeNormalized[0], size_t(MaxBlockDim[0]))); + + // When GlobalSizeNormalized[0] is prime threadPerBlock[0] will later // computed as 1, which is not efficient configuration. In such case we use - // global_size_normalized[0] + 1 to compute threadPerBlock[0]. - int adjusted_0_dim_global_work_size = - (isPrime(global_size_normalized[0]) && - (threadsPerBlock[0] != global_size_normalized[0])) - ? global_size_normalized[0] + 1 - : global_size_normalized[0]; - - static auto isPowerOf2 = [](size_t value) -> bool { - return value && !(value & (value - 1)); + // GlobalSizeNormalized[0] + 1 to compute threadPerBlock[0]. + int Adjusted0DimGlobalWorkSize = + (IsPrime(GlobalSizeNormalized[0]) && + (ThreadsPerBlock[0] != GlobalSizeNormalized[0])) + ? GlobalSizeNormalized[0] + 1 + : GlobalSizeNormalized[0]; + + static auto IsPowerOf2 = [](size_t Value) -> bool { + return Value && !(Value & (Value - 1)); }; // Find a local work group size that is a divisor of the global // work group size to produce uniform work groups. // Additionally, for best compute utilisation, the local size has // to be a power of two. - while (0u != (adjusted_0_dim_global_work_size % threadsPerBlock[0]) || - !isPowerOf2(threadsPerBlock[0])) { - --threadsPerBlock[0]; + while (0u != (Adjusted0DimGlobalWorkSize % ThreadsPerBlock[0]) || + !IsPowerOf2(ThreadsPerBlock[0])) { + --ThreadsPerBlock[0]; } } @@ -204,19 +204,19 @@ void guessLocalWorkSize(ur_device_handle_t device, size_t *threadsPerBlock, // If the kernel requires a number of registers for the entire thread // block exceeds the hardware limitations, then the cuLaunchKernel call // will fail to launch with CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES error. -bool hasExceededMaxRegistersPerBlock(ur_device_handle_t device, - ur_kernel_handle_t kernel, - size_t blockSize) { - int maxRegsPerBlock{0}; +bool hasExceededMaxRegistersPerBlock(ur_device_handle_t Device, + ur_kernel_handle_t Kernel, + size_t BlockSize) { + int MaxRegsPerBlock{0}; UR_CHECK_ERROR(cuDeviceGetAttribute( - &maxRegsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, - device->get())); + &MaxRegsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, + Device->get())); - int regsPerThread{0}; - UR_CHECK_ERROR(cuFuncGetAttribute(®sPerThread, CU_FUNC_ATTRIBUTE_NUM_REGS, - kernel->get())); + int RegsPerThread{0}; + UR_CHECK_ERROR(cuFuncGetAttribute(&RegsPerThread, CU_FUNC_ATTRIBUTE_NUM_REGS, + Kernel->get())); - return blockSize * regsPerThread > size_t(maxRegsPerBlock); + return BlockSize * RegsPerThread > size_t(MaxRegsPerBlock); } /// Enqueues a wait on the given CUstream for all specified events (See @@ -230,71 +230,69 @@ UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( // represented by input events) and then all future work waits on that stream. UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE); - ur_result_t result; + ur_result_t Result; try { - ScopedContext active(hQueue->get_context()); - uint32_t stream_token; - ur_stream_guard_ guard; - CUstream cuStream = hQueue->get_next_compute_stream( - numEventsInWaitList, phEventWaitList, guard, &stream_token); + ScopedContext Active(hQueue->getContext()); + uint32_t StreamToken; + ur_stream_guard_ Guard; + CUstream CuStream = hQueue->getNextComputeStream( + numEventsInWaitList, phEventWaitList, Guard, &StreamToken); { - std::lock_guard guard(hQueue->barrier_mutex_); - if (hQueue->barrier_event_ == nullptr) { + std::lock_guard GuardBarrier(hQueue->BarrierMutex); + if (hQueue->BarrierEvent == nullptr) { UR_CHECK_ERROR( - cuEventCreate(&hQueue->barrier_event_, CU_EVENT_DISABLE_TIMING)); + cuEventCreate(&hQueue->BarrierEvent, CU_EVENT_DISABLE_TIMING)); } if (numEventsInWaitList == 0) { // wait on all work - if (hQueue->barrier_tmp_event_ == nullptr) { - UR_CHECK_ERROR(cuEventCreate(&hQueue->barrier_tmp_event_, - CU_EVENT_DISABLE_TIMING)); + if (hQueue->BarrierTmpEvent == nullptr) { + UR_CHECK_ERROR( + cuEventCreate(&hQueue->BarrierTmpEvent, CU_EVENT_DISABLE_TIMING)); } - hQueue->sync_streams( - [cuStream, tmp_event = hQueue->barrier_tmp_event_](CUstream s) { - if (cuStream != s) { + hQueue->syncStreams( + [CuStream, TmpEvent = hQueue->BarrierTmpEvent](CUstream s) { + if (CuStream != s) { // record a new CUDA event on every stream and make one stream // wait for these events - UR_CHECK_ERROR(cuEventRecord(tmp_event, s)); - UR_CHECK_ERROR(cuStreamWaitEvent(cuStream, tmp_event, 0)); + UR_CHECK_ERROR(cuEventRecord(TmpEvent, s)); + UR_CHECK_ERROR(cuStreamWaitEvent(CuStream, TmpEvent, 0)); } }); } else { // wait just on given events forLatestEvents(phEventWaitList, numEventsInWaitList, - [cuStream](ur_event_handle_t event) -> ur_result_t { - if (event->get_queue()->has_been_synchronized( - event->get_compute_stream_token())) { + [CuStream](ur_event_handle_t Event) -> ur_result_t { + if (Event->getQueue()->hasBeenSynchronized( + Event->getComputeStreamToken())) { return UR_RESULT_SUCCESS; } else { return UR_CHECK_ERROR( - cuStreamWaitEvent(cuStream, event->get(), 0)); + cuStreamWaitEvent(CuStream, Event->get(), 0)); } }); } - result = UR_CHECK_ERROR(cuEventRecord(hQueue->barrier_event_, cuStream)); - for (unsigned int i = 0; i < hQueue->compute_applied_barrier_.size(); - i++) { - hQueue->compute_applied_barrier_[i] = false; + Result = UR_CHECK_ERROR(cuEventRecord(hQueue->BarrierEvent, CuStream)); + for (unsigned int i = 0; i < hQueue->ComputeAppliedBarrier.size(); i++) { + hQueue->ComputeAppliedBarrier[i] = false; } - for (unsigned int i = 0; i < hQueue->transfer_applied_barrier_.size(); - i++) { - hQueue->transfer_applied_barrier_[i] = false; + for (unsigned int i = 0; i < hQueue->TransferAppliedBarrier.size(); i++) { + hQueue->TransferAppliedBarrier[i] = false; } } - if (result != UR_RESULT_SUCCESS) { - return result; + if (Result != UR_RESULT_SUCCESS) { + return Result; } if (phEvent) { - *phEvent = ur_event_handle_t_::make_native( - UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, hQueue, cuStream, stream_token); + *phEvent = ur_event_handle_t_::makeNative( + UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, hQueue, CuStream, StreamToken); (*phEvent)->start(); (*phEvent)->record(); } return UR_RESULT_SUCCESS; - } catch (ur_result_t err) { - return err; + } catch (ur_result_t Err) { + return Err; } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } @@ -319,7 +317,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { // Preconditions UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hQueue->get_context() == hKernel->get_context(), + UR_ASSERT(hQueue->getContext() == hKernel->getContext(), UR_RESULT_ERROR_INVALID_KERNEL); UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(pGlobalWorkOffset, UR_RESULT_ERROR_INVALID_NULL_POINTER); @@ -333,162 +331,162 @@ UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( // Set the number of threads per block to the number of threads per warp // by default unless user has provided a better number - size_t threadsPerBlock[3] = {32u, 1u, 1u}; - size_t maxWorkGroupSize = 0u; - size_t maxThreadsPerBlock[3] = {}; - bool providedLocalWorkGroupSize = (pLocalWorkSize != nullptr); - int32_t local_size = hKernel->get_local_size(); - ur_result_t retError = UR_RESULT_SUCCESS; + size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; + size_t MaxWorkGroupSize = 0u; + size_t MaxThreadsPerBlock[3] = {}; + bool ProvidedLocalWorkGroupSize = (pLocalWorkSize != nullptr); + int32_t LocalSize = hKernel->getLocalSize(); + ur_result_t Result = UR_RESULT_SUCCESS; try { // Set the active context here as guessLocalWorkSize needs an active context - ScopedContext active(hQueue->get_context()); + ScopedContext Active(hQueue->getContext()); { - size_t *reqdThreadsPerBlock = hKernel->reqdThreadsPerBlock_; - maxWorkGroupSize = hQueue->device_->get_max_work_group_size(); - hQueue->device_->get_max_work_item_sizes(sizeof(maxThreadsPerBlock), - maxThreadsPerBlock); - - if (providedLocalWorkGroupSize) { - auto isValid = [&](int dim) { - if (reqdThreadsPerBlock[dim] != 0 && - pLocalWorkSize[dim] != reqdThreadsPerBlock[dim]) + size_t *ReqdThreadsPerBlock = hKernel->ReqdThreadsPerBlock; + MaxWorkGroupSize = hQueue->Device->getMaxWorkGroupSize(); + hQueue->Device->getMaxWorkItemSizes(sizeof(MaxThreadsPerBlock), + MaxThreadsPerBlock); + + if (ProvidedLocalWorkGroupSize) { + auto IsValid = [&](int Dim) { + if (ReqdThreadsPerBlock[Dim] != 0 && + pLocalWorkSize[Dim] != ReqdThreadsPerBlock[Dim]) return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - if (pLocalWorkSize[dim] > maxThreadsPerBlock[dim]) + if (pLocalWorkSize[Dim] > MaxThreadsPerBlock[Dim]) return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; // Checks that local work sizes are a divisor of the global work sizes // which includes that the local work sizes are neither larger than // the global work sizes and not 0. - if (0u == pLocalWorkSize[dim]) + if (0u == pLocalWorkSize[Dim]) return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - if (0u != (pGlobalWorkSize[dim] % pLocalWorkSize[dim])) + if (0u != (pGlobalWorkSize[Dim] % pLocalWorkSize[Dim])) return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - threadsPerBlock[dim] = pLocalWorkSize[dim]; + ThreadsPerBlock[Dim] = pLocalWorkSize[Dim]; return UR_RESULT_SUCCESS; }; - size_t kernelLocalWorkGroupSize = 0; - for (size_t dim = 0; dim < workDim; dim++) { - auto err = isValid(dim); - if (err != UR_RESULT_SUCCESS) - return err; + size_t KernelLocalWorkGroupSize = 0; + for (size_t Dim = 0; Dim < workDim; Dim++) { + auto Err = IsValid(Dim); + if (Err != UR_RESULT_SUCCESS) + return Err; // If no error then sum the total local work size per dim. - kernelLocalWorkGroupSize += pLocalWorkSize[dim]; + KernelLocalWorkGroupSize += pLocalWorkSize[Dim]; } - if (hasExceededMaxRegistersPerBlock(hQueue->device_, hKernel, - kernelLocalWorkGroupSize)) { + if (hasExceededMaxRegistersPerBlock(hQueue->Device, hKernel, + KernelLocalWorkGroupSize)) { return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; } } else { - guessLocalWorkSize(hQueue->device_, threadsPerBlock, pGlobalWorkSize, - workDim, maxThreadsPerBlock, hKernel, local_size); + guessLocalWorkSize(hQueue->Device, ThreadsPerBlock, pGlobalWorkSize, + workDim, MaxThreadsPerBlock, hKernel, LocalSize); } } - if (maxWorkGroupSize < - size_t(threadsPerBlock[0] * threadsPerBlock[1] * threadsPerBlock[2])) { + if (MaxWorkGroupSize < + size_t(ThreadsPerBlock[0] * ThreadsPerBlock[1] * ThreadsPerBlock[2])) { return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; } - size_t blocksPerGrid[3] = {1u, 1u, 1u}; + size_t BlocksPerGrid[3] = {1u, 1u, 1u}; for (size_t i = 0; i < workDim; i++) { - blocksPerGrid[i] = - (pGlobalWorkSize[i] + threadsPerBlock[i] - 1) / threadsPerBlock[i]; + BlocksPerGrid[i] = + (pGlobalWorkSize[i] + ThreadsPerBlock[i] - 1) / ThreadsPerBlock[i]; } - std::unique_ptr retImplEv{nullptr}; + std::unique_ptr RetImplEvent{nullptr}; - uint32_t stream_token; - ur_stream_guard_ guard; - CUstream cuStream = hQueue->get_next_compute_stream( - numEventsInWaitList, phEventWaitList, guard, &stream_token); - CUfunction cuFunc = hKernel->get(); + uint32_t StreamToken; + ur_stream_guard_ Guard; + CUstream CuStream = hQueue->getNextComputeStream( + numEventsInWaitList, phEventWaitList, Guard, &StreamToken); + CUfunction CuFunc = hKernel->get(); - retError = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, - phEventWaitList); + Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, + phEventWaitList); // Set the implicit global offset parameter if kernel has offset variant if (hKernel->get_with_offset_parameter()) { - std::uint32_t cuda_implicit_offset[3] = {0, 0, 0}; + std::uint32_t CudaImplicitOffset[3] = {0, 0, 0}; if (pGlobalWorkOffset) { for (size_t i = 0; i < workDim; i++) { - cuda_implicit_offset[i] = + CudaImplicitOffset[i] = static_cast(pGlobalWorkOffset[i]); if (pGlobalWorkOffset[i] != 0) { - cuFunc = hKernel->get_with_offset_parameter(); + CuFunc = hKernel->get_with_offset_parameter(); } } } - hKernel->set_implicit_offset_arg(sizeof(cuda_implicit_offset), - cuda_implicit_offset); + hKernel->setImplicitOffsetArg(sizeof(CudaImplicitOffset), + CudaImplicitOffset); } - auto &argIndices = hKernel->get_arg_indices(); + auto &ArgIndices = hKernel->getArgIndices(); if (phEvent) { - retImplEv = - std::unique_ptr(ur_event_handle_t_::make_native( - UR_COMMAND_KERNEL_LAUNCH, hQueue, cuStream, stream_token)); - retImplEv->start(); + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_KERNEL_LAUNCH, hQueue, CuStream, StreamToken)); + RetImplEvent->start(); } // Set local mem max size if env var is present - static const char *local_mem_sz_ptr = + static const char *LocalMemSizePtr = std::getenv("SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE"); - if (local_mem_sz_ptr) { - int device_max_local_mem = 0; + if (LocalMemSizePtr) { + int DeviceMaxLocalMem = 0; cuDeviceGetAttribute( - &device_max_local_mem, + &DeviceMaxLocalMem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, hQueue->get_device()->get()); - static const int env_val = std::atoi(local_mem_sz_ptr); - if (env_val <= 0 || env_val > device_max_local_mem) { + static const int EnvVal = std::atoi(LocalMemSizePtr); + if (EnvVal <= 0 || EnvVal > DeviceMaxLocalMem) { setErrorMessage("Invalid value specified for " "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE", UR_RESULT_ERROR_ADAPTER_SPECIFIC); return UR_RESULT_ERROR_ADAPTER_SPECIFIC; } UR_CHECK_ERROR(cuFuncSetAttribute( - cuFunc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, env_val)); + CuFunc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, EnvVal)); } - retError = UR_CHECK_ERROR(cuLaunchKernel( - cuFunc, blocksPerGrid[0], blocksPerGrid[1], blocksPerGrid[2], - threadsPerBlock[0], threadsPerBlock[1], threadsPerBlock[2], local_size, - cuStream, const_cast(argIndices.data()), nullptr)); - if (local_size != 0) - hKernel->clear_local_size(); + Result = UR_CHECK_ERROR(cuLaunchKernel( + CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2], + ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], LocalSize, + CuStream, const_cast(ArgIndices.data()), nullptr)); + if (LocalSize != 0) + hKernel->clearLocalSize(); if (phEvent) { - retError = retImplEv->record(); - *phEvent = retImplEv.release(); + Result = RetImplEvent->record(); + *phEvent = RetImplEvent.release(); } - } catch (ur_result_t err) { - retError = err; + } catch (ur_result_t Err) { + Result = Err; } - return retError; + return Result; } /// General 3D memory copy operation. /// This function requires the corresponding CUDA context to be at the top of /// the context stack -/// If the source and/or destination is on the device, src_ptr and/or dst_ptr +/// If the source and/or destination is on the device, SrcPtr and/or DstPtr /// must be a pointer to a CUdeviceptr static ur_result_t commonEnqueueMemBufferCopyRect( - CUstream cu_stream, ur_rect_region_t region, const void *src_ptr, - const CUmemorytype_enum src_type, ur_rect_offset_t src_offset, - size_t src_row_pitch, size_t src_slice_pitch, void *dst_ptr, - const CUmemorytype_enum dst_type, ur_rect_offset_t dst_offset, + CUstream cu_stream, ur_rect_region_t region, const void *SrcPtr, + const CUmemorytype_enum SrcType, ur_rect_offset_t src_offset, + size_t src_row_pitch, size_t src_slice_pitch, void *DstPtr, + const CUmemorytype_enum DstType, ur_rect_offset_t dst_offset, size_t dst_row_pitch, size_t dst_slice_pitch) { - UR_ASSERT(src_type == CU_MEMORYTYPE_DEVICE || src_type == CU_MEMORYTYPE_HOST, + UR_ASSERT(SrcType == CU_MEMORYTYPE_DEVICE || SrcType == CU_MEMORYTYPE_HOST, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(dst_type == CU_MEMORYTYPE_DEVICE || dst_type == CU_MEMORYTYPE_HOST, + UR_ASSERT(DstType == CU_MEMORYTYPE_DEVICE || DstType == CU_MEMORYTYPE_HOST, UR_RESULT_ERROR_INVALID_MEM_OBJECT); src_row_pitch = @@ -508,22 +506,21 @@ static ur_result_t commonEnqueueMemBufferCopyRect( params.Height = region.height; params.Depth = region.depth; - params.srcMemoryType = src_type; - params.srcDevice = src_type == CU_MEMORYTYPE_DEVICE - ? *static_cast(src_ptr) + params.srcMemoryType = SrcType; + params.srcDevice = SrcType == CU_MEMORYTYPE_DEVICE + ? *static_cast(SrcPtr) : 0; - params.srcHost = src_type == CU_MEMORYTYPE_HOST ? src_ptr : nullptr; + params.srcHost = SrcType == CU_MEMORYTYPE_HOST ? SrcPtr : nullptr; params.srcXInBytes = src_offset.x; params.srcY = src_offset.y; params.srcZ = src_offset.z; params.srcPitch = src_row_pitch; params.srcHeight = src_slice_pitch / src_row_pitch; - params.dstMemoryType = dst_type; - params.dstDevice = dst_type == CU_MEMORYTYPE_DEVICE - ? *static_cast(dst_ptr) - : 0; - params.dstHost = dst_type == CU_MEMORYTYPE_HOST ? dst_ptr : nullptr; + params.dstMemoryType = DstType; + params.dstDevice = + DstType == CU_MEMORYTYPE_DEVICE ? *static_cast(DstPtr) : 0; + params.dstHost = DstType == CU_MEMORYTYPE_HOST ? DstPtr : nullptr; params.dstXInBytes = dst_offset.x; params.dstY = dst_offset.y; params.dstZ = dst_offset.z; @@ -543,45 +540,45 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect( UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - ur_result_t retErr = UR_RESULT_SUCCESS; - CUdeviceptr devPtr = hBuffer->mem_.buffer_mem_.get(); - std::unique_ptr retImplEv{nullptr}; + ur_result_t Result = UR_RESULT_SUCCESS; + CUdeviceptr DevPtr = hBuffer->Mem.BufferMem.get(); + std::unique_ptr RetImplEvent{nullptr}; try { - ScopedContext active(hQueue->get_context()); - CUstream cuStream = hQueue->get_next_transfer_stream(); + ScopedContext Active(hQueue->getContext()); + CUstream CuStream = hQueue->getNextTransferStream(); - retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, + Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, phEventWaitList); if (phEvent) { - retImplEv = - std::unique_ptr(ur_event_handle_t_::make_native( - UR_COMMAND_MEM_BUFFER_READ_RECT, hQueue, cuStream)); - retImplEv->start(); + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_BUFFER_READ_RECT, hQueue, CuStream)); + RetImplEvent->start(); } - retErr = commonEnqueueMemBufferCopyRect( - cuStream, region, &devPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin, + Result = commonEnqueueMemBufferCopyRect( + CuStream, region, &DevPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin, bufferRowPitch, bufferSlicePitch, pDst, CU_MEMORYTYPE_HOST, hostOrigin, hostRowPitch, bufferSlicePitch); if (phEvent) { - retErr = retImplEv->record(); + Result = RetImplEvent->record(); } if (blockingRead) { - retErr = UR_CHECK_ERROR(cuStreamSynchronize(cuStream)); + Result = UR_CHECK_ERROR(cuStreamSynchronize(CuStream)); } if (phEvent) { - *phEvent = retImplEv.release(); + *phEvent = RetImplEvent.release(); } - } catch (ur_result_t err) { - retErr = err; + } catch (ur_result_t Err) { + Result = Err; } - return retErr; + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( @@ -594,44 +591,44 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - ur_result_t retErr = UR_RESULT_SUCCESS; - CUdeviceptr devPtr = hBuffer->mem_.buffer_mem_.get(); - std::unique_ptr retImplEv{nullptr}; + ur_result_t Result = UR_RESULT_SUCCESS; + CUdeviceptr DevPtr = hBuffer->Mem.BufferMem.get(); + std::unique_ptr RetImplEvent{nullptr}; try { - ScopedContext active(hQueue->get_context()); - CUstream cuStream = hQueue->get_next_transfer_stream(); - retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, + ScopedContext active(hQueue->getContext()); + CUstream cuStream = hQueue->getNextTransferStream(); + Result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, phEventWaitList); if (phEvent) { - retImplEv = - std::unique_ptr(ur_event_handle_t_::make_native( + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( UR_COMMAND_MEM_BUFFER_WRITE_RECT, hQueue, cuStream)); - retImplEv->start(); + RetImplEvent->start(); } - retErr = commonEnqueueMemBufferCopyRect( + Result = commonEnqueueMemBufferCopyRect( cuStream, region, pSrc, CU_MEMORYTYPE_HOST, hostOrigin, hostRowPitch, - hostSlicePitch, &devPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin, + hostSlicePitch, &DevPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin, bufferRowPitch, bufferSlicePitch); if (phEvent) { - retErr = retImplEv->record(); + Result = RetImplEvent->record(); } if (blockingWrite) { - retErr = UR_CHECK_ERROR(cuStreamSynchronize(cuStream)); + Result = UR_CHECK_ERROR(cuStreamSynchronize(cuStream)); } if (phEvent) { - *phEvent = retImplEv.release(); + *phEvent = RetImplEvent.release(); } - } catch (ur_result_t err) { - retErr = err; + } catch (ur_result_t Err) { + Result = Err; } - return retErr; + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy( @@ -641,36 +638,36 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy( ur_event_handle_t *phEvent) { UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - std::unique_ptr retImplEv{nullptr}; + std::unique_ptr RetImplEvent{nullptr}; try { - ScopedContext active(hQueue->get_context()); - ur_result_t result; + ScopedContext Active(hQueue->getContext()); + ur_result_t Result; - auto stream = hQueue->get_next_transfer_stream(); - result = - enqueueEventsWait(hQueue, stream, numEventsInWaitList, phEventWaitList); + auto Stream = hQueue->getNextTransferStream(); + Result = + enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList); if (phEvent) { - retImplEv = - std::unique_ptr(ur_event_handle_t_::make_native( - UR_COMMAND_MEM_BUFFER_COPY, hQueue, stream)); - result = retImplEv->start(); + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_BUFFER_COPY, hQueue, Stream)); + Result = RetImplEvent->start(); } - auto src = hBufferSrc->mem_.buffer_mem_.get() + srcOffset; - auto dst = hBufferDst->mem_.buffer_mem_.get() + dstOffset; + auto Src = hBufferSrc->Mem.BufferMem.get() + srcOffset; + auto Dst = hBufferDst->Mem.BufferMem.get() + dstOffset; - result = UR_CHECK_ERROR(cuMemcpyDtoDAsync(dst, src, size, stream)); + Result = UR_CHECK_ERROR(cuMemcpyDtoDAsync(Dst, Src, size, Stream)); if (phEvent) { - result = retImplEv->record(); - *phEvent = retImplEv.release(); + Result = RetImplEvent->record(); + *phEvent = RetImplEvent.release(); } - return result; - } catch (ur_result_t err) { - return err; + return Result; + } catch (ur_result_t Err) { + return Err; } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } @@ -687,38 +684,38 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( UR_ASSERT(hBufferDst, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - ur_result_t retErr = UR_RESULT_SUCCESS; - CUdeviceptr srcPtr = hBufferSrc->mem_.buffer_mem_.get(); - CUdeviceptr dstPtr = hBufferDst->mem_.buffer_mem_.get(); - std::unique_ptr retImplEv{nullptr}; + ur_result_t Result = UR_RESULT_SUCCESS; + CUdeviceptr SrcPtr = hBufferSrc->Mem.BufferMem.get(); + CUdeviceptr DstPtr = hBufferDst->Mem.BufferMem.get(); + std::unique_ptr RetImplEvent{nullptr}; try { - ScopedContext active(hQueue->get_context()); - CUstream cuStream = hQueue->get_next_transfer_stream(); - retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, + ScopedContext Active(hQueue->getContext()); + CUstream CuStream = hQueue->getNextTransferStream(); + Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, phEventWaitList); if (phEvent) { - retImplEv = - std::unique_ptr(ur_event_handle_t_::make_native( - UR_COMMAND_MEM_BUFFER_COPY_RECT, hQueue, cuStream)); - retImplEv->start(); + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_BUFFER_COPY_RECT, hQueue, CuStream)); + RetImplEvent->start(); } - retErr = commonEnqueueMemBufferCopyRect( - cuStream, region, &srcPtr, CU_MEMORYTYPE_DEVICE, srcOrigin, srcRowPitch, - srcSlicePitch, &dstPtr, CU_MEMORYTYPE_DEVICE, dstOrigin, dstRowPitch, + Result = commonEnqueueMemBufferCopyRect( + CuStream, region, &SrcPtr, CU_MEMORYTYPE_DEVICE, srcOrigin, srcRowPitch, + srcSlicePitch, &DstPtr, CU_MEMORYTYPE_DEVICE, dstOrigin, dstRowPitch, dstSlicePitch); if (phEvent) { - retImplEv->record(); - *phEvent = retImplEv.release(); + RetImplEvent->record(); + *phEvent = RetImplEvent.release(); } } catch (ur_result_t err) { - retErr = err; + Result = err; } - return retErr; + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( @@ -728,54 +725,54 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( ur_event_handle_t *phEvent) { UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - auto args_are_multiples_of_pattern_size = + auto ArgsAreMultiplesOfPatternSize = (offset % patternSize == 0) || (size % patternSize == 0); - auto pattern_is_valid = (pPattern != nullptr); + auto PatternIsValid = (pPattern != nullptr); - auto pattern_size_is_valid = + auto PatternSizeIsValid = ((patternSize & (patternSize - 1)) == 0) && // is power of two (patternSize > 0) && (patternSize <= 128); // falls within valid range - UR_ASSERT(args_are_multiples_of_pattern_size && pattern_is_valid && - pattern_size_is_valid, + UR_ASSERT(ArgsAreMultiplesOfPatternSize && PatternIsValid && + PatternSizeIsValid, UR_RESULT_ERROR_INVALID_SIZE); - std::unique_ptr retImplEv{nullptr}; + std::unique_ptr RetImplEvent{nullptr}; try { - ScopedContext active(hQueue->get_context()); + ScopedContext Active(hQueue->getContext()); - auto stream = hQueue->get_next_transfer_stream(); - ur_result_t result; - result = - enqueueEventsWait(hQueue, stream, numEventsInWaitList, phEventWaitList); + auto Stream = hQueue->getNextTransferStream(); + ur_result_t Result; + Result = + enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList); if (phEvent) { - retImplEv = - std::unique_ptr(ur_event_handle_t_::make_native( - UR_COMMAND_MEM_BUFFER_FILL, hQueue, stream)); - result = retImplEv->start(); + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_BUFFER_FILL, hQueue, Stream)); + Result = RetImplEvent->start(); } - auto dstDevice = hBuffer->mem_.buffer_mem_.get() + offset; + auto DstDevice = hBuffer->Mem.BufferMem.get() + offset; auto N = size / patternSize; // pattern size in bytes switch (patternSize) { case 1: { - auto value = *static_cast(pPattern); - result = UR_CHECK_ERROR(cuMemsetD8Async(dstDevice, value, N, stream)); + auto Value = *static_cast(pPattern); + Result = UR_CHECK_ERROR(cuMemsetD8Async(DstDevice, Value, N, Stream)); break; } case 2: { - auto value = *static_cast(pPattern); - result = UR_CHECK_ERROR(cuMemsetD16Async(dstDevice, value, N, stream)); + auto Value = *static_cast(pPattern); + Result = UR_CHECK_ERROR(cuMemsetD16Async(DstDevice, Value, N, Stream)); break; } case 4: { - auto value = *static_cast(pPattern); - result = UR_CHECK_ERROR(cuMemsetD32Async(dstDevice, value, N, stream)); + auto Value = *static_cast(pPattern); + Result = UR_CHECK_ERROR(cuMemsetD32Async(DstDevice, Value, N, Stream)); break; } default: { @@ -786,20 +783,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( // This means that one cuMemsetD2D32Async call is made for every 4 bytes // in the pattern. - auto number_of_steps = patternSize / sizeof(uint32_t); + auto NumberOfSteps = patternSize / sizeof(uint32_t); // we walk up the pattern in 4-byte steps, and call cuMemset for each // 4-byte chunk of the pattern. - for (auto step = 0u; step < number_of_steps; ++step) { + for (auto Step = 0u; Step < NumberOfSteps; ++Step) { // take 4 bytes of the pattern - auto value = *(static_cast(pPattern) + step); + auto Value = *(static_cast(pPattern) + Step); // offset the pointer to the part of the buffer we want to write to - auto offset_ptr = dstDevice + (step * sizeof(uint32_t)); + auto OffsetPtr = DstDevice + (Step * sizeof(uint32_t)); // set all of the pattern chunks - result = UR_CHECK_ERROR( - cuMemsetD2D32Async(offset_ptr, patternSize, value, 1, N, stream)); + Result = UR_CHECK_ERROR( + cuMemsetD2D32Async(OffsetPtr, patternSize, Value, 1, N, Stream)); } break; @@ -807,20 +804,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( } if (phEvent) { - result = retImplEv->record(); - *phEvent = retImplEv.release(); + Result = RetImplEvent->record(); + *phEvent = RetImplEvent.release(); } - return result; - } catch (ur_result_t err) { - return err; + return Result; + } catch (ur_result_t Err) { + return Err; } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } } -static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR array_desc) { - switch (array_desc.Format) { +static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR ArrayDesc) { + switch (ArrayDesc.Format) { case CU_AD_FORMAT_UNSIGNED_INT8: case CU_AD_FORMAT_SIGNED_INT8: return 1; @@ -841,66 +838,66 @@ static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR array_desc) { /// General ND memory copy operation for images (where N > 1). /// This function requires the corresponding CUDA context to be at the top of /// the context stack -/// If the source and/or destination is an array, src_ptr and/or dst_ptr +/// If the source and/or destination is an array, SrcPtr and/or DstPtr /// must be a pointer to a CUarray static ur_result_t commonEnqueueMemImageNDCopy( - CUstream cu_stream, ur_mem_type_t img_type, const ur_rect_region_t region, - const void *src_ptr, const CUmemorytype_enum src_type, - const ur_rect_offset_t src_offset, void *dst_ptr, - const CUmemorytype_enum dst_type, const ur_rect_offset_t dst_offset) { - UR_ASSERT(src_type == CU_MEMORYTYPE_ARRAY || src_type == CU_MEMORYTYPE_HOST, + CUstream CuStream, ur_mem_type_t ImgType, const ur_rect_region_t Region, + const void *SrcPtr, const CUmemorytype_enum SrcType, + const ur_rect_offset_t SrcOffset, void *DstPtr, + const CUmemorytype_enum DstType, const ur_rect_offset_t DstOffset) { + UR_ASSERT(SrcType == CU_MEMORYTYPE_ARRAY || SrcType == CU_MEMORYTYPE_HOST, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(dst_type == CU_MEMORYTYPE_ARRAY || dst_type == CU_MEMORYTYPE_HOST, + UR_ASSERT(DstType == CU_MEMORYTYPE_ARRAY || DstType == CU_MEMORYTYPE_HOST, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - if (img_type == UR_MEM_TYPE_IMAGE2D) { - CUDA_MEMCPY2D cpyDesc; - memset(&cpyDesc, 0, sizeof(cpyDesc)); - cpyDesc.srcMemoryType = src_type; - if (src_type == CU_MEMORYTYPE_ARRAY) { - cpyDesc.srcArray = *static_cast(src_ptr); - cpyDesc.srcXInBytes = src_offset.x; - cpyDesc.srcY = src_offset.y; + if (ImgType == UR_MEM_TYPE_IMAGE2D) { + CUDA_MEMCPY2D CpyDesc; + memset(&CpyDesc, 0, sizeof(CpyDesc)); + CpyDesc.srcMemoryType = SrcType; + if (SrcType == CU_MEMORYTYPE_ARRAY) { + CpyDesc.srcArray = *static_cast(SrcPtr); + CpyDesc.srcXInBytes = SrcOffset.x; + CpyDesc.srcY = SrcOffset.y; } else { - cpyDesc.srcHost = src_ptr; + CpyDesc.srcHost = SrcPtr; } - cpyDesc.dstMemoryType = dst_type; - if (dst_type == CU_MEMORYTYPE_ARRAY) { - cpyDesc.dstArray = *static_cast(dst_ptr); - cpyDesc.dstXInBytes = dst_offset.x; - cpyDesc.dstY = dst_offset.y; + CpyDesc.dstMemoryType = DstType; + if (DstType == CU_MEMORYTYPE_ARRAY) { + CpyDesc.dstArray = *static_cast(DstPtr); + CpyDesc.dstXInBytes = DstOffset.x; + CpyDesc.dstY = DstOffset.y; } else { - cpyDesc.dstHost = dst_ptr; + CpyDesc.dstHost = DstPtr; } - cpyDesc.WidthInBytes = region.width; - cpyDesc.Height = region.height; - return UR_CHECK_ERROR(cuMemcpy2DAsync(&cpyDesc, cu_stream)); + CpyDesc.WidthInBytes = Region.width; + CpyDesc.Height = Region.height; + return UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc, CuStream)); } - if (img_type == UR_MEM_TYPE_IMAGE3D) { - CUDA_MEMCPY3D cpyDesc; - memset(&cpyDesc, 0, sizeof(cpyDesc)); - cpyDesc.srcMemoryType = src_type; - if (src_type == CU_MEMORYTYPE_ARRAY) { - cpyDesc.srcArray = *static_cast(src_ptr); - cpyDesc.srcXInBytes = src_offset.x; - cpyDesc.srcY = src_offset.y; - cpyDesc.srcZ = src_offset.z; + if (ImgType == UR_MEM_TYPE_IMAGE3D) { + CUDA_MEMCPY3D CpyDesc; + memset(&CpyDesc, 0, sizeof(CpyDesc)); + CpyDesc.srcMemoryType = SrcType; + if (SrcType == CU_MEMORYTYPE_ARRAY) { + CpyDesc.srcArray = *static_cast(SrcPtr); + CpyDesc.srcXInBytes = SrcOffset.x; + CpyDesc.srcY = SrcOffset.y; + CpyDesc.srcZ = SrcOffset.z; } else { - cpyDesc.srcHost = src_ptr; - } - cpyDesc.dstMemoryType = dst_type; - if (dst_type == CU_MEMORYTYPE_ARRAY) { - cpyDesc.dstArray = *static_cast(dst_ptr); - cpyDesc.dstXInBytes = dst_offset.x; - cpyDesc.dstY = dst_offset.y; - cpyDesc.dstZ = dst_offset.z; + CpyDesc.srcHost = SrcPtr; + } + CpyDesc.dstMemoryType = DstType; + if (DstType == CU_MEMORYTYPE_ARRAY) { + CpyDesc.dstArray = *static_cast(DstPtr); + CpyDesc.dstXInBytes = DstOffset.x; + CpyDesc.dstY = DstOffset.y; + CpyDesc.dstZ = DstOffset.z; } else { - cpyDesc.dstHost = dst_ptr; + CpyDesc.dstHost = DstPtr; } - cpyDesc.WidthInBytes = region.width; - cpyDesc.Height = region.height; - cpyDesc.Depth = region.depth; - return UR_CHECK_ERROR(cuMemcpy3DAsync(&cpyDesc, cu_stream)); + CpyDesc.WidthInBytes = Region.width; + CpyDesc.Height = Region.height; + CpyDesc.Depth = Region.depth; + return UR_CHECK_ERROR(cuMemcpy3DAsync(&CpyDesc, CuStream)); } return UR_RESULT_ERROR_INVALID_VALUE; } @@ -912,62 +909,62 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hImage, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hImage->mem_type_ == ur_mem_handle_t_::mem_type::surface, + UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - ur_result_t retErr = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { - ScopedContext active(hQueue->get_context()); - CUstream cuStream = hQueue->get_next_transfer_stream(); - retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, + ScopedContext Active(hQueue->getContext()); + CUstream CuStream = hQueue->getNextTransferStream(); + Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, phEventWaitList); - CUarray array = hImage->mem_.surface_mem_.get_array(); + CUarray Array = hImage->Mem.SurfaceMem.getArray(); - CUDA_ARRAY_DESCRIPTOR arrayDesc; - retErr = UR_CHECK_ERROR(cuArrayGetDescriptor(&arrayDesc, array)); + CUDA_ARRAY_DESCRIPTOR ArrayDesc; + Result = UR_CHECK_ERROR(cuArrayGetDescriptor(&ArrayDesc, Array)); - int elementByteSize = imageElementByteSize(arrayDesc); + int ElementByteSize = imageElementByteSize(ArrayDesc); - size_t byteOffsetX = origin.x * elementByteSize * arrayDesc.NumChannels; - size_t bytesToCopy = elementByteSize * arrayDesc.NumChannels * region.width; + size_t ByteOffsetX = origin.x * ElementByteSize * ArrayDesc.NumChannels; + size_t BytesToCopy = ElementByteSize * ArrayDesc.NumChannels * region.width; - ur_mem_type_t imgType = hImage->mem_.surface_mem_.get_image_type(); - if (imgType == UR_MEM_TYPE_IMAGE1D) { - retErr = UR_CHECK_ERROR( - cuMemcpyAtoHAsync(pDst, array, byteOffsetX, bytesToCopy, cuStream)); + ur_mem_type_t ImgType = hImage->Mem.SurfaceMem.getImageType(); + if (ImgType == UR_MEM_TYPE_IMAGE1D) { + Result = UR_CHECK_ERROR( + cuMemcpyAtoHAsync(pDst, Array, ByteOffsetX, BytesToCopy, CuStream)); } else { - ur_rect_region_t adjustedRegion = {bytesToCopy, region.height, + ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height, region.depth}; - ur_rect_offset_t srcOffset = {byteOffsetX, origin.y, origin.z}; + ur_rect_offset_t SrcOffset = {ByteOffsetX, origin.y, origin.z}; - retErr = commonEnqueueMemImageNDCopy( - cuStream, imgType, adjustedRegion, &array, CU_MEMORYTYPE_ARRAY, - srcOffset, pDst, CU_MEMORYTYPE_HOST, ur_rect_offset_t{}); + Result = commonEnqueueMemImageNDCopy( + CuStream, ImgType, AdjustedRegion, &Array, CU_MEMORYTYPE_ARRAY, + SrcOffset, pDst, CU_MEMORYTYPE_HOST, ur_rect_offset_t{}); - if (retErr != UR_RESULT_SUCCESS) { - return retErr; + if (Result != UR_RESULT_SUCCESS) { + return Result; } } if (phEvent) { - auto new_event = ur_event_handle_t_::make_native( - UR_COMMAND_MEM_IMAGE_READ, hQueue, cuStream); - new_event->record(); - *phEvent = new_event; + auto NewEvent = ur_event_handle_t_::makeNative(UR_COMMAND_MEM_IMAGE_READ, + hQueue, CuStream); + NewEvent->record(); + *phEvent = NewEvent; } if (blockingRead) { - retErr = UR_CHECK_ERROR(cuStreamSynchronize(cuStream)); + Result = UR_CHECK_ERROR(cuStreamSynchronize(CuStream)); } - } catch (ur_result_t err) { - return err; + } catch (ur_result_t Err) { + return Err; } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } - return retErr; + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite( @@ -977,58 +974,58 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite( const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hImage, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hImage->mem_type_ == ur_mem_handle_t_::mem_type::surface, + UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - ur_result_t retErr = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { - ScopedContext active(hQueue->get_context()); - CUstream cuStream = hQueue->get_next_transfer_stream(); - retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, + ScopedContext Active(hQueue->getContext()); + CUstream CuStream = hQueue->getNextTransferStream(); + Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, phEventWaitList); - CUarray array = hImage->mem_.surface_mem_.get_array(); + CUarray Array = hImage->Mem.SurfaceMem.getArray(); - CUDA_ARRAY_DESCRIPTOR arrayDesc; - retErr = UR_CHECK_ERROR(cuArrayGetDescriptor(&arrayDesc, array)); + CUDA_ARRAY_DESCRIPTOR ArrayDesc; + Result = UR_CHECK_ERROR(cuArrayGetDescriptor(&ArrayDesc, Array)); - int elementByteSize = imageElementByteSize(arrayDesc); + int ElementByteSize = imageElementByteSize(ArrayDesc); - size_t byteOffsetX = origin.x * elementByteSize * arrayDesc.NumChannels; - size_t bytesToCopy = elementByteSize * arrayDesc.NumChannels * region.width; + size_t ByteOffsetX = origin.x * ElementByteSize * ArrayDesc.NumChannels; + size_t BytesToCopy = ElementByteSize * ArrayDesc.NumChannels * region.width; - ur_mem_type_t imgType = hImage->mem_.surface_mem_.get_image_type(); - if (imgType == UR_MEM_TYPE_IMAGE1D) { - retErr = UR_CHECK_ERROR( - cuMemcpyHtoAAsync(array, byteOffsetX, pSrc, bytesToCopy, cuStream)); + ur_mem_type_t ImgType = hImage->Mem.SurfaceMem.getImageType(); + if (ImgType == UR_MEM_TYPE_IMAGE1D) { + Result = UR_CHECK_ERROR( + cuMemcpyHtoAAsync(Array, ByteOffsetX, pSrc, BytesToCopy, CuStream)); } else { - ur_rect_region_t adjustedRegion = {bytesToCopy, region.height, + ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height, region.depth}; - ur_rect_offset_t dstOffset = {byteOffsetX, origin.y, origin.z}; + ur_rect_offset_t DstOffset = {ByteOffsetX, origin.y, origin.z}; - retErr = commonEnqueueMemImageNDCopy( - cuStream, imgType, adjustedRegion, pSrc, CU_MEMORYTYPE_HOST, - ur_rect_offset_t{}, &array, CU_MEMORYTYPE_ARRAY, dstOffset); + Result = commonEnqueueMemImageNDCopy( + CuStream, ImgType, AdjustedRegion, pSrc, CU_MEMORYTYPE_HOST, + ur_rect_offset_t{}, &Array, CU_MEMORYTYPE_ARRAY, DstOffset); - if (retErr != UR_RESULT_SUCCESS) { - return retErr; + if (Result != UR_RESULT_SUCCESS) { + return Result; } } if (phEvent) { - auto new_event = ur_event_handle_t_::make_native( - UR_COMMAND_MEM_IMAGE_WRITE, hQueue, cuStream); - new_event->record(); - *phEvent = new_event; + auto NewEvent = ur_event_handle_t_::makeNative(UR_COMMAND_MEM_IMAGE_WRITE, + hQueue, CuStream); + NewEvent->record(); + *phEvent = NewEvent; } - } catch (ur_result_t err) { - return err; + } catch (ur_result_t Err) { + return Err; } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } - return retErr; + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy( @@ -1037,76 +1034,76 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy( ur_rect_offset_t dstOrigin, ur_rect_region_t region, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - UR_ASSERT(hImageSrc->mem_type_ == ur_mem_handle_t_::mem_type::surface, + UR_ASSERT(hImageSrc->MemType == ur_mem_handle_t_::Type::Surface, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(hImageDst->mem_type_ == ur_mem_handle_t_::mem_type::surface, + UR_ASSERT(hImageDst->MemType == ur_mem_handle_t_::Type::Surface, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(hImageSrc->mem_.surface_mem_.get_image_type() == - hImageDst->mem_.surface_mem_.get_image_type(), + UR_ASSERT(hImageSrc->Mem.SurfaceMem.getImageType() == + hImageDst->Mem.SurfaceMem.getImageType(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); - ur_result_t retErr = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { - ScopedContext active(hQueue->get_context()); - CUstream cuStream = hQueue->get_next_transfer_stream(); - retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, + ScopedContext Active(hQueue->getContext()); + CUstream CuStream = hQueue->getNextTransferStream(); + Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, phEventWaitList); - CUarray srcArray = hImageSrc->mem_.surface_mem_.get_array(); - CUarray dstArray = hImageDst->mem_.surface_mem_.get_array(); + CUarray SrcArray = hImageSrc->Mem.SurfaceMem.getArray(); + CUarray DstArray = hImageDst->Mem.SurfaceMem.getArray(); - CUDA_ARRAY_DESCRIPTOR srcArrayDesc; - retErr = UR_CHECK_ERROR(cuArrayGetDescriptor(&srcArrayDesc, srcArray)); - CUDA_ARRAY_DESCRIPTOR dstArrayDesc; - retErr = UR_CHECK_ERROR(cuArrayGetDescriptor(&dstArrayDesc, dstArray)); + CUDA_ARRAY_DESCRIPTOR SrcArrayDesc; + Result = UR_CHECK_ERROR(cuArrayGetDescriptor(&SrcArrayDesc, SrcArray)); + CUDA_ARRAY_DESCRIPTOR DstArrayDesc; + Result = UR_CHECK_ERROR(cuArrayGetDescriptor(&DstArrayDesc, DstArray)); - UR_ASSERT(srcArrayDesc.Format == dstArrayDesc.Format, + UR_ASSERT(SrcArrayDesc.Format == DstArrayDesc.Format, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(srcArrayDesc.NumChannels == dstArrayDesc.NumChannels, + UR_ASSERT(SrcArrayDesc.NumChannels == DstArrayDesc.NumChannels, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - int elementByteSize = imageElementByteSize(srcArrayDesc); + int ElementByteSize = imageElementByteSize(SrcArrayDesc); - size_t dstByteOffsetX = - dstOrigin.x * elementByteSize * srcArrayDesc.NumChannels; - size_t srcByteOffsetX = - srcOrigin.x * elementByteSize * dstArrayDesc.NumChannels; - size_t bytesToCopy = - elementByteSize * srcArrayDesc.NumChannels * region.width; + size_t DstByteOffsetX = + dstOrigin.x * ElementByteSize * SrcArrayDesc.NumChannels; + size_t SrcByteOffsetX = + srcOrigin.x * ElementByteSize * DstArrayDesc.NumChannels; + size_t BytesToCopy = + ElementByteSize * SrcArrayDesc.NumChannels * region.width; - ur_mem_type_t imgType = hImageSrc->mem_.surface_mem_.get_image_type(); - if (imgType == UR_MEM_TYPE_IMAGE1D) { - retErr = UR_CHECK_ERROR(cuMemcpyAtoA(dstArray, dstByteOffsetX, srcArray, - srcByteOffsetX, bytesToCopy)); + ur_mem_type_t ImgType = hImageSrc->Mem.SurfaceMem.getImageType(); + if (ImgType == UR_MEM_TYPE_IMAGE1D) { + Result = UR_CHECK_ERROR(cuMemcpyAtoA(DstArray, DstByteOffsetX, SrcArray, + SrcByteOffsetX, BytesToCopy)); } else { - ur_rect_region_t adjustedRegion = {bytesToCopy, region.height, + ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height, region.depth}; - ur_rect_offset_t srcOffset = {srcByteOffsetX, srcOrigin.y, srcOrigin.z}; - ur_rect_offset_t dstOffset = {dstByteOffsetX, dstOrigin.y, dstOrigin.z}; + ur_rect_offset_t SrcOffset = {SrcByteOffsetX, srcOrigin.y, srcOrigin.z}; + ur_rect_offset_t DstOffset = {DstByteOffsetX, dstOrigin.y, dstOrigin.z}; - retErr = commonEnqueueMemImageNDCopy( - cuStream, imgType, adjustedRegion, &srcArray, CU_MEMORYTYPE_ARRAY, - srcOffset, &dstArray, CU_MEMORYTYPE_ARRAY, dstOffset); + Result = commonEnqueueMemImageNDCopy( + CuStream, ImgType, AdjustedRegion, &SrcArray, CU_MEMORYTYPE_ARRAY, + SrcOffset, &DstArray, CU_MEMORYTYPE_ARRAY, DstOffset); - if (retErr != UR_RESULT_SUCCESS) { - return retErr; + if (Result != UR_RESULT_SUCCESS) { + return Result; } } if (phEvent) { - auto new_event = ur_event_handle_t_::make_native( - UR_COMMAND_MEM_IMAGE_COPY, hQueue, cuStream); - new_event->record(); - *phEvent = new_event; + auto NewEvent = ur_event_handle_t_::makeNative(UR_COMMAND_MEM_IMAGE_COPY, + hQueue, CuStream); + NewEvent->record(); + *phEvent = NewEvent; } - } catch (ur_result_t err) { - return err; + } catch (ur_result_t Err) { + return Err; } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } - return retErr; + return Result; } /// Implements mapping on the host using a BufferRead operation. @@ -1122,54 +1119,53 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( UR_ASSERT(ppRetMap != nullptr, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(hQueue != nullptr, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hBuffer != nullptr, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hBuffer->mem_type_ == ur_mem_handle_t_::mem_type::buffer, + UR_ASSERT(hBuffer->MemType == ur_mem_handle_t_::Type::Buffer, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - ur_result_t ret_err = UR_RESULT_ERROR_INVALID_MEM_OBJECT; - const bool is_pinned = - hBuffer->mem_.buffer_mem_.allocMode_ == - ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::alloc_host_ptr; + ur_result_t Result = UR_RESULT_ERROR_INVALID_MEM_OBJECT; + const bool IsPinned = + hBuffer->Mem.BufferMem.MemAllocMode == + ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr; // Currently no support for overlapping regions - if (hBuffer->mem_.buffer_mem_.get_map_ptr() != nullptr) { - return ret_err; + if (hBuffer->Mem.BufferMem.getMapPtr() != nullptr) { + return Result; } // Allocate a pointer in the host to store the mapped information - auto hostPtr = hBuffer->mem_.buffer_mem_.map_to_ptr(offset, mapFlags); - *ppRetMap = hBuffer->mem_.buffer_mem_.get_map_ptr(); - if (hostPtr) { - ret_err = UR_RESULT_SUCCESS; + auto HostPtr = hBuffer->Mem.BufferMem.mapToPtr(offset, mapFlags); + *ppRetMap = hBuffer->Mem.BufferMem.getMapPtr(); + if (HostPtr) { + Result = UR_RESULT_SUCCESS; } - if (!is_pinned && + if (!IsPinned && ((mapFlags & UR_MAP_FLAG_READ) || (mapFlags & UR_MAP_FLAG_WRITE))) { // Pinned host memory is already on host so it doesn't need to be read. - ret_err = urEnqueueMemBufferRead(hQueue, hBuffer, blockingMap, offset, size, - hostPtr, numEventsInWaitList, - phEventWaitList, phEvent); + Result = urEnqueueMemBufferRead(hQueue, hBuffer, blockingMap, offset, size, + HostPtr, numEventsInWaitList, + phEventWaitList, phEvent); } else { - ScopedContext active(hQueue->get_context()); + ScopedContext Active(hQueue->getContext()); - if (is_pinned) { - ret_err = urEnqueueEventsWait(hQueue, numEventsInWaitList, - phEventWaitList, nullptr); + if (IsPinned) { + Result = urEnqueueEventsWait(hQueue, numEventsInWaitList, phEventWaitList, + nullptr); } if (phEvent) { try { - *phEvent = - ur_event_handle_t_::make_native(UR_COMMAND_MEM_BUFFER_MAP, hQueue, - hQueue->get_next_transfer_stream()); + *phEvent = ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_BUFFER_MAP, hQueue, hQueue->getNextTransferStream()); (*phEvent)->start(); (*phEvent)->record(); - } catch (ur_result_t error) { - ret_err = error; + } catch (ur_result_t Err) { + Result = Err; } } } - return ret_err; + return Result; } /// Implements the unmap from the host, using a BufferWrite operation. @@ -1180,51 +1176,50 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( ur_queue_handle_t hQueue, ur_mem_handle_t hMem, void *pMappedPtr, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - ur_result_t ret_err = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(pMappedPtr, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(hMem->mem_type_ == ur_mem_handle_t_::mem_type::buffer, + UR_ASSERT(hMem->MemType == ur_mem_handle_t_::Type::Buffer, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(hMem->mem_.buffer_mem_.get_map_ptr() != nullptr, + UR_ASSERT(hMem->Mem.BufferMem.getMapPtr() != nullptr, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(hMem->mem_.buffer_mem_.get_map_ptr() == pMappedPtr, + UR_ASSERT(hMem->Mem.BufferMem.getMapPtr() == pMappedPtr, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - const bool is_pinned = - hMem->mem_.buffer_mem_.allocMode_ == - ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::alloc_host_ptr; + const bool IsPinned = + hMem->Mem.BufferMem.MemAllocMode == + ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr; - if (!is_pinned && - (hMem->mem_.buffer_mem_.get_map_flags() & UR_MAP_FLAG_WRITE)) { + if (!IsPinned && (hMem->Mem.BufferMem.getMapFlags() & UR_MAP_FLAG_WRITE)) { // Pinned host memory is only on host so it doesn't need to be written to. - ret_err = urEnqueueMemBufferWrite( - hQueue, hMem, true, hMem->mem_.buffer_mem_.get_map_offset(pMappedPtr), - hMem->mem_.buffer_mem_.get_size(), pMappedPtr, numEventsInWaitList, + Result = urEnqueueMemBufferWrite( + hQueue, hMem, true, hMem->Mem.BufferMem.getMapOffset(pMappedPtr), + hMem->Mem.BufferMem.getSize(), pMappedPtr, numEventsInWaitList, phEventWaitList, phEvent); } else { - ScopedContext active(hQueue->get_context()); + ScopedContext Active(hQueue->getContext()); - if (is_pinned) { - ret_err = urEnqueueEventsWait(hQueue, numEventsInWaitList, - phEventWaitList, nullptr); + if (IsPinned) { + Result = urEnqueueEventsWait(hQueue, numEventsInWaitList, phEventWaitList, + nullptr); } if (phEvent) { try { - *phEvent = ur_event_handle_t_::make_native( - UR_COMMAND_MEM_UNMAP, hQueue, hQueue->get_next_transfer_stream()); + *phEvent = ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_UNMAP, hQueue, hQueue->getNextTransferStream()); (*phEvent)->start(); (*phEvent)->record(); - } catch (ur_result_t error) { - ret_err = error; + } catch (ur_result_t Err) { + Result = Err; } } } - hMem->mem_.buffer_mem_.unmap(pMappedPtr); - return ret_err; + hMem->Mem.BufferMem.unmap(pMappedPtr); + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( @@ -1235,50 +1230,50 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( UR_ASSERT(ptr, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(size % patternSize == 0, UR_RESULT_ERROR_INVALID_SIZE); - ur_result_t result = UR_RESULT_SUCCESS; - std::unique_ptr event_ptr{nullptr}; + ur_result_t Result = UR_RESULT_SUCCESS; + std::unique_ptr EventPtr{nullptr}; try { - ScopedContext active(hQueue->get_context()); - uint32_t stream_token; - ur_stream_guard_ guard; - CUstream cuStream = hQueue->get_next_compute_stream( - numEventsInWaitList, phEventWaitList, guard, &stream_token); - result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, + ScopedContext Active(hQueue->getContext()); + uint32_t StreamToken; + ur_stream_guard_ Guard; + CUstream CuStream = hQueue->getNextComputeStream( + numEventsInWaitList, phEventWaitList, Guard, &StreamToken); + Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, phEventWaitList); if (phEvent) { - event_ptr = - std::unique_ptr(ur_event_handle_t_::make_native( - UR_COMMAND_USM_FILL, hQueue, cuStream, stream_token)); - event_ptr->start(); + EventPtr = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_USM_FILL, hQueue, CuStream, StreamToken)); + EventPtr->start(); } switch (patternSize) { case 1: - result = UR_CHECK_ERROR( + Result = UR_CHECK_ERROR( cuMemsetD8Async((CUdeviceptr)ptr, *((const uint8_t *)pPattern) & 0xFF, - size, cuStream)); + size, CuStream)); break; case 2: - result = UR_CHECK_ERROR(cuMemsetD16Async( + Result = UR_CHECK_ERROR(cuMemsetD16Async( (CUdeviceptr)ptr, *((const uint16_t *)pPattern) & 0xFFFF, size, - cuStream)); + CuStream)); break; case 4: - result = UR_CHECK_ERROR(cuMemsetD32Async( + Result = UR_CHECK_ERROR(cuMemsetD32Async( (CUdeviceptr)ptr, *((const uint32_t *)pPattern) & 0xFFFFFFFF, size, - cuStream)); + CuStream)); break; default: return UR_RESULT_ERROR_INVALID_ARGUMENT; } if (phEvent) { - result = event_ptr->record(); - *phEvent = event_ptr.release(); + Result = EventPtr->record(); + *phEvent = EventPtr.release(); } - } catch (ur_result_t err) { - result = err; + } catch (ur_result_t Err) { + Result = Err; } - return result; + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy( @@ -1288,36 +1283,36 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy( UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE); UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER); - ur_result_t result = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; - std::unique_ptr event_ptr{nullptr}; + std::unique_ptr EventPtr{nullptr}; try { - ScopedContext active(hQueue->get_context()); - CUstream cuStream = hQueue->get_next_transfer_stream(); - result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, + ScopedContext Active(hQueue->getContext()); + CUstream CuStream = hQueue->getNextTransferStream(); + Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, phEventWaitList); if (phEvent) { - event_ptr = - std::unique_ptr(ur_event_handle_t_::make_native( - UR_COMMAND_USM_MEMCPY, hQueue, cuStream)); - event_ptr->start(); + EventPtr = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_USM_MEMCPY, hQueue, CuStream)); + EventPtr->start(); } - result = UR_CHECK_ERROR( - cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size, cuStream)); + Result = UR_CHECK_ERROR( + cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size, CuStream)); if (phEvent) { - result = event_ptr->record(); + Result = EventPtr->record(); } if (blocking) { - result = UR_CHECK_ERROR(cuStreamSynchronize(cuStream)); + Result = UR_CHECK_ERROR(cuStreamSynchronize(CuStream)); } if (phEvent) { - *phEvent = event_ptr.release(); + *phEvent = EventPtr.release(); } - } catch (ur_result_t err) { - result = err; + } catch (ur_result_t Err) { + Result = Err; } - return result; + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( @@ -1325,23 +1320,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE); - ur_device_handle_t device = hQueue->get_context()->get_device(); + ur_device_handle_t Device = hQueue->getContext()->getDevice(); // Certain cuda devices and Windows do not have support for some Unified // Memory features. cuMemPrefetchAsync requires concurrent memory access // for managed memory. Therfore, ignore prefetch hint if concurrent managed // memory access is not available. - if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { + if (!getAttribute(Device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { setErrorMessage("Prefetch hint ignored as device does not support " "concurrent managed access", UR_RESULT_SUCCESS); return UR_RESULT_ERROR_ADAPTER_SPECIFIC; } - unsigned int is_managed; + unsigned int IsManaged; UR_CHECK_ERROR(cuPointerGetAttribute( - &is_managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem)); - if (!is_managed) { + &IsManaged, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem)); + if (!IsManaged) { setErrorMessage("Prefetch hint ignored as prefetch only works with USM", UR_RESULT_SUCCESS); return UR_RESULT_ERROR_ADAPTER_SPECIFIC; @@ -1352,30 +1347,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( return UR_RESULT_ERROR_INVALID_VALUE; UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); - ur_result_t result = UR_RESULT_SUCCESS; - std::unique_ptr event_ptr{nullptr}; + ur_result_t Result = UR_RESULT_SUCCESS; + std::unique_ptr EventPtr{nullptr}; try { - ScopedContext active(hQueue->get_context()); - CUstream cuStream = hQueue->get_next_transfer_stream(); - result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, + ScopedContext Active(hQueue->getContext()); + CUstream CuStream = hQueue->getNextTransferStream(); + Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, phEventWaitList); if (phEvent) { - event_ptr = - std::unique_ptr(ur_event_handle_t_::make_native( - UR_COMMAND_MEM_BUFFER_COPY, hQueue, cuStream)); - event_ptr->start(); + EventPtr = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_BUFFER_COPY, hQueue, CuStream)); + EventPtr->start(); } - result = UR_CHECK_ERROR( - cuMemPrefetchAsync((CUdeviceptr)pMem, size, device->get(), cuStream)); + Result = UR_CHECK_ERROR( + cuMemPrefetchAsync((CUdeviceptr)pMem, size, Device->get(), CuStream)); if (phEvent) { - result = event_ptr->record(); - *phEvent = event_ptr.release(); + Result = EventPtr->record(); + *phEvent = EventPtr.release(); } - } catch (ur_result_t err) { - result = err; + } catch (ur_result_t Err) { + Result = Err; } - return result; + return Result; } /// USM: memadvise API to govern behavior of automatic migration mechanisms @@ -1395,8 +1390,8 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, (advice & UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE) || (advice & UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_DEVICE) || (advice & UR_USM_ADVICE_FLAG_DEFAULT)) { - ur_device_handle_t device = hQueue->get_context()->get_device(); - if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { + ur_device_handle_t Device = hQueue->getContext()->getDevice(); + if (!getAttribute(Device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { setErrorMessage("Mem advise ignored as device does not support " "concurrent managed access", UR_RESULT_SUCCESS); @@ -1408,54 +1403,54 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, // CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS property. } - unsigned int is_managed; + unsigned int IsManaged; UR_CHECK_ERROR(cuPointerGetAttribute( - &is_managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem)); - if (!is_managed) { + &IsManaged, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem)); + if (!IsManaged) { setErrorMessage( "Memory advice ignored as memory advices only works with USM", UR_RESULT_SUCCESS); return UR_RESULT_ERROR_ADAPTER_SPECIFIC; } - ur_result_t result = UR_RESULT_SUCCESS; - std::unique_ptr event_ptr{nullptr}; + ur_result_t Result = UR_RESULT_SUCCESS; + std::unique_ptr EventPtr{nullptr}; try { - ScopedContext active(hQueue->get_context()); + ScopedContext Active(hQueue->getContext()); if (phEvent) { - event_ptr = std::unique_ptr( - ur_event_handle_t_::make_native(UR_COMMAND_USM_ADVISE, hQueue, - hQueue->get_next_transfer_stream())); - event_ptr->start(); + EventPtr = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_USM_ADVISE, hQueue, hQueue->getNextTransferStream())); + EventPtr->start(); } if (advice & UR_USM_ADVICE_FLAG_DEFAULT) { UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size, CU_MEM_ADVISE_UNSET_READ_MOSTLY, - hQueue->get_context()->get_device()->get())); + hQueue->getContext()->getDevice()->get())); UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size, CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION, - hQueue->get_context()->get_device()->get())); + hQueue->getContext()->getDevice()->get())); UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size, CU_MEM_ADVISE_UNSET_ACCESSED_BY, - hQueue->get_context()->get_device()->get())); + hQueue->getContext()->getDevice()->get())); } else { - result = setCuMemAdvise((CUdeviceptr)pMem, size, advice, - hQueue->get_context()->get_device()->get()); + Result = setCuMemAdvise((CUdeviceptr)pMem, size, advice, + hQueue->getContext()->getDevice()->get()); } if (phEvent) { - result = event_ptr->record(); - *phEvent = event_ptr.release(); + Result = EventPtr->record(); + *phEvent = EventPtr.release(); } } catch (ur_result_t err) { - result = err; + Result = err; } catch (...) { - result = UR_RESULT_ERROR_UNKNOWN; + Result = UR_RESULT_ERROR_UNKNOWN; } - return result; + return Result; } // TODO: Implement this. Remember to return true for @@ -1477,31 +1472,31 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( ur_result_t result = UR_RESULT_SUCCESS; try { - ScopedContext active(hQueue->get_context()); - CUstream cuStream = hQueue->get_next_transfer_stream(); + ScopedContext active(hQueue->getContext()); + CUstream cuStream = hQueue->getNextTransferStream(); result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, phEventWaitList); if (phEvent) { - (*phEvent) = ur_event_handle_t_::make_native( + (*phEvent) = ur_event_handle_t_::makeNative( UR_COMMAND_MEM_BUFFER_COPY_RECT, hQueue, cuStream); (*phEvent)->start(); } // Determine the direction of copy using cuPointerGetAttribute - // for both the src_ptr and dst_ptr - CUDA_MEMCPY2D cpyDesc = {0}; + // for both the SrcPtr and DstPtr + CUDA_MEMCPY2D CpyDesc = {0}; - getUSMHostOrDevicePtr(pSrc, &cpyDesc.srcMemoryType, &cpyDesc.srcDevice, - &cpyDesc.srcHost); - getUSMHostOrDevicePtr(pDst, &cpyDesc.dstMemoryType, &cpyDesc.dstDevice, - &cpyDesc.dstHost); + getUSMHostOrDevicePtr(pSrc, &CpyDesc.srcMemoryType, &CpyDesc.srcDevice, + &CpyDesc.srcHost); + getUSMHostOrDevicePtr(pDst, &CpyDesc.dstMemoryType, &CpyDesc.dstDevice, + &CpyDesc.dstHost); - cpyDesc.dstPitch = dstPitch; - cpyDesc.srcPitch = srcPitch; - cpyDesc.WidthInBytes = width; - cpyDesc.Height = height; + CpyDesc.dstPitch = dstPitch; + CpyDesc.srcPitch = srcPitch; + CpyDesc.WidthInBytes = width; + CpyDesc.Height = height; - result = UR_CHECK_ERROR(cuMemcpy2DAsync(&cpyDesc, cuStream)); + result = UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc, cuStream)); if (phEvent) { (*phEvent)->record(); @@ -1522,7 +1517,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(!hBuffer->is_image(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(!hBuffer->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER); if (phEventWaitList) { UR_ASSERT(numEventsInWaitList > 0, UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); @@ -1530,46 +1525,46 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( UR_ASSERT(numEventsInWaitList == 0, UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); } - UR_ASSERT(offset + size <= hBuffer->mem_.buffer_mem_.size_, + UR_ASSERT(offset + size <= hBuffer->Mem.BufferMem.Size, UR_RESULT_ERROR_INVALID_SIZE); - ur_result_t retErr = UR_RESULT_SUCCESS; - CUdeviceptr devPtr = hBuffer->mem_.buffer_mem_.get(); - std::unique_ptr retImplEv{nullptr}; + ur_result_t Result = UR_RESULT_SUCCESS; + CUdeviceptr DevPtr = hBuffer->Mem.BufferMem.get(); + std::unique_ptr RetImplEvent{nullptr}; try { - ScopedContext active(hQueue->get_context()); - CUstream cuStream = hQueue->get_next_transfer_stream(); + ScopedContext Active(hQueue->getContext()); + CUstream CuStream = hQueue->getNextTransferStream(); - retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, + Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, phEventWaitList); if (phEvent) { - retImplEv = - std::unique_ptr(ur_event_handle_t_::make_native( - UR_COMMAND_MEM_BUFFER_READ, hQueue, cuStream)); - retImplEv->start(); + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_BUFFER_READ, hQueue, CuStream)); + RetImplEvent->start(); } - UR_CHECK_ERROR(cuMemcpyDtoHAsync(pDst, devPtr + offset, size, cuStream)); + UR_CHECK_ERROR(cuMemcpyDtoHAsync(pDst, DevPtr + offset, size, CuStream)); if (phEvent) { - retErr = retImplEv->record(); + Result = RetImplEvent->record(); } if (blockingRead) { - UR_CHECK_ERROR(cuStreamSynchronize(cuStream)); + UR_CHECK_ERROR(cuStreamSynchronize(CuStream)); } if (phEvent) { - *phEvent = retImplEv.release(); + *phEvent = RetImplEvent.release(); } - } catch (ur_result_t err) { - retErr = err; + } catch (ur_result_t Err) { + Result = Err; } - return retErr; + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( @@ -1579,7 +1574,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(!hBuffer->is_image(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(!hBuffer->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER); if (phEventWaitList) { UR_ASSERT(numEventsInWaitList > 0, UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); @@ -1587,44 +1582,44 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( UR_ASSERT(numEventsInWaitList == 0, UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); } - UR_ASSERT(offset + size <= hBuffer->mem_.buffer_mem_.size_, + UR_ASSERT(offset + size <= hBuffer->Mem.BufferMem.Size, UR_RESULT_ERROR_INVALID_SIZE); - ur_result_t retErr = UR_RESULT_SUCCESS; - CUdeviceptr devPtr = hBuffer->mem_.buffer_mem_.get(); - std::unique_ptr retImplEv{nullptr}; + ur_result_t Result = UR_RESULT_SUCCESS; + CUdeviceptr DevPtr = hBuffer->Mem.BufferMem.get(); + std::unique_ptr RetImplEvent{nullptr}; try { - ScopedContext active(hQueue->get_context()); - CUstream cuStream = hQueue->get_next_transfer_stream(); + ScopedContext Active(hQueue->getContext()); + CUstream CuStream = hQueue->getNextTransferStream(); - retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, + Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, phEventWaitList); if (phEvent) { - retImplEv = - std::unique_ptr(ur_event_handle_t_::make_native( - UR_COMMAND_MEM_BUFFER_WRITE, hQueue, cuStream)); - retImplEv->start(); + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_BUFFER_WRITE, hQueue, CuStream)); + RetImplEvent->start(); } - UR_CHECK_ERROR(cuMemcpyHtoDAsync(devPtr + offset, pSrc, size, cuStream)); + UR_CHECK_ERROR(cuMemcpyHtoDAsync(DevPtr + offset, pSrc, size, CuStream)); if (phEvent) { - retErr = retImplEv->record(); + Result = RetImplEvent->record(); } if (blockingWrite) { - UR_CHECK_ERROR(cuStreamSynchronize(cuStream)); + UR_CHECK_ERROR(cuStreamSynchronize(CuStream)); } if (phEvent) { - *phEvent = retImplEv.release(); + *phEvent = RetImplEvent.release(); } - } catch (ur_result_t err) { - retErr = err; + } catch (ur_result_t Err) { + Result = Err; } - return retErr; + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite( @@ -1638,29 +1633,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite( // Since CUDA requires a the global variable to be referenced by name, we use // metadata to find the correct name to access it by. - auto device_global_name_it = hProgram->globalIDMD_.find(name); - if (device_global_name_it == hProgram->globalIDMD_.end()) + auto DeviceGlobalNameIt = hProgram->GlobalIDMD.find(name); + if (DeviceGlobalNameIt == hProgram->GlobalIDMD.end()) return UR_RESULT_ERROR_INVALID_VALUE; - std::string device_global_name = device_global_name_it->second; + std::string DeviceGlobalName = DeviceGlobalNameIt->second; - ur_result_t result = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { - CUdeviceptr device_global = 0; - size_t device_global_size = 0; - result = UR_CHECK_ERROR( - cuModuleGetGlobal(&device_global, &device_global_size, hProgram->get(), - device_global_name.c_str())); + CUdeviceptr DeviceGlobal = 0; + size_t DeviceGlobalSize = 0; + Result = UR_CHECK_ERROR(cuModuleGetGlobal(&DeviceGlobal, &DeviceGlobalSize, + hProgram->get(), + DeviceGlobalName.c_str())); - if (offset + count > device_global_size) + if (offset + count > DeviceGlobalSize) return UR_RESULT_ERROR_INVALID_VALUE; return urEnqueueUSMMemcpy( - hQueue, blockingWrite, reinterpret_cast(device_global + offset), + hQueue, blockingWrite, reinterpret_cast(DeviceGlobal + offset), pSrc, count, numEventsInWaitList, phEventWaitList, phEvent); - } catch (ur_result_t error) { - result = error; + } catch (ur_result_t Err) { + Result = Err; } - return result; + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead( @@ -1674,30 +1669,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead( // Since CUDA requires a the global variable to be referenced by name, we use // metadata to find the correct name to access it by. - auto device_global_name_it = hProgram->globalIDMD_.find(name); - if (device_global_name_it == hProgram->globalIDMD_.end()) + auto DeviceGlobalNameIt = hProgram->GlobalIDMD.find(name); + if (DeviceGlobalNameIt == hProgram->GlobalIDMD.end()) return UR_RESULT_ERROR_INVALID_VALUE; - std::string device_global_name = device_global_name_it->second; + std::string DeviceGlobalName = DeviceGlobalNameIt->second; - ur_result_t result = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { - CUdeviceptr device_global = 0; - size_t device_global_size = 0; - result = UR_CHECK_ERROR( - cuModuleGetGlobal(&device_global, &device_global_size, hProgram->get(), - device_global_name.c_str())); + CUdeviceptr DeviceGlobal = 0; + size_t DeviceGlobalSize = 0; + Result = UR_CHECK_ERROR(cuModuleGetGlobal(&DeviceGlobal, &DeviceGlobalSize, + hProgram->get(), + DeviceGlobalName.c_str())); - if (offset + count > device_global_size) + if (offset + count > DeviceGlobalSize) return UR_RESULT_ERROR_INVALID_VALUE; return urEnqueueUSMMemcpy( hQueue, blockingRead, pDst, - reinterpret_cast(device_global + offset), count, + reinterpret_cast(DeviceGlobal + offset), count, numEventsInWaitList, phEventWaitList, phEvent); - } catch (ur_result_t error) { - result = error; + } catch (ur_result_t Err) { + Result = Err; } - return result; + return Result; } /// Host Pipes diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp index f1a0b9d2a97d2..8916197b73f1c 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp @@ -15,150 +15,148 @@ #include #include -ur_event_handle_t_::ur_event_handle_t_(ur_command_t type, - ur_context_handle_t context, - ur_queue_handle_t queue, CUstream stream, - uint32_t stream_token) - : commandType_{type}, refCount_{1}, has_ownership_{true}, - hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false}, - streamToken_{stream_token}, evEnd_{nullptr}, evStart_{nullptr}, - evQueued_{nullptr}, queue_{queue}, stream_{stream}, context_{context} { +ur_event_handle_t_::ur_event_handle_t_(ur_command_t Type, + ur_context_handle_t Context, + ur_queue_handle_t Queue, CUstream Stream, + uint32_t StreamToken) + : CommandType{Type}, RefCount{1}, HasOwnership{true}, + HasBeenWaitedOn{false}, IsRecorded{false}, IsStarted{false}, + StreamToken{StreamToken}, EvEnd{nullptr}, EvStart{nullptr}, + EvQueued{nullptr}, Queue{Queue}, Stream{Stream}, Context{Context} { - bool profilingEnabled = queue_->ur_flags_ & UR_QUEUE_FLAG_PROFILING_ENABLE; + bool ProfilingEnabled = Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE; UR_CHECK_ERROR(cuEventCreate( - &evEnd_, profilingEnabled ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING)); + &EvEnd, ProfilingEnabled ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING)); - if (profilingEnabled) { - UR_CHECK_ERROR(cuEventCreate(&evQueued_, CU_EVENT_DEFAULT)); - UR_CHECK_ERROR(cuEventCreate(&evStart_, CU_EVENT_DEFAULT)); + if (ProfilingEnabled) { + UR_CHECK_ERROR(cuEventCreate(&EvQueued, CU_EVENT_DEFAULT)); + UR_CHECK_ERROR(cuEventCreate(&EvStart, CU_EVENT_DEFAULT)); } - if (queue_ != nullptr) { - urQueueRetain(queue_); + if (Queue != nullptr) { + urQueueRetain(Queue); } - urContextRetain(context_); + urContextRetain(Context); } -ur_event_handle_t_::ur_event_handle_t_(ur_context_handle_t context, - CUevent eventNative) - // TODO(ur): Missing user command type - : commandType_{UR_COMMAND_EVENTS_WAIT}, refCount_{1}, has_ownership_{false}, - hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false}, - streamToken_{std::numeric_limits::max()}, evEnd_{eventNative}, - evStart_{nullptr}, evQueued_{nullptr}, queue_{nullptr}, - context_{context} { - urContextRetain(context_); +ur_event_handle_t_::ur_event_handle_t_(ur_context_handle_t Context, + CUevent EventNative) + : CommandType{UR_COMMAND_EVENTS_WAIT}, RefCount{1}, HasOwnership{false}, + HasBeenWaitedOn{false}, IsRecorded{false}, IsStarted{false}, + StreamToken{std::numeric_limits::max()}, EvEnd{EventNative}, + EvStart{nullptr}, EvQueued{nullptr}, Queue{nullptr}, Context{Context} { + urContextRetain(Context); } ur_event_handle_t_::~ur_event_handle_t_() { - if (queue_ != nullptr) { - urQueueRelease(queue_); + if (Queue != nullptr) { + urQueueRelease(Queue); } - urContextRelease(context_); + urContextRelease(Context); } ur_result_t ur_event_handle_t_::start() { - assert(!is_started()); - ur_result_t result = UR_RESULT_SUCCESS; + assert(!isStarted()); + ur_result_t Result = UR_RESULT_SUCCESS; try { - if (queue_->ur_flags_ & UR_QUEUE_FLAG_PROFILING_ENABLE) { + if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) { // NOTE: This relies on the default stream to be unused. - result = UR_CHECK_ERROR(cuEventRecord(evQueued_, 0)); - result = UR_CHECK_ERROR(cuEventRecord(evStart_, stream_)); + Result = UR_CHECK_ERROR(cuEventRecord(EvQueued, 0)); + Result = UR_CHECK_ERROR(cuEventRecord(EvStart, Stream)); } - } catch (ur_result_t error) { - result = error; + } catch (ur_result_t Err) { + Result = Err; } - isStarted_ = true; - return result; + IsStarted = true; + return Result; } -bool ur_event_handle_t_::is_completed() const noexcept { - if (!isRecorded_) { +bool ur_event_handle_t_::isCompleted() const noexcept { + if (!IsRecorded) { return false; } - if (!hasBeenWaitedOn_) { - const CUresult ret = cuEventQuery(evEnd_); - if (ret != CUDA_SUCCESS && ret != CUDA_ERROR_NOT_READY) { - UR_CHECK_ERROR(ret); + if (!HasBeenWaitedOn) { + const CUresult Result = cuEventQuery(EvEnd); + if (Result != CUDA_SUCCESS && Result != CUDA_ERROR_NOT_READY) { + UR_CHECK_ERROR(Result); return false; } - if (ret == CUDA_ERROR_NOT_READY) { + if (Result == CUDA_ERROR_NOT_READY) { return false; } } return true; } -uint64_t ur_event_handle_t_::get_queued_time() const { - assert(is_started()); - return queue_->get_device()->get_elapsed_time(evQueued_); +uint64_t ur_event_handle_t_::getQueuedTime() const { + assert(isStarted()); + return Queue->get_device()->getElapsedTime(EvQueued); } -uint64_t ur_event_handle_t_::get_start_time() const { - assert(is_started()); - return queue_->get_device()->get_elapsed_time(evStart_); +uint64_t ur_event_handle_t_::getStartTime() const { + assert(isStarted()); + return Queue->get_device()->getElapsedTime(EvStart); } -uint64_t ur_event_handle_t_::get_end_time() const { - assert(is_started() && is_recorded()); - return queue_->get_device()->get_elapsed_time(evEnd_); +uint64_t ur_event_handle_t_::getEndTime() const { + assert(isStarted() && isRecorded()); + return Queue->get_device()->getElapsedTime(EvEnd); } ur_result_t ur_event_handle_t_::record() { - if (is_recorded() || !is_started()) { + if (isRecorded() || !isStarted()) { return UR_RESULT_ERROR_INVALID_EVENT; } - ur_result_t result = UR_RESULT_ERROR_INVALID_OPERATION; + ur_result_t Result = UR_RESULT_ERROR_INVALID_OPERATION; - UR_ASSERT(queue_, UR_RESULT_ERROR_INVALID_QUEUE); + UR_ASSERT(Queue, UR_RESULT_ERROR_INVALID_QUEUE); try { - eventId_ = queue_->get_next_event_id(); - if (eventId_ == 0) { + EventID = Queue->getNextEventID(); + if (EventID == 0) { sycl::detail::ur::die( "Unrecoverable program state reached in event identifier overflow"); } - result = UR_CHECK_ERROR(cuEventRecord(evEnd_, stream_)); + Result = UR_CHECK_ERROR(cuEventRecord(EvEnd, Stream)); } catch (ur_result_t error) { - result = error; + Result = error; } - if (result == UR_RESULT_SUCCESS) { - isRecorded_ = true; + if (Result == UR_RESULT_SUCCESS) { + IsRecorded = true; } - return result; + return Result; } ur_result_t ur_event_handle_t_::wait() { - ur_result_t retErr; + ur_result_t Result; try { - retErr = UR_CHECK_ERROR(cuEventSynchronize(evEnd_)); - hasBeenWaitedOn_ = true; + Result = UR_CHECK_ERROR(cuEventSynchronize(EvEnd)); + HasBeenWaitedOn = true; } catch (ur_result_t error) { - retErr = error; + Result = error; } - return retErr; + return Result; } ur_result_t ur_event_handle_t_::release() { - if (!backend_has_ownership()) + if (!backendHasOwnership()) return UR_RESULT_SUCCESS; - assert(queue_ != nullptr); + assert(Queue != nullptr); - UR_CHECK_ERROR(cuEventDestroy(evEnd_)); + UR_CHECK_ERROR(cuEventDestroy(EvEnd)); - if (queue_->ur_flags_ & UR_QUEUE_FLAG_PROFILING_ENABLE) { - UR_CHECK_ERROR(cuEventDestroy(evQueued_)); - UR_CHECK_ERROR(cuEventDestroy(evStart_)); + if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) { + UR_CHECK_ERROR(cuEventDestroy(EvQueued)); + UR_CHECK_ERROR(cuEventDestroy(EvStart)); } return UR_RESULT_SUCCESS; @@ -174,15 +172,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(ur_event_handle_t hEvent, switch (propName) { case UR_EVENT_INFO_COMMAND_QUEUE: - return ReturnValue(hEvent->get_queue()); + return ReturnValue(hEvent->getQueue()); case UR_EVENT_INFO_COMMAND_TYPE: - return ReturnValue(hEvent->get_command_type()); + return ReturnValue(hEvent->getCommandType()); case UR_EVENT_INFO_REFERENCE_COUNT: - return ReturnValue(hEvent->get_reference_count()); + return ReturnValue(hEvent->getReferenceCount()); case UR_EVENT_INFO_COMMAND_EXECUTION_STATUS: - return ReturnValue(hEvent->get_execution_status()); + return ReturnValue(hEvent->getExecutionStatus()); case UR_EVENT_INFO_CONTEXT: - return ReturnValue(hEvent->get_context()); + return ReturnValue(hEvent->getContext()); default: sycl::detail::ur::die("Event info request not implemented"); } @@ -198,9 +196,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( UR_ASSERT(hEvent, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet); - ur_queue_handle_t queue = hEvent->get_queue(); - if (queue == nullptr || - !(queue->ur_flags_ & UR_QUEUE_FLAG_PROFILING_ENABLE)) { + ur_queue_handle_t Queue = hEvent->getQueue(); + if (Queue == nullptr || !(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE)) { return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; } @@ -208,11 +205,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( case UR_PROFILING_INFO_COMMAND_QUEUED: case UR_PROFILING_INFO_COMMAND_SUBMIT: // Note: No user for this case - return ReturnValue(static_cast(hEvent->get_queued_time())); + return ReturnValue(static_cast(hEvent->getQueuedTime())); case UR_PROFILING_INFO_COMMAND_START: - return ReturnValue(static_cast(hEvent->get_start_time())); + return ReturnValue(static_cast(hEvent->getStartTime())); case UR_PROFILING_INFO_COMMAND_END: - return ReturnValue(static_cast(hEvent->get_end_time())); + return ReturnValue(static_cast(hEvent->getEndTime())); default: break; } @@ -234,19 +231,19 @@ urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) { UR_ASSERT(phEventWaitList, UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); UR_ASSERT(numEvents > 0, UR_RESULT_ERROR_INVALID_VALUE); - auto context = phEventWaitList[0]->get_context(); - ScopedContext active(context); + auto Context = phEventWaitList[0]->getContext(); + ScopedContext Active(Context); - auto waitFunc = [context](ur_event_handle_t event) -> ur_result_t { - UR_ASSERT(event, UR_RESULT_ERROR_INVALID_EVENT); - UR_ASSERT(event->get_context() == context, + auto WaitFunc = [Context](ur_event_handle_t Event) -> ur_result_t { + UR_ASSERT(Event, UR_RESULT_ERROR_INVALID_EVENT); + UR_ASSERT(Event->getContext() == Context, UR_RESULT_ERROR_INVALID_CONTEXT); - return event->wait(); + return Event->wait(); }; - return forLatestEvents(phEventWaitList, numEvents, waitFunc); - } catch (ur_result_t err) { - return err; + return forLatestEvents(phEventWaitList, numEvents, WaitFunc); + } catch (ur_result_t Err) { + return Err; } catch (...) { return UR_RESULT_ERROR_OUT_OF_RESOURCES; } @@ -255,10 +252,10 @@ urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) { UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) { UR_ASSERT(hEvent, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - const auto refCount = hEvent->increment_reference_count(); + const auto RefCount = hEvent->incrementReferenceCount(); sycl::detail::ur::assertion( - refCount != 0, "Reference count overflow detected in urEventRetain."); + RefCount != 0, "Reference count overflow detected in urEventRetain."); return UR_RESULT_SUCCESS; } @@ -269,20 +266,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { // double delete or someone is messing with the ref count. // either way, cannot safely proceed. sycl::detail::ur::assertion( - hEvent->get_reference_count() != 0, + hEvent->getReferenceCount() != 0, "Reference count overflow detected in urEventRelease."); // decrement ref count. If it is 0, delete the event. - if (hEvent->decrement_reference_count() == 0) { + if (hEvent->decrementReferenceCount() == 0) { std::unique_ptr event_ptr{hEvent}; - ur_result_t result = UR_RESULT_ERROR_INVALID_EVENT; + ur_result_t Result = UR_RESULT_ERROR_INVALID_EVENT; try { - ScopedContext active(hEvent->get_context()); - result = hEvent->release(); + ScopedContext Active(hEvent->getContext()); + Result = hEvent->release(); } catch (...) { - result = UR_RESULT_ERROR_OUT_OF_RESOURCES; + Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; } - return result; + return Result; } return UR_RESULT_SUCCESS; @@ -298,11 +295,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle( ur_native_handle_t hNativeEvent, ur_context_handle_t hContext, const ur_event_native_properties_t *pProperties, ur_event_handle_t *phEvent) { - (void)pProperties; + std::ignore = pProperties; - std::unique_ptr event_ptr{nullptr}; + std::unique_ptr EventPtr{nullptr}; - *phEvent = ur_event_handle_t_::make_with_native( + *phEvent = ur_event_handle_t_::makeWithNative( hContext, reinterpret_cast(hNativeEvent)); return UR_RESULT_SUCCESS; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp index b0f10b33a5822..b1e0f939940ca 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp @@ -24,68 +24,68 @@ struct ur_event_handle_t_ { ur_result_t start(); - native_type get() const noexcept { return evEnd_; }; + native_type get() const noexcept { return EvEnd; }; - ur_queue_handle_t get_queue() const noexcept { return queue_; } + ur_queue_handle_t getQueue() const noexcept { return Queue; } - CUstream get_stream() const noexcept { return stream_; } + CUstream getStream() const noexcept { return Stream; } - uint32_t get_compute_stream_token() const noexcept { return streamToken_; } + uint32_t getComputeStreamToken() const noexcept { return StreamToken; } - ur_command_t get_command_type() const noexcept { return commandType_; } + ur_command_t getCommandType() const noexcept { return CommandType; } - uint32_t get_reference_count() const noexcept { return refCount_; } + uint32_t getReferenceCount() const noexcept { return RefCount; } - bool is_recorded() const noexcept { return isRecorded_; } + bool isRecorded() const noexcept { return IsRecorded; } - bool is_started() const noexcept { return isStarted_; } + bool isStarted() const noexcept { return IsStarted; } - bool is_completed() const noexcept; + bool isCompleted() const noexcept; - uint32_t get_execution_status() const noexcept { + uint32_t getExecutionStatus() const noexcept { - if (!is_recorded()) { + if (!isRecorded()) { return UR_EVENT_STATUS_SUBMITTED; } - if (!is_completed()) { + if (!isCompleted()) { return UR_EVENT_STATUS_RUNNING; } return UR_EVENT_STATUS_COMPLETE; } - ur_context_handle_t get_context() const noexcept { return context_; }; + ur_context_handle_t getContext() const noexcept { return Context; }; - uint32_t increment_reference_count() { return ++refCount_; } + uint32_t incrementReferenceCount() { return ++RefCount; } - uint32_t decrement_reference_count() { return --refCount_; } + uint32_t decrementReferenceCount() { return --RefCount; } - uint32_t get_event_id() const noexcept { return eventId_; } + uint32_t getEventID() const noexcept { return EventID; } - bool backend_has_ownership() const noexcept { return has_ownership_; } + bool backendHasOwnership() const noexcept { return HasOwnership; } // Returns the counter time when the associated command(s) were enqueued // - uint64_t get_queued_time() const; + uint64_t getQueuedTime() const; // Returns the counter time when the associated command(s) started execution // - uint64_t get_start_time() const; + uint64_t getStartTime() const; // Returns the counter time when the associated command(s) completed // - uint64_t get_end_time() const; + uint64_t getEndTime() const; // construct a native CUDA. This maps closely to the underlying CUDA event. static ur_event_handle_t - make_native(ur_command_t type, ur_queue_handle_t queue, CUstream stream, - uint32_t stream_token = std::numeric_limits::max()) { - return new ur_event_handle_t_(type, queue->get_context(), queue, stream, - stream_token); + makeNative(ur_command_t Type, ur_queue_handle_t Queue, CUstream Stream, + uint32_t StreamToken = std::numeric_limits::max()) { + return new ur_event_handle_t_(Type, Queue->getContext(), Queue, Stream, + StreamToken); } - static ur_event_handle_t make_with_native(ur_context_handle_t context, - CUevent eventNative) { + static ur_event_handle_t makeWithNative(ur_context_handle_t context, + CUevent eventNative) { return new ur_event_handle_t_(context, eventNative); } @@ -94,95 +94,94 @@ struct ur_event_handle_t_ { ~ur_event_handle_t_(); private: - // This constructor is private to force programmers to use the make_native / + // This constructor is private to force programmers to use the makeNative / // make_user static members in order to create a pi_event for CUDA. - ur_event_handle_t_(ur_command_t type, ur_context_handle_t context, - ur_queue_handle_t queue, CUstream stream, - uint32_t stream_token); + ur_event_handle_t_(ur_command_t Type, ur_context_handle_t Context, + ur_queue_handle_t Queue, CUstream Stream, + uint32_t StreamToken); // This constructor is private to force programmers to use the - // make_with_native for event introp - ur_event_handle_t_(ur_context_handle_t context, CUevent eventNative); + // makeWithNative for event introp + ur_event_handle_t_(ur_context_handle_t Context, CUevent EventNative); - ur_command_t commandType_; // The type of command associated with event. + ur_command_t CommandType; // The type of command associated with event. - std::atomic_uint32_t refCount_; // Event reference count. + std::atomic_uint32_t RefCount; // Event reference count. - bool has_ownership_; // Signifies if event owns the native type. + bool HasOwnership; // Signifies if event owns the native type. - bool hasBeenWaitedOn_; // Signifies whether the event has been waited - // on through a call to wait(), which implies - // that it has completed. + bool HasBeenWaitedOn; // Signifies whether the event has been waited + // on through a call to wait(), which implies + // that it has completed. - bool isRecorded_; // Signifies wether a native CUDA event has been recorded - // yet. - bool isStarted_; // Signifies wether the operation associated with the - // PI event has started or not - // + bool IsRecorded; // Signifies wether a native CUDA event has been recorded + // yet. + bool IsStarted; // Signifies wether the operation associated with the + // PI event has started or not - uint32_t streamToken_; - uint32_t eventId_; // Queue identifier of the event. + uint32_t StreamToken; + uint32_t EventID; // Queue identifier of the event. - native_type evEnd_; // CUDA event handle. If this _pi_event represents a user - // event, this will be nullptr. + native_type EvEnd; // CUDA event handle. If this _pi_event represents a user + // event, this will be nullptr. - native_type evStart_; // CUDA event handle associated with the start + native_type EvStart; // CUDA event handle associated with the start - native_type evQueued_; // CUDA event handle associated with the time - // the command was enqueued + native_type EvQueued; // CUDA event handle associated with the time + // the command was enqueued - ur_queue_handle_t queue_; // pi_queue associated with the event. If this is a - // user event, this will be nullptr. + ur_queue_handle_t Queue; // pi_queue associated with the event. If this is a + // user event, this will be nullptr. - CUstream stream_; // CUstream associated with the event. If this is a user - // event, this will be uninitialized. + CUstream Stream; // CUstream associated with the event. If this is a user + // event, this will be uninitialized. - ur_context_handle_t context_; // pi_context associated with the event. If this - // is a native event, this will be the same - // context associated with the queue_ member. + ur_context_handle_t Context; // pi_context associated with the event. If this + // is a native event, this will be the same + // context associated with the queue_ member. }; // Iterates over the event wait list, returns correct ur_result_t error codes. // Invokes the callback for the latest event of each queue in the wait list. // The callback must take a single pi_event argument and return a ur_result_t. template -ur_result_t forLatestEvents(const ur_event_handle_t *event_wait_list, - std::size_t num_events_in_wait_list, Func &&f) { +ur_result_t forLatestEvents(const ur_event_handle_t *EventWaitList, + std::size_t NumEventsInWaitList, Func &&F) { - if (event_wait_list == nullptr || num_events_in_wait_list == 0) { + if (EventWaitList == nullptr || NumEventsInWaitList == 0) { return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; } // Fast path if we only have a single event - if (num_events_in_wait_list == 1) { - return f(event_wait_list[0]); + if (NumEventsInWaitList == 1) { + return F(EventWaitList[0]); } - std::vector events{ - event_wait_list, event_wait_list + num_events_in_wait_list}; - std::sort(events.begin(), events.end(), - [](ur_event_handle_t e0, ur_event_handle_t e1) { + std::vector Events{EventWaitList, + EventWaitList + NumEventsInWaitList}; + std::sort(Events.begin(), Events.end(), + [](ur_event_handle_t Event0, ur_event_handle_t Event1) { // Tiered sort creating sublists of streams (smallest value first) // in which the corresponding events are sorted into a sequence of // newest first. - return e0->get_stream() < e1->get_stream() || - (e0->get_stream() == e1->get_stream() && - e0->get_event_id() > e1->get_event_id()); + return Event0->getStream() < Event1->getStream() || + (Event0->getStream() == Event1->getStream() && + Event0->getEventID() > Event1->getEventID()); }); - bool first = true; - CUstream lastSeenStream = 0; - for (ur_event_handle_t event : events) { - if (!event || (!first && event->get_stream() == lastSeenStream)) { + bool First = true; + CUstream LastSeenStream = 0; + for (ur_event_handle_t Event : Events) { + if (!Event || (!First && Event->getStream() == LastSeenStream)) { continue; } - first = false; - lastSeenStream = event->get_stream(); + First = false; + LastSeenStream = Event->getStream(); - auto result = f(event); - if (result != UR_RESULT_SUCCESS) { - return result; + auto Result = F(Event); + if (Result != UR_RESULT_SUCCESS) { + return Result; } } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp index 900b23dd84306..f3c05e016e441 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp @@ -17,46 +17,47 @@ urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName, UR_ASSERT(phKernel, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(pKernelName, UR_RESULT_ERROR_INVALID_NULL_POINTER); - ur_result_t retErr = UR_RESULT_SUCCESS; - std::unique_ptr retKernel{nullptr}; + ur_result_t Result = UR_RESULT_SUCCESS; + std::unique_ptr Kernel{nullptr}; try { - ScopedContext active(hProgram->get_context()); + ScopedContext Active(hProgram->getContext()); - CUfunction cuFunc; - CUresult functionResult = - cuModuleGetFunction(&cuFunc, hProgram->get(), pKernelName); + CUfunction CuFunc; + CUresult FunctionResult = + cuModuleGetFunction(&CuFunc, hProgram->get(), pKernelName); // We can't add this as a generic mapping in UR_CHECK_ERROR since cuda's // NOT_FOUND error applies to more than just functions. - if (functionResult == CUDA_ERROR_NOT_FOUND) { + if (FunctionResult == CUDA_ERROR_NOT_FOUND) { throw UR_RESULT_ERROR_INVALID_KERNEL_NAME; } else { - retErr = UR_CHECK_ERROR(functionResult); + Result = UR_CHECK_ERROR(FunctionResult); } - std::string kernel_name_woffset = std::string(pKernelName) + "_with_offset"; - CUfunction cuFuncWithOffsetParam; - CUresult offsetRes = cuModuleGetFunction( - &cuFuncWithOffsetParam, hProgram->get(), kernel_name_woffset.c_str()); + std::string KernelNameWithOffset = + std::string(pKernelName) + "_with_offset"; + CUfunction CuFuncWithOffsetParam; + CUresult OffsetRes = cuModuleGetFunction( + &CuFuncWithOffsetParam, hProgram->get(), KernelNameWithOffset.c_str()); // If there is no kernel with global offset parameter we mark it as missing - if (offsetRes == CUDA_ERROR_NOT_FOUND) { - cuFuncWithOffsetParam = nullptr; + if (OffsetRes == CUDA_ERROR_NOT_FOUND) { + CuFuncWithOffsetParam = nullptr; } else { - retErr = UR_CHECK_ERROR(offsetRes); + Result = UR_CHECK_ERROR(OffsetRes); } - retKernel = std::unique_ptr( - new ur_kernel_handle_t_{cuFunc, cuFuncWithOffsetParam, pKernelName, - hProgram, hProgram->get_context()}); - } catch (ur_result_t err) { - retErr = err; + Kernel = std::unique_ptr( + new ur_kernel_handle_t_{CuFunc, CuFuncWithOffsetParam, pKernelName, + hProgram, hProgram->getContext()}); + } catch (ur_result_t Err) { + Result = Err; } catch (...) { - retErr = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + Result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } - *phKernel = retKernel.release(); - return retErr; + *phKernel = Kernel.release(); + return Result; } UR_APIEXPORT ur_result_t UR_APICALL @@ -70,82 +71,78 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, switch (propName) { case UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: { - size_t global_work_size[3] = {0, 0, 0}; + size_t GlobalWorkSize[3] = {0, 0, 0}; - int max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0}; + int MaxBlockDimX{0}, MaxBlockDimY{0}, MaxBlockDimZ{0}; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&max_block_dimX, - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, + cuDeviceGetAttribute(&MaxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, hDevice->get()) == CUDA_SUCCESS); sycl::detail::ur::assertion( - cuDeviceGetAttribute(&max_block_dimY, - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, + cuDeviceGetAttribute(&MaxBlockDimY, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, hDevice->get()) == CUDA_SUCCESS); sycl::detail::ur::assertion( - cuDeviceGetAttribute(&max_block_dimZ, - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, + cuDeviceGetAttribute(&MaxBlockDimZ, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, hDevice->get()) == CUDA_SUCCESS); - int max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0}; + int MaxGridDimX{0}, MaxGridDimY{0}, MaxGridDimZ{0}; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&max_grid_dimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, + cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, hDevice->get()) == CUDA_SUCCESS); sycl::detail::ur::assertion( - cuDeviceGetAttribute(&max_grid_dimY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, + cuDeviceGetAttribute(&MaxGridDimY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, hDevice->get()) == CUDA_SUCCESS); sycl::detail::ur::assertion( - cuDeviceGetAttribute(&max_grid_dimZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, + cuDeviceGetAttribute(&MaxGridDimZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, hDevice->get()) == CUDA_SUCCESS); - global_work_size[0] = max_block_dimX * max_grid_dimX; - global_work_size[1] = max_block_dimY * max_grid_dimY; - global_work_size[2] = max_block_dimZ * max_grid_dimZ; - return ReturnValue(global_work_size, 3); + GlobalWorkSize[0] = MaxBlockDimX * MaxGridDimX; + GlobalWorkSize[1] = MaxBlockDimY * MaxGridDimY; + GlobalWorkSize[2] = MaxBlockDimZ * MaxGridDimZ; + return ReturnValue(GlobalWorkSize, 3); } case UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { - int max_threads = 0; + int MaxThreads = 0; sycl::detail::ur::assertion( - cuFuncGetAttribute(&max_threads, - CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, + cuFuncGetAttribute(&MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, hKernel->get()) == CUDA_SUCCESS); - return ReturnValue(size_t(max_threads)); + return ReturnValue(size_t(MaxThreads)); } case UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { - size_t group_size[3] = {0, 0, 0}; - const auto &reqd_wg_size_md_map = - hKernel->program_->kernelReqdWorkGroupSizeMD_; - const auto reqd_wg_size_md = reqd_wg_size_md_map.find(hKernel->name_); - if (reqd_wg_size_md != reqd_wg_size_md_map.end()) { - const auto reqd_wg_size = reqd_wg_size_md->second; - group_size[0] = std::get<0>(reqd_wg_size); - group_size[1] = std::get<1>(reqd_wg_size); - group_size[2] = std::get<2>(reqd_wg_size); + size_t GroupSize[3] = {0, 0, 0}; + const auto &ReqdWGSizeMDMap = + hKernel->get_program()->KernelReqdWorkGroupSizeMD; + const auto ReqdWGSizeMD = ReqdWGSizeMDMap.find(hKernel->getName()); + if (ReqdWGSizeMD != ReqdWGSizeMDMap.end()) { + const auto ReqdWGSize = ReqdWGSizeMD->second; + GroupSize[0] = std::get<0>(ReqdWGSize); + GroupSize[1] = std::get<1>(ReqdWGSize); + GroupSize[2] = std::get<2>(ReqdWGSize); } - return ReturnValue(group_size, 3); + return ReturnValue(GroupSize, 3); } case UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: { // OpenCL LOCAL == CUDA SHARED - int bytes = 0; + int Bytes = 0; sycl::detail::ur::assertion( - cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, + cuFuncGetAttribute(&Bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, hKernel->get()) == CUDA_SUCCESS); - return ReturnValue(uint64_t(bytes)); + return ReturnValue(uint64_t(Bytes)); } case UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { // Work groups should be multiples of the warp size - int warpSize = 0; + int WarpSize = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, + cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, hDevice->get()) == CUDA_SUCCESS); - return ReturnValue(static_cast(warpSize)); + return ReturnValue(static_cast(WarpSize)); } case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: { // OpenCL PRIVATE == CUDA LOCAL - int bytes = 0; + int Bytes = 0; sycl::detail::ur::assertion( - cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, + cuFuncGetAttribute(&Bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, hKernel->get()) == CUDA_SUCCESS); - return ReturnValue(uint64_t(bytes)); + return ReturnValue(uint64_t(Bytes)); } default: break; @@ -156,10 +153,9 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, UR_APIEXPORT ur_result_t UR_APICALL urKernelRetain(ur_kernel_handle_t hKernel) { UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hKernel->get_reference_count() > 0u, - UR_RESULT_ERROR_INVALID_KERNEL); + UR_ASSERT(hKernel->getReferenceCount() > 0u, UR_RESULT_ERROR_INVALID_KERNEL); - hKernel->increment_reference_count(); + hKernel->incrementReferenceCount(); return UR_RESULT_SUCCESS; } @@ -169,11 +165,10 @@ urKernelRelease(ur_kernel_handle_t hKernel) { // double delete or someone is messing with the ref count. // either way, cannot safely proceed. - UR_ASSERT(hKernel->get_reference_count() != 0, - UR_RESULT_ERROR_INVALID_KERNEL); + UR_ASSERT(hKernel->getReferenceCount() != 0, UR_RESULT_ERROR_INVALID_KERNEL); // decrement ref count. If it is 0, delete the program. - if (hKernel->decrement_reference_count() == 0) { + if (hKernel->decrementReferenceCount() == 0) { // no internal cuda resources to clean up. Just delete it. delete hKernel; return UR_RESULT_SUCCESS; @@ -198,17 +193,17 @@ urKernelSetArgValue(ur_kernel_handle_t hKernel, uint32_t argIndex, UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(argSize, UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE); - ur_result_t retErr = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { if (pArgValue) { - hKernel->set_kernel_arg(argIndex, argSize, pArgValue); + hKernel->setKernelArg(argIndex, argSize, pArgValue); } else { - hKernel->set_kernel_local_arg(argIndex, argSize); + hKernel->setKernelLocalArg(argIndex, argSize); } - } catch (ur_result_t err) { - retErr = err; + } catch (ur_result_t Err) { + Result = Err; } - return retErr; + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel, @@ -222,23 +217,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel, switch (propName) { case UR_KERNEL_INFO_FUNCTION_NAME: - return ReturnValue(hKernel->get_name()); + return ReturnValue(hKernel->getName()); case UR_KERNEL_INFO_NUM_ARGS: - return ReturnValue(hKernel->get_num_args()); + return ReturnValue(hKernel->getNumArgs()); case UR_KERNEL_INFO_REFERENCE_COUNT: - return ReturnValue(hKernel->get_reference_count()); + return ReturnValue(hKernel->getReferenceCount()); case UR_KERNEL_INFO_CONTEXT: - return ReturnValue(hKernel->get_context()); + return ReturnValue(hKernel->getContext()); case UR_KERNEL_INFO_PROGRAM: return ReturnValue(hKernel->get_program()); case UR_KERNEL_INFO_ATTRIBUTES: return ReturnValue(""); case UR_KERNEL_INFO_NUM_REGS: { - int numRegs = 0; + int NumRegs = 0; sycl::detail::ur::assertion( - cuFuncGetAttribute(&numRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, + cuFuncGetAttribute(&NumRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, hKernel->get()) == CUDA_SUCCESS); - return ReturnValue(static_cast(numRegs)); + return ReturnValue(static_cast(NumRegs)); } default: break; @@ -257,25 +252,24 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, switch (propName) { case UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE: { // Sub-group size is equivalent to warp size - int warpSize = 0; + int WarpSize = 0; sycl::detail::ur::assertion( - cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, + cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, hDevice->get()) == CUDA_SUCCESS); - return ReturnValue(static_cast(warpSize)); + return ReturnValue(static_cast(WarpSize)); } case UR_KERNEL_SUB_GROUP_INFO_MAX_NUM_SUB_GROUPS: { // Number of sub-groups = max block size / warp size + possible remainder - int max_threads = 0; + int MaxThreads = 0; sycl::detail::ur::assertion( - cuFuncGetAttribute(&max_threads, - CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, + cuFuncGetAttribute(&MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, hKernel->get()) == CUDA_SUCCESS); - int warpSize = 0; + int WarpSize = 0; urKernelGetSubGroupInfo(hKernel, hDevice, UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE, - sizeof(uint32_t), &warpSize, nullptr); - int maxWarps = (max_threads + warpSize - 1) / warpSize; - return ReturnValue(static_cast(maxWarps)); + sizeof(uint32_t), &WarpSize, nullptr); + int MaxWarps = (MaxThreads + WarpSize - 1) / WarpSize; + return ReturnValue(static_cast(MaxWarps)); } case UR_KERNEL_SUB_GROUP_INFO_COMPILE_NUM_SUB_GROUPS: { // Return value of 0 => not specified @@ -298,7 +292,7 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer( ur_kernel_handle_t hKernel, uint32_t argIndex, const void *pArgValue) { - hKernel->set_kernel_arg(argIndex, sizeof(pArgValue), pArgValue); + hKernel->setKernelArg(argIndex, sizeof(pArgValue), pArgValue); return UR_RESULT_SUCCESS; } @@ -310,16 +304,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj( // Below sets kernel arg when zero-sized buffers are handled. // In such case the corresponding memory is null. if (hArgValue == nullptr) { - hKernel->set_kernel_arg(argIndex, 0, nullptr); + hKernel->setKernelArg(argIndex, 0, nullptr); return UR_RESULT_SUCCESS; } - ur_result_t retErr = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { - if (hArgValue->mem_type_ == ur_mem_handle_t_::mem_type::surface) { + if (hArgValue->MemType == ur_mem_handle_t_::Type::Surface) { CUDA_ARRAY3D_DESCRIPTOR arrayDesc; UR_CHECK_ERROR(cuArray3DGetDescriptor( - &arrayDesc, hArgValue->mem_.surface_mem_.get_array())); + &arrayDesc, hArgValue->Mem.SurfaceMem.getArray())); if (arrayDesc.Format != CU_AD_FORMAT_UNSIGNED_INT32 && arrayDesc.Format != CU_AD_FORMAT_SIGNED_INT32 && arrayDesc.Format != CU_AD_FORMAT_HALF && @@ -329,16 +323,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj( UR_RESULT_ERROR_ADAPTER_SPECIFIC); return UR_RESULT_ERROR_ADAPTER_SPECIFIC; } - CUsurfObject cuSurf = hArgValue->mem_.surface_mem_.get_surface(); - hKernel->set_kernel_arg(argIndex, sizeof(cuSurf), (void *)&cuSurf); + CUsurfObject CuSurf = hArgValue->Mem.SurfaceMem.getSurface(); + hKernel->setKernelArg(argIndex, sizeof(CuSurf), (void *)&CuSurf); } else { - CUdeviceptr cuPtr = hArgValue->mem_.buffer_mem_.get(); - hKernel->set_kernel_arg(argIndex, sizeof(CUdeviceptr), (void *)&cuPtr); + CUdeviceptr CuPtr = hArgValue->Mem.BufferMem.get(); + hKernel->setKernelArg(argIndex, sizeof(CUdeviceptr), (void *)&CuPtr); } - } catch (ur_result_t err) { - retErr = err; + } catch (ur_result_t Err) { + Result = Err; } - return retErr; + return Result; } // A NOP for the CUDA backend @@ -370,12 +364,12 @@ urKernelSetArgSampler(ur_kernel_handle_t hKernel, uint32_t argIndex, ur_sampler_handle_t hArgValue) { UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - ur_result_t retErr = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { - uint32_t samplerProps = hArgValue->props_; - hKernel->set_kernel_arg(argIndex, sizeof(uint32_t), (void *)&samplerProps); - } catch (ur_result_t err) { - retErr = err; + uint32_t SamplerProps = hArgValue->Props; + hKernel->setKernelArg(argIndex, sizeof(uint32_t), (void *)&SamplerProps); + } catch (ur_result_t Err) { + Result = Err; } - return retErr; + return Result; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp index 9308b7b408b44..3707cab1d1e0f 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp @@ -35,38 +35,37 @@ struct ur_kernel_handle_t_ { using native_type = CUfunction; - native_type function_; - native_type functionWithOffsetParam_; - std::string name_; - ur_context_handle_t context_; - ur_program_handle_t program_; - std::atomic_uint32_t refCount_; + native_type Function; + native_type FunctionWithOffsetParam; + std::string Name; + ur_context_handle_t Context; + ur_program_handle_t Program; + std::atomic_uint32_t RefCount; - static constexpr uint32_t REQD_THREADS_PER_BLOCK_DIMENSIONS = 3u; - size_t reqdThreadsPerBlock_[REQD_THREADS_PER_BLOCK_DIMENSIONS]; + static constexpr uint32_t ReqdThreadsPerBlockDimensions = 3u; + size_t ReqdThreadsPerBlock[ReqdThreadsPerBlockDimensions]; /// Structure that holds the arguments to the kernel. - /// Note earch argument size is known, since it comes + /// Note each argument size is known, since it comes /// from the kernel signature. /// This is not something can be queried from the CUDA API /// so there is a hard-coded size (\ref MAX_PARAM_BYTES) /// and a storage. - /// struct arguments { - static constexpr size_t MAX_PARAM_BYTES = 4000u; - using args_t = std::array; + static constexpr size_t MaxParamBytes = 4000u; + using args_t = std::array; using args_size_t = std::vector; using args_index_t = std::vector; - args_t storage_; - args_size_t paramSizes_; - args_index_t indices_; - args_size_t offsetPerIndex_; + args_t Storage; + args_size_t ParamSizes; + args_index_t Indices; + args_size_t OffsetPerIndex; - std::uint32_t implicitOffsetArgs_[3] = {0, 0, 0}; + std::uint32_t ImplicitOffsetArgs[3] = {0, 0, 0}; arguments() { // Place the implicit offset index at the end of the indicies collection - indices_.emplace_back(&implicitOffsetArgs_); + Indices.emplace_back(&ImplicitOffsetArgs); } /// Adds an argument to the kernel. @@ -74,126 +73,127 @@ struct ur_kernel_handle_t_ { /// Otherwise, it is added. /// Gaps are filled with empty arguments. /// Implicit offset argument is kept at the back of the indices collection. - void add_arg(size_t index, size_t size, const void *arg, - size_t localSize = 0) { - if (index + 2 > indices_.size()) { + void addArg(size_t Index, size_t Size, const void *Arg, + size_t LocalSize = 0) { + if (Index + 2 > Indices.size()) { // Move implicit offset argument index with the end - indices_.resize(index + 2, indices_.back()); + Indices.resize(Index + 2, Indices.back()); // Ensure enough space for the new argument - paramSizes_.resize(index + 1); - offsetPerIndex_.resize(index + 1); + ParamSizes.resize(Index + 1); + OffsetPerIndex.resize(Index + 1); } - paramSizes_[index] = size; + ParamSizes[Index] = Size; // calculate the insertion point on the array - size_t insertPos = std::accumulate(std::begin(paramSizes_), - std::begin(paramSizes_) + index, 0); + size_t InsertPos = std::accumulate(std::begin(ParamSizes), + std::begin(ParamSizes) + Index, 0); // Update the stored value for the argument - std::memcpy(&storage_[insertPos], arg, size); - indices_[index] = &storage_[insertPos]; - offsetPerIndex_[index] = localSize; + std::memcpy(&Storage[InsertPos], Arg, Size); + Indices[Index] = &Storage[InsertPos]; + OffsetPerIndex[Index] = LocalSize; } - void add_local_arg(size_t index, size_t size) { - size_t localOffset = this->get_local_size(); + void addLocalArg(size_t Index, size_t Size) { + size_t LocalOffset = this->getLocalSize(); // maximum required alignment is the size of the largest vector type - const size_t max_alignment = sizeof(double) * 16; + const size_t MaxAlignment = sizeof(double) * 16; // for arguments smaller than the maximum alignment simply align to the // size of the argument - const size_t alignment = std::min(max_alignment, size); + const size_t Alignment = std::min(MaxAlignment, Size); // align the argument - size_t alignedLocalOffset = localOffset; - if (localOffset % alignment != 0) { - alignedLocalOffset += alignment - (localOffset % alignment); + size_t AlignedLocalOffset = LocalOffset; + if (LocalOffset % Alignment != 0) { + AlignedLocalOffset += Alignment - (LocalOffset % Alignment); } - add_arg(index, sizeof(size_t), (const void *)&(alignedLocalOffset), - size + (alignedLocalOffset - localOffset)); + addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset), + Size + (AlignedLocalOffset - LocalOffset)); } - void set_implicit_offset(size_t size, std::uint32_t *implicitOffset) { - assert(size == sizeof(std::uint32_t) * 3); - std::memcpy(implicitOffsetArgs_, implicitOffset, size); + void setImplicitOffset(size_t Size, std::uint32_t *ImplicitOffset) { + assert(Size == sizeof(std::uint32_t) * 3); + std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size); } - void clear_local_size() { - std::fill(std::begin(offsetPerIndex_), std::end(offsetPerIndex_), 0); + void clearLocalSize() { + std::fill(std::begin(OffsetPerIndex), std::end(OffsetPerIndex), 0); } - const args_index_t &get_indices() const noexcept { return indices_; } + const args_index_t &getIndices() const noexcept { return Indices; } - uint32_t get_local_size() const { - return std::accumulate(std::begin(offsetPerIndex_), - std::end(offsetPerIndex_), 0); + uint32_t getLocalSize() const { + return std::accumulate(std::begin(OffsetPerIndex), + std::end(OffsetPerIndex), 0); } - } args_; - - ur_kernel_handle_t_(CUfunction func, CUfunction funcWithOffsetParam, - const char *name, ur_program_handle_t program, - ur_context_handle_t ctxt) - : function_{func}, functionWithOffsetParam_{funcWithOffsetParam}, - name_{name}, context_{ctxt}, program_{program}, refCount_{1} { - urProgramRetain(program_); - urContextRetain(context_); + } Args; + + ur_kernel_handle_t_(CUfunction Func, CUfunction FuncWithOffsetParam, + const char *Name, ur_program_handle_t Program, + ur_context_handle_t Context) + : Function{Func}, FunctionWithOffsetParam{FuncWithOffsetParam}, + Name{Name}, Context{Context}, Program{Program}, RefCount{1} { + urProgramRetain(Program); + urContextRetain(Context); /// Note: this code assumes that there is only one device per context ur_result_t retError = urKernelGetGroupInfo( - this, ctxt->get_device(), UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, - sizeof(reqdThreadsPerBlock_), reqdThreadsPerBlock_, nullptr); + this, Context->getDevice(), + UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, + sizeof(ReqdThreadsPerBlock), ReqdThreadsPerBlock, nullptr); assert(retError == UR_RESULT_SUCCESS); } ~ur_kernel_handle_t_() { - urProgramRelease(program_); - urContextRelease(context_); + urProgramRelease(Program); + urContextRelease(Context); } - ur_program_handle_t get_program() const noexcept { return program_; } + ur_program_handle_t get_program() const noexcept { return Program; } - uint32_t increment_reference_count() noexcept { return ++refCount_; } + uint32_t incrementReferenceCount() noexcept { return ++RefCount; } - uint32_t decrement_reference_count() noexcept { return --refCount_; } + uint32_t decrementReferenceCount() noexcept { return --RefCount; } - uint32_t get_reference_count() const noexcept { return refCount_; } + uint32_t getReferenceCount() const noexcept { return RefCount; } - native_type get() const noexcept { return function_; }; + native_type get() const noexcept { return Function; }; native_type get_with_offset_parameter() const noexcept { - return functionWithOffsetParam_; + return FunctionWithOffsetParam; }; bool has_with_offset_parameter() const noexcept { - return functionWithOffsetParam_ != nullptr; + return FunctionWithOffsetParam != nullptr; } - ur_context_handle_t get_context() const noexcept { return context_; }; + ur_context_handle_t getContext() const noexcept { return Context; }; - const char *get_name() const noexcept { return name_.c_str(); } + const char *getName() const noexcept { return Name.c_str(); } /// Returns the number of arguments, excluding the implicit global offset. /// Note this only returns the current known number of arguments, not the /// real one required by the kernel, since this cannot be queried from /// the CUDA Driver API - size_t get_num_args() const noexcept { return args_.indices_.size() - 1; } + size_t getNumArgs() const noexcept { return Args.Indices.size() - 1; } - void set_kernel_arg(int index, size_t size, const void *arg) { - args_.add_arg(index, size, arg); + void setKernelArg(int Index, size_t Size, const void *Arg) { + Args.addArg(Index, Size, Arg); } - void set_kernel_local_arg(int index, size_t size) { - args_.add_local_arg(index, size); + void setKernelLocalArg(int Index, size_t Size) { + Args.addLocalArg(Index, Size); } - void set_implicit_offset_arg(size_t size, std::uint32_t *implicitOffset) { - return args_.set_implicit_offset(size, implicitOffset); + void setImplicitOffsetArg(size_t Size, std::uint32_t *ImplicitOffset) { + return Args.setImplicitOffset(Size, ImplicitOffset); } - const arguments::args_index_t &get_arg_indices() const { - return args_.get_indices(); + const arguments::args_index_t &getArgIndices() const { + return Args.getIndices(); } - uint32_t get_local_size() const noexcept { return args_.get_local_size(); } + uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); } - void clear_local_size() { args_.clear_local_size(); } + void clearLocalSize() { Args.clearLocalSize(); } }; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp index abca91b594e19..b88d5307f4711 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp @@ -35,76 +35,74 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( // Currently, USE_HOST_PTR is not implemented using host register // since this triggers a weird segfault after program ends. // Setting this constant to true enables testing that behavior. - const bool enableUseHostPtr = false; - const bool performInitialCopy = + const bool EnableUseHostPtr = false; + const bool PerformInitialCopy = (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) || - ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && !enableUseHostPtr); - ur_result_t retErr = UR_RESULT_SUCCESS; - ur_mem_handle_t retMemObj = nullptr; + ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && !EnableUseHostPtr); + ur_result_t Result = UR_RESULT_SUCCESS; + ur_mem_handle_t MemObj = nullptr; try { - ScopedContext active(hContext); - CUdeviceptr ptr; - auto pHost = pProperties ? pProperties->pHost : nullptr; - - ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode allocMode = - ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::classic; - - if ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && enableUseHostPtr) { - retErr = UR_CHECK_ERROR( - cuMemHostRegister(pHost, size, CU_MEMHOSTREGISTER_DEVICEMAP)); - retErr = UR_CHECK_ERROR(cuMemHostGetDevicePointer(&ptr, pHost, 0)); - allocMode = ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::use_host_ptr; + ScopedContext Active(hContext); + CUdeviceptr Ptr; + auto HostPtr = pProperties ? pProperties->pHost : nullptr; + + ur_mem_handle_t_::MemImpl::BufferMem::AllocMode AllocMode = + ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::Classic; + + if ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && EnableUseHostPtr) { + Result = UR_CHECK_ERROR( + cuMemHostRegister(HostPtr, size, CU_MEMHOSTREGISTER_DEVICEMAP)); + Result = UR_CHECK_ERROR(cuMemHostGetDevicePointer(&Ptr, HostPtr, 0)); + AllocMode = ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::UseHostPtr; } else if (flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) { - retErr = UR_CHECK_ERROR(cuMemAllocHost(&pHost, size)); - retErr = UR_CHECK_ERROR(cuMemHostGetDevicePointer(&ptr, pHost, 0)); - allocMode = - ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::alloc_host_ptr; + Result = UR_CHECK_ERROR(cuMemAllocHost(&HostPtr, size)); + Result = UR_CHECK_ERROR(cuMemHostGetDevicePointer(&Ptr, HostPtr, 0)); + AllocMode = ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr; } else { - retErr = UR_CHECK_ERROR(cuMemAlloc(&ptr, size)); + Result = UR_CHECK_ERROR(cuMemAlloc(&Ptr, size)); if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) { - allocMode = ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::copy_in; + AllocMode = ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::CopyIn; } } - if (retErr == UR_RESULT_SUCCESS) { + if (Result == UR_RESULT_SUCCESS) { ur_mem_handle_t parentBuffer = nullptr; - auto piMemObj = std::unique_ptr(new ur_mem_handle_t_{ - hContext, parentBuffer, flags, allocMode, ptr, pHost, size}); - if (piMemObj != nullptr) { - retMemObj = piMemObj.release(); - if (performInitialCopy) { + auto URMemObj = std::unique_ptr(new ur_mem_handle_t_{ + hContext, parentBuffer, flags, AllocMode, Ptr, HostPtr, size}); + if (URMemObj != nullptr) { + MemObj = URMemObj.release(); + if (PerformInitialCopy) { // Operates on the default stream of the current CUDA context. - retErr = UR_CHECK_ERROR(cuMemcpyHtoD(ptr, pHost, size)); + Result = UR_CHECK_ERROR(cuMemcpyHtoD(Ptr, HostPtr, size)); // Synchronize with default stream implicitly used by cuMemcpyHtoD // to make buffer data available on device before any other UR call // uses it. - if (retErr == UR_RESULT_SUCCESS) { + if (Result == UR_RESULT_SUCCESS) { CUstream defaultStream = 0; - retErr = UR_CHECK_ERROR(cuStreamSynchronize(defaultStream)); + Result = UR_CHECK_ERROR(cuStreamSynchronize(defaultStream)); } } } else { - retErr = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + Result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } } - } catch (ur_result_t err) { - retErr = err; + } catch (ur_result_t Err) { + Result = Err; } catch (...) { - retErr = UR_RESULT_ERROR_OUT_OF_RESOURCES; + Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; } - *phBuffer = retMemObj; + *phBuffer = MemObj; - return retErr; + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) { UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hMem->get_reference_count() > 0, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - hMem->increment_reference_count(); + UR_ASSERT(hMem->getReferenceCount() > 0, UR_RESULT_ERROR_INVALID_MEM_OBJECT); + hMem->incrementReferenceCount(); return UR_RESULT_SUCCESS; } @@ -115,52 +113,52 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) { UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) { UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - ur_result_t ret = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { // Do nothing if there are other references - if (hMem->decrement_reference_count() > 0) { + if (hMem->decrementReferenceCount() > 0) { return UR_RESULT_SUCCESS; } - // make sure hMem is released in case check_error_ur throws - std::unique_ptr uniqueMemObj(hMem); + // make sure hMem is released in case checkErrorUR throws + std::unique_ptr MemObjPtr(hMem); - if (hMem->is_sub_buffer()) { + if (hMem->isSubBuffer()) { return UR_RESULT_SUCCESS; } - ScopedContext active(uniqueMemObj->get_context()); + ScopedContext Active(MemObjPtr->getContext()); - if (hMem->mem_type_ == ur_mem_handle_t_::mem_type::buffer) { - switch (uniqueMemObj->mem_.buffer_mem_.allocMode_) { - case ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::copy_in: - case ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::classic: - ret = UR_CHECK_ERROR(cuMemFree(uniqueMemObj->mem_.buffer_mem_.ptr_)); + if (hMem->MemType == ur_mem_handle_t_::Type::Buffer) { + switch (MemObjPtr->Mem.BufferMem.MemAllocMode) { + case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::CopyIn: + case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::Classic: + Result = UR_CHECK_ERROR(cuMemFree(MemObjPtr->Mem.BufferMem.Ptr)); break; - case ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::use_host_ptr: - ret = UR_CHECK_ERROR( - cuMemHostUnregister(uniqueMemObj->mem_.buffer_mem_.hostPtr_)); + case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::UseHostPtr: + Result = UR_CHECK_ERROR( + cuMemHostUnregister(MemObjPtr->Mem.BufferMem.HostPtr)); break; - case ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::alloc_host_ptr: - ret = UR_CHECK_ERROR( - cuMemFreeHost(uniqueMemObj->mem_.buffer_mem_.hostPtr_)); + case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr: + Result = + UR_CHECK_ERROR(cuMemFreeHost(MemObjPtr->Mem.BufferMem.HostPtr)); }; - } else if (hMem->mem_type_ == ur_mem_handle_t_::mem_type::surface) { - ret = UR_CHECK_ERROR( - cuSurfObjectDestroy(uniqueMemObj->mem_.surface_mem_.get_surface())); - ret = UR_CHECK_ERROR( - cuArrayDestroy(uniqueMemObj->mem_.surface_mem_.get_array())); + } else if (hMem->MemType == ur_mem_handle_t_::Type::Surface) { + Result = UR_CHECK_ERROR( + cuSurfObjectDestroy(MemObjPtr->Mem.SurfaceMem.getSurface())); + Result = + UR_CHECK_ERROR(cuArrayDestroy(MemObjPtr->Mem.SurfaceMem.getArray())); } - } catch (ur_result_t err) { - ret = err; + } catch (ur_result_t Err) { + Result = Err; } catch (...) { - ret = UR_RESULT_ERROR_OUT_OF_RESOURCES; + Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; } - if (ret != UR_RESULT_SUCCESS) { + if (Result != UR_RESULT_SUCCESS) { // A reported CUDA error is either an implementation or an asynchronous CUDA // error for which it is unclear if the function that reported it succeeded // or not. Either way, the state of the program is compromised and likely @@ -183,7 +181,7 @@ urMemGetNativeHandle(ur_mem_handle_t hMem, ur_native_handle_t *phNativeMem) { UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(phNativeMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); *phNativeMem = - reinterpret_cast(hMem->mem_.buffer_mem_.get()); + reinterpret_cast(hMem->Mem.BufferMem.get()); return UR_RESULT_SUCCESS; } @@ -195,27 +193,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory, UR_ASSERT(hMemory, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(MemInfoType <= UR_MEM_INFO_CONTEXT, UR_RESULT_ERROR_INVALID_ENUMERATION); - UR_ASSERT(hMemory->is_buffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(hMemory->isBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); UrReturnHelper ReturnValue(propSize, pMemInfo, pPropSizeRet); - ScopedContext active(hMemory->get_context()); + ScopedContext Active(hMemory->getContext()); switch (MemInfoType) { case UR_MEM_INFO_SIZE: { try { - size_t allocSize = 0; - UR_CHECK_ERROR(cuMemGetAddressRange(nullptr, &allocSize, - hMemory->mem_.buffer_mem_.ptr_)); - return ReturnValue(allocSize); - } catch (ur_result_t err) { - return err; + size_t AllocSize = 0; + UR_CHECK_ERROR(cuMemGetAddressRange(nullptr, &AllocSize, + hMemory->Mem.BufferMem.Ptr)); + return ReturnValue(AllocSize); + } catch (ur_result_t Err) { + return Err; } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } } case UR_MEM_INFO_CONTEXT: { - return ReturnValue(hMemory->get_context()); + return ReturnValue(hMemory->getContext()); } default: @@ -251,7 +249,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( (UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER | UR_MEM_FLAG_USE_HOST_POINTER)) { UR_ASSERT(pHost, UR_RESULT_ERROR_INVALID_HOST_PTR); } - const bool performInitialCopy = + const bool PerformInitialCopy = (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) || ((flags & UR_MEM_FLAG_USE_HOST_POINTER)); @@ -270,7 +268,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); } - ur_result_t retErr = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; // We only support RBGA channel order // TODO: check SYCL CTS and spec. May also have to support BGRA @@ -280,58 +278,58 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( // We have to use cuArray3DCreate, which has some caveats. The height and // depth parameters must be set to 0 produce 1D or 2D arrays. pImageDesc gives // a minimum value of 1, so we need to convert the answer. - CUDA_ARRAY3D_DESCRIPTOR array_desc; - array_desc.NumChannels = 4; // Only support 4 channel image - array_desc.Flags = 0; // No flags required - array_desc.Width = pImageDesc->width; + CUDA_ARRAY3D_DESCRIPTOR ArrayDesc; + ArrayDesc.NumChannels = 4; // Only support 4 channel image + ArrayDesc.Flags = 0; // No flags required + ArrayDesc.Width = pImageDesc->width; if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) { - array_desc.Height = 0; - array_desc.Depth = 0; + ArrayDesc.Height = 0; + ArrayDesc.Depth = 0; } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) { - array_desc.Height = pImageDesc->height; - array_desc.Depth = 0; + ArrayDesc.Height = pImageDesc->height; + ArrayDesc.Depth = 0; } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) { - array_desc.Height = pImageDesc->height; - array_desc.Depth = pImageDesc->depth; + ArrayDesc.Height = pImageDesc->height; + ArrayDesc.Depth = pImageDesc->depth; } // We need to get this now in bytes for calculating the total image size later - size_t pixel_type_size_bytes; + size_t PixelTypeSizeBytes; switch (pImageFormat->channelType) { case UR_IMAGE_CHANNEL_TYPE_UNORM_INT8: case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8: - array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT8; - pixel_type_size_bytes = 1; + ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT8; + PixelTypeSizeBytes = 1; break; case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8: - array_desc.Format = CU_AD_FORMAT_SIGNED_INT8; - pixel_type_size_bytes = 1; + ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT8; + PixelTypeSizeBytes = 1; break; case UR_IMAGE_CHANNEL_TYPE_UNORM_INT16: case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16: - array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT16; - pixel_type_size_bytes = 2; + ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT16; + PixelTypeSizeBytes = 2; break; case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16: - array_desc.Format = CU_AD_FORMAT_SIGNED_INT16; - pixel_type_size_bytes = 2; + ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT16; + PixelTypeSizeBytes = 2; break; case UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT: - array_desc.Format = CU_AD_FORMAT_HALF; - pixel_type_size_bytes = 2; + ArrayDesc.Format = CU_AD_FORMAT_HALF; + PixelTypeSizeBytes = 2; break; case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32: - array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT32; - pixel_type_size_bytes = 4; + ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT32; + PixelTypeSizeBytes = 4; break; case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32: - array_desc.Format = CU_AD_FORMAT_SIGNED_INT32; - pixel_type_size_bytes = 4; + ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT32; + PixelTypeSizeBytes = 4; break; case UR_IMAGE_CHANNEL_TYPE_FLOAT: - array_desc.Format = CU_AD_FORMAT_FLOAT; - pixel_type_size_bytes = 4; + ArrayDesc.Format = CU_AD_FORMAT_FLOAT; + PixelTypeSizeBytes = 4; break; default: sycl::detail::ur::die( @@ -339,51 +337,51 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( } // When a dimension isn't used pImageDesc has the size set to 1 - size_t pixel_size_bytes = - pixel_type_size_bytes * 4; // 4 is the only number of channels we support - size_t image_size_bytes = pixel_size_bytes * pImageDesc->width * - pImageDesc->height * pImageDesc->depth; + size_t PixelSizeBytes = + PixelTypeSizeBytes * 4; // 4 is the only number of channels we support + size_t ImageSizeBytes = PixelSizeBytes * pImageDesc->width * + pImageDesc->height * pImageDesc->depth; - ScopedContext active(hContext); - CUarray image_array = nullptr; + ScopedContext Active(hContext); + CUarray ImageArray = nullptr; try { - retErr = UR_CHECK_ERROR(cuArray3DCreate(&image_array, &array_desc)); - } catch (ur_result_t err) { - if (err == UR_RESULT_ERROR_INVALID_VALUE) { + Result = UR_CHECK_ERROR(cuArray3DCreate(&ImageArray, &ArrayDesc)); + } catch (ur_result_t Err) { + if (Err == UR_RESULT_ERROR_INVALID_VALUE) { return UR_RESULT_ERROR_INVALID_IMAGE_SIZE; } - return err; + return Err; } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } try { - if (performInitialCopy) { + if (PerformInitialCopy) { // We have to use a different copy function for each image dimensionality if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) { - retErr = UR_CHECK_ERROR( - cuMemcpyHtoA(image_array, 0, pHost, image_size_bytes)); + Result = + UR_CHECK_ERROR(cuMemcpyHtoA(ImageArray, 0, pHost, ImageSizeBytes)); } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) { - CUDA_MEMCPY2D cpy_desc; - memset(&cpy_desc, 0, sizeof(cpy_desc)); - cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; - cpy_desc.srcHost = pHost; - cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; - cpy_desc.dstArray = image_array; - cpy_desc.WidthInBytes = pixel_size_bytes * pImageDesc->width; - cpy_desc.Height = pImageDesc->height; - retErr = UR_CHECK_ERROR(cuMemcpy2D(&cpy_desc)); + CUDA_MEMCPY2D CpyDesc; + memset(&CpyDesc, 0, sizeof(CpyDesc)); + CpyDesc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; + CpyDesc.srcHost = pHost; + CpyDesc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; + CpyDesc.dstArray = ImageArray; + CpyDesc.WidthInBytes = PixelSizeBytes * pImageDesc->width; + CpyDesc.Height = pImageDesc->height; + Result = UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc)); } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) { - CUDA_MEMCPY3D cpy_desc; - memset(&cpy_desc, 0, sizeof(cpy_desc)); - cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; - cpy_desc.srcHost = pHost; - cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; - cpy_desc.dstArray = image_array; - cpy_desc.WidthInBytes = pixel_size_bytes * pImageDesc->width; - cpy_desc.Height = pImageDesc->height; - cpy_desc.Depth = pImageDesc->depth; - retErr = UR_CHECK_ERROR(cuMemcpy3D(&cpy_desc)); + CUDA_MEMCPY3D CpyDesc; + memset(&CpyDesc, 0, sizeof(CpyDesc)); + CpyDesc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; + CpyDesc.srcHost = pHost; + CpyDesc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; + CpyDesc.dstArray = ImageArray; + CpyDesc.WidthInBytes = PixelSizeBytes * pImageDesc->width; + CpyDesc.Height = pImageDesc->height; + CpyDesc.Depth = pImageDesc->depth; + Result = UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc)); } } @@ -396,35 +394,35 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( // handle. // CUDA_RESOURCE_DESC::flags must be set to zero - CUDA_RESOURCE_DESC image_res_desc; - image_res_desc.res.array.hArray = image_array; - image_res_desc.resType = CU_RESOURCE_TYPE_ARRAY; - image_res_desc.flags = 0; + CUDA_RESOURCE_DESC ImageResDesc; + ImageResDesc.res.array.hArray = ImageArray; + ImageResDesc.resType = CU_RESOURCE_TYPE_ARRAY; + ImageResDesc.flags = 0; - CUsurfObject surface; - retErr = UR_CHECK_ERROR(cuSurfObjectCreate(&surface, &image_res_desc)); + CUsurfObject Surface; + Result = UR_CHECK_ERROR(cuSurfObjectCreate(&Surface, &ImageResDesc)); - auto urMemObj = std::unique_ptr(new ur_mem_handle_t_( - hContext, image_array, surface, flags, pImageDesc->type, phMem)); + auto MemObj = std::unique_ptr(new ur_mem_handle_t_( + hContext, ImageArray, Surface, flags, pImageDesc->type, phMem)); - if (urMemObj == nullptr) { + if (MemObj == nullptr) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } - *phMem = urMemObj.release(); - } catch (ur_result_t err) { - if (image_array) { - cuArrayDestroy(image_array); + *phMem = MemObj.release(); + } catch (ur_result_t Err) { + if (ImageArray) { + cuArrayDestroy(ImageArray); } - return err; + return Err; } catch (...) { - if (image_array) { - cuArrayDestroy(image_array); + if (ImageArray) { + cuArrayDestroy(ImageArray); } return UR_RESULT_ERROR_UNKNOWN; } - return retErr; + return Result; } /// \TODO Not implemented @@ -445,8 +443,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition( UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT((flags & UR_MEM_FLAGS_MASK) == 0, UR_RESULT_ERROR_INVALID_ENUMERATION); - UR_ASSERT(hBuffer->is_buffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(!hBuffer->is_sub_buffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(hBuffer->isBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(!hBuffer->isSubBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); // Default value for flags means UR_MEM_FLAG_READ_WRITE. if (flags == 0) { @@ -457,11 +455,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition( (UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER | UR_MEM_FLAG_ALLOC_HOST_POINTER | UR_MEM_FLAG_USE_HOST_POINTER)), UR_RESULT_ERROR_INVALID_VALUE); - if (hBuffer->memFlags_ & UR_MEM_FLAG_WRITE_ONLY) { + if (hBuffer->MemFlags & UR_MEM_FLAG_WRITE_ONLY) { UR_ASSERT(!(flags & (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_READ_ONLY)), UR_RESULT_ERROR_INVALID_VALUE); } - if (hBuffer->memFlags_ & UR_MEM_FLAG_READ_ONLY) { + if (hBuffer->MemFlags & UR_MEM_FLAG_READ_ONLY) { UR_ASSERT(!(flags & (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)), UR_RESULT_ERROR_INVALID_VALUE); } @@ -474,38 +472,38 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition( UR_ASSERT(pRegion->size != 0u, UR_RESULT_ERROR_INVALID_BUFFER_SIZE); assert((pRegion->origin <= (pRegion->origin + pRegion->size)) && "Overflow"); - UR_ASSERT(((pRegion->origin + pRegion->size) <= - hBuffer->mem_.buffer_mem_.get_size()), - UR_RESULT_ERROR_INVALID_BUFFER_SIZE); + UR_ASSERT( + ((pRegion->origin + pRegion->size) <= hBuffer->Mem.BufferMem.getSize()), + UR_RESULT_ERROR_INVALID_BUFFER_SIZE); // Retained indirectly due to retaining parent buffer below. - ur_context_handle_t context = hBuffer->context_; + ur_context_handle_t Context = hBuffer->Context; - ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode allocMode = - ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::classic; + ur_mem_handle_t_::MemImpl::BufferMem::AllocMode AllocMode = + ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::Classic; - assert(hBuffer->mem_.buffer_mem_.ptr_ != - ur_mem_handle_t_::mem_::buffer_mem_::native_type{0}); - ur_mem_handle_t_::mem_::buffer_mem_::native_type ptr = - hBuffer->mem_.buffer_mem_.ptr_ + pRegion->origin; + assert(hBuffer->Mem.BufferMem.Ptr != + ur_mem_handle_t_::MemImpl::BufferMem::native_type{0}); + ur_mem_handle_t_::MemImpl::BufferMem::native_type Ptr = + hBuffer->Mem.BufferMem.Ptr + pRegion->origin; - void *hostPtr = nullptr; - if (hBuffer->mem_.buffer_mem_.hostPtr_) { - hostPtr = static_cast(hBuffer->mem_.buffer_mem_.hostPtr_) + - pRegion->origin; + void *HostPtr = nullptr; + if (hBuffer->Mem.BufferMem.HostPtr) { + HostPtr = + static_cast(hBuffer->Mem.BufferMem.HostPtr) + pRegion->origin; } - std::unique_ptr retMemObj{nullptr}; + std::unique_ptr MemObj{nullptr}; try { - retMemObj = std::unique_ptr{new ur_mem_handle_t_{ - context, hBuffer, flags, allocMode, ptr, hostPtr, pRegion->size}}; - } catch (ur_result_t err) { + MemObj = std::unique_ptr{new ur_mem_handle_t_{ + Context, hBuffer, flags, AllocMode, Ptr, HostPtr, pRegion->size}}; + } catch (ur_result_t Err) { *phMem = nullptr; - return err; + return Err; } catch (...) { *phMem = nullptr; return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } - *phMem = retMemObj.release(); + *phMem = MemObj.release(); return UR_RESULT_SUCCESS; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp index 5712218b06425..a1b484e3212bf 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp @@ -18,178 +18,173 @@ /// Keeps tracks of all mapped regions used for Map/Unmap calls. /// Only one region can be active at the same time per allocation. struct ur_mem_handle_t_ { - - // TODO: Move as much shared data up as possible - using ur_context = ur_context_handle_t_ *; - using ur_mem = ur_mem_handle_t_ *; - // Context where the memory object is accessibles - ur_context context_; + ur_context_handle_t Context; /// Reference counting of the handler - std::atomic_uint32_t refCount_; - enum class mem_type { buffer, surface } mem_type_; + std::atomic_uint32_t RefCount; + enum class Type { Buffer, Surface } MemType; // Original mem flags passed - ur_mem_flags_t memFlags_; + ur_mem_flags_t MemFlags; /// A UR Memory object represents either plain memory allocations ("Buffers" /// in OpenCL) or typed allocations ("Images" in OpenCL). /// In CUDA their API handlers are different. Whereas "Buffers" are allocated /// as pointer-like structs, "Images" are stored in Textures or Surfaces /// This union allows implementation to use either from the same handler. - union mem_ { + union MemImpl { // Handler for plain, pointer-based CUDA allocations - struct buffer_mem_ { + struct BufferMem { using native_type = CUdeviceptr; // If this allocation is a sub-buffer (i.e., a view on an existing // allocation), this is the pointer to the parent handler structure - ur_mem parent_; + ur_mem_handle_t Parent; // CUDA handler for the pointer - native_type ptr_; + native_type Ptr; /// Pointer associated with this device on the host - void *hostPtr_; + void *HostPtr; /// Size of the allocation in bytes - size_t size_; + size_t Size; /// Offset of the active mapped region. - size_t mapOffset_; + size_t MapOffset; /// Pointer to the active mapped region, if any - void *mapPtr_; + void *MapPtr; /// Original flags for the mapped region - ur_map_flags_t mapFlags_; + ur_map_flags_t MapFlags; - /** alloc_mode + /** AllocMode * classic: Just a normal buffer allocated on the device via cuda malloc * use_host_ptr: Use an address on the host for the device * copy_in: The data for the device comes from the host but the host pointer is not available later for re-use * alloc_host_ptr: Uses pinned-memory allocation */ - enum class alloc_mode { - classic, - use_host_ptr, - copy_in, - alloc_host_ptr - } allocMode_; + enum class AllocMode { + Classic, + UseHostPtr, + CopyIn, + AllocHostPtr, + } MemAllocMode; - native_type get() const noexcept { return ptr_; } + native_type get() const noexcept { return Ptr; } - size_t get_size() const noexcept { return size_; } + size_t getSize() const noexcept { return Size; } - void *get_map_ptr() const noexcept { return mapPtr_; } + void *getMapPtr() const noexcept { return MapPtr; } - size_t get_map_offset(void *) const noexcept { return mapOffset_; } + size_t getMapOffset(void *) const noexcept { return MapOffset; } /// Returns a pointer to data visible on the host that contains /// the data on the device associated with this allocation. /// The offset is used to index into the CUDA allocation. /// - void *map_to_ptr(size_t offset, ur_map_flags_t flags) noexcept { - assert(mapPtr_ == nullptr); - mapOffset_ = offset; - mapFlags_ = flags; - if (hostPtr_) { - mapPtr_ = static_cast(hostPtr_) + offset; + void *mapToPtr(size_t Offset, ur_map_flags_t Flags) noexcept { + assert(MapPtr == nullptr); + MapOffset = Offset; + MapFlags = Flags; + if (HostPtr) { + MapPtr = static_cast(HostPtr) + Offset; } else { // TODO: Allocate only what is needed based on the offset - mapPtr_ = static_cast(malloc(this->get_size())); + MapPtr = static_cast(malloc(this->getSize())); } - return mapPtr_; + return MapPtr; } /// Detach the allocation from the host memory. void unmap(void *) noexcept { - assert(mapPtr_ != nullptr); + assert(MapPtr != nullptr); - if (mapPtr_ != hostPtr_) { - free(mapPtr_); + if (MapPtr != HostPtr) { + free(MapPtr); } - mapPtr_ = nullptr; - mapOffset_ = 0; + MapPtr = nullptr; + MapOffset = 0; } - ur_map_flags_t get_map_flags() const noexcept { - assert(mapPtr_ != nullptr); - return mapFlags_; + ur_map_flags_t getMapFlags() const noexcept { + assert(MapPtr != nullptr); + return MapFlags; } - } buffer_mem_; + } BufferMem; // Handler data for surface object (i.e. Images) - struct surface_mem_ { - CUarray array_; - CUsurfObject surfObj_; - ur_mem_type_t imageType_; + struct SurfaceMem { + CUarray Array; + CUsurfObject SurfObj; + ur_mem_type_t ImageType; - CUarray get_array() const noexcept { return array_; } + CUarray getArray() const noexcept { return Array; } - CUsurfObject get_surface() const noexcept { return surfObj_; } + CUsurfObject getSurface() const noexcept { return SurfObj; } - ur_mem_type_t get_image_type() const noexcept { return imageType_; } - } surface_mem_; - } mem_; + ur_mem_type_t getImageType() const noexcept { return ImageType; } + } SurfaceMem; + } Mem; /// Constructs the UR mem handler for a non-typed allocation ("buffer") - ur_mem_handle_t_(ur_context ctxt, ur_mem parent, ur_mem_flags_t mem_flags, - mem_::buffer_mem_::alloc_mode mode, CUdeviceptr ptr, - void *host_ptr, size_t size) - : context_{ctxt}, refCount_{1}, mem_type_{mem_type::buffer}, - memFlags_{mem_flags} { - mem_.buffer_mem_.ptr_ = ptr; - mem_.buffer_mem_.parent_ = parent; - mem_.buffer_mem_.hostPtr_ = host_ptr; - mem_.buffer_mem_.size_ = size; - mem_.buffer_mem_.mapOffset_ = 0; - mem_.buffer_mem_.mapPtr_ = nullptr; - mem_.buffer_mem_.mapFlags_ = UR_MAP_FLAG_WRITE; - mem_.buffer_mem_.allocMode_ = mode; - if (is_sub_buffer()) { - urMemRetain(mem_.buffer_mem_.parent_); + ur_mem_handle_t_(ur_context_handle_t Context, ur_mem_handle_t Parent, + ur_mem_flags_t MemFlags, MemImpl::BufferMem::AllocMode Mode, + CUdeviceptr Ptr, void *HostPtr, size_t Size) + : Context{Context}, RefCount{1}, MemType{Type::Buffer}, + MemFlags{MemFlags} { + Mem.BufferMem.Ptr = Ptr; + Mem.BufferMem.Parent = Parent; + Mem.BufferMem.HostPtr = HostPtr; + Mem.BufferMem.Size = Size; + Mem.BufferMem.MapOffset = 0; + Mem.BufferMem.MapPtr = nullptr; + Mem.BufferMem.MapFlags = UR_MAP_FLAG_WRITE; + Mem.BufferMem.MemAllocMode = Mode; + if (isSubBuffer()) { + urMemRetain(Mem.BufferMem.Parent); } else { - urContextRetain(context_); + urContextRetain(Context); } }; /// Constructs the UR allocation for an Image object (surface in CUDA) - ur_mem_handle_t_(ur_context ctxt, CUarray array, CUsurfObject surf, - ur_mem_flags_t mem_flags, ur_mem_type_t image_type, - void *host_ptr) - : context_{ctxt}, refCount_{1}, mem_type_{mem_type::surface}, - memFlags_{mem_flags} { + ur_mem_handle_t_(ur_context_handle_t Context, CUarray Array, + CUsurfObject Surf, ur_mem_flags_t MemFlags, + ur_mem_type_t ImageType, void *HostPtr) + : Context{Context}, RefCount{1}, MemType{Type::Surface}, + MemFlags{MemFlags} { // Ignore unused parameter - (void)host_ptr; + (void)HostPtr; - mem_.surface_mem_.array_ = array; - mem_.surface_mem_.surfObj_ = surf; - mem_.surface_mem_.imageType_ = image_type; - urContextRetain(context_); + Mem.SurfaceMem.Array = Array; + Mem.SurfaceMem.SurfObj = Surf; + Mem.SurfaceMem.ImageType = ImageType; + urContextRetain(Context); } ~ur_mem_handle_t_() { - if (mem_type_ == mem_type::buffer) { - if (is_sub_buffer()) { - urMemRelease(mem_.buffer_mem_.parent_); + if (MemType == Type::Buffer) { + if (isSubBuffer()) { + urMemRelease(Mem.BufferMem.Parent); return; } } - urContextRelease(context_); + urContextRelease(Context); } // TODO: Move as many shared funcs up as possible - bool is_buffer() const noexcept { return mem_type_ == mem_type::buffer; } + bool isBuffer() const noexcept { return MemType == Type::Buffer; } - bool is_sub_buffer() const noexcept { - return (is_buffer() && (mem_.buffer_mem_.parent_ != nullptr)); + bool isSubBuffer() const noexcept { + return (isBuffer() && (Mem.BufferMem.Parent != nullptr)); } - bool is_image() const noexcept { return mem_type_ == mem_type::surface; } + bool isImage() const noexcept { return MemType == Type::Surface; } - ur_context get_context() const noexcept { return context_; } + ur_context_handle_t getContext() const noexcept { return Context; } - uint32_t increment_reference_count() noexcept { return ++refCount_; } + uint32_t incrementReferenceCount() noexcept { return ++RefCount; } - uint32_t decrement_reference_count() noexcept { return --refCount_; } + uint32_t decrementReferenceCount() noexcept { return --RefCount; } - uint32_t get_reference_count() const noexcept { return refCount_; } + uint32_t getReferenceCount() const noexcept { return RefCount; } }; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp index fdf0f723e168f..f28f76c2a95df 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp @@ -33,8 +33,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGetInfo( case UR_PLATFORM_INFO_PROFILE: return ReturnValue("FULL PROFILE"); case UR_PLATFORM_INFO_VERSION: { - auto version = getCudaVersionString(); - return ReturnValue(version.c_str()); + auto Version = getCudaVersionString(); + return ReturnValue(Version.c_str()); } case UR_PLATFORM_INFO_EXTENSIONS: { return ReturnValue(""); @@ -62,102 +62,103 @@ urPlatformGet(uint32_t NumEntries, ur_platform_handle_t *phPlatforms, uint32_t *pNumPlatforms) { try { - static std::once_flag initFlag; - static uint32_t numPlatforms = 1; - static std::vector platformIds; + static std::once_flag InitFlag; + static uint32_t NumPlatforms = 1; + static std::vector Platforms; UR_ASSERT(phPlatforms || pNumPlatforms, UR_RESULT_ERROR_INVALID_VALUE); UR_ASSERT(!phPlatforms || NumEntries > 0, UR_RESULT_ERROR_INVALID_SIZE); - ur_result_t err = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; std::call_once( - initFlag, - [](ur_result_t &err) { + InitFlag, + [](ur_result_t &Result) { if (cuInit(0) != CUDA_SUCCESS) { - numPlatforms = 0; + NumPlatforms = 0; return; } - int numDevices = 0; - err = UR_CHECK_ERROR(cuDeviceGetCount(&numDevices)); - if (numDevices == 0) { - numPlatforms = 0; + int NumDevices = 0; + Result = UR_CHECK_ERROR(cuDeviceGetCount(&NumDevices)); + if (NumDevices == 0) { + NumPlatforms = 0; return; } try { // make one platform per device - numPlatforms = numDevices; - platformIds.resize(numDevices); + NumPlatforms = NumDevices; + Platforms.resize(NumDevices); - for (int i = 0; i < numDevices; ++i) { - CUdevice device; - err = UR_CHECK_ERROR(cuDeviceGet(&device, i)); - CUcontext context; - err = UR_CHECK_ERROR(cuDevicePrimaryCtxRetain(&context, device)); + for (int i = 0; i < NumDevices; ++i) { + CUdevice Device; + Result = UR_CHECK_ERROR(cuDeviceGet(&Device, i)); + CUcontext Context; + Result = + UR_CHECK_ERROR(cuDevicePrimaryCtxRetain(&Context, Device)); - ScopedContext active(context); - CUevent evBase; - err = UR_CHECK_ERROR(cuEventCreate(&evBase, CU_EVENT_DEFAULT)); + ScopedContext active(Context); + CUevent EvBase; + Result = UR_CHECK_ERROR(cuEventCreate(&EvBase, CU_EVENT_DEFAULT)); // Use default stream to record base event counter - err = UR_CHECK_ERROR(cuEventRecord(evBase, 0)); + Result = UR_CHECK_ERROR(cuEventRecord(EvBase, 0)); - platformIds[i].devices_.emplace_back(new ur_device_handle_t_{ - device, context, evBase, &platformIds[i]}); + Platforms[i].Devices.emplace_back(new ur_device_handle_t_{ + Device, Context, EvBase, &Platforms[i]}); { - const auto &dev = platformIds[i].devices_.back().get(); - size_t maxWorkGroupSize = 0u; - size_t maxThreadsPerBlock[3] = {}; - ur_result_t retError = urDeviceGetInfo( - dev, UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES, - sizeof(maxThreadsPerBlock), maxThreadsPerBlock, nullptr); - if (retError != UR_RESULT_SUCCESS) { - throw retError; + const auto &Dev = Platforms[i].Devices.back().get(); + size_t MaxWorkGroupSize = 0u; + size_t MaxThreadsPerBlock[3] = {}; + ur_result_t RetError = urDeviceGetInfo( + Dev, UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES, + sizeof(MaxThreadsPerBlock), MaxThreadsPerBlock, nullptr); + if (RetError != UR_RESULT_SUCCESS) { + throw RetError; } - retError = urDeviceGetInfo( - dev, UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE, - sizeof(maxWorkGroupSize), &maxWorkGroupSize, nullptr); - if (retError != UR_RESULT_SUCCESS) { - throw retError; + RetError = urDeviceGetInfo( + Dev, UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE, + sizeof(MaxWorkGroupSize), &MaxWorkGroupSize, nullptr); + if (RetError != UR_RESULT_SUCCESS) { + throw RetError; } - dev->save_max_work_item_sizes(sizeof(maxThreadsPerBlock), - maxThreadsPerBlock); - dev->save_max_work_group_size(maxWorkGroupSize); + Dev->saveMaxWorkItemSizes(sizeof(MaxThreadsPerBlock), + MaxThreadsPerBlock); + Dev->saveMaxWorkGroupSize(MaxWorkGroupSize); } } } catch (const std::bad_alloc &) { // Signal out-of-memory situation - for (int i = 0; i < numDevices; ++i) { - platformIds[i].devices_.clear(); + for (int i = 0; i < NumDevices; ++i) { + Platforms[i].Devices.clear(); } - platformIds.clear(); - err = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + Platforms.clear(); + Result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } catch (...) { // Clear and rethrow to allow retry - for (int i = 0; i < numDevices; ++i) { - platformIds[i].devices_.clear(); + for (int i = 0; i < NumDevices; ++i) { + Platforms[i].Devices.clear(); } - platformIds.clear(); + Platforms.clear(); throw; } }, - err); + Result); if (pNumPlatforms != nullptr) { - *pNumPlatforms = numPlatforms; + *pNumPlatforms = NumPlatforms; } if (phPlatforms != nullptr) { - for (unsigned i = 0; i < std::min(NumEntries, numPlatforms); ++i) { - phPlatforms[i] = &platformIds[i]; + for (unsigned i = 0; i < std::min(NumEntries, NumPlatforms); ++i) { + phPlatforms[i] = &Platforms[i]; } } - return err; - } catch (ur_result_t err) { - return err; + return Result; + } catch (ur_result_t Err) { + return Err; } catch (...) { return UR_RESULT_ERROR_OUT_OF_RESOURCES; } @@ -189,7 +190,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urTearDown(void *) { UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetBackendOption( ur_platform_handle_t hPlatform, const char *pFrontendOption, const char **ppPlatformOption) { - (void)hPlatform; + std::ignore = hPlatform; using namespace std::literals; if (pFrontendOption == nullptr) return UR_RESULT_ERROR_INVALID_NULL_POINTER; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.hpp index 5b2e79f49be8d..187290718aebf 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.hpp @@ -11,5 +11,5 @@ #include struct ur_platform_handle_t_ { - std::vector> devices_; + std::vector> Devices; }; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp index 314a9a866c813..ce8d7c705ae83 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp @@ -8,48 +8,47 @@ #include "program.hpp" -bool getMaxRegistersJitOptionValue(const std::string &build_options, - unsigned int &value) { +bool getMaxRegistersJitOptionValue(const std::string &BuildOptions, + unsigned int &Value) { using namespace std::string_view_literals; - const std::size_t optionPos = build_options.find_first_of("maxrregcount"sv); - if (optionPos == std::string::npos) { + const std::size_t OptionPos = BuildOptions.find_first_of("maxrregcount"sv); + if (OptionPos == std::string::npos) { return false; } - const std::size_t delimPos = build_options.find('=', optionPos + 1u); - if (delimPos == std::string::npos) { + const std::size_t DelimPos = BuildOptions.find('=', OptionPos + 1u); + if (DelimPos == std::string::npos) { return false; } - const std::size_t length = build_options.length(); - const std::size_t startPos = delimPos + 1u; - if (delimPos == std::string::npos || startPos >= length) { + const std::size_t Length = BuildOptions.length(); + const std::size_t StartPos = DelimPos + 1u; + if (DelimPos == std::string::npos || StartPos >= Length) { return false; } - std::size_t pos = startPos; - while (pos < length && - std::isdigit(static_cast(build_options[pos]))) { - pos++; + std::size_t Pos = StartPos; + while (Pos < Length && + std::isdigit(static_cast(BuildOptions[Pos]))) { + Pos++; } - const std::string valueString = - build_options.substr(startPos, pos - startPos); - if (valueString.empty()) { + const std::string ValueString = BuildOptions.substr(StartPos, Pos - StartPos); + if (ValueString.empty()) { return false; } - value = static_cast(std::stoi(valueString)); + Value = static_cast(std::stoi(ValueString)); return true; } -ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t ctxt) - : module_{nullptr}, binary_{}, binarySizeInBytes_{0}, refCount_{1}, - context_{ctxt}, kernelReqdWorkGroupSizeMD_{} { - urContextRetain(context_); +ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t Context) + : Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1}, + Context{Context}, KernelReqdWorkGroupSizeMD{} { + urContextRetain(Context); } -ur_program_handle_t_::~ur_program_handle_t_() { urContextRelease(context_); } +ur_program_handle_t_::~ur_program_handle_t_() { urContextRelease(Context); } std::pair splitMetadataName(const std::string &metadataName) { @@ -61,18 +60,18 @@ splitMetadataName(const std::string &metadataName) { } ur_result_t -ur_program_handle_t_::set_metadata(const ur_program_metadata_t *metadata, - size_t length) { - for (size_t i = 0; i < length; ++i) { - const ur_program_metadata_t metadataElement = metadata[i]; - std::string metadataElementName{metadataElement.pName}; +ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata, + size_t Length) { + for (size_t i = 0; i < Length; ++i) { + const ur_program_metadata_t MetadataElement = Metadata[i]; + std::string MetadataElementName{MetadataElement.pName}; - auto [prefix, tag] = splitMetadataName(metadataElementName); + auto [Prefix, Tag] = splitMetadataName(MetadataElementName); - if (tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE) { + if (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE) { // If metadata is reqd_work_group_size, record it for the corresponding // kernel name. - size_t MDElemsSize = metadataElement.size - sizeof(std::uint64_t); + size_t MDElemsSize = MetadataElement.size - sizeof(std::uint64_t); // Expect between 1 and 3 32-bit integer values. UR_ASSERT(MDElemsSize >= sizeof(std::uint32_t) && @@ -81,80 +80,79 @@ ur_program_handle_t_::set_metadata(const ur_program_metadata_t *metadata, // Get pointer to data, skipping 64-bit size at the start of the data. const char *ValuePtr = - reinterpret_cast(metadataElement.value.pData) + + reinterpret_cast(MetadataElement.value.pData) + sizeof(std::uint64_t); // Read values and pad with 1's for values not present. - std::uint32_t reqdWorkGroupElements[] = {1, 1, 1}; - std::memcpy(reqdWorkGroupElements, ValuePtr, MDElemsSize); - kernelReqdWorkGroupSizeMD_[prefix] = - std::make_tuple(reqdWorkGroupElements[0], reqdWorkGroupElements[1], - reqdWorkGroupElements[2]); - } else if (tag == __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING) { - const char *metadataValPtr = - reinterpret_cast(metadataElement.value.pData) + + std::uint32_t ReqdWorkGroupElements[] = {1, 1, 1}; + std::memcpy(ReqdWorkGroupElements, ValuePtr, MDElemsSize); + KernelReqdWorkGroupSizeMD[Prefix] = + std::make_tuple(ReqdWorkGroupElements[0], ReqdWorkGroupElements[1], + ReqdWorkGroupElements[2]); + } else if (Tag == __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING) { + const char *MetadataValPtr = + reinterpret_cast(MetadataElement.value.pData) + sizeof(std::uint64_t); - const char *metadataValPtrEnd = - metadataValPtr + metadataElement.size - sizeof(std::uint64_t); - globalIDMD_[prefix] = std::string{metadataValPtr, metadataValPtrEnd}; + const char *MetadataValPtrEnd = + MetadataValPtr + MetadataElement.size - sizeof(std::uint64_t); + GlobalIDMD[Prefix] = std::string{MetadataValPtr, MetadataValPtrEnd}; } } return UR_RESULT_SUCCESS; } -ur_result_t ur_program_handle_t_::set_binary(const char *source, - size_t length) { +ur_result_t ur_program_handle_t_::setBinary(const char *Source, size_t Length) { // Do not re-set program binary data which has already been set as that will // delete the old binary data. - UR_ASSERT(binary_ == nullptr && binarySizeInBytes_ == 0, + UR_ASSERT(Binary == nullptr && BinarySizeInBytes == 0, UR_RESULT_ERROR_INVALID_OPERATION); - binary_ = source; - binarySizeInBytes_ = length; + Binary = Source; + BinarySizeInBytes = Length; return UR_RESULT_SUCCESS; } -ur_result_t ur_program_handle_t_::build_program(const char *build_options) { - if (build_options) { - this->buildOptions_ = build_options; +ur_result_t ur_program_handle_t_::buildProgram(const char *BuildOptions) { + if (BuildOptions) { + this->BuildOptions = BuildOptions; } - constexpr const unsigned int numberOfOptions = 4u; + constexpr const unsigned int NumberOfOptions = 4u; - std::vector options(numberOfOptions); - std::vector optionVals(numberOfOptions); + std::vector Options(NumberOfOptions); + std::vector OptionVals(NumberOfOptions); // Pass a buffer for info messages - options[0] = CU_JIT_INFO_LOG_BUFFER; - optionVals[0] = (void *)infoLog_; + Options[0] = CU_JIT_INFO_LOG_BUFFER; + OptionVals[0] = (void *)InfoLog; // Pass the size of the info buffer - options[1] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; - optionVals[1] = (void *)(long)MAX_LOG_SIZE; + Options[1] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; + OptionVals[1] = (void *)(long)MaxLogSize; // Pass a buffer for error message - options[2] = CU_JIT_ERROR_LOG_BUFFER; - optionVals[2] = (void *)errorLog_; + Options[2] = CU_JIT_ERROR_LOG_BUFFER; + OptionVals[2] = (void *)ErrorLog; // Pass the size of the error buffer - options[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; - optionVals[3] = (void *)(long)MAX_LOG_SIZE; - - if (!buildOptions_.empty()) { - unsigned int maxRegs; - bool valid = getMaxRegistersJitOptionValue(buildOptions_, maxRegs); - if (valid) { - options.push_back(CU_JIT_MAX_REGISTERS); - optionVals.push_back(reinterpret_cast(maxRegs)); + Options[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; + OptionVals[3] = (void *)(long)MaxLogSize; + + if (!this->BuildOptions.empty()) { + unsigned int MaxRegs; + bool Valid = getMaxRegistersJitOptionValue(BuildOptions, MaxRegs); + if (Valid) { + Options.push_back(CU_JIT_MAX_REGISTERS); + OptionVals.push_back(reinterpret_cast(MaxRegs)); } } auto result = UR_CHECK_ERROR( - cuModuleLoadDataEx(&module_, static_cast(binary_), - options.size(), options.data(), optionVals.data())); + cuModuleLoadDataEx(&Module, static_cast(Binary), + Options.size(), Options.data(), OptionVals.data())); - const auto success = (result == UR_RESULT_SUCCESS); + const auto Success = (result == UR_RESULT_SUCCESS); - buildStatus_ = - success ? UR_PROGRAM_BUILD_STATUS_SUCCESS : UR_PROGRAM_BUILD_STATUS_ERROR; + BuildStatus = + Success ? UR_PROGRAM_BUILD_STATUS_SUCCESS : UR_PROGRAM_BUILD_STATUS_ERROR; // If no exception, result is correct - return success ? UR_RESULT_SUCCESS : UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE; + return Success ? UR_RESULT_SUCCESS : UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE; } /// Finds kernel names by searching for entry points in the PTX source, as the @@ -178,7 +176,7 @@ urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL, ur_program_handle_t *phProgram) { UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - ur_device_handle_t hDevice = hContext->get_device(); + ur_device_handle_t hDevice = hContext->getDevice(); auto pBinary = reinterpret_cast(pIL); return urProgramCreateWithBinary(hContext, hDevice, length, pBinary, @@ -204,17 +202,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext, const char *pOptions) { UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - ur_result_t retError = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { - ScopedContext active(hProgram->get_context()); + ScopedContext Active(hProgram->getContext()); - hProgram->build_program(pOptions); + hProgram->buildProgram(pOptions); - } catch (ur_result_t err) { - retError = err; + } catch (ur_result_t Err) { + Result = Err; } - return retError; + return Result; } /// Creates a new UR program object that is the outcome of linking all input @@ -230,44 +228,44 @@ urProgramLink(ur_context_handle_t hContext, uint32_t count, UR_ASSERT(phPrograms, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(phProgram, UR_RESULT_ERROR_INVALID_NULL_POINTER); - ur_result_t retError = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { - ScopedContext active(hContext); + ScopedContext Active(hContext); - CUlinkState state; - std::unique_ptr retProgram{ + CUlinkState State; + std::unique_ptr RetProgram{ new ur_program_handle_t_{hContext}}; - retError = UR_CHECK_ERROR(cuLinkCreate(0, nullptr, nullptr, &state)); + Result = UR_CHECK_ERROR(cuLinkCreate(0, nullptr, nullptr, &State)); try { for (size_t i = 0; i < count; ++i) { - ur_program_handle_t program = phPrograms[i]; - retError = UR_CHECK_ERROR(cuLinkAddData( - state, CU_JIT_INPUT_PTX, const_cast(program->binary_), - program->binarySizeInBytes_, nullptr, 0, nullptr, nullptr)); + ur_program_handle_t Program = phPrograms[i]; + Result = UR_CHECK_ERROR(cuLinkAddData( + State, CU_JIT_INPUT_PTX, const_cast(Program->Binary), + Program->BinarySizeInBytes, nullptr, 0, nullptr, nullptr)); } - void *cubin = nullptr; - size_t cubinSize = 0; - retError = UR_CHECK_ERROR(cuLinkComplete(state, &cubin, &cubinSize)); + void *CuBin = nullptr; + size_t CuBinSize = 0; + Result = UR_CHECK_ERROR(cuLinkComplete(State, &CuBin, &CuBinSize)); - retError = - retProgram->set_binary(static_cast(cubin), cubinSize); + Result = + RetProgram->setBinary(static_cast(CuBin), CuBinSize); - retError = retProgram->build_program(pOptions); + Result = RetProgram->buildProgram(pOptions); } catch (...) { // Upon error attempt cleanup - UR_CHECK_ERROR(cuLinkDestroy(state)); + UR_CHECK_ERROR(cuLinkDestroy(State)); throw; } - retError = UR_CHECK_ERROR(cuLinkDestroy(state)); - *phProgram = retProgram.release(); + Result = UR_CHECK_ERROR(cuLinkDestroy(State)); + *phProgram = RetProgram.release(); - } catch (ur_result_t err) { - retError = err; + } catch (ur_result_t Err) { + Result = Err; } - return retError; + return Result; } /// Created a UR program object from a CUDA program handle. @@ -299,12 +297,12 @@ urProgramGetBuildInfo(ur_program_handle_t hProgram, ur_device_handle_t hDevice, switch (propName) { case UR_PROGRAM_BUILD_INFO_STATUS: { - return ReturnValue(hProgram->buildStatus_); + return ReturnValue(hProgram->BuildStatus); } case UR_PROGRAM_BUILD_INFO_OPTIONS: - return ReturnValue(hProgram->buildOptions_.c_str()); + return ReturnValue(hProgram->BuildOptions.c_str()); case UR_PROGRAM_BUILD_INFO_LOG: - return ReturnValue(hProgram->infoLog_, hProgram->MAX_LOG_SIZE); + return ReturnValue(hProgram->InfoLog, hProgram->MaxLogSize); default: break; } @@ -320,19 +318,19 @@ urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName, switch (propName) { case UR_PROGRAM_INFO_REFERENCE_COUNT: - return ReturnValue(hProgram->get_reference_count()); + return ReturnValue(hProgram->getReferenceCount()); case UR_PROGRAM_INFO_CONTEXT: - return ReturnValue(hProgram->context_); + return ReturnValue(hProgram->Context); case UR_PROGRAM_INFO_NUM_DEVICES: return ReturnValue(1u); case UR_PROGRAM_INFO_DEVICES: - return ReturnValue(&hProgram->context_->deviceId_, 1); + return ReturnValue(&hProgram->Context->DeviceID, 1); case UR_PROGRAM_INFO_SOURCE: - return ReturnValue(hProgram->binary_); + return ReturnValue(hProgram->Binary); case UR_PROGRAM_INFO_BINARY_SIZES: - return ReturnValue(&hProgram->binarySizeInBytes_, 1); + return ReturnValue(&hProgram->BinarySizeInBytes, 1); case UR_PROGRAM_INFO_BINARIES: - return ReturnValue(&hProgram->binary_, 1); + return ReturnValue(&hProgram->Binary, 1); case UR_PROGRAM_INFO_KERNEL_NAMES: return getKernelNames(hProgram); default: @@ -344,9 +342,8 @@ urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName, UR_APIEXPORT ur_result_t UR_APICALL urProgramRetain(ur_program_handle_t program) { UR_ASSERT(program, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(program->get_reference_count() > 0, - UR_RESULT_ERROR_INVALID_PROGRAM); - program->increment_reference_count(); + UR_ASSERT(program->getReferenceCount() > 0, UR_RESULT_ERROR_INVALID_PROGRAM); + program->incrementReferenceCount(); return UR_RESULT_SUCCESS; } @@ -354,38 +351,38 @@ urProgramRetain(ur_program_handle_t program) { /// When the reference count reaches 0, it unloads the module from /// the context. UR_APIEXPORT ur_result_t UR_APICALL -urProgramRelease(ur_program_handle_t program) { - UR_ASSERT(program, UR_RESULT_ERROR_INVALID_NULL_HANDLE); +urProgramRelease(ur_program_handle_t hProgram) { + UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); // double delete or someone is messing with the ref count. // either way, cannot safely proceed. - UR_ASSERT(program->get_reference_count() != 0, + UR_ASSERT(hProgram->getReferenceCount() != 0, UR_RESULT_ERROR_INVALID_PROGRAM); // decrement ref count. If it is 0, delete the program. - if (program->decrement_reference_count() == 0) { + if (hProgram->decrementReferenceCount() == 0) { - std::unique_ptr program_ptr{program}; + std::unique_ptr ProgramPtr{hProgram}; - ur_result_t result = UR_RESULT_ERROR_INVALID_PROGRAM; + ur_result_t Result = UR_RESULT_ERROR_INVALID_PROGRAM; try { - ScopedContext active(program->get_context()); - auto cuModule = program->get(); + ScopedContext Active(hProgram->getContext()); + auto cuModule = hProgram->get(); // "0" is a valid handle for a cuModule, so the best way to check if we // actually loaded a module and need to unload it is to look at the build // status. - if (program->buildStatus_ == UR_PROGRAM_BUILD_STATUS_SUCCESS) { - result = UR_CHECK_ERROR(cuModuleUnload(cuModule)); - } else if (program->buildStatus_ == UR_PROGRAM_BUILD_STATUS_NONE) { + if (hProgram->BuildStatus == UR_PROGRAM_BUILD_STATUS_SUCCESS) { + Result = UR_CHECK_ERROR(cuModuleUnload(cuModule)); + } else if (hProgram->BuildStatus == UR_PROGRAM_BUILD_STATUS_NONE) { // Nothing to free. - result = UR_RESULT_SUCCESS; + Result = UR_RESULT_SUCCESS; } } catch (...) { - result = UR_RESULT_ERROR_OUT_OF_RESOURCES; + Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; } - return result; + return Result; } return UR_RESULT_SUCCESS; @@ -419,13 +416,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(phProgram, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(pBinary != nullptr, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(hContext->get_device()->get() == hDevice->get(), + UR_ASSERT(hContext->getDevice()->get() == hDevice->get(), UR_RESULT_ERROR_INVALID_CONTEXT); UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE); - ur_result_t retError = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; - std::unique_ptr retProgram{ + std::unique_ptr RetProgram{ new ur_program_handle_t_{hContext}}; if (pProperties) { @@ -434,19 +431,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( } else if (pProperties->count == 0 && pProperties->pMetadatas != nullptr) { return UR_RESULT_ERROR_INVALID_SIZE; } - retError = - retProgram->set_metadata(pProperties->pMetadatas, pProperties->count); + Result = + RetProgram->setMetadata(pProperties->pMetadatas, pProperties->count); } - UR_ASSERT(retError == UR_RESULT_SUCCESS, retError); + UR_ASSERT(Result == UR_RESULT_SUCCESS, Result); auto pBinary_string = reinterpret_cast(pBinary); - retError = retProgram->set_binary(pBinary_string, size); - UR_ASSERT(retError == UR_RESULT_SUCCESS, retError); + Result = RetProgram->setBinary(pBinary_string, size); + UR_ASSERT(Result == UR_RESULT_SUCCESS, Result); - *phProgram = retProgram.release(); + *phProgram = RetProgram.release(); - return retError; + return Result; } // This entry point is only used for native specialization constants (SPIR-V), @@ -462,22 +459,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetFunctionPointer( // Check if device passed is the same the device bound to the context UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hDevice == hProgram->get_context()->get_device(), + UR_ASSERT(hDevice == hProgram->getContext()->getDevice(), UR_RESULT_ERROR_INVALID_DEVICE); UR_ASSERT(pFunctionName, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(ppFunctionPointer, UR_RESULT_ERROR_INVALID_NULL_POINTER); - CUfunction func; - CUresult ret = cuModuleGetFunction(&func, hProgram->get(), pFunctionName); - *ppFunctionPointer = func; - ur_result_t retError = UR_RESULT_SUCCESS; + CUfunction Func; + CUresult Ret = cuModuleGetFunction(&Func, hProgram->get(), pFunctionName); + *ppFunctionPointer = Func; + ur_result_t Result = UR_RESULT_SUCCESS; - if (ret != CUDA_SUCCESS && ret != CUDA_ERROR_NOT_FOUND) - retError = UR_CHECK_ERROR(ret); - if (ret == CUDA_ERROR_NOT_FOUND) { + if (Ret != CUDA_SUCCESS && Ret != CUDA_ERROR_NOT_FOUND) + Result = UR_CHECK_ERROR(Ret); + if (Ret == CUDA_ERROR_NOT_FOUND) { *ppFunctionPointer = 0; - retError = UR_RESULT_ERROR_INVALID_FUNCTION_NAME; + Result = UR_RESULT_ERROR_INVALID_FUNCTION_NAME; } - return retError; + return Result; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.hpp index 35ac6fb215ea0..6d47df5b78523 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.hpp @@ -17,39 +17,38 @@ struct ur_program_handle_t_ { using native_type = CUmodule; - native_type module_; - const char *binary_; - size_t binarySizeInBytes_; - std::atomic_uint32_t refCount_; - ur_context_handle_t context_; + native_type Module; + const char *Binary; + size_t BinarySizeInBytes; + std::atomic_uint32_t RefCount; + ur_context_handle_t Context; // Metadata std::unordered_map> - kernelReqdWorkGroupSizeMD_; - std::unordered_map globalIDMD_; + KernelReqdWorkGroupSizeMD; + std::unordered_map GlobalIDMD; - constexpr static size_t MAX_LOG_SIZE = 8192u; + constexpr static size_t MaxLogSize = 8192u; - char errorLog_[MAX_LOG_SIZE], infoLog_[MAX_LOG_SIZE]; - std::string buildOptions_; - ur_program_build_status_t buildStatus_ = UR_PROGRAM_BUILD_STATUS_NONE; + char ErrorLog[MaxLogSize], InfoLog[MaxLogSize]; + std::string BuildOptions; + ur_program_build_status_t BuildStatus = UR_PROGRAM_BUILD_STATUS_NONE; - ur_program_handle_t_(ur_context_handle_t ctxt); + ur_program_handle_t_(ur_context_handle_t Context); ~ur_program_handle_t_(); - ur_result_t set_metadata(const ur_program_metadata_t *metadata, - size_t length); + ur_result_t setMetadata(const ur_program_metadata_t *Metadata, size_t Length); - ur_result_t set_binary(const char *binary, size_t binarySizeInBytes); + ur_result_t setBinary(const char *Binary, size_t BinarySizeInBytes); - ur_result_t build_program(const char *build_options); - ur_context_handle_t get_context() const { return context_; }; + ur_result_t buildProgram(const char *BuildOptions); + ur_context_handle_t getContext() const { return Context; }; - native_type get() const noexcept { return module_; }; + native_type get() const noexcept { return Module; }; - uint32_t increment_reference_count() noexcept { return ++refCount_; } + uint32_t incrementReferenceCount() noexcept { return ++RefCount; } - uint32_t decrement_reference_count() noexcept { return --refCount_; } + uint32_t decrementReferenceCount() noexcept { return --RefCount; } - uint32_t get_reference_count() const noexcept { return refCount_; } + uint32_t getReferenceCount() const noexcept { return RefCount; } }; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp index 2c13c6ea29d14..82edf55612669 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp @@ -14,102 +14,101 @@ #include #include -void ur_queue_handle_t_::compute_stream_wait_for_barrier_if_needed( - CUstream stream, uint32_t stream_i) { - if (barrier_event_ && !compute_applied_barrier_[stream_i]) { - UR_CHECK_ERROR(cuStreamWaitEvent(stream, barrier_event_, 0)); - compute_applied_barrier_[stream_i] = true; +void ur_queue_handle_t_::computeStreamWaitForBarrierIfNeeded(CUstream Stream, + uint32_t StreamI) { + if (BarrierEvent && !ComputeAppliedBarrier[StreamI]) { + UR_CHECK_ERROR(cuStreamWaitEvent(Stream, BarrierEvent, 0)); + ComputeAppliedBarrier[StreamI] = true; } } -void ur_queue_handle_t_::transfer_stream_wait_for_barrier_if_needed( - CUstream stream, uint32_t stream_i) { - if (barrier_event_ && !transfer_applied_barrier_[stream_i]) { - UR_CHECK_ERROR(cuStreamWaitEvent(stream, barrier_event_, 0)); - transfer_applied_barrier_[stream_i] = true; +void ur_queue_handle_t_::transferStreamWaitForBarrierIfNeeded( + CUstream Stream, uint32_t StreamI) { + if (BarrierEvent && !TransferAppliedBarrier[StreamI]) { + UR_CHECK_ERROR(cuStreamWaitEvent(Stream, BarrierEvent, 0)); + TransferAppliedBarrier[StreamI] = true; } } -CUstream ur_queue_handle_t_::get_next_compute_stream(uint32_t *stream_token) { - uint32_t stream_i; - uint32_t token; +CUstream ur_queue_handle_t_::getNextComputeStream(uint32_t *StreamToken) { + uint32_t StreamI; + uint32_t Token; while (true) { - if (num_compute_streams_ < compute_streams_.size()) { + if (NumComputeStreams < ComputeStreams.size()) { // the check above is for performance - so as not to lock mutex every time - std::lock_guard guard(compute_stream_mutex_); + std::lock_guard guard(ComputeStreamMutex); // The second check is done after mutex is locked so other threads can not - // change num_compute_streams_ after that - if (num_compute_streams_ < compute_streams_.size()) { + // change NumComputeStreams after that + if (NumComputeStreams < ComputeStreams.size()) { UR_CHECK_ERROR( - cuStreamCreate(&compute_streams_[num_compute_streams_++], flags_)); + cuStreamCreate(&ComputeStreams[NumComputeStreams++], Flags)); } } - token = compute_stream_idx_++; - stream_i = token % compute_streams_.size(); + Token = ComputeStreamIndex++; + StreamI = Token % ComputeStreams.size(); // if a stream has been reused before it was next selected round-robin // fashion, we want to delay its next use and instead select another one // that is more likely to have completed all the enqueued work. - if (delay_compute_[stream_i]) { - delay_compute_[stream_i] = false; + if (DelayCompute[StreamI]) { + DelayCompute[StreamI] = false; } else { break; } } - if (stream_token) { - *stream_token = token; + if (StreamToken) { + *StreamToken = Token; } - CUstream res = compute_streams_[stream_i]; - compute_stream_wait_for_barrier_if_needed(res, stream_i); + CUstream res = ComputeStreams[StreamI]; + computeStreamWaitForBarrierIfNeeded(res, StreamI); return res; } -CUstream ur_queue_handle_t_::get_next_compute_stream( - uint32_t num_events_in_wait_list, const ur_event_handle_t *event_wait_list, - ur_stream_guard_ &guard, uint32_t *stream_token) { - for (uint32_t i = 0; i < num_events_in_wait_list; i++) { - uint32_t token = event_wait_list[i]->get_compute_stream_token(); - if (reinterpret_cast(event_wait_list[i]->get_queue()) == +CUstream ur_queue_handle_t_::getNextComputeStream( + uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, + ur_stream_guard_ &Guard, uint32_t *StreamToken) { + for (uint32_t i = 0; i < NumEventsInWaitList; i++) { + uint32_t Token = EventWaitList[i]->getComputeStreamToken(); + if (reinterpret_cast(EventWaitList[i]->getQueue()) == this && - can_reuse_stream(token)) { - std::unique_lock compute_sync_guard( - compute_stream_sync_mutex_); + canReuseStream(Token)) { + std::unique_lock ComputeSyncGuard(ComputeStreamSyncMutex); // redo the check after lock to avoid data races on - // last_sync_compute_streams_ - if (can_reuse_stream(token)) { - uint32_t stream_i = token % delay_compute_.size(); - delay_compute_[stream_i] = true; - if (stream_token) { - *stream_token = token; + // LastSyncComputeStreams + if (canReuseStream(Token)) { + uint32_t StreamI = Token % DelayCompute.size(); + DelayCompute[StreamI] = true; + if (StreamToken) { + *StreamToken = Token; } - guard = ur_stream_guard_{std::move(compute_sync_guard)}; - CUstream res = event_wait_list[i]->get_stream(); - compute_stream_wait_for_barrier_if_needed(res, stream_i); - return res; + Guard = ur_stream_guard_{std::move(ComputeSyncGuard)}; + CUstream Result = EventWaitList[i]->getStream(); + computeStreamWaitForBarrierIfNeeded(Result, StreamI); + return Result; } } } - guard = {}; - return get_next_compute_stream(stream_token); + Guard = {}; + return getNextComputeStream(StreamToken); } -CUstream ur_queue_handle_t_::get_next_transfer_stream() { - if (transfer_streams_.empty()) { // for example in in-order queue - return get_next_compute_stream(); +CUstream ur_queue_handle_t_::getNextTransferStream() { + if (TransferStreams.empty()) { // for example in in-order queue + return getNextComputeStream(); } - if (num_transfer_streams_ < transfer_streams_.size()) { + if (NumTransferStreams < TransferStreams.size()) { // the check above is for performance - so as not to lock mutex every time - std::lock_guard guard(transfer_stream_mutex_); + std::lock_guard Guuard(TransferStreamMutex); // The second check is done after mutex is locked so other threads can not - // change num_transfer_streams_ after that - if (num_transfer_streams_ < transfer_streams_.size()) { + // change NumTransferStreams after that + if (NumTransferStreams < TransferStreams.size()) { UR_CHECK_ERROR( - cuStreamCreate(&transfer_streams_[num_transfer_streams_++], flags_)); + cuStreamCreate(&TransferStreams[NumTransferStreams++], Flags)); } } - uint32_t stream_i = transfer_stream_idx_++ % transfer_streams_.size(); - CUstream res = transfer_streams_[stream_i]; - transfer_stream_wait_for_barrier_if_needed(res, stream_i); - return res; + uint32_t StreamI = TransferStreamIndex++ % TransferStreams.size(); + CUstream Result = TransferStreams[StreamI]; + transferStreamWaitForBarrierIfNeeded(Result, StreamI); + return Result; } /// Creates a `ur_queue_handle_t` object on the CUDA backend. @@ -121,47 +120,47 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_queue_properties_t *pProps, ur_queue_handle_t *phQueue) { try { - std::unique_ptr queueImpl{nullptr}; + std::unique_ptr Queue{nullptr}; UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(phQueue, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - if (hContext->get_device() != hDevice) { + if (hContext->getDevice() != hDevice) { *phQueue = nullptr; return UR_RESULT_ERROR_INVALID_DEVICE; } - unsigned int flags = CU_STREAM_NON_BLOCKING; - ur_queue_flags_t urFlags = 0; - bool is_out_of_order = false; + unsigned int Flags = CU_STREAM_NON_BLOCKING; + ur_queue_flags_t URFlags = 0; + bool IsOutOfOrder = false; if (pProps && pProps->stype == UR_STRUCTURE_TYPE_QUEUE_PROPERTIES) { - urFlags = pProps->flags; - if (urFlags == __SYCL_UR_CUDA_USE_DEFAULT_STREAM) { - flags = CU_STREAM_DEFAULT; - } else if (urFlags == __SYCL_UR_CUDA_SYNC_WITH_DEFAULT) { - flags = 0; + URFlags = pProps->flags; + if (URFlags == __SYCL_UR_CUDA_USE_DEFAULT_STREAM) { + Flags = CU_STREAM_DEFAULT; + } else if (URFlags == __SYCL_UR_CUDA_SYNC_WITH_DEFAULT) { + Flags = 0; } - if (urFlags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) { - is_out_of_order = true; + if (URFlags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) { + IsOutOfOrder = true; } } - std::vector computeCuStreams( - is_out_of_order ? ur_queue_handle_t_::default_num_compute_streams : 1); - std::vector transferCuStreams( - is_out_of_order ? ur_queue_handle_t_::default_num_transfer_streams : 0); + std::vector ComputeCuStreams( + IsOutOfOrder ? ur_queue_handle_t_::DefaultNumComputeStreams : 1); + std::vector TransferCuStreams( + IsOutOfOrder ? ur_queue_handle_t_::DefaultNumTransferStreams : 0); - queueImpl = std::unique_ptr(new ur_queue_handle_t_{ - std::move(computeCuStreams), std::move(transferCuStreams), hContext, - hDevice, flags, urFlags}); + Queue = std::unique_ptr(new ur_queue_handle_t_{ + std::move(ComputeCuStreams), std::move(TransferCuStreams), hContext, + hDevice, Flags, URFlags}); - *phQueue = queueImpl.release(); + *phQueue = Queue.release(); return UR_RESULT_SUCCESS; - } catch (ur_result_t err) { + } catch (ur_result_t Err) { - return err; + return Err; } catch (...) { @@ -171,61 +170,61 @@ urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice, UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) { UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - assert(hQueue->get_reference_count() > 0); + assert(hQueue->getReferenceCount() > 0); - hQueue->increment_reference_count(); + hQueue->incrementReferenceCount(); return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) { UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - if (hQueue->decrement_reference_count() > 0) { + if (hQueue->decrementReferenceCount() > 0) { return UR_RESULT_SUCCESS; } try { - std::unique_ptr queueImpl(hQueue); + std::unique_ptr Queue(hQueue); - if (!hQueue->backend_has_ownership()) + if (!hQueue->backendHasOwnership()) return UR_RESULT_SUCCESS; - ScopedContext active(hQueue->get_context()); + ScopedContext Active(hQueue->getContext()); - hQueue->for_each_stream([](CUstream s) { - UR_CHECK_ERROR(cuStreamSynchronize(s)); - UR_CHECK_ERROR(cuStreamDestroy(s)); + hQueue->forEachStream([](CUstream S) { + UR_CHECK_ERROR(cuStreamSynchronize(S)); + UR_CHECK_ERROR(cuStreamDestroy(S)); }); return UR_RESULT_SUCCESS; - } catch (ur_result_t err) { - return err; + } catch (ur_result_t Err) { + return Err; } catch (...) { return UR_RESULT_ERROR_OUT_OF_RESOURCES; } } UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) { - ur_result_t result = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - ScopedContext active(hQueue->get_context()); + ScopedContext active(hQueue->getContext()); - hQueue->sync_streams([&result](CUstream s) { - result = UR_CHECK_ERROR(cuStreamSynchronize(s)); + hQueue->syncStreams([&Result](CUstream s) { + Result = UR_CHECK_ERROR(cuStreamSynchronize(s)); }); - } catch (ur_result_t err) { + } catch (ur_result_t Err) { - result = err; + Result = Err; } catch (...) { - result = UR_RESULT_ERROR_OUT_OF_RESOURCES; + Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; } - return result; + return Result; } // There is no CUDA counterpart for queue flushing and we don't run into the @@ -242,9 +241,9 @@ urQueueGetNativeHandle(ur_queue_handle_t hQueue, ur_queue_native_desc_t *pDesc, UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(phNativeQueue, UR_RESULT_ERROR_INVALID_NULL_POINTER); - ScopedContext active(hQueue->get_context()); + ScopedContext Active(hQueue->getContext()); *phNativeQueue = - reinterpret_cast(hQueue->get_next_compute_stream()); + reinterpret_cast(hQueue->getNextComputeStream()); return UR_RESULT_SUCCESS; } @@ -254,35 +253,35 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( ur_queue_handle_t *phQueue) { (void)pProperties; - unsigned int cuFlags; - CUstream cuStream = reinterpret_cast(hNativeQueue); - UR_ASSERT(hContext->get_device() == hDevice, UR_RESULT_ERROR_INVALID_DEVICE); + unsigned int CuFlags; + CUstream CuStream = reinterpret_cast(hNativeQueue); + UR_ASSERT(hContext->getDevice() == hDevice, UR_RESULT_ERROR_INVALID_DEVICE); - auto retErr = UR_CHECK_ERROR(cuStreamGetFlags(cuStream, &cuFlags)); + auto Return = UR_CHECK_ERROR(cuStreamGetFlags(CuStream, &CuFlags)); - ur_queue_flags_t flags = 0; - if (cuFlags == CU_STREAM_DEFAULT) - flags = __SYCL_UR_CUDA_USE_DEFAULT_STREAM; - else if (cuFlags == CU_STREAM_NON_BLOCKING) - flags = __SYCL_UR_CUDA_SYNC_WITH_DEFAULT; + ur_queue_flags_t Flags = 0; + if (CuFlags == CU_STREAM_DEFAULT) + Flags = __SYCL_UR_CUDA_USE_DEFAULT_STREAM; + else if (CuFlags == CU_STREAM_NON_BLOCKING) + Flags = __SYCL_UR_CUDA_SYNC_WITH_DEFAULT; else sycl::detail::ur::die("Unknown cuda stream"); - std::vector computeCuStreams(1, cuStream); - std::vector transferCuStreams(0); + std::vector ComputeCuStreams(1, CuStream); + std::vector TransferCuStreams(0); // Create queue and set num_compute_streams to 1, as computeCuStreams has // valid stream - *phQueue = new ur_queue_handle_t_{std::move(computeCuStreams), - std::move(transferCuStreams), + *phQueue = new ur_queue_handle_t_{std::move(ComputeCuStreams), + std::move(TransferCuStreams), hContext, hDevice, - cuFlags, - flags, + CuFlags, + Flags, /*backend_owns*/ false}; - (*phQueue)->num_compute_streams_ = 1; + (*phQueue)->NumComputeStreams = 1; - return retErr; + return Return; } UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue, @@ -297,29 +296,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue, switch (uint32_t{propName}) { case UR_QUEUE_INFO_CONTEXT: - return ReturnValue(hQueue->context_); + return ReturnValue(hQueue->Context); case UR_QUEUE_INFO_DEVICE: - return ReturnValue(hQueue->device_); + return ReturnValue(hQueue->Device); case UR_QUEUE_INFO_REFERENCE_COUNT: - return ReturnValue(hQueue->get_reference_count()); + return ReturnValue(hQueue->getReferenceCount()); case UR_QUEUE_INFO_FLAGS: - return ReturnValue(hQueue->ur_flags_); + return ReturnValue(hQueue->URFlags); case UR_QUEUE_INFO_EMPTY: { try { - bool IsReady = hQueue->all_of([](CUstream s) -> bool { - const CUresult ret = cuStreamQuery(s); - if (ret == CUDA_SUCCESS) + bool IsReady = hQueue->allOf([](CUstream S) -> bool { + const CUresult Ret = cuStreamQuery(S); + if (Ret == CUDA_SUCCESS) return true; - if (ret == CUDA_ERROR_NOT_READY) + if (Ret == CUDA_ERROR_NOT_READY) return false; - UR_CHECK_ERROR(ret); + UR_CHECK_ERROR(Ret); return false; }); return ReturnValue(IsReady); - } catch (ur_result_t err) { - return err; + } catch (ur_result_t Err) { + return Err; } catch (...) { return UR_RESULT_ERROR_OUT_OF_RESOURCES; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp index daa1017d0f0aa..bfb8f6606b645 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp @@ -19,104 +19,100 @@ using ur_stream_guard_ = std::unique_lock; struct ur_queue_handle_t_ { using native_type = CUstream; - static constexpr int default_num_compute_streams = 128; - static constexpr int default_num_transfer_streams = 64; + static constexpr int DefaultNumComputeStreams = 128; + static constexpr int DefaultNumTransferStreams = 64; - std::vector compute_streams_; - std::vector transfer_streams_; + std::vector ComputeStreams; + std::vector TransferStreams; // delay_compute_ keeps track of which streams have been recently reused and // their next use should be delayed. If a stream has been recently reused it // will be skipped the next time it would be selected round-robin style. When // skipped, its delay flag is cleared. - std::vector delay_compute_; + std::vector DelayCompute; // keep track of which streams have applied barrier - std::vector compute_applied_barrier_; - std::vector transfer_applied_barrier_; - ur_context_handle_t_ *context_; - ur_device_handle_t_ *device_; - // ur_queue_properties_t properties_; - CUevent barrier_event_ = nullptr; - CUevent barrier_tmp_event_ = nullptr; - std::atomic_uint32_t refCount_; - std::atomic_uint32_t eventCount_; - std::atomic_uint32_t compute_stream_idx_; - std::atomic_uint32_t transfer_stream_idx_; - unsigned int num_compute_streams_; - unsigned int num_transfer_streams_; - unsigned int last_sync_compute_streams_; - unsigned int last_sync_transfer_streams_; - unsigned int flags_; - ur_queue_flags_t ur_flags_; - // When compute_stream_sync_mutex_ and compute_stream_mutex_ both need to be - // locked at the same time, compute_stream_sync_mutex_ should be locked first + std::vector ComputeAppliedBarrier; + std::vector TransferAppliedBarrier; + ur_context_handle_t_ *Context; + ur_device_handle_t_ *Device; + CUevent BarrierEvent = nullptr; + CUevent BarrierTmpEvent = nullptr; + std::atomic_uint32_t RefCount; + std::atomic_uint32_t EventCount; + std::atomic_uint32_t ComputeStreamIndex; + std::atomic_uint32_t TransferStreamIndex; + unsigned int NumComputeStreams; + unsigned int NumTransferStreams; + unsigned int LastSyncComputeStreams; + unsigned int LastSyncTransferStreams; + unsigned int Flags; + ur_queue_flags_t URFlags; + // When ComputeStreamSyncMutex and ComputeStreamMutex both need to be + // locked at the same time, ComputeStreamSyncMutex should be locked first // to avoid deadlocks - std::mutex compute_stream_sync_mutex_; - std::mutex compute_stream_mutex_; - std::mutex transfer_stream_mutex_; - std::mutex barrier_mutex_; - bool has_ownership_; - - ur_queue_handle_t_(std::vector &&compute_streams, - std::vector &&transfer_streams, - ur_context_handle_t_ *context, ur_device_handle_t_ *device, - unsigned int flags, ur_queue_flags_t ur_flags, - bool backend_owns = true) - : compute_streams_{std::move(compute_streams)}, - transfer_streams_{std::move(transfer_streams)}, - delay_compute_(compute_streams_.size(), false), - compute_applied_barrier_(compute_streams_.size()), - transfer_applied_barrier_(transfer_streams_.size()), context_{context}, - device_{device}, refCount_{1}, eventCount_{0}, compute_stream_idx_{0}, - transfer_stream_idx_{0}, num_compute_streams_{0}, - num_transfer_streams_{0}, last_sync_compute_streams_{0}, - last_sync_transfer_streams_{0}, flags_(flags), ur_flags_(ur_flags), - has_ownership_{backend_owns} { - urContextRetain(context_); - urDeviceRetain(device_); + std::mutex ComputeStreamSyncMutex; + std::mutex ComputeStreamMutex; + std::mutex TransferStreamMutex; + std::mutex BarrierMutex; + bool HasOwnership; + + ur_queue_handle_t_(std::vector &&ComputeStreams, + std::vector &&TransferStreams, + ur_context_handle_t_ *Context, ur_device_handle_t_ *Device, + unsigned int Flags, ur_queue_flags_t URFlags, + bool BackendOwns = true) + : ComputeStreams{std::move(ComputeStreams)}, + TransferStreams{std::move(TransferStreams)}, + DelayCompute(this->ComputeStreams.size(), false), + ComputeAppliedBarrier(this->ComputeStreams.size()), + TransferAppliedBarrier(this->TransferStreams.size()), Context{Context}, + Device{Device}, RefCount{1}, EventCount{0}, ComputeStreamIndex{0}, + TransferStreamIndex{0}, NumComputeStreams{0}, NumTransferStreams{0}, + LastSyncComputeStreams{0}, LastSyncTransferStreams{0}, Flags(Flags), + URFlags(URFlags), HasOwnership{BackendOwns} { + urContextRetain(Context); + urDeviceRetain(Device); } ~ur_queue_handle_t_() { - urContextRelease(context_); - urDeviceRelease(device_); + urContextRelease(Context); + urDeviceRelease(Device); } - void compute_stream_wait_for_barrier_if_needed(CUstream stream, - uint32_t stream_i); - void transfer_stream_wait_for_barrier_if_needed(CUstream stream, - uint32_t stream_i); + void computeStreamWaitForBarrierIfNeeded(CUstream Strean, uint32_t StreamI); + void transferStreamWaitForBarrierIfNeeded(CUstream Stream, uint32_t StreamI); // get_next_compute/transfer_stream() functions return streams from // appropriate pools in round-robin fashion - native_type get_next_compute_stream(uint32_t *stream_token = nullptr); + native_type getNextComputeStream(uint32_t *StreamToken = nullptr); // this overload tries select a stream that was used by one of dependancies. // If that is not possible returns a new stream. If a stream is reused it // returns a lock that needs to remain locked as long as the stream is in use - native_type get_next_compute_stream(uint32_t num_events_in_wait_list, - const ur_event_handle_t *event_wait_list, - ur_stream_guard_ &guard, - uint32_t *stream_token = nullptr); - native_type get_next_transfer_stream(); - native_type get() { return get_next_compute_stream(); }; - - bool has_been_synchronized(uint32_t stream_token) { + native_type getNextComputeStream(uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList, + ur_stream_guard_ &Guard, + uint32_t *StreamToken = nullptr); + native_type getNextTransferStream(); + native_type get() { return getNextComputeStream(); }; + + bool hasBeenSynchronized(uint32_t StreamToken) { // stream token not associated with one of the compute streams - if (stream_token == std::numeric_limits::max()) { + if (StreamToken == std::numeric_limits::max()) { return false; } - return last_sync_compute_streams_ >= stream_token; + return LastSyncComputeStreams >= StreamToken; } - bool can_reuse_stream(uint32_t stream_token) { + bool canReuseStream(uint32_t StreamToken) { // stream token not associated with one of the compute streams - if (stream_token == std::numeric_limits::max()) { + if (StreamToken == std::numeric_limits::max()) { return false; } // If the command represented by the stream token was not the last command // enqueued to the stream we can not reuse the stream - we need to allow for // commands enqueued after it and the one we are about to enqueue to run // concurrently - bool is_last_command = - (compute_stream_idx_ - stream_token) <= compute_streams_.size(); + bool IsLastCommand = + (ComputeStreamIndex - StreamToken) <= ComputeStreams.size(); // If there was a barrier enqueued to the queue after the command // represented by the stream token we should not reuse the stream, as we can // not take that stream into account for the bookkeeping for the next @@ -125,129 +121,124 @@ struct ur_queue_handle_t_ { // represented by the stream token is guaranteed to be complete by the // barrier before any work we are about to enqueue to the stream will start, // so the event does not need to be synchronized with. - return is_last_command && !has_been_synchronized(stream_token); + return IsLastCommand && !hasBeenSynchronized(StreamToken); } - template bool all_of(T &&f) { + template bool allOf(T &&F) { { - std::lock_guard compute_guard(compute_stream_mutex_); - unsigned int end = - std::min(static_cast(compute_streams_.size()), - num_compute_streams_); - if (!std::all_of(compute_streams_.begin(), compute_streams_.begin() + end, - f)) + std::lock_guard ComputeGuard(ComputeStreamMutex); + unsigned int End = std::min( + static_cast(ComputeStreams.size()), NumComputeStreams); + if (!std::all_of(ComputeStreams.begin(), ComputeStreams.begin() + End, F)) return false; } { - std::lock_guard transfer_guard(transfer_stream_mutex_); - unsigned int end = - std::min(static_cast(transfer_streams_.size()), - num_transfer_streams_); - if (!std::all_of(transfer_streams_.begin(), - transfer_streams_.begin() + end, f)) + std::lock_guard TransferGuard(TransferStreamMutex); + unsigned int End = + std::min(static_cast(TransferStreams.size()), + NumTransferStreams); + if (!std::all_of(TransferStreams.begin(), TransferStreams.begin() + End, + F)) return false; } return true; } - template void for_each_stream(T &&f) { + template void forEachStream(T &&F) { { - std::lock_guard compute_guard(compute_stream_mutex_); - unsigned int end = - std::min(static_cast(compute_streams_.size()), - num_compute_streams_); - for (unsigned int i = 0; i < end; i++) { - f(compute_streams_[i]); + std::lock_guard compute_guard(ComputeStreamMutex); + unsigned int End = std::min( + static_cast(ComputeStreams.size()), NumComputeStreams); + for (unsigned int i = 0; i < End; i++) { + F(ComputeStreams[i]); } } { - std::lock_guard transfer_guard(transfer_stream_mutex_); - unsigned int end = - std::min(static_cast(transfer_streams_.size()), - num_transfer_streams_); - for (unsigned int i = 0; i < end; i++) { - f(transfer_streams_[i]); + std::lock_guard transfer_guard(TransferStreamMutex); + unsigned int End = + std::min(static_cast(TransferStreams.size()), + NumTransferStreams); + for (unsigned int i = 0; i < End; i++) { + F(TransferStreams[i]); } } } - template void sync_streams(T &&f) { - auto sync_compute = [&f, &streams = compute_streams_, - &delay = delay_compute_](unsigned int start, - unsigned int stop) { - for (unsigned int i = start; i < stop; i++) { - f(streams[i]); - delay[i] = false; + template void syncStreams(T &&F) { + auto SyncCompute = [&F, &Streams = ComputeStreams, &Delay = DelayCompute]( + unsigned int Start, unsigned int Stop) { + for (unsigned int i = Start; i < Stop; i++) { + F(Streams[i]); + Delay[i] = false; } }; - auto sync_transfer = [&f, &streams = transfer_streams_](unsigned int start, - unsigned int stop) { - for (unsigned int i = start; i < stop; i++) { - f(streams[i]); + auto SyncTransfer = [&F, &streams = TransferStreams](unsigned int Start, + unsigned int Stop) { + for (unsigned int i = Start; i < Stop; i++) { + F(streams[i]); } }; { - unsigned int size = static_cast(compute_streams_.size()); - std::lock_guard compute_sync_guard(compute_stream_sync_mutex_); - std::lock_guard compute_guard(compute_stream_mutex_); - unsigned int start = last_sync_compute_streams_; - unsigned int end = num_compute_streams_ < size - ? num_compute_streams_ - : compute_stream_idx_.load(); + unsigned int Size = static_cast(ComputeStreams.size()); + std::lock_guard ComputeSyncGuard(ComputeStreamSyncMutex); + std::lock_guard ComputeGuard(ComputeStreamMutex); + unsigned int Start = LastSyncComputeStreams; + unsigned int End = NumComputeStreams < Size ? NumComputeStreams + : ComputeStreamIndex.load(); if (ResetUsed) { - last_sync_compute_streams_ = end; + LastSyncComputeStreams = End; } - if (end - start >= size) { - sync_compute(0, size); + if (End - Start >= Size) { + SyncCompute(0, Size); } else { - start %= size; - end %= size; - if (start <= end) { - sync_compute(start, end); + Start %= Size; + End %= Size; + if (Start <= End) { + SyncCompute(Start, End); } else { - sync_compute(start, size); - sync_compute(0, end); + SyncCompute(Start, Size); + SyncCompute(0, End); } } } { - unsigned int size = static_cast(transfer_streams_.size()); - if (size > 0) { - std::lock_guard transfer_guard(transfer_stream_mutex_); - unsigned int start = last_sync_transfer_streams_; - unsigned int end = num_transfer_streams_ < size - ? num_transfer_streams_ - : transfer_stream_idx_.load(); + unsigned int Size = static_cast(TransferStreams.size()); + if (Size > 0) { + std::lock_guard TransferGuard(TransferStreamMutex); + unsigned int Start = LastSyncTransferStreams; + unsigned int End = NumTransferStreams < Size + ? NumTransferStreams + : TransferStreamIndex.load(); if (ResetUsed) { - last_sync_transfer_streams_ = end; + LastSyncTransferStreams = End; } - if (end - start >= size) { - sync_transfer(0, size); + if (End - Start >= Size) { + SyncTransfer(0, Size); } else { - start %= size; - end %= size; - if (start <= end) { - sync_transfer(start, end); + Start %= Size; + End %= Size; + if (Start <= End) { + SyncTransfer(Start, End); } else { - sync_transfer(start, size); - sync_transfer(0, end); + SyncTransfer(Start, Size); + SyncTransfer(0, End); } } } } } - ur_context_handle_t_ *get_context() const { return context_; }; + ur_context_handle_t_ *getContext() const { return Context; }; - ur_device_handle_t_ *get_device() const { return device_; }; + ur_device_handle_t_ *get_device() const { return Device; }; - uint32_t increment_reference_count() noexcept { return ++refCount_; } + uint32_t incrementReferenceCount() noexcept { return ++RefCount; } - uint32_t decrement_reference_count() noexcept { return --refCount_; } + uint32_t decrementReferenceCount() noexcept { return --RefCount; } - uint32_t get_reference_count() const noexcept { return refCount_; } + uint32_t getReferenceCount() const noexcept { return RefCount; } - uint32_t get_next_event_id() noexcept { return ++eventCount_; } + uint32_t getNextEventID() noexcept { return ++EventCount; } - bool backend_has_ownership() const noexcept { return has_ownership_; } + bool backendHasOwnership() const noexcept { return HasOwnership; } }; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp index c07f548c92a26..464bd783b4646 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp @@ -12,20 +12,20 @@ ur_result_t urSamplerCreate(ur_context_handle_t hContext, const ur_sampler_desc_t *pDesc, ur_sampler_handle_t *phSampler) { - std::unique_ptr retImplSampl{ + std::unique_ptr Sampler{ new ur_sampler_handle_t_(hContext)}; if (pDesc && pDesc->stype == UR_STRUCTURE_TYPE_SAMPLER_DESC) { - retImplSampl->props_ |= pDesc->normalizedCoords; - retImplSampl->props_ |= (pDesc->filterMode << 1); - retImplSampl->props_ |= (pDesc->addressingMode << 2); + Sampler->Props |= pDesc->normalizedCoords; + Sampler->Props |= (pDesc->filterMode << 1); + Sampler->Props |= (pDesc->addressingMode << 2); } else { // Set default values - retImplSampl->props_ |= true; // Normalized Coords - retImplSampl->props_ |= UR_SAMPLER_ADDRESSING_MODE_CLAMP << 2; + Sampler->Props |= true; // Normalized Coords + Sampler->Props |= UR_SAMPLER_ADDRESSING_MODE_CLAMP << 2; } - *phSampler = retImplSampl.release(); + *phSampler = Sampler.release(); return UR_RESULT_SUCCESS; } @@ -37,22 +37,22 @@ ur_result_t urSamplerGetInfo(ur_sampler_handle_t hSampler, switch (propName) { case UR_SAMPLER_INFO_REFERENCE_COUNT: - return ReturnValue(hSampler->get_reference_count()); + return ReturnValue(hSampler->getReferenceCount()); case UR_SAMPLER_INFO_CONTEXT: - return ReturnValue(hSampler->context_); + return ReturnValue(hSampler->Context); case UR_SAMPLER_INFO_NORMALIZED_COORDS: { - bool norm_coords_prop = static_cast(hSampler->props_); - return ReturnValue(norm_coords_prop); + bool NormCoordsProp = static_cast(hSampler->Props); + return ReturnValue(NormCoordsProp); } case UR_SAMPLER_INFO_FILTER_MODE: { - auto filter_prop = - static_cast(((hSampler->props_ >> 1) & 0x1)); - return ReturnValue(filter_prop); + auto FilterProp = + static_cast(((hSampler->Props >> 1) & 0x1)); + return ReturnValue(FilterProp); } case UR_SAMPLER_INFO_ADDRESSING_MODE: { - auto addressing_prop = - static_cast(hSampler->props_ >> 2); - return ReturnValue(addressing_prop); + auto AddressingProp = + static_cast(hSampler->Props >> 2); + return ReturnValue(AddressingProp); } default: return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; @@ -62,7 +62,7 @@ ur_result_t urSamplerGetInfo(ur_sampler_handle_t hSampler, ur_result_t urSamplerRetain(ur_sampler_handle_t hSampler) { UR_ASSERT(hSampler, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - hSampler->increment_reference_count(); + hSampler->incrementReferenceCount(); return UR_RESULT_SUCCESS; } @@ -72,11 +72,11 @@ ur_result_t urSamplerRelease(ur_sampler_handle_t hSampler) { // double delete or someone is messing with the ref count. // either way, cannot safely proceed. sycl::detail::ur::assertion( - hSampler->get_reference_count() != 0, + hSampler->getReferenceCount() != 0, "Reference count overflow detected in urSamplerRelease."); // decrement ref count. If it is 0, delete the sampler. - if (hSampler->decrement_reference_count() == 0) { + if (hSampler->decrementReferenceCount() == 0) { delete hSampler; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.hpp index 61ed98325a5ed..6dbbb124ffc3e 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.hpp @@ -14,16 +14,16 @@ /// | 31 30 ... 6 5 | 4 3 2 | 1 | 0 | /// | N/A | addressing mode | fiter mode | normalize coords | struct ur_sampler_handle_t_ { - std::atomic_uint32_t refCount_; - uint32_t props_; - ur_context_handle_t context_; + std::atomic_uint32_t RefCount; + uint32_t Props; + ur_context_handle_t Context; - ur_sampler_handle_t_(ur_context_handle_t context) - : refCount_(1), props_(0), context_(context) {} + ur_sampler_handle_t_(ur_context_handle_t Context) + : RefCount(1), Props(0), Context(Context) {} - uint32_t increment_reference_count() noexcept { return ++refCount_; } + uint32_t incrementReferenceCount() noexcept { return ++RefCount; } - uint32_t decrement_reference_count() noexcept { return --refCount_; } + uint32_t decrementReferenceCount() noexcept { return --RefCount; } - uint32_t get_reference_count() const noexcept { return refCount_; } + uint32_t getReferenceCount() const noexcept { return RefCount; } }; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp index 0309d4a7b627a..67b98f5c30319 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp @@ -25,32 +25,32 @@ urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc, UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - size_t device_max_mem_alloc_size = 0; - UR_ASSERT(urDeviceGetInfo(hContext->get_device(), + size_t DeviceMaxMemAllocSize = 0; + UR_ASSERT(urDeviceGetInfo(hContext->getDevice(), UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, sizeof(size_t), - static_cast(&device_max_mem_alloc_size), + static_cast(&DeviceMaxMemAllocSize), nullptr) == UR_RESULT_SUCCESS, UR_RESULT_ERROR_INVALID_DEVICE); - UR_ASSERT(size > 0 && size <= device_max_mem_alloc_size, + UR_ASSERT(size > 0 && size <= DeviceMaxMemAllocSize, UR_RESULT_ERROR_INVALID_USM_SIZE); - ur_result_t result = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { - ScopedContext active(hContext); - result = UR_CHECK_ERROR(cuMemAllocHost(ppMem, size)); - } catch (ur_result_t error) { - result = error; + ScopedContext Active(hContext); + Result = UR_CHECK_ERROR(cuMemAllocHost(ppMem, size)); + } catch (ur_result_t Err) { + Result = Err; } UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 || ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)), UR_RESULT_ERROR_INVALID_VALUE); - assert(result == UR_RESULT_SUCCESS && + assert(Result == UR_RESULT_SUCCESS && (!pUSMDesc || pUSMDesc->align == 0 || reinterpret_cast(*ppMem) % pUSMDesc->align == 0)); - return result; + return Result; } /// USM: Implements USM device allocations using a normal CUDA device pointer @@ -63,31 +63,31 @@ urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - size_t device_max_mem_alloc_size = 0; + size_t DeviceMaxMemAllocSize = 0; UR_ASSERT(urDeviceGetInfo(hDevice, UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, sizeof(size_t), - static_cast(&device_max_mem_alloc_size), + static_cast(&DeviceMaxMemAllocSize), nullptr) == UR_RESULT_SUCCESS, UR_RESULT_ERROR_INVALID_DEVICE); - UR_ASSERT(size > 0 && size <= device_max_mem_alloc_size, + UR_ASSERT(size > 0 && size <= DeviceMaxMemAllocSize, UR_RESULT_ERROR_INVALID_USM_SIZE); - ur_result_t result = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { - ScopedContext active(hContext); - result = UR_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)ppMem, size)); - } catch (ur_result_t error) { - result = error; + ScopedContext Active(hContext); + Result = UR_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)ppMem, size)); + } catch (ur_result_t Err) { + Result = Err; } UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 || ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)), UR_RESULT_ERROR_INVALID_VALUE); - assert(result == UR_RESULT_SUCCESS && + assert(Result == UR_RESULT_SUCCESS && (!pUSMDesc || pUSMDesc->align == 0 || reinterpret_cast(*ppMem) % pUSMDesc->align == 0)); - return result; + return Result; } /// USM: Implements USM Shared allocations using CUDA Managed Memory @@ -100,32 +100,32 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - size_t device_max_mem_alloc_size = 0; + size_t DeviceMaxMemAllocSize = 0; UR_ASSERT(urDeviceGetInfo(hDevice, UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, sizeof(size_t), - static_cast(&device_max_mem_alloc_size), + static_cast(&DeviceMaxMemAllocSize), nullptr) == UR_RESULT_SUCCESS, UR_RESULT_ERROR_INVALID_DEVICE); - UR_ASSERT(size > 0 && size <= device_max_mem_alloc_size, + UR_ASSERT(size > 0 && size <= DeviceMaxMemAllocSize, UR_RESULT_ERROR_INVALID_USM_SIZE); - ur_result_t result = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { - ScopedContext active(hContext); - result = UR_CHECK_ERROR( + ScopedContext Active(hContext); + Result = UR_CHECK_ERROR( cuMemAllocManaged((CUdeviceptr *)ppMem, size, CU_MEM_ATTACH_GLOBAL)); - } catch (ur_result_t error) { - result = error; + } catch (ur_result_t Err) { + Result = Err; } UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 || ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)), UR_RESULT_ERROR_INVALID_VALUE); - assert(result == UR_RESULT_SUCCESS && + assert(Result == UR_RESULT_SUCCESS && (!pUSMDesc || pUSMDesc->align == 0 || reinterpret_cast(*ppMem) % pUSMDesc->align == 0)); - return result; + return Result; } /// USM: Frees the given USM pointer associated with the context. @@ -134,30 +134,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext, void *pMem) { UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); - ur_result_t result = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { - ScopedContext active(hContext); - bool is_managed; - unsigned int type; - void *attribute_values[2] = {&is_managed, &type}; - CUpointer_attribute attributes[2] = {CU_POINTER_ATTRIBUTE_IS_MANAGED, + ScopedContext Active(hContext); + bool IsManaged; + unsigned int Type; + void *AttributeValues[2] = {&IsManaged, &Type}; + CUpointer_attribute Attributes[2] = {CU_POINTER_ATTRIBUTE_IS_MANAGED, CU_POINTER_ATTRIBUTE_MEMORY_TYPE}; - result = UR_CHECK_ERROR(cuPointerGetAttributes( - 2, attributes, attribute_values, (CUdeviceptr)pMem)); - UR_ASSERT(type == CU_MEMORYTYPE_DEVICE || type == CU_MEMORYTYPE_HOST, + Result = UR_CHECK_ERROR(cuPointerGetAttributes( + 2, Attributes, AttributeValues, (CUdeviceptr)pMem)); + UR_ASSERT(Type == CU_MEMORYTYPE_DEVICE || Type == CU_MEMORYTYPE_HOST, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - if (is_managed || type == CU_MEMORYTYPE_DEVICE) { + if (IsManaged || Type == CU_MEMORYTYPE_DEVICE) { // Memory allocated with cuMemAlloc and cuMemAllocManaged must be freed // with cuMemFree - result = UR_CHECK_ERROR(cuMemFree((CUdeviceptr)pMem)); + Result = UR_CHECK_ERROR(cuMemFree((CUdeviceptr)pMem)); } else { // Memory allocated with cuMemAllocHost must be freed with cuMemFreeHost - result = UR_CHECK_ERROR(cuMemFreeHost(pMem)); + Result = UR_CHECK_ERROR(cuMemFreeHost(pMem)); } - } catch (ur_result_t error) { - result = error; + } catch (ur_result_t Err) { + Result = Err; } - return result; + return Result; } UR_APIEXPORT ur_result_t UR_APICALL @@ -167,36 +167,36 @@ urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem, UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); - ur_result_t result = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet); try { - ScopedContext active(hContext); + ScopedContext Active(hContext); switch (propName) { case UR_USM_ALLOC_INFO_TYPE: { - unsigned int value; + unsigned int Value; // do not throw if cuPointerGetAttribute returns CUDA_ERROR_INVALID_VALUE - CUresult ret = cuPointerGetAttribute( - &value, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem); - if (ret == CUDA_ERROR_INVALID_VALUE) { + CUresult Ret = cuPointerGetAttribute( + &Value, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem); + if (Ret == CUDA_ERROR_INVALID_VALUE) { // pointer not known to the CUDA subsystem return ReturnValue(UR_USM_TYPE_UNKNOWN); } - result = check_error_ur(ret, __func__, __LINE__ - 5, __FILE__); - if (value) { + Result = checkErrorUR(Ret, __func__, __LINE__ - 5, __FILE__); + if (Value) { // pointer to managed memory return ReturnValue(UR_USM_TYPE_SHARED); } - result = UR_CHECK_ERROR(cuPointerGetAttribute( - &value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)pMem)); - UR_ASSERT(value == CU_MEMORYTYPE_DEVICE || value == CU_MEMORYTYPE_HOST, + Result = UR_CHECK_ERROR(cuPointerGetAttribute( + &Value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)pMem)); + UR_ASSERT(Value == CU_MEMORYTYPE_DEVICE || Value == CU_MEMORYTYPE_HOST, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - if (value == CU_MEMORYTYPE_DEVICE) { + if (Value == CU_MEMORYTYPE_DEVICE) { // pointer to device memory return ReturnValue(UR_USM_TYPE_DEVICE); } - if (value == CU_MEMORYTYPE_HOST) { + if (Value == CU_MEMORYTYPE_HOST) { // pointer to host memory return ReturnValue(UR_USM_TYPE_HOST); } @@ -211,10 +211,10 @@ urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem, case UR_USM_ALLOC_INFO_BASE_PTR: { #if __CUDA_API_VERSION >= 10020 // CU_POINTER_ATTRIBUTE_RANGE_START_ADDR was introduced in CUDA 10.2 - unsigned int value; + unsigned int Value; result = UR_CHECK_ERROR(cuPointerGetAttribute( - &value, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, (CUdeviceptr)pMem)); - return ReturnValue(value); + &Value, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, (CUdeviceptr)pMem)); + return ReturnValue(Value); #else return UR_RESULT_ERROR_INVALID_VALUE; #endif @@ -222,35 +222,36 @@ urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem, case UR_USM_ALLOC_INFO_SIZE: { #if __CUDA_API_VERSION >= 10020 // CU_POINTER_ATTRIBUTE_RANGE_SIZE was introduced in CUDA 10.2 - unsigned int value; + unsigned int Value; result = UR_CHECK_ERROR(cuPointerGetAttribute( - &value, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem)); - return ReturnValue(value); + &Value, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem)); + return ReturnValue(Value); #else return UR_RESULT_ERROR_INVALID_VALUE; #endif } case UR_USM_ALLOC_INFO_DEVICE: { // get device index associated with this pointer - unsigned int device_idx; - result = UR_CHECK_ERROR(cuPointerGetAttribute( - &device_idx, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)pMem)); + unsigned int DeviceIndex; + Result = UR_CHECK_ERROR(cuPointerGetAttribute( + &DeviceIndex, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, + (CUdeviceptr)pMem)); // currently each device is in its own platform, so find the platform at // the same index - std::vector platforms; - platforms.resize(device_idx + 1); - result = urPlatformGet(device_idx + 1, platforms.data(), nullptr); + std::vector Platforms; + Platforms.resize(DeviceIndex + 1); + Result = urPlatformGet(DeviceIndex + 1, Platforms.data(), nullptr); // get the device from the platform - ur_device_handle_t device = platforms[device_idx]->devices_[0].get(); - return ReturnValue(device); + ur_device_handle_t Device = Platforms[DeviceIndex]->Devices[0].get(); + return ReturnValue(Device); } default: return UR_RESULT_ERROR_INVALID_ENUMERATION; } - } catch (ur_result_t error) { - result = error; + } catch (ur_result_t Err) { + Result = Err; } - return result; + return Result; } From a0de2d72c877ae6c182f54def6817f214d8d56db Mon Sep 17 00:00:00 2001 From: Petr Vesely Date: Thu, 8 Jun 2023 09:22:28 +0100 Subject: [PATCH 42/45] [SYCL][CUDA][PI][UR] Fix PR review comments --- .../ur/adapters/cuda/context.hpp | 19 ++++----- .../ur/adapters/cuda/event.hpp | 32 +++++++-------- .../ur/adapters/cuda/kernel.cpp | 6 ++- .../ur/adapters/cuda/kernel.hpp | 12 +++--- .../ur/adapters/cuda/memory.cpp | 4 +- .../ur/adapters/cuda/memory.hpp | 15 +++---- .../ur/adapters/cuda/platform.cpp | 3 +- .../ur/adapters/cuda/program.cpp | 12 ++---- .../ur/adapters/cuda/queue.cpp | 5 +-- .../ur/adapters/cuda/queue.hpp | 40 +++++++++---------- .../ur/adapters/cuda/sampler.cpp | 6 +-- .../ur/adapters/cuda/ur_interface_loader.cpp | 6 +-- .../unified_runtime/ur/adapters/cuda/usm.cpp | 19 +++++---- 13 files changed, 85 insertions(+), 94 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp index 96103d4d52c14..e13c48fa003b9 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp @@ -21,8 +21,8 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data); /// UR context mapping to a CUDA context object. /// -/// There is no direct mapping between a CUDA context and a UR context, -/// main differences described below: +/// There is no direct mapping between a CUDA context and a UR context. +/// The main differences are described below: /// /// CUDA context vs UR context /// @@ -32,21 +32,21 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data); /// with a given device and control access to said device from the user side. /// UR API context are objects that are passed to functions, and not bound /// to threads. -/// The _ur_context object doesn't implement this behavior, only holds the -/// CUDA context data. The RAII object \ref ScopedContext implements the active -/// context behavior. +/// The ur_context_handle_t_ object doesn't implement this behavior. It only +/// holds the CUDA context data. The RAII object \ref ScopedContext implements +/// the active context behavior. /// /// Primary vs User-defined context /// /// CUDA has two different types of context, the Primary context, /// which is usable by all threads on a given process for a given device, and /// the aforementioned custom contexts. -/// CUDA documentation, and performance analysis, indicates it is recommended -/// to use Primary context whenever possible. -/// Primary context is used as well by the CUDA Runtime API. +/// The CUDA documentation, confirmed with performance analysis, suggest using +/// the Primary context whenever possible. +/// The Primary context is also used by the CUDA Runtime API. /// For UR applications to interop with CUDA Runtime API, they have to use /// the primary context - and make that active in the thread. -/// The `_ur_context` object can be constructed with a `kind` parameter +/// The `ur_context_handle_t_` object can be constructed with a `kind` parameter /// that allows to construct a Primary or `user-defined` context, so that /// the UR object interface is always the same. /// @@ -56,6 +56,7 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data); /// the PI Context can store a number of callback functions that will be /// called upon destruction of the UR Context. /// See proposal for details. +/// https://github.com/codeplaysoftware/standards-proposals/blob/master/extended-context-destruction/index.md /// struct ur_context_handle_t_ { diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp index b1e0f939940ca..fe56c1e1ab501 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp @@ -101,7 +101,7 @@ struct ur_event_handle_t_ { uint32_t StreamToken); // This constructor is private to force programmers to use the - // makeWithNative for event introp + // makeWithNative for event interop ur_event_handle_t_(ur_context_handle_t Context, CUevent EventNative); ur_command_t CommandType; // The type of command associated with event. @@ -117,33 +117,34 @@ struct ur_event_handle_t_ { bool IsRecorded; // Signifies wether a native CUDA event has been recorded // yet. bool IsStarted; // Signifies wether the operation associated with the - // PI event has started or not + // UR event has started or not uint32_t StreamToken; uint32_t EventID; // Queue identifier of the event. - native_type EvEnd; // CUDA event handle. If this _pi_event represents a user - // event, this will be nullptr. + native_type EvEnd; // CUDA event handle. If this ur_event_handle_t represents + // a user event, this will be nullptr. native_type EvStart; // CUDA event handle associated with the start native_type EvQueued; // CUDA event handle associated with the time // the command was enqueued - ur_queue_handle_t Queue; // pi_queue associated with the event. If this is a - // user event, this will be nullptr. + ur_queue_handle_t Queue; // ur_queue_handle_t associated with the event. If + // this is a user event, this will be nullptr. CUstream Stream; // CUstream associated with the event. If this is a user // event, this will be uninitialized. - ur_context_handle_t Context; // pi_context associated with the event. If this - // is a native event, this will be the same - // context associated with the queue_ member. + ur_context_handle_t Context; // ur_context_handle_t associated with the event. + // If this is a native event, this will be the + // same context associated with the queue member. }; -// Iterates over the event wait list, returns correct ur_result_t error codes. -// Invokes the callback for the latest event of each queue in the wait list. -// The callback must take a single pi_event argument and return a ur_result_t. +// Iterate over `event_wait_list` and apply the given callback `f` to the +// latest event on each queue therein. The callback must take a single +// ur_event_handle_t argument and return a ur_result_t. If the callback returns +// an error, the iteration terminates and the error is returned. template ur_result_t forLatestEvents(const ur_event_handle_t *EventWaitList, std::size_t NumEventsInWaitList, Func &&F) { @@ -169,14 +170,13 @@ ur_result_t forLatestEvents(const ur_event_handle_t *EventWaitList, Event0->getEventID() > Event1->getEventID()); }); - bool First = true; CUstream LastSeenStream = 0; - for (ur_event_handle_t Event : Events) { - if (!Event || (!First && Event->getStream() == LastSeenStream)) { + for (size_t i = 0; i < Events.size(); i++) { + auto Event = Events[i]; + if (!Event || (i != 0 && Event->getStream() == LastSeenStream)) { continue; } - First = false; LastSeenStream = Event->getStream(); auto Result = F(Event); diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp index f3c05e016e441..69c02392fa522 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp @@ -66,7 +66,6 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, void *pPropValue, size_t *pPropSizeRet) { UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - // Here we want to query about a kernel's cuda blocks! UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); switch (propName) { @@ -356,6 +355,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle( ur_program_handle_t hProgram, const ur_kernel_native_properties_t *pProperties, ur_kernel_handle_t *phKernel) { + std::ignore = hNativeKernel; + std::ignore = hContext; + std::ignore = hProgram; + std::ignore = pProperties; + std::ignore = phKernel; return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp index 3707cab1d1e0f..040f74ba6b403 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp @@ -23,7 +23,7 @@ /// invocation. This is not the case of CUFunction objects, /// which are simply passed together with the arguments on the invocation. /// The UR Kernel implementation for CUDA stores the list of arguments, -/// argument sizes and offsets to emulate the interface of UR Kernel, +/// argument sizes, and offsets to emulate the interface of UR Kernel, /// saving the arguments for the later dispatch. /// Note that in UR API, the Local memory is specified as a size per /// individual argument, but in CUDA only the total usage of shared @@ -31,7 +31,6 @@ /// A compiler pass converts the UR API local memory model into the /// CUDA shared model. This object simply calculates the total of /// shared memory, and the initial offsets of each parameter. -/// struct ur_kernel_handle_t_ { using native_type = CUfunction; @@ -68,7 +67,7 @@ struct ur_kernel_handle_t_ { Indices.emplace_back(&ImplicitOffsetArgs); } - /// Adds an argument to the kernel. + /// Add an argument to the kernel. /// If the argument existed before, it is replaced. /// Otherwise, it is added. /// Gaps are filled with empty arguments. @@ -104,8 +103,9 @@ struct ur_kernel_handle_t_ { // align the argument size_t AlignedLocalOffset = LocalOffset; - if (LocalOffset % Alignment != 0) { - AlignedLocalOffset += Alignment - (LocalOffset % Alignment); + size_t Pad = LocalOffset % Alignment; + if (Pad != 0) { + AlignedLocalOffset += Alignment - Pad; } addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset), @@ -171,7 +171,7 @@ struct ur_kernel_handle_t_ { const char *getName() const noexcept { return Name.c_str(); } - /// Returns the number of arguments, excluding the implicit global offset. + /// Get the number of kernel arguments, excluding the implicit global offset. /// Note this only returns the current known number of arguments, not the /// real one required by the kernel, since this cannot be queried from /// the CUDA Driver API diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp index b88d5307f4711..c8ecf9d5ddf12 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp @@ -14,7 +14,7 @@ /// Creates a UR Memory object using a CUDA memory allocation. /// Can trigger a manual copy depending on the mode. -/// \TODO Implement USE_HOST_PTR using cuHostRegister +/// \TODO Implement USE_HOST_PTR using cuHostRegister - See #9789 /// UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( ur_context_handle_t hContext, ur_mem_flags_t flags, size_t size, @@ -109,7 +109,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) { /// Decreases the reference count of the Mem object. /// If this is zero, calls the relevant CUDA Free function /// \return UR_RESULT_SUCCESS unless deallocation error -/// UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) { UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE); @@ -435,7 +434,6 @@ urMemImageGetInfo(ur_mem_handle_t hMemory, ur_image_info_t ImgInfoType, /// Implements a buffer partition in the CUDA backend. /// A buffer partition (or a sub-buffer, in OpenCL terms) is simply implemented /// as an offset over an existing CUDA allocation. -/// UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition( ur_mem_handle_t hBuffer, ur_mem_flags_t flags, ur_buffer_create_type_t bufferCreateType, const ur_buffer_region_t *pRegion, diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp index a1b484e3212bf..a986607a65d5e 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp @@ -18,7 +18,7 @@ /// Keeps tracks of all mapped regions used for Map/Unmap calls. /// Only one region can be active at the same time per allocation. struct ur_mem_handle_t_ { - // Context where the memory object is accessibles + // Context where the memory object is accessible ur_context_handle_t Context; /// Reference counting of the handler @@ -31,7 +31,7 @@ struct ur_mem_handle_t_ { /// A UR Memory object represents either plain memory allocations ("Buffers" /// in OpenCL) or typed allocations ("Images" in OpenCL). /// In CUDA their API handlers are different. Whereas "Buffers" are allocated - /// as pointer-like structs, "Images" are stored in Textures or Surfaces + /// as pointer-like structs, "Images" are stored in Textures or Surfaces. /// This union allows implementation to use either from the same handler. union MemImpl { // Handler for plain, pointer-based CUDA allocations @@ -80,7 +80,6 @@ struct ur_mem_handle_t_ { /// Returns a pointer to data visible on the host that contains /// the data on the device associated with this allocation. /// The offset is used to index into the CUDA allocation. - /// void *mapToPtr(size_t Offset, ur_map_flags_t Flags) noexcept { assert(MapPtr == nullptr); MapOffset = Offset; @@ -152,7 +151,6 @@ struct ur_mem_handle_t_ { ur_mem_type_t ImageType, void *HostPtr) : Context{Context}, RefCount{1}, MemType{Type::Surface}, MemFlags{MemFlags} { - // Ignore unused parameter (void)HostPtr; Mem.SurfaceMem.Array = Array; @@ -162,16 +160,13 @@ struct ur_mem_handle_t_ { } ~ur_mem_handle_t_() { - if (MemType == Type::Buffer) { - if (isSubBuffer()) { - urMemRelease(Mem.BufferMem.Parent); - return; - } + if (isBuffer() && isSubBuffer()) { + urMemRelease(Mem.BufferMem.Parent); + return; } urContextRelease(Context); } - // TODO: Move as many shared funcs up as possible bool isBuffer() const noexcept { return MemType == Type::Buffer; } bool isSubBuffer() const noexcept { diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp index f28f76c2a95df..c0150df284cc5 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp @@ -56,7 +56,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGetInfo( /// /// However because multiple devices in a context is not currently supported, /// place each device in a separate platform. -/// UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGet(uint32_t NumEntries, ur_platform_handle_t *phPlatforms, uint32_t *pNumPlatforms) { @@ -183,7 +182,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urTearDown(void *) { return UR_RESULT_SUCCESS; } -// Returns plugin specific backend option. +// Get CUDA plugin specific backend option. // Current support is only for optimization options. // Return empty string for cuda. // TODO: Determine correct string to be passed. diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp index ce8d7c705ae83..f359b24eb68b6 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp @@ -159,7 +159,7 @@ ur_result_t ur_program_handle_t_::buildProgram(const char *BuildOptions) { /// CUDA driver API doesn't expose an operation for this. /// Note: This is currently only being used by the SYCL program class for the /// has_kernel method, so an alternative would be to move the has_kernel -/// query to PI and use cuModuleGetFunction to check for a kernel. +/// query to UR and use cuModuleGetFunction to check for a kernel. /// Note: Another alternative is to add kernel names as metadata, like with /// reqd_work_group_size. ur_result_t getKernelNames(ur_program_handle_t) { @@ -169,7 +169,6 @@ ur_result_t getKernelNames(ur_program_handle_t) { /// CUDA will handle the PTX/CUBIN binaries internally through CUmodule object. /// So, urProgramCreateWithIL and urProgramCreateWithBinary are equivalent in /// terms of CUDA adapter. See \ref urProgramCreateWithBinary. -/// UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL, size_t length, const ur_program_properties_t *pProperties, @@ -186,7 +185,6 @@ urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL, /// CUDA will handle the PTX/CUBIN binaries internally through a call to /// cuModuleLoadDataEx. So, urProgramCompile and urProgramBuild are equivalent /// in terms of CUDA adapter. \TODO Implement asynchronous compilation -/// UR_APIEXPORT ur_result_t UR_APICALL urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram, const char *pOptions) { @@ -196,7 +194,6 @@ urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram, /// Loads the images from a UR program into a CUmodule that can be /// used later on to extract functions (kernels). /// See \ref ur_program_handle_t for implementation details. -/// UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext, ur_program_handle_t hProgram, const char *pOptions) { @@ -218,7 +215,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext, /// Creates a new UR program object that is the outcome of linking all input /// programs. /// \TODO Implement linker options, requires mapping of OpenCL to CUDA -/// UR_APIEXPORT ur_result_t UR_APICALL urProgramLink(ur_context_handle_t hContext, uint32_t count, const ur_program_handle_t *phPrograms, const char *pOptions, @@ -390,10 +386,10 @@ urProgramRelease(ur_program_handle_t hProgram) { /// Gets the native CUDA handle of a UR program object /// -/// \param[in] program The PI program to get the native CUDA object of. -/// \param[out] nativeHandle Set to the native handle of the PI program object. +/// \param[in] program The UR program handle to get the native CUDA object of. +/// \param[out] nativeHandle Set to the native handle of the UR program object. /// -/// \return TBD +/// \return ur_result_t UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle( ur_program_handle_t program, ur_native_handle_t *nativeHandle) { UR_ASSERT(program, UR_RESULT_ERROR_INVALID_NULL_HANDLE); diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp index 82edf55612669..7eac0144f1e21 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp @@ -115,7 +115,6 @@ CUstream ur_queue_handle_t_::getNextTransferStream() { /// Valid properties /// * __SYCL_PI_CUDA_USE_DEFAULT_STREAM -> CU_STREAM_DEFAULT /// * __SYCL_PI_CUDA_SYNC_WITH_DEFAULT -> CU_STREAM_NON_BLOCKING -/// UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_queue_properties_t *pProps, ur_queue_handle_t *phQueue) { @@ -294,7 +293,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue, UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropSizeRet); - switch (uint32_t{propName}) { + switch (propName) { case UR_QUEUE_INFO_CONTEXT: return ReturnValue(hQueue->Context); case UR_QUEUE_INFO_DEVICE: @@ -324,7 +323,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue, } } default: - break; + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; } return UR_RESULT_ERROR_INVALID_ENUMERATION; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp index bfb8f6606b645..5b37f750cb520 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp @@ -84,7 +84,7 @@ struct ur_queue_handle_t_ { // get_next_compute/transfer_stream() functions return streams from // appropriate pools in round-robin fashion native_type getNextComputeStream(uint32_t *StreamToken = nullptr); - // this overload tries select a stream that was used by one of dependancies. + // this overload tries select a stream that was used by one of dependencies. // If that is not possible returns a new stream. If a stream is reused it // returns a lock that needs to remain locked as long as the stream is in use native_type getNextComputeStream(uint32_t NumEventsInWaitList, @@ -203,26 +203,26 @@ struct ur_queue_handle_t_ { } { unsigned int Size = static_cast(TransferStreams.size()); - if (Size > 0) { - std::lock_guard TransferGuard(TransferStreamMutex); - unsigned int Start = LastSyncTransferStreams; - unsigned int End = NumTransferStreams < Size - ? NumTransferStreams - : TransferStreamIndex.load(); - if (ResetUsed) { - LastSyncTransferStreams = End; - } - if (End - Start >= Size) { - SyncTransfer(0, Size); + if (!Size) { + return; + } + std::lock_guard TransferGuard(TransferStreamMutex); + unsigned int Start = LastSyncTransferStreams; + unsigned int End = NumTransferStreams < Size ? NumTransferStreams + : TransferStreamIndex.load(); + if (ResetUsed) { + LastSyncTransferStreams = End; + } + if (End - Start >= Size) { + SyncTransfer(0, Size); + } else { + Start %= Size; + End %= Size; + if (Start <= End) { + SyncTransfer(Start, End); } else { - Start %= Size; - End %= Size; - if (Start <= End) { - SyncTransfer(Start, End); - } else { - SyncTransfer(Start, Size); - SyncTransfer(0, End); - } + SyncTransfer(Start, Size); + SyncTransfer(0, End); } } } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp index 464bd783b4646..decb3c1fd519a 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp @@ -17,8 +17,8 @@ ur_result_t urSamplerCreate(ur_context_handle_t hContext, if (pDesc && pDesc->stype == UR_STRUCTURE_TYPE_SAMPLER_DESC) { Sampler->Props |= pDesc->normalizedCoords; - Sampler->Props |= (pDesc->filterMode << 1); - Sampler->Props |= (pDesc->addressingMode << 2); + Sampler->Props |= pDesc->filterMode << 1; + Sampler->Props |= pDesc->addressingMode << 2; } else { // Set default values Sampler->Props |= true; // Normalized Coords @@ -46,7 +46,7 @@ ur_result_t urSamplerGetInfo(ur_sampler_handle_t hSampler, } case UR_SAMPLER_INFO_FILTER_MODE: { auto FilterProp = - static_cast(((hSampler->Props >> 1) & 0x1)); + static_cast((hSampler->Props >> 1) & 0x1); return ReturnValue(FilterProp); } case UR_SAMPLER_INFO_ADDRESSING_MODE: { diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp index f0eb6008d8a36..c7258ad241373 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp @@ -14,11 +14,11 @@ namespace { // TODO - this is a duplicate of what is in the L0 plugin // We should move this to somewhere common ur_result_t validateProcInputs(ur_api_version_t version, void *pDdiTable) { - if (nullptr == pDdiTable) { + if (pDdiTable == nullptr) { return UR_RESULT_ERROR_INVALID_NULL_POINTER; } - // Pre 1.0 we enforce loader and adapter must have same version. - // Post 1.0 only major version match should be required. + // Pre 1.0 we enforce that loader and adapter must have the same version. + // Post 1.0 only a major version match should be required. if (version != UR_API_VERSION_CURRENT) { return UR_RESULT_ERROR_UNSUPPORTED_VERSION; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp index 67b98f5c30319..7584e79a7c774 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp @@ -18,7 +18,7 @@ #include /// USM: Implements USM Host allocations using CUDA Pinned Memory -/// +/// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#page-locked-host-memory UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool, size_t size, void **ppMem) { @@ -62,6 +62,9 @@ urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 || + ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)), + UR_RESULT_ERROR_INVALID_VALUE); size_t DeviceMaxMemAllocSize = 0; UR_ASSERT(urDeviceGetInfo(hDevice, UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, @@ -77,11 +80,8 @@ urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, ScopedContext Active(hContext); Result = UR_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)ppMem, size)); } catch (ur_result_t Err) { - Result = Err; + return Err; } - UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 || - ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)), - UR_RESULT_ERROR_INVALID_VALUE); assert(Result == UR_RESULT_SUCCESS && (!pUSMDesc || pUSMDesc->align == 0 || @@ -99,6 +99,9 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 || + ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)), + UR_RESULT_ERROR_INVALID_VALUE); size_t DeviceMaxMemAllocSize = 0; UR_ASSERT(urDeviceGetInfo(hDevice, UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, @@ -115,11 +118,8 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, Result = UR_CHECK_ERROR( cuMemAllocManaged((CUdeviceptr *)ppMem, size, CU_MEM_ATTACH_GLOBAL)); } catch (ur_result_t Err) { - Result = Err; + return Err; } - UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 || - ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)), - UR_RESULT_ERROR_INVALID_VALUE); assert(Result == UR_RESULT_SUCCESS && (!pUSMDesc || pUSMDesc->align == 0 || @@ -206,7 +206,6 @@ urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem, #else __builtin_unreachable(); #endif - return ReturnValue(UR_USM_TYPE_UNKNOWN); } case UR_USM_ALLOC_INFO_BASE_PTR: { #if __CUDA_API_VERSION >= 10020 From 2a50972775de4337226548e2392338c39030e08d Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Fri, 9 Jun 2023 14:44:42 +0100 Subject: [PATCH 43/45] [SYCL][CUDA] Tidy CMakeLists.txt --- sycl/plugins/cuda/CMakeLists.txt | 14 +++++++------- sycl/plugins/unified_runtime/CMakeLists.txt | 12 ++++++------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/sycl/plugins/cuda/CMakeLists.txt b/sycl/plugins/cuda/CMakeLists.txt index 70e4e1a200e1a..2570b6f7e7348 100644 --- a/sycl/plugins/cuda/CMakeLists.txt +++ b/sycl/plugins/cuda/CMakeLists.txt @@ -64,21 +64,21 @@ add_sycl_plugin(cuda "../unified_runtime/ur/adapters/cuda/enqueue.cpp" "../unified_runtime/ur/adapters/cuda/event.cpp" "../unified_runtime/ur/adapters/cuda/event.hpp" + "../unified_runtime/ur/adapters/cuda/kernel.cpp" + "../unified_runtime/ur/adapters/cuda/kernel.hpp" + "../unified_runtime/ur/adapters/cuda/memory.cpp" + "../unified_runtime/ur/adapters/cuda/memory.hpp" "../unified_runtime/ur/adapters/cuda/platform.cpp" "../unified_runtime/ur/adapters/cuda/platform.hpp" "../unified_runtime/ur/adapters/cuda/program.cpp" "../unified_runtime/ur/adapters/cuda/program.hpp" - "../unified_runtime/ur/adapters/cuda/kernel.cpp" - "../unified_runtime/ur/adapters/cuda/kernel.hpp" - "../unified_runtime/ur/adapters/cuda/queue.hpp" "../unified_runtime/ur/adapters/cuda/queue.cpp" + "../unified_runtime/ur/adapters/cuda/queue.hpp" "../unified_runtime/ur/adapters/cuda/sampler.cpp" "../unified_runtime/ur/adapters/cuda/sampler.hpp" - "../unified_runtime/ur/adapters/cuda/usm.cpp" - "../unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp" "../unified_runtime/ur/adapters/cuda/tracing.cpp" - "../unified_runtime/ur/adapters/cuda/memory.cpp" - "../unified_runtime/ur/adapters/cuda/memory.hpp" + "../unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp" + "../unified_runtime/ur/adapters/cuda/usm.cpp" # --- "${sycl_inc_dir}/sycl/detail/pi.h" "${sycl_inc_dir}/sycl/detail/pi.hpp" diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 9ceb01b670b98..372d7b5f82910 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -140,21 +140,21 @@ if ("cuda" IN_LIST SYCL_ENABLE_PLUGINS) "ur/adapters/cuda/enqueue.cpp" "ur/adapters/cuda/event.cpp" "ur/adapters/cuda/event.hpp" + "ur/adapters/cuda/kernel.cpp" + "ur/adapters/cuda/kernel.hpp" + "ur/adapters/cuda/memory.cpp" + "ur/adapters/cuda/memory.hpp" "ur/adapters/cuda/platform.cpp" "ur/adapters/cuda/platform.hpp" "ur/adapters/cuda/program.cpp" "ur/adapters/cuda/program.hpp" - "ur/adapters/cuda/kernel.cpp" - "ur/adapters/cuda/kernel.hpp" "ur/adapters/cuda/queue.cpp" "ur/adapters/cuda/queue.hpp" "ur/adapters/cuda/sampler.cpp" "ur/adapters/cuda/sampler.hpp" - "ur/adapters/cuda/memory.cpp" - "ur/adapters/cuda/memory.hpp" - "ur/adapters/cuda/usm.cpp" - "ur/adapters/cuda/ur_interface_loader.cpp" "ur/adapters/cuda/tracing.cpp" + "ur/adapters/cuda/ur_interface_loader.cpp" + "ur/adapters/cuda/usm.cpp" INCLUDE_DIRS ${sycl_inc_dir} LIBRARIES From c39e7942f554b77f9a2c6b547ec659000bf63fb5 Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Mon, 12 Jun 2023 16:41:37 +0100 Subject: [PATCH 44/45] Fix various build warnings --- .../ur/adapters/cuda/context.cpp | 9 +++++++-- .../ur/adapters/cuda/device.cpp | 1 + .../ur/adapters/cuda/enqueue.cpp | 20 ++++++++++++------- .../ur/adapters/cuda/kernel.cpp | 1 + .../ur/adapters/cuda/kernel.hpp | 5 +++-- .../ur/adapters/cuda/memory.cpp | 16 +++++++-------- .../ur/adapters/cuda/program.cpp | 6 +++--- .../ur/adapters/cuda/queue.cpp | 1 + .../ur/adapters/cuda/queue.hpp | 2 +- 9 files changed, 38 insertions(+), 23 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp index c922e8a3ddad6..74a32bdac2748 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp @@ -22,6 +22,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextCreate(uint32_t DeviceCount, const ur_device_handle_t *phDevices, const ur_context_properties_t *pProperties, ur_context_handle_t *phContext) { + std::ignore = DeviceCount; + std::ignore = pProperties; UR_ASSERT(phDevices, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(phContext, UR_RESULT_ERROR_INVALID_NULL_POINTER); @@ -132,8 +134,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextCreateWithNativeHandle( const ur_device_handle_t *phDevices, const ur_context_native_properties_t *pProperties, ur_context_handle_t *phContext) { - (void)hNativeContext; - (void)phContext; + std::ignore = hNativeContext; + std::ignore = numDevices; + std::ignore = phDevices; + std::ignore = pProperties; + std::ignore = phContext; return UR_RESULT_ERROR_INVALID_OPERATION; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp index c3028a58717c6..51ceab14db3d2 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp @@ -1098,6 +1098,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( ur_native_handle_t hNativeDevice, ur_platform_handle_t hPlatform, const ur_device_native_properties_t *pProperties, ur_device_handle_t *phDevice) { + std::ignore = pProperties; UR_ASSERT(phDevice, UR_RESULT_ERROR_INVALID_NULL_POINTER); // We can't cast between ur_native_handle_t and CUdevice, so memcpy the bits diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp index ef87dab96d2fa..242a419407030 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp @@ -561,7 +561,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect( Result = commonEnqueueMemBufferCopyRect( CuStream, region, &DevPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin, bufferRowPitch, bufferSlicePitch, pDst, CU_MEMORYTYPE_HOST, hostOrigin, - hostRowPitch, bufferSlicePitch); + hostRowPitch, hostSlicePitch); if (phEvent) { Result = RetImplEvent->record(); @@ -905,8 +905,11 @@ static ur_result_t commonEnqueueMemImageNDCopy( UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( ur_queue_handle_t hQueue, ur_mem_handle_t hImage, bool blockingRead, ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch, - size_t phEventWaitListslicePitch, void *pDst, uint32_t numEventsInWaitList, + size_t slicePitch, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + std::ignore = rowPitch; + std::ignore = slicePitch; + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hImage, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface, @@ -972,6 +975,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite( ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch, size_t slicePitch, void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + std::ignore = blockingWrite; + std::ignore = rowPitch; + std::ignore = slicePitch; + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hImage, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface, @@ -1456,10 +1463,8 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, // TODO: Implement this. Remember to return true for // PI_EXT_ONEAPI_CONTEXT_INFO_USM_FILL2D_SUPPORT when it is implemented. UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill2D( - ur_queue_handle_t hQueue, void *pMem, size_t pitch, size_t patternSize, - const void *pPattern, size_t width, size_t height, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { + ur_queue_handle_t, void *, size_t, size_t, const void *, size_t, size_t, + uint32_t, const ur_event_handle_t *, ur_event_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } @@ -1484,7 +1489,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( // Determine the direction of copy using cuPointerGetAttribute // for both the SrcPtr and DstPtr - CUDA_MEMCPY2D CpyDesc = {0}; + CUDA_MEMCPY2D CpyDesc = {}; + memset(&CpyDesc, 0, sizeof(CpyDesc)); getUSMHostOrDevicePtr(pSrc, &CpyDesc.srcMemoryType, &CpyDesc.srcDevice, &CpyDesc.srcHost); diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp index 69c02392fa522..e1d6f9f9a2cd3 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp @@ -338,6 +338,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj( UR_APIEXPORT ur_result_t UR_APICALL urKernelSetExecInfo(ur_kernel_handle_t hKernel, ur_kernel_exec_info_t propName, size_t propSize, const void *pPropValue) { + std::ignore = propSize; UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(pPropValue, UR_RESULT_ERROR_INVALID_NULL_POINTER); switch (propName) { diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp index 040f74ba6b403..8b6a617126b08 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp @@ -137,11 +137,12 @@ struct ur_kernel_handle_t_ { urProgramRetain(Program); urContextRetain(Context); /// Note: this code assumes that there is only one device per context - ur_result_t retError = urKernelGetGroupInfo( + ur_result_t RetError = urKernelGetGroupInfo( this, Context->getDevice(), UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, sizeof(ReqdThreadsPerBlock), ReqdThreadsPerBlock, nullptr); - assert(retError == UR_RESULT_SUCCESS); + (void)RetError; + assert(RetError == UR_RESULT_SUCCESS); } ~ur_kernel_handle_t_() { diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp index c8ecf9d5ddf12..b19acea3159f2 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp @@ -221,15 +221,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory, } UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreateWithNativeHandle( - ur_native_handle_t hNativeMem, ur_context_handle_t hContext, - const ur_mem_native_properties_t *pProperties, ur_mem_handle_t *phMem) { + ur_native_handle_t, ur_context_handle_t, const ur_mem_native_properties_t *, + ur_mem_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreateWithNativeHandle( - ur_native_handle_t hNativeMem, ur_context_handle_t hContext, - const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, - const ur_mem_native_properties_t *pProperties, ur_mem_handle_t *phMem) { + ur_native_handle_t, ur_context_handle_t, const ur_image_format_t *, + const ur_image_desc_t *, const ur_mem_native_properties_t *, + ur_mem_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } @@ -425,9 +425,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( } /// \TODO Not implemented -UR_APIEXPORT ur_result_t UR_APICALL -urMemImageGetInfo(ur_mem_handle_t hMemory, ur_image_info_t ImgInfoType, - size_t propSize, void *pImgInfo, size_t *pPropSizeRet) { +UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo(ur_mem_handle_t, + ur_image_info_t, size_t, + void *, size_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp index f359b24eb68b6..e7467af0b8cbf 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp @@ -197,6 +197,7 @@ urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram, UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext, ur_program_handle_t hProgram, const char *pOptions) { + std::ignore = hContext; UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); ur_result_t Result = UR_RESULT_SUCCESS; @@ -274,9 +275,8 @@ urProgramLink(ur_context_handle_t hContext, uint32_t count, /// /// \return TBD UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithNativeHandle( - ur_native_handle_t hNativeProgram, ur_context_handle_t hContext, - const ur_program_native_properties_t *pProperties, - ur_program_handle_t *phProgram) { + ur_native_handle_t, ur_context_handle_t, + const ur_program_native_properties_t *, ur_program_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp index 7eac0144f1e21..1aded75fb0741 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp @@ -237,6 +237,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t hQueue) { UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle(ur_queue_handle_t hQueue, ur_queue_native_desc_t *pDesc, ur_native_handle_t *phNativeQueue) { + std::ignore = pDesc; UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(phNativeQueue, UR_RESULT_ERROR_INVALID_NULL_POINTER); diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp index 5b37f750cb520..69232efcc77e6 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp @@ -180,7 +180,7 @@ struct ur_queue_handle_t_ { }; { unsigned int Size = static_cast(ComputeStreams.size()); - std::lock_guard ComputeSyncGuard(ComputeStreamSyncMutex); + std::lock_guard ComputeSyncGuard(ComputeStreamSyncMutex); std::lock_guard ComputeGuard(ComputeStreamMutex); unsigned int Start = LastSyncComputeStreams; unsigned int End = NumComputeStreams < Size ? NumComputeStreams From b64fcbd5be136f2fb32d45f1e4d2adc8d983818d Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Wed, 14 Jun 2023 10:26:13 +0100 Subject: [PATCH 45/45] Address more review feedback --- sycl/plugins/unified_runtime/CMakeLists.txt | 5 +++++ .../ur/adapters/cuda/device.cpp | 2 +- .../ur/adapters/cuda/enqueue.cpp | 6 +++--- .../ur/adapters/cuda/platform.cpp | 10 +++++----- .../ur/adapters/cuda/sampler.cpp | 18 ++++++++++-------- 5 files changed, 24 insertions(+), 17 deletions(-) diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 372d7b5f82910..8cff5b2848b0f 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -162,6 +162,11 @@ if ("cuda" IN_LIST SYCL_ENABLE_PLUGINS) Threads::Threads cudadrv ) + + set_target_properties("ur_adapter_cuda" PROPERTIES + VERSION "0.0.0" + SOVERSION "0" + ) endif() if (TARGET UnifiedRuntimeLoader) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp index 51ceab14db3d2..c364c6f384a49 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp @@ -1026,7 +1026,7 @@ urDevicePartition(ur_device_handle_t, const ur_device_partition_property_t *, /// \return UR_RESULT_SUCCESS always since CUDA devices are always root /// devices. -UR_DLLEXPORT ur_result_t UR_APICALL +UR_APIEXPORT ur_result_t UR_APICALL urDeviceRelease(ur_device_handle_t hDevice) { UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp index 242a419407030..8dbd6ee2a27fe 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp @@ -223,7 +223,7 @@ bool hasExceededMaxRegistersPerBlock(ur_device_handle_t Device, /// \ref enqueueEventWaitWithBarrier.) If the events list is empty, the enqueued /// wait will wait on all previous events in the queue. /// -UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { // This function makes one stream work on the previous work (or work @@ -303,14 +303,14 @@ UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( /// TODO: Add support for multiple streams once the Event class is properly /// refactored. /// -UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, phEventWaitList, phEvent); } -UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp index c0150df284cc5..600512d0b01c7 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp @@ -18,7 +18,7 @@ void enableCUDATracing(); void disableCUDATracing(); -UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGetInfo( +UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetInfo( ur_platform_handle_t hPlatform, ur_platform_info_t PlatformInfoType, size_t Size, void *pPlatformInfo, size_t *pSizeRet) { @@ -56,7 +56,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGetInfo( /// /// However because multiple devices in a context is not currently supported, /// place each device in a separate platform. -UR_DLLEXPORT ur_result_t UR_APICALL +UR_APIEXPORT ur_result_t UR_APICALL urPlatformGet(uint32_t NumEntries, ur_platform_handle_t *phPlatforms, uint32_t *pNumPlatforms) { @@ -163,7 +163,7 @@ urPlatformGet(uint32_t NumEntries, ur_platform_handle_t *phPlatforms, } } -UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGetApiVersion( +UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetApiVersion( ur_platform_handle_t hDriver, ur_api_version_t *pVersion) { UR_ASSERT(hDriver, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(pVersion, UR_RESULT_ERROR_INVALID_NULL_POINTER); @@ -172,12 +172,12 @@ UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGetApiVersion( return UR_RESULT_SUCCESS; } -UR_DLLEXPORT ur_result_t UR_APICALL urInit(ur_device_init_flags_t) { +UR_APIEXPORT ur_result_t UR_APICALL urInit(ur_device_init_flags_t) { enableCUDATracing(); return UR_RESULT_SUCCESS; } -UR_DLLEXPORT ur_result_t UR_APICALL urTearDown(void *) { +UR_APIEXPORT ur_result_t UR_APICALL urTearDown(void *) { disableCUDATracing(); return UR_RESULT_SUCCESS; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp index decb3c1fd519a..36ec89fb9da3c 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp @@ -9,9 +9,9 @@ #include "sampler.hpp" #include "common.hpp" -ur_result_t urSamplerCreate(ur_context_handle_t hContext, - const ur_sampler_desc_t *pDesc, - ur_sampler_handle_t *phSampler) { +UR_APIEXPORT ur_result_t UR_APICALL +urSamplerCreate(ur_context_handle_t hContext, const ur_sampler_desc_t *pDesc, + ur_sampler_handle_t *phSampler) { std::unique_ptr Sampler{ new ur_sampler_handle_t_(hContext)}; @@ -29,9 +29,9 @@ ur_result_t urSamplerCreate(ur_context_handle_t hContext, return UR_RESULT_SUCCESS; } -ur_result_t urSamplerGetInfo(ur_sampler_handle_t hSampler, - ur_sampler_info_t propName, size_t propValueSize, - void *pPropValue, size_t *pPropSizeRet) { +UR_APIEXPORT ur_result_t UR_APICALL +urSamplerGetInfo(ur_sampler_handle_t hSampler, ur_sampler_info_t propName, + size_t propValueSize, void *pPropValue, size_t *pPropSizeRet) { UR_ASSERT(hSampler, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropSizeRet); @@ -60,13 +60,15 @@ ur_result_t urSamplerGetInfo(ur_sampler_handle_t hSampler, return {}; } -ur_result_t urSamplerRetain(ur_sampler_handle_t hSampler) { +UR_APIEXPORT ur_result_t UR_APICALL +urSamplerRetain(ur_sampler_handle_t hSampler) { UR_ASSERT(hSampler, UR_RESULT_ERROR_INVALID_NULL_HANDLE); hSampler->incrementReferenceCount(); return UR_RESULT_SUCCESS; } -ur_result_t urSamplerRelease(ur_sampler_handle_t hSampler) { +UR_APIEXPORT ur_result_t UR_APICALL +urSamplerRelease(ur_sampler_handle_t hSampler) { UR_ASSERT(hSampler, UR_RESULT_ERROR_INVALID_NULL_HANDLE); // double delete or someone is messing with the ref count.