diff --git a/buildbot/configure.py b/buildbot/configure.py index 195eb168a1420..19b36171b2bff 100644 --- a/buildbot/configure.py +++ b/buildbot/configure.py @@ -27,6 +27,7 @@ def do_configure(args): sycl_build_pi_cuda = 'OFF' sycl_build_pi_esimd_cpu = 'ON' sycl_build_pi_rocm = 'OFF' + sycl_build_pi_rocm_platform = 'AMD' sycl_werror = 'ON' llvm_enable_assertions = 'ON' llvm_enable_doxygen = 'OFF' @@ -40,21 +41,26 @@ def do_configure(args): if args.arm: llvm_targets_to_build = 'ARM;AArch64' + if args.disable_esimd_cpu: + sycl_build_pi_esimd_cpu = 'OFF' + + if args.cuda or args.rocm: + llvm_enable_projects += ';libclc' + if args.cuda: llvm_targets_to_build += ';NVPTX' - llvm_enable_projects += ';libclc' libclc_targets_to_build = 'nvptx64--;nvptx64--nvidiacl' sycl_build_pi_cuda = 'ON' - if args.disable_esimd_cpu: - sycl_build_pi_esimd_cpu = 'OFF' - if args.rocm: - llvm_targets_to_build += ';AMDGPU' - # TODO libclc should be added once, - # TODO when we build DPC++ with both CUDA and ROCM support - llvm_enable_projects += ';libclc' - libclc_targets_to_build = 'amdgcn--;amdgcn--amdhsa' + if args.rocm_platform == 'AMD': + llvm_targets_to_build += ';AMDGPU' + libclc_targets_to_build += ';amdgcn--;amdgcn--amdhsa' + elif args.rocm_platform == 'NVIDIA' and not args.cuda: + llvm_targets_to_build += ';NVPTX' + libclc_targets_to_build += ';nvptx64--;nvptx64--nvidiacl' + + sycl_build_pi_rocm_platform = args.rocm_platform sycl_build_pi_rocm = 'ON' if args.no_werror: @@ -92,6 +98,7 @@ def do_configure(args): "-DLIBCLC_TARGETS_TO_BUILD={}".format(libclc_targets_to_build), "-DSYCL_BUILD_PI_CUDA={}".format(sycl_build_pi_cuda), "-DSYCL_BUILD_PI_ROCM={}".format(sycl_build_pi_rocm), + "-DSYCL_BUILD_PI_ROCM_PLATFORM={}".format(sycl_build_pi_rocm_platform), "-DLLVM_BUILD_TOOLS=ON", "-DSYCL_ENABLE_WERROR={}".format(sycl_werror), "-DCMAKE_INSTALL_PREFIX={}".format(install_dir), @@ -161,7 +168,8 @@ def main(): parser.add_argument("-t", "--build-type", metavar="BUILD_TYPE", default="Release", help="build type: Debug, Release") parser.add_argument("--cuda", action='store_true', help="switch from OpenCL to CUDA") - parser.add_argument("--rocm", action='store_true', help="swith from OpenCL to ROCM") + parser.add_argument("--rocm", action='store_true', help="switch from OpenCL to ROCm") + parser.add_argument("--rocm-platform", type=str, choices=['AMD', 'NVIDIA'], default='AMD', help="choose ROCm backend") parser.add_argument("--arm", action='store_true', help="build ARM support rather than x86") parser.add_argument("--disable-esimd-cpu", action='store_true', help="build without ESIMD_CPU support") parser.add_argument("--no-assertions", action='store_true', help="build without assertions") diff --git a/sycl/doc/GetStartedGuide.md b/sycl/doc/GetStartedGuide.md index fac0d53fc0c7a..35ff90a6d87af 100644 --- a/sycl/doc/GetStartedGuide.md +++ b/sycl/doc/GetStartedGuide.md @@ -10,6 +10,7 @@ and a wide range of compute accelerators such as GPU and FPGA. - [Build DPC++ toolchain with libc++ library](#build-dpc-toolchain-with-libc-library) - [Build DPC++ toolchain with support for NVIDIA CUDA](#build-dpc-toolchain-with-support-for-nvidia-cuda) - [Build DPC++ toolchain with support for AMD ROCm](#build-dpc-toolchain-with-support-for-amd-rocm) + - [Build DPC++ toolchain with support for NVIDIA ROCm](#build-dpc-toolchain-with-support-for-nvidia-rocm) - [Build Doxygen documentation](#build-doxygen-documentation) - [Deployment](#deployment) - [Use DPC++ toolchain](#use-dpc-toolchain) @@ -107,6 +108,7 @@ flags can be found by launching the script with `--help`): * `--no-werror` -> Don't treat warnings as errors when compiling llvm * `--cuda` -> use the cuda backend (see [Nvidia CUDA](#build-dpc-toolchain-with-support-for-nvidia-cuda)) * `--rocm` -> use the rocm backend (see [AMD ROCm](#build-dpc-toolchain-with-support-for-amd-rocm)) +* `--rocm-platform` -> select the platform used by the rocm backend, `AMD` or `NVIDIA` (see [AMD ROCm](#build-dpc-toolchain-with-support-for-amd-rocm) or see [NVIDIA ROCm](#build-dpc-toolchain-with-support-for-nvidia-rocm)) * `--shared-libs` -> Build shared libraries * `-t` -> Build type (debug or release) * `-o` -> Path to build directory @@ -175,6 +177,34 @@ produce a standard ELF shared code object which can be loaded and executed on an So if you want to support AMD ROCm, you should also build the lld project. [LLD Build Guide](https://lld.llvm.org/) +The following CMake variables can be updated to change where CMake is looking +for the ROCm installation: + +* `SYCL_BUILD_PI_ROCM_INCLUDE_DIR`: Path to HIP include directory (default + `/opt/rocm/hip/include`). +* `SYCL_BUILD_PI_ROCM_HSA_INCLUDE_DIR`: Path to HSA include directory (default + `/opt/rocm/hsa/include`). +* `SYCL_BUILD_PI_ROCM_AMD_LIBRARY`: Path to HIP runtime library (default + `/opt/rocm/hip/lib/libamdhip64.so`). + +### Build DPC++ toolchain with support for NVIDIA ROCm + +There is experimental support for DPC++ for using ROCm on NVIDIA devices. + +This is a compatibility feature and the [CUDA backend](#build-dpc-toolchain-with-support-for-nvidia-cuda) +should be preferred to run on NVIDIA GPUs. + +To enable support for NVIDIA ROCm devices, follow the instructions for the Linux +DPC++ toolchain, but add the `--rocm` and `--rocm-platform NVIDIA` flags to +`configure.py`. + +Enabling this flag requires ROCm to be installed, more specifically +[HIP NVCC](https://rocmdocs.amd.com/en/latest/Installation_Guide/HIP-Installation.html#nvidia-platform), +as well as CUDA to be installed, see +[NVIDIA CUDA Installation Guide for Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html). + +Currently this was only tested on Linux with ROCm 4.2, CUDA 11 and a GeForce GTX +1060 card. ### Build Doxygen documentation @@ -510,7 +540,7 @@ and run following command: clang++ -fsycl simple-sycl-app.cpp -o simple-sycl-app.exe ``` -When building for CUDA, use the CUDA target triple as follows: +When building for CUDA or NVIDIA ROCm, use the CUDA target triple as follows: ```bash clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice \ diff --git a/sycl/plugins/rocm/CMakeLists.txt b/sycl/plugins/rocm/CMakeLists.txt index edf593113aaab..bfe5700ee7b80 100644 --- a/sycl/plugins/rocm/CMakeLists.txt +++ b/sycl/plugins/rocm/CMakeLists.txt @@ -1,51 +1,76 @@ -message(STATUS "Including the PI API ROCM backend.") +# Set default PI ROCm platform to AMD +set(SYCL_BUILD_PI_ROCM_PLATFORM "AMD" CACHE STRING "PI ROCm platform, AMD or NVIDIA") - # cannot rely on cmake support for ROCM; it assumes runtime API is being used. - # we only require the ROCM driver API to be used - # rocm_rocm_LIBRARY variable defines the path to libhsa-runtime64.so, the ROCM Driver API library. +message(STATUS "Including the PI API ROCM backend for ${SYCL_BUILD_PI_ROCM_PLATFORM}.") -#find_package(ROCM 4.0 REQUIRED) - -# Make imported library global to use it within the project. -add_library(rocmdrv SHARED IMPORTED GLOBAL) - - -set(ROCM_ROCM_LIBRARY "/opt/rocm/hip/lib/libamdhip64.so") -set(ROCM_INCLUDE_DIRS "/opt/rocm/hip/include") -set(hsa_inc_dir "/opt/rocm/hsa/include") - - -add_definitions(-D__HIP_PLATFORM_HCC__) - -set_target_properties( - rocmdrv PROPERTIES - IMPORTED_LOCATION ${ROCM_ROCM_LIBRARY} - INTERFACE_INCLUDE_DIRECTORIES ${ROCM_INCLUDE_DIRS} -) +# Set default ROCm include dirs +set(SYCL_BUILD_PI_ROCM_INCLUDE_DIR "/opt/rocm/hip/include" CACHE STRING "HIP include dir") +set(SYCL_BUILD_PI_ROCM_HSA_INCLUDE_DIR "/opt/rocm/hsa/include" CACHE STRING "HSA include dir") +set(HIP_HEADERS "${SYCL_BUILD_PI_ROCM_INCLUDE_DIR};${SYCL_BUILD_PI_ROCM_HSA_INCLUDE_DIR}") +# Create pi_rocm library add_library(pi_rocm SHARED - "${sycl_inc_dir}/CL/sycl/detail/pi.h" - "${sycl_inc_dir}/CL/sycl/detail/pi.hpp" - "pi_rocm.hpp" - "pi_rocm.cpp" + "${sycl_inc_dir}/CL/sycl/detail/pi.h" + "${sycl_inc_dir}/CL/sycl/detail/pi.hpp" + "pi_rocm.hpp" + "pi_rocm.cpp" ) - - add_dependencies(sycl-toolchain pi_rocm) - set_target_properties(pi_rocm PROPERTIES LINKER_LANGUAGE CXX) +target_link_libraries(pi_rocm PUBLIC OpenCL-Headers) - +# Setup include directories target_include_directories(pi_rocm - PRIVATE - ${sycl_inc_dir} - ${sycl_plugin_dir} - ${ROCM_INCLUDE_DIRS} - ${hsa_inc_dir} + PRIVATE + ${sycl_inc_dir} + ${sycl_plugin_dir} ) - -target_link_libraries(pi_rocm PUBLIC OpenCL-Headers rocmdrv) +if("${SYCL_BUILD_PI_ROCM_PLATFORM}" STREQUAL "AMD") + # Import HIP runtime library + set(SYCL_BUILD_PI_ROCM_AMD_LIBRARY "/opt/rocm/hip/lib/libamdhip64.so" CACHE STRING "HIP AMD runtime library") + add_library(rocmdrv SHARED IMPORTED GLOBAL) + + set_target_properties( + rocmdrv PROPERTIES + IMPORTED_LOCATION ${SYCL_BUILD_PI_ROCM_AMD_LIBRARY} + INTERFACE_INCLUDE_DIRECTORIES "${HIP_HEADERS}" + INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}" + ) + target_link_libraries(pi_rocm PUBLIC rocmdrv) + + # Set HIP define to select AMD platform + target_compile_definitions(pi_rocm PRIVATE __HIP_PLATFORM_AMD__) + elseif("${SYCL_BUILD_PI_ROCM_PLATFORM}" STREQUAL "NVIDIA") + # Import CUDA libraries + find_package(CUDA REQUIRED) + list(APPEND HIP_HEADERS ${CUDA_INCLUDE_DIRS}) + + # cudadrv may be defined by the CUDA plugin + if(NOT TARGET cudadrv) + add_library(cudadrv SHARED IMPORTED GLOBAL) + set_target_properties( + cudadrv PROPERTIES + IMPORTED_LOCATION ${CUDA_CUDA_LIBRARY} + INTERFACE_INCLUDE_DIRECTORIES "${HIP_HEADERS}" + INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}" + ) + endif() + + add_library(cudart SHARED IMPORTED GLOBAL) + set_target_properties( + cudart PROPERTIES + IMPORTED_LOCATION ${CUDA_CUDART_LIBRARY} + INTERFACE_INCLUDE_DIRECTORIES "${HIP_HEADERS}" + INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}" + ) + target_link_libraries(pi_rocm PUBLIC cudadrv cudart) + + # Set HIP define to select NVIDIA platform + target_compile_definitions(pi_rocm PRIVATE __HIP_PLATFORM_NVIDIA__) +else() + message(FATAL_ERROR "Unspecified PI ROCM platform please set SYCL_BUILD_PI_ROCM_PLATFORM to 'AMD' or 'NVIDIA'") +endif() add_common_options(pi_rocm) diff --git a/sycl/plugins/rocm/pi_rocm.cpp b/sycl/plugins/rocm/pi_rocm.cpp index 687bd2ed935d8..c238f337f5a8c 100644 --- a/sycl/plugins/rocm/pi_rocm.cpp +++ b/sycl/plugins/rocm/pi_rocm.cpp @@ -23,8 +23,64 @@ #include #include #include +#include namespace { +// Hipify doesn't support cuArrayGetDescriptor, on AMD the hipArray can just be +// indexed, but on NVidia it is an opaque type and needs to go through +// cuArrayGetDescriptor so implement a utility function to get the array +// properties +inline void getArrayDesc(hipArray *array, hipArray_Format &format, + size_t &channels) { +#if defined(__HIP_PLATFORM_AMD__) + format = array->Format; + channels = array->NumChannels; +#elif defined(__HIP_PLATFORM_NVIDIA__) + CUDA_ARRAY_DESCRIPTOR arrayDesc; + cuArrayGetDescriptor(&arrayDesc, (CUarray)array); + + format = arrayDesc.Format; + channels = arrayDesc.NumChannels; +#else +#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__"); +#endif +} + +// NVidia HIP headers guard hipArray3DCreate behind __CUDACC__, this does not +// seem to be required and we're not using nvcc to build the HIP PI plugin so +// add the translation function here +#if defined(__HIP_PLATFORM_NVIDIA__) && !defined(__CUDACC__) +inline static hipError_t +hipArray3DCreate(hiparray *pHandle, + const HIP_ARRAY3D_DESCRIPTOR *pAllocateArray) { + return hipCUResultTohipError(cuArray3DCreate(pHandle, pAllocateArray)); +} +#endif + +// hipArray gets turned into cudaArray when using the HIP NVIDIA platform, and +// some CUDA APIs use cudaArray* and others use CUarray, these two represent the +// same type, however when building cudaArray appears as an opaque type, so it +// needs to be explicitly casted to CUarray. In order for this to work for both +// AMD and NVidia we introduce an second hipArray type that will be CUarray for +// NVIDIA and hipArray* for AMD so that we can place the explicit casts when +// necessary for NVIDIA and they will be no-ops for AMD. +#if defined(__HIP_PLATFORM_NVIDIA__) +typedef CUarray hipCUarray; +#elif defined(__HIP_PLATFORM_AMD__) +typedef hipArray *hipCUarray; +#else +#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__"); +#endif + +// Add missing HIP to CUDA defines +#if defined(__HIP_PLATFORM_NVIDIA__) +#define hipMemoryType CUmemorytype +#define hipMemoryTypeHost CU_MEMORYTYPE_HOST +#define hipMemoryTypeDevice CU_MEMORYTYPE_DEVICE +#define hipMemoryTypeArray CU_MEMORYTYPE_ARRAY +#define hipMemoryTypeUnified CU_MEMORYTYPE_UNIFIED +#endif + std::string getHipVersionString() { int driver_version = 0; if (hipDriverGetVersion(&driver_version) != hipSuccess) { @@ -216,15 +272,20 @@ pi_result getInfo(size_t param_value_size, void *param_value, }; return getInfoImpl(param_value_size, param_value, param_value_size_ret, value, - sizeof(T), assignment); + sizeof(T), std::move(assignment)); } template pi_result getInfoArray(size_t array_length, size_t param_value_size, void *param_value, size_t *param_value_size_ret, T *value) { + + auto assignment = [](void *param_value, T *value, size_t value_size) { + memcpy(param_value, static_cast(value), value_size); + }; + return getInfoImpl(param_value_size, param_value, param_value_size_ret, value, - array_length * sizeof(T), memcpy); + array_length * sizeof(T), std::move(assignment)); } template <> @@ -802,11 +863,18 @@ pi_result rocm_piextDeviceSelectBinary(pi_device device, cl::sycl::detail::pi::die("No binary images in the list"); } - // Look for an image for the AMDGCN target, and return the first one that is + // Look for an image for the ROCm target, and return the first one that is // found +#if defined(__HIP_PLATFORM_AMD__) + const char *binary_type = __SYCL_PI_DEVICE_BINARY_TARGET_AMDGCN; +#elif defined(__HIP_PLATFORM_NVIDIA__) + const char *binary_type = __SYCL_PI_DEVICE_BINARY_TARGET_NVPTX64; +#else +#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__"); +#endif + for (pi_uint32 i = 0; i < num_binaries; i++) { - if (strcmp(binaries[i]->DeviceTargetSpec, - __SYCL_PI_DEVICE_BINARY_TARGET_AMDGCN) == 0) { + if (strcmp(binaries[i]->DeviceTargetSpec, binary_type) == 0) { *selected_binary = i; return PI_SUCCESS; } @@ -1699,7 +1767,7 @@ pi_result rocm_piMemBufferCreate(pi_context context, pi_mem_flags flags, try { ScopedContext active(context); - hipDevPtr ptr; + void *ptr; _pi_mem::mem_::buffer_mem_::alloc_mode allocMode = _pi_mem::mem_::buffer_mem_::alloc_mode::classic; @@ -1722,13 +1790,15 @@ pi_result rocm_piMemBufferCreate(pi_context context, pi_mem_flags flags, if (retErr == PI_SUCCESS) { pi_mem parentBuffer = nullptr; - auto piMemObj = std::unique_ptr<_pi_mem>( - new _pi_mem{context, parentBuffer, allocMode, ptr, host_ptr, size}); + auto devPtr = + reinterpret_cast<_pi_mem::mem_::mem_::buffer_mem_::native_type>(ptr); + auto piMemObj = std::unique_ptr<_pi_mem>(new _pi_mem{ + context, parentBuffer, allocMode, devPtr, host_ptr, size}); if (piMemObj != nullptr) { retMemObj = piMemObj.release(); if (performInitialCopy) { // Operates on the default stream of the current HIP context. - retErr = PI_CHECK_ERROR(hipMemcpyHtoD(ptr, host_ptr, size)); + retErr = PI_CHECK_ERROR(hipMemcpyHtoD(devPtr, host_ptr, size)); // Synchronize with default stream implicitly used by cuMemcpyHtoD // to make buffer data available on device before any other PI call // uses it. @@ -1781,7 +1851,8 @@ pi_result rocm_piMemRelease(pi_mem memObj) { switch (uniqueMemObj->mem_.buffer_mem_.allocMode_) { case _pi_mem::mem_::buffer_mem_::alloc_mode::copy_in: case _pi_mem::mem_::buffer_mem_::alloc_mode::classic: - ret = PI_CHECK_ERROR(hipFree(uniqueMemObj->mem_.buffer_mem_.ptr_)); + ret = PI_CHECK_ERROR( + hipFree((void *)uniqueMemObj->mem_.buffer_mem_.ptr_)); break; case _pi_mem::mem_::buffer_mem_::alloc_mode::use_host_ptr: ret = PI_CHECK_ERROR( @@ -1797,7 +1868,7 @@ pi_result rocm_piMemRelease(pi_mem memObj) { ret = PI_CHECK_ERROR(hipDestroySurfaceObject( uniqueMemObj->mem_.surface_mem_.get_surface())); auto array = uniqueMemObj->mem_.surface_mem_.get_array(); - ret = PI_CHECK_ERROR(hipFreeArray(&array)); + ret = PI_CHECK_ERROR(hipFreeArray(array)); } } catch (pi_result err) { @@ -1857,7 +1928,7 @@ pi_result rocm_piMemBufferPartition(pi_mem parent_buffer, pi_mem_flags flags, assert(parent_buffer->mem_.buffer_mem_.ptr_ != _pi_mem::mem_::buffer_mem_::native_type{0}); _pi_mem::mem_::buffer_mem_::native_type ptr = - (uint8_t *)(parent_buffer->mem_.buffer_mem_.ptr_) + bufferRegion.origin; + parent_buffer->mem_.buffer_mem_.get_with_offset(bufferRegion.origin); void *hostPtr = nullptr; if (parent_buffer->mem_.buffer_mem_.hostPtr_) { @@ -2079,7 +2150,6 @@ pi_result rocm_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer, assert(command_queue != nullptr); pi_result retErr = PI_SUCCESS; hipStream_t hipStream = command_queue->get(); - hipDevPtr devPtr = buffer->mem_.buffer_mem_.get(); std::unique_ptr<_pi_event> retImplEv{nullptr}; try { @@ -2095,7 +2165,8 @@ pi_result rocm_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer, } retErr = PI_CHECK_ERROR( - hipMemcpyHtoDAsync((uint8_t *)devPtr + offset, ptr, size, hipStream)); + hipMemcpyHtoDAsync(buffer->mem_.buffer_mem_.get_with_offset(offset), + ptr, size, hipStream)); if (event) { retErr = retImplEv->record(); @@ -2125,7 +2196,6 @@ pi_result rocm_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer, assert(command_queue != nullptr); pi_result retErr = PI_SUCCESS; hipStream_t hipStream = command_queue->get(); - hipDevPtr devPtr = buffer->mem_.buffer_mem_.get(); std::unique_ptr<_pi_event> retImplEv{nullptr}; try { @@ -2140,8 +2210,9 @@ pi_result rocm_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer, retImplEv->start(); } - retErr = PI_CHECK_ERROR( - hipMemcpyDtoHAsync(ptr, (uint8_t *)devPtr + offset, size, hipStream)); + retErr = PI_CHECK_ERROR(hipMemcpyDtoHAsync( + ptr, buffer->mem_.buffer_mem_.get_with_offset(offset), size, + hipStream)); if (event) { retErr = retImplEv->record(); @@ -2265,10 +2336,12 @@ pi_result rocm_piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index, if (arg_mem->mem_type_ == _pi_mem::mem_type::surface) { auto array = arg_mem->mem_.surface_mem_.get_array(); - if (array.Format != HIP_AD_FORMAT_UNSIGNED_INT32 && - array.Format != HIP_AD_FORMAT_SIGNED_INT32 && - array.Format != HIP_AD_FORMAT_HALF && - array.Format != HIP_AD_FORMAT_FLOAT) { + hipArray_Format Format; + size_t NumChannels; + getArrayDesc(array, Format, NumChannels); + if (Format != HIP_AD_FORMAT_UNSIGNED_INT32 && + Format != HIP_AD_FORMAT_SIGNED_INT32 && + Format != HIP_AD_FORMAT_HALF && Format != HIP_AD_FORMAT_FLOAT) { cl::sycl::detail::pi::die( "PI HIP kernels only support images with channel types int32, " "uint32, float, and half."); @@ -2278,8 +2351,8 @@ pi_result rocm_piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index, } else { - hipDevPtr hipPtr = arg_mem->mem_.buffer_mem_.get(); - kernel->set_kernel_arg(arg_index, sizeof(hipDevPtr), (void *)&hipPtr); + void *hipPtr = arg_mem->mem_.buffer_mem_.get_void(); + kernel->set_kernel_arg(arg_index, sizeof(void *), (void *)&hipPtr); } } catch (pi_result err) { retErr = err; @@ -2525,7 +2598,8 @@ pi_result rocm_piMemImageCreate(pi_context context, pi_mem_flags flags, ScopedContext active(context); hipArray *image_array; - retErr = PI_CHECK_ERROR(hipArray3DCreate(&image_array, &array_desc)); + retErr = PI_CHECK_ERROR(hipArray3DCreate( + reinterpret_cast(&image_array), &array_desc)); try { if (performInitialCopy) { @@ -2539,7 +2613,7 @@ pi_result rocm_piMemImageCreate(pi_context context, pi_mem_flags flags, cpy_desc.srcMemoryType = hipMemoryType::hipMemoryTypeHost; cpy_desc.srcHost = host_ptr; cpy_desc.dstMemoryType = hipMemoryType::hipMemoryTypeArray; - cpy_desc.dstArray = image_array; + cpy_desc.dstArray = reinterpret_cast(image_array); cpy_desc.WidthInBytes = pixel_size_bytes * image_desc->image_width; cpy_desc.Height = image_desc->image_height; retErr = PI_CHECK_ERROR(hipMemcpyParam2D(&cpy_desc)); @@ -2549,7 +2623,7 @@ pi_result rocm_piMemImageCreate(pi_context context, pi_mem_flags flags, cpy_desc.srcMemoryType = hipMemoryType::hipMemoryTypeHost; cpy_desc.srcHost = host_ptr; cpy_desc.dstMemoryType = hipMemoryType::hipMemoryTypeArray; - cpy_desc.dstArray = image_array; + cpy_desc.dstArray = reinterpret_cast(image_array); cpy_desc.WidthInBytes = pixel_size_bytes * image_desc->image_width; cpy_desc.Height = image_desc->image_height; cpy_desc.Depth = image_desc->image_depth; @@ -2572,7 +2646,7 @@ pi_result rocm_piMemImageCreate(pi_context context, pi_mem_flags flags, retErr = PI_CHECK_ERROR(hipCreateSurfaceObject(&surface, &image_res_desc)); auto piMemObj = std::unique_ptr<_pi_mem>(new _pi_mem{ - context, *image_array, surface, image_desc->image_type, host_ptr}); + context, image_array, surface, image_desc->image_type, host_ptr}); if (piMemObj == nullptr) { return PI_OUT_OF_HOST_MEMORY; @@ -3393,7 +3467,7 @@ static pi_result commonEnqueueMemBufferCopyRect( params.srcMemoryType = src_type; params.srcDevice = src_type == hipMemoryTypeDevice - ? *static_cast(src_ptr) + ? *static_cast(src_ptr) : 0; params.srcHost = src_type == hipMemoryTypeHost ? src_ptr : nullptr; params.srcXInBytes = src_offset->x_bytes; @@ -3403,8 +3477,9 @@ static pi_result commonEnqueueMemBufferCopyRect( params.srcHeight = src_slice_pitch / src_row_pitch; params.dstMemoryType = dst_type; - params.dstDevice = - dst_type == hipMemoryTypeDevice ? *static_cast(dst_ptr) : 0; + params.dstDevice = dst_type == hipMemoryTypeDevice + ? *reinterpret_cast(dst_ptr) + : 0; params.dstHost = dst_type == hipMemoryTypeHost ? dst_ptr : nullptr; params.dstXInBytes = dst_offset->x_bytes; params.dstY = dst_offset->y_scalar; @@ -3430,7 +3505,7 @@ pi_result rocm_piEnqueueMemBufferReadRect( pi_result retErr = PI_SUCCESS; hipStream_t hipStream = command_queue->get(); - hipDevPtr devPtr = buffer->mem_.buffer_mem_.get(); + void *devPtr = buffer->mem_.buffer_mem_.get_void(); std::unique_ptr<_pi_event> retImplEv{nullptr}; try { @@ -3481,7 +3556,7 @@ pi_result rocm_piEnqueueMemBufferWriteRect( pi_result retErr = PI_SUCCESS; hipStream_t hipStream = command_queue->get(); - hipDevPtr devPtr = buffer->mem_.buffer_mem_.get(); + void *devPtr = buffer->mem_.buffer_mem_.get_void(); std::unique_ptr<_pi_event> retImplEv{nullptr}; try { @@ -3548,8 +3623,8 @@ pi_result rocm_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, } auto stream = command_queue->get(); - auto src = (uint8_t *)(src_buffer->mem_.buffer_mem_.get()) + src_offset; - auto dst = (uint8_t *)(dst_buffer->mem_.buffer_mem_.get()) + dst_offset; + auto src = src_buffer->mem_.buffer_mem_.get_with_offset(src_offset); + auto dst = dst_buffer->mem_.buffer_mem_.get_with_offset(dst_offset); result = PI_CHECK_ERROR(hipMemcpyDtoDAsync(dst, src, size, stream)); @@ -3580,8 +3655,8 @@ pi_result rocm_piEnqueueMemBufferCopyRect( pi_result retErr = PI_SUCCESS; hipStream_t hipStream = command_queue->get(); - hipDevPtr srcPtr = src_buffer->mem_.buffer_mem_.get(); - hipDevPtr dstPtr = dst_buffer->mem_.buffer_mem_.get(); + void *srcPtr = src_buffer->mem_.buffer_mem_.get_void(); + void *dstPtr = dst_buffer->mem_.buffer_mem_.get_void(); std::unique_ptr<_pi_event> retImplEv{nullptr}; try { @@ -3653,7 +3728,7 @@ pi_result rocm_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, result = retImplEv->start(); } - auto dstDevice = (uint8_t *)(buffer->mem_.buffer_mem_.get()) + offset; + auto dstDevice = buffer->mem_.buffer_mem_.get_with_offset(offset); auto stream = command_queue->get(); auto N = size / pattern_size; @@ -3694,7 +3769,7 @@ pi_result rocm_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, } } -static size_t imageElementByteSize(enum hipArray_Format array_format) { +static size_t imageElementByteSize(hipArray_Format array_format) { switch (array_format) { case HIP_AD_FORMAT_UNSIGNED_INT8: case HIP_AD_FORMAT_SIGNED_INT8: @@ -3735,7 +3810,7 @@ static pi_result commonEnqueueMemImageNDCopy( cpyDesc.srcMemoryType = src_type; if (src_type == hipMemoryTypeArray) { cpyDesc.srcArray = - const_cast(static_cast(src_ptr)); + reinterpret_cast(const_cast(src_ptr)); cpyDesc.srcXInBytes = src_offset[0]; cpyDesc.srcY = src_offset[1]; } else { @@ -3744,7 +3819,7 @@ static pi_result commonEnqueueMemImageNDCopy( cpyDesc.dstMemoryType = dst_type; if (dst_type == hipMemoryTypeArray) { cpyDesc.dstArray = - const_cast(static_cast(dst_ptr)); + reinterpret_cast(const_cast(dst_ptr)); cpyDesc.dstXInBytes = dst_offset[0]; cpyDesc.dstY = dst_offset[1]; } else { @@ -3762,7 +3837,7 @@ static pi_result commonEnqueueMemImageNDCopy( cpyDesc.srcMemoryType = src_type; if (src_type == hipMemoryTypeArray) { cpyDesc.srcArray = - const_cast(static_cast(src_ptr)); + reinterpret_cast(const_cast(src_ptr)); cpyDesc.srcXInBytes = src_offset[0]; cpyDesc.srcY = src_offset[1]; cpyDesc.srcZ = src_offset[2]; @@ -3771,7 +3846,7 @@ static pi_result commonEnqueueMemImageNDCopy( } cpyDesc.dstMemoryType = dst_type; if (dst_type == hipMemoryTypeArray) { - cpyDesc.dstArray = static_cast(dst_ptr); + cpyDesc.dstArray = reinterpret_cast(dst_ptr); cpyDesc.dstXInBytes = dst_offset[0]; cpyDesc.dstY = dst_offset[1]; cpyDesc.dstZ = dst_offset[2]; @@ -3809,12 +3884,16 @@ pi_result rocm_piEnqueueMemImageRead( event_wait_list, nullptr); } - hipArray array = image->mem_.surface_mem_.get_array(); + hipArray *array = image->mem_.surface_mem_.get_array(); - int elementByteSize = imageElementByteSize(array.Format); + hipArray_Format Format; + size_t NumChannels; + getArrayDesc(array, Format, NumChannels); - size_t byteOffsetX = origin[0] * elementByteSize * array.NumChannels; - size_t bytesToCopy = elementByteSize * array.NumChannels * region[0]; + int elementByteSize = imageElementByteSize(Format); + + size_t byteOffsetX = origin[0] * elementByteSize * NumChannels; + size_t bytesToCopy = elementByteSize * NumChannels * region[0]; pi_mem_type imgType = image->mem_.surface_mem_.get_image_type(); @@ -3822,7 +3901,7 @@ pi_result rocm_piEnqueueMemImageRead( size_t srcOffset[3] = {byteOffsetX, origin[1], origin[2]}; retErr = commonEnqueueMemImageNDCopy(hipStream, imgType, adjustedRegion, - &array, hipMemoryTypeArray, srcOffset, + array, hipMemoryTypeArray, srcOffset, ptr, hipMemoryTypeHost, nullptr); if (retErr != PI_SUCCESS) { @@ -3871,12 +3950,16 @@ rocm_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image, event_wait_list, nullptr); } - hipArray array = image->mem_.surface_mem_.get_array(); + hipArray *array = image->mem_.surface_mem_.get_array(); + + hipArray_Format Format; + size_t NumChannels; + getArrayDesc(array, Format, NumChannels); - int elementByteSize = imageElementByteSize(array.Format); + int elementByteSize = imageElementByteSize(Format); - size_t byteOffsetX = origin[0] * elementByteSize * array.NumChannels; - size_t bytesToCopy = elementByteSize * array.NumChannels * region[0]; + size_t byteOffsetX = origin[0] * elementByteSize * NumChannels; + size_t bytesToCopy = elementByteSize * NumChannels * region[0]; pi_mem_type imgType = image->mem_.surface_mem_.get_image_type(); @@ -3884,8 +3967,8 @@ rocm_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image, size_t dstOffset[3] = {byteOffsetX, origin[1], origin[2]}; retErr = commonEnqueueMemImageNDCopy(hipStream, imgType, adjustedRegion, - ptr, hipMemoryTypeHost, nullptr, - &array, hipMemoryTypeArray, dstOffset); + ptr, hipMemoryTypeHost, nullptr, array, + hipMemoryTypeArray, dstOffset); if (retErr != PI_SUCCESS) { return retErr; @@ -3932,19 +4015,24 @@ pi_result rocm_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, event_wait_list, nullptr); } - hipArray srcArray = src_image->mem_.surface_mem_.get_array(); - hipArray dstArray = dst_image->mem_.surface_mem_.get_array(); + hipArray *srcArray = src_image->mem_.surface_mem_.get_array(); + hipArray_Format srcFormat; + size_t srcNumChannels; + getArrayDesc(srcArray, srcFormat, srcNumChannels); - assert(srcArray.Format == dstArray.Format); - assert(srcArray.NumChannels == dstArray.NumChannels); + hipArray *dstArray = dst_image->mem_.surface_mem_.get_array(); + hipArray_Format dstFormat; + size_t dstNumChannels; + getArrayDesc(dstArray, dstFormat, dstNumChannels); - int elementByteSize = imageElementByteSize(srcArray.Format); + assert(srcFormat == dstFormat); + assert(srcNumChannels == dstNumChannels); - size_t dstByteOffsetX = - dst_origin[0] * elementByteSize * srcArray.NumChannels; - size_t srcByteOffsetX = - src_origin[0] * elementByteSize * dstArray.NumChannels; - size_t bytesToCopy = elementByteSize * srcArray.NumChannels * region[0]; + int elementByteSize = imageElementByteSize(srcFormat); + + size_t dstByteOffsetX = dst_origin[0] * elementByteSize * srcNumChannels; + size_t srcByteOffsetX = src_origin[0] * elementByteSize * dstNumChannels; + size_t bytesToCopy = elementByteSize * srcNumChannels * region[0]; pi_mem_type imgType = src_image->mem_.surface_mem_.get_image_type(); @@ -3953,8 +4041,8 @@ pi_result rocm_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, size_t dstOffset[3] = {dstByteOffsetX, dst_origin[1], dst_origin[2]}; retErr = commonEnqueueMemImageNDCopy( - hipStream, imgType, adjustedRegion, &srcArray, hipMemoryTypeArray, - srcOffset, &dstArray, hipMemoryTypeArray, dstOffset); + hipStream, imgType, adjustedRegion, srcArray, hipMemoryTypeArray, + srcOffset, dstArray, hipMemoryTypeArray, dstOffset); if (retErr != PI_SUCCESS) { return retErr; @@ -4139,7 +4227,7 @@ pi_result rocm_piextUSMDeviceAlloc(void **result_ptr, pi_context context, pi_result result = PI_SUCCESS; try { ScopedContext active(context); - result = PI_CHECK_ERROR(hipMalloc((hipDevPtr *)result_ptr, size)); + result = PI_CHECK_ERROR(hipMalloc(result_ptr, size)); } catch (pi_result error) { result = error; } @@ -4163,8 +4251,8 @@ pi_result rocm_piextUSMSharedAlloc(void **result_ptr, pi_context context, pi_result result = PI_SUCCESS; try { ScopedContext active(context); - result = PI_CHECK_ERROR( - hipMallocManaged((hipDevPtr *)result_ptr, size, hipMemAttachGlobal)); + result = + PI_CHECK_ERROR(hipMallocManaged(result_ptr, size, hipMemAttachGlobal)); } catch (pi_result error) { result = error; } @@ -4185,12 +4273,12 @@ pi_result rocm_piextUSMFree(pi_context context, void *ptr) { ScopedContext active(context); unsigned int type; hipPointerAttribute_t hipPointerAttributeType; - result = PI_CHECK_ERROR( - hipPointerGetAttributes(&hipPointerAttributeType, (hipDevPtr)ptr)); + result = + PI_CHECK_ERROR(hipPointerGetAttributes(&hipPointerAttributeType, ptr)); type = hipPointerAttributeType.memoryType; assert(type == hipMemoryTypeDevice or type == hipMemoryTypeHost); if (type == hipMemoryTypeDevice) { - result = PI_CHECK_ERROR(hipFree((hipDevPtr)ptr)); + result = PI_CHECK_ERROR(hipFree(ptr)); } if (type == hipMemoryTypeHost) { result = PI_CHECK_ERROR(hipFreeHost(ptr)); @@ -4222,8 +4310,9 @@ pi_result rocm_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value, _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_FILL, queue)); event_ptr->start(); } - result = PI_CHECK_ERROR(hipMemsetD8Async( - (hipDevPtr)ptr, (unsigned char)value & 0xFF, count, hipStream)); + result = PI_CHECK_ERROR( + hipMemsetD8Async(reinterpret_cast(ptr), + (unsigned char)value & 0xFF, count, hipStream)); if (event) { result = event_ptr->record(); *event = event_ptr.release(); diff --git a/sycl/plugins/rocm/pi_rocm.hpp b/sycl/plugins/rocm/pi_rocm.hpp index 931b54c90bbc2..9888651809675 100644 --- a/sycl/plugins/rocm/pi_rocm.hpp +++ b/sycl/plugins/rocm/pi_rocm.hpp @@ -32,8 +32,6 @@ #include #include -typedef void *hipDevPtr; - extern "C" { /// \cond INGORE_BLOCK_IN_DOXYGEN @@ -237,6 +235,13 @@ struct _pi_mem { native_type get() const noexcept { return ptr_; } + native_type get_with_offset(size_t offset) const noexcept { + return reinterpret_cast(reinterpret_cast(ptr_) + + offset); + } + + void *get_void() const noexcept { return reinterpret_cast(ptr_); } + size_t get_size() const noexcept { return size_; } void *get_map_ptr() const noexcept { return mapPtr_; } @@ -279,11 +284,11 @@ struct _pi_mem { // Handler data for surface object (i.e. Images) struct surface_mem_ { - hipArray array_; + hipArray *array_; hipSurfaceObject_t surfObj_; pi_mem_type imageType_; - hipArray get_array() const noexcept { return array_; } + hipArray *get_array() const noexcept { return array_; } hipSurfaceObject_t get_surface() const noexcept { return surfObj_; } @@ -311,7 +316,7 @@ struct _pi_mem { }; /// Constructs the PI allocation for an Image object - _pi_mem(pi_context ctxt, hipArray array, hipSurfaceObject_t surf, + _pi_mem(pi_context ctxt, hipArray *array, hipSurfaceObject_t surf, pi_mem_type image_type, void *host_ptr) : context_{ctxt}, refCount_{1}, mem_type_{mem_type::surface} { mem_.surface_mem_.array_ = array; diff --git a/sycl/tools/CMakeLists.txt b/sycl/tools/CMakeLists.txt index a21ae9c8f9de9..85b9069eb5fd3 100644 --- a/sycl/tools/CMakeLists.txt +++ b/sycl/tools/CMakeLists.txt @@ -20,8 +20,12 @@ target_link_libraries(get_device_count_by_type LevelZeroLoader::Headers OpenCL-ICD ${LEVEL_ZERO_LIBRARY} - $<$:cudadrv> - $<$:rocmdrv> + # The CUDA and ROCm for NVIDA plugins need cudadrv + $<$,$,$>>:cudadrv> + # The ROCm for AMD plugin needs rocmdrv + $<$,$>:rocmdrv> + # The ROCm for NVIDIA plugin also needs cudart + $<$,$>:cudart> ) target_compile_definitions(get_device_count_by_type PRIVATE