diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h index c27be2edb35e8..b7c15f4aa288e 100644 --- a/sycl/include/CL/sycl/detail/pi.h +++ b/sycl/include/CL/sycl/detail/pi.h @@ -1635,7 +1635,9 @@ __SYCL_EXPORT pi_result piextUSMSharedAlloc(void **result_ptr, pi_usm_mem_properties *properties, size_t size, pi_uint32 alignment); -/// Frees allocated USM memory +/// Indicates that the allocated USM memory is no longer needed on the runtime +/// side. The actual freeing of the memory may be done in a blocking or deferred +/// manner, e.g. to avoid issues with indirect memory access from kernels. /// /// \param context is the pi_context of the allocation /// \param ptr is the memory to be freed diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index e259ac5e942cf..9474b20fdf863 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -57,6 +57,7 @@ CONSTFIX char clHostMemAllocName[] = "clHostMemAllocINTEL"; CONSTFIX char clDeviceMemAllocName[] = "clDeviceMemAllocINTEL"; CONSTFIX char clSharedMemAllocName[] = "clSharedMemAllocINTEL"; CONSTFIX char clMemFreeName[] = "clMemFreeINTEL"; +CONSTFIX char clMemBlockingFreeName[] = "clMemBlockingFreeINTEL"; CONSTFIX char clCreateBufferWithPropertiesName[] = "clCreateBufferWithPropertiesINTEL"; CONSTFIX char clSetKernelArgMemPointerName[] = "clSetKernelArgMemPointerINTEL"; @@ -968,11 +969,56 @@ pi_result piextUSMSharedAlloc(void **result_ptr, pi_context context, /// \param context is the pi_context of the allocation /// \param ptr is the memory to be freed pi_result piextUSMFree(pi_context context, void *ptr) { + // Use a blocking free to avoid issues with indirect access from kernels that + // might be still running. + clMemBlockingFreeINTEL_fn FuncPtr = nullptr; + + // We need to use clMemBlockingFreeINTEL here, however, due to a bug in OpenCL + // CPU runtime this call fails with CL_INVALID_EVENT on CPU devices in certain + // cases. As a temporary workaround, this function replicates caching of + // extension function pointers in getExtFuncFromContext, while choosing + // clMemBlockingFreeINTEL for GPU and clMemFreeINTEL for other device types. + // TODO remove this workaround when the new OpenCL CPU runtime version is + // uplifted in CI. + static_assert( + std::is_same::value); + cl_uint deviceCount; + cl_int ret_err = + clGetContextInfo(cast(context), CL_CONTEXT_NUM_DEVICES, + sizeof(cl_uint), &deviceCount, nullptr); + + if (ret_err != CL_SUCCESS || deviceCount < 1) { + return PI_INVALID_CONTEXT; + } + + std::vector devicesInCtx(deviceCount); + ret_err = clGetContextInfo(cast(context), CL_CONTEXT_DEVICES, + deviceCount * sizeof(cl_device_id), + devicesInCtx.data(), nullptr); + + if (ret_err != CL_SUCCESS) { + return PI_INVALID_CONTEXT; + } + + bool useBlockingFree = true; + for (const cl_device_id &dev : devicesInCtx) { + cl_device_type devType = CL_DEVICE_TYPE_DEFAULT; + ret_err = clGetDeviceInfo(dev, CL_DEVICE_TYPE, sizeof(cl_device_type), + &devType, nullptr); + if (ret_err != CL_SUCCESS) { + return PI_INVALID_DEVICE; + } + useBlockingFree &= devType == CL_DEVICE_TYPE_GPU; + } - clMemFreeINTEL_fn FuncPtr = nullptr; pi_result RetVal = PI_INVALID_OPERATION; - RetVal = getExtFuncFromContext(context, - &FuncPtr); + if (useBlockingFree) + RetVal = + getExtFuncFromContext( + context, &FuncPtr); + else + RetVal = getExtFuncFromContext(context, + &FuncPtr); if (FuncPtr) { RetVal = cast(FuncPtr(cast(context), ptr));