diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h index c2ab0bcaccf80..7df5e5b17f2fb 100644 --- a/sycl/include/CL/sycl/detail/pi.h +++ b/sycl/include/CL/sycl/detail/pi.h @@ -296,7 +296,8 @@ typedef enum { PI_DEVICE_INFO_MAX_MEM_BANDWIDTH = 0x10026, PI_DEVICE_INFO_IMAGE_SRGB = 0x10027, PI_DEVICE_INFO_ATOMIC_64 = 0x10110, - PI_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = 0x10111 + PI_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = 0x10111, + PI_DEVICE_INFO_P2P_READ_DEVICES = 0x10112 } _pi_device_info; typedef enum { diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 20d45b55998fe..2f1a3bae6fab4 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -1637,6 +1637,15 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name, } return getInfo(param_value_size, param_value, param_value_size_ret, value); } + case PI_DEVICE_INFO_P2P_READ_DEVICES: { + + std::vector devs; + + for (const auto &dev : device->get_platform()->devices_) { + devs.emplace_back(dev.get()); + } + return getInfo(param_value_size, param_value, param_value_size_ret, devs); + } // TODO: Investigate if this information is available on CUDA. case PI_DEVICE_INFO_PCI_ADDRESS: @@ -3648,6 +3657,19 @@ pi_result cuda_piSamplerRelease(pi_sampler sampler) { return PI_SUCCESS; } +void copyRectAsserts(const pi_buff_rect_region ®ion, + const pi_buff_rect_offset &src_offset, + const pi_buff_rect_offset &dst_offset, + const CUmemorytype_enum &src_type, + const CUmemorytype_enum &dst_type) { + assert(region != nullptr); + assert(src_offset != nullptr); + assert(dst_offset != nullptr); + + assert(src_type == CU_MEMORYTYPE_DEVICE || src_type == CU_MEMORYTYPE_HOST); + assert(dst_type == CU_MEMORYTYPE_DEVICE || dst_type == CU_MEMORYTYPE_HOST); +} + /// General 3D memory copy operation. /// This function requires the corresponding CUDA context to be at the top of /// the context stack @@ -3660,12 +3682,7 @@ static pi_result commonEnqueueMemBufferCopyRect( const CUmemorytype_enum dst_type, pi_buff_rect_offset dst_offset, size_t dst_row_pitch, size_t dst_slice_pitch) { - assert(region != nullptr); - assert(src_offset != nullptr); - assert(dst_offset != nullptr); - - assert(src_type == CU_MEMORYTYPE_DEVICE || src_type == CU_MEMORYTYPE_HOST); - assert(dst_type == CU_MEMORYTYPE_DEVICE || dst_type == CU_MEMORYTYPE_HOST); + copyRectAsserts(region, src_offset, dst_offset, src_type, dst_type); src_row_pitch = (!src_row_pitch) ? region->width_bytes + src_offset->x_bytes : src_row_pitch; @@ -3711,6 +3728,60 @@ static pi_result commonEnqueueMemBufferCopyRect( return PI_CHECK_ERROR(cuMemcpy3DAsync(¶ms, cu_stream)); } +/// General 3D memory Peer copy operation. +/// Similar to commonEnqueueMemBufferCopyRect with the addition that two +/// contexts must be specified. +static pi_result commonEnqueueMemBufferCopyRectPeer( + CUstream cu_stream, pi_buff_rect_region region, const void *src_ptr, + const CUmemorytype_enum src_type, pi_buff_rect_offset src_offset, + size_t src_row_pitch, size_t src_slice_pitch, void *dst_ptr, + const CUmemorytype_enum dst_type, pi_buff_rect_offset dst_offset, + size_t dst_row_pitch, size_t dst_slice_pitch, CUcontext dst_context, + CUcontext src_context) { + + copyRectAsserts(region, src_offset, dst_offset, src_type, dst_type); + + src_row_pitch = (!src_row_pitch) ? region->width_bytes : src_row_pitch; + src_slice_pitch = (!src_slice_pitch) ? (region->height_scalar * src_row_pitch) + : src_slice_pitch; + dst_row_pitch = (!dst_row_pitch) ? region->width_bytes : dst_row_pitch; + dst_slice_pitch = (!dst_slice_pitch) ? (region->height_scalar * dst_row_pitch) + : dst_slice_pitch; + + CUDA_MEMCPY3D_PEER params = {}; + + params.WidthInBytes = region->width_bytes; + params.Height = region->height_scalar; + params.Depth = region->depth_scalar; + + params.srcMemoryType = src_type; + params.srcDevice = src_type == CU_MEMORYTYPE_DEVICE + ? *static_cast(src_ptr) + : 0; + params.srcHost = src_type == CU_MEMORYTYPE_HOST ? src_ptr : nullptr; + params.srcXInBytes = src_offset->x_bytes; + params.srcY = src_offset->y_scalar; + params.srcZ = src_offset->z_scalar; + params.srcPitch = src_row_pitch; + params.srcHeight = src_slice_pitch / src_row_pitch; + + params.dstMemoryType = dst_type; + params.dstDevice = dst_type == CU_MEMORYTYPE_DEVICE + ? *static_cast(dst_ptr) + : 0; + params.dstHost = dst_type == CU_MEMORYTYPE_HOST ? dst_ptr : nullptr; + params.dstXInBytes = dst_offset->x_bytes; + params.dstY = dst_offset->y_scalar; + params.dstZ = dst_offset->z_scalar; + params.dstPitch = dst_row_pitch; + params.dstHeight = dst_slice_pitch / dst_row_pitch; + + params.dstContext = dst_context; + params.srcContext = src_context; + + return PI_CHECK_ERROR(cuMemcpy3DPeerAsync(¶ms, cu_stream)); +} + pi_result cuda_piEnqueueMemBufferReadRect( pi_queue command_queue, pi_mem buffer, pi_bool blocking_read, pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, @@ -3845,7 +3916,17 @@ pi_result cuda_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, auto src = src_buffer->mem_.buffer_mem_.get() + src_offset; auto dst = dst_buffer->mem_.buffer_mem_.get() + dst_offset; - result = PI_CHECK_ERROR(cuMemcpyDtoDAsync(dst, src, size, stream)); + if (src_buffer->context_ == dst_buffer->context_) { + result = PI_CHECK_ERROR(cuMemcpyDtoDAsync(dst, src, size, stream)); + } else { + auto dst_context = dst_buffer->context_->get(); + auto src_context = src_buffer->context_->get(); + + cuCtxEnablePeerAccess(dst_context, 0); + + result = PI_CHECK_ERROR( + cuMemcpyPeerAsync(dst, dst_context, src, src_context, size, stream)); + } if (event) { result = retImplEv->record(); @@ -3889,11 +3970,22 @@ pi_result cuda_piEnqueueMemBufferCopyRect( PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, command_queue)); retImplEv->start(); } + if (src_buffer->context_ == dst_buffer->context_) { + retErr = commonEnqueueMemBufferCopyRect( + cuStream, region, &srcPtr, CU_MEMORYTYPE_DEVICE, src_origin, + src_row_pitch, src_slice_pitch, &dstPtr, CU_MEMORYTYPE_DEVICE, + dst_origin, dst_row_pitch, dst_slice_pitch); + } else { + auto dstContext = dst_buffer->context_->get(); + auto srcContext = src_buffer->context_->get(); - retErr = commonEnqueueMemBufferCopyRect( - cuStream, region, &srcPtr, CU_MEMORYTYPE_DEVICE, src_origin, - src_row_pitch, src_slice_pitch, &dstPtr, CU_MEMORYTYPE_DEVICE, - dst_origin, dst_row_pitch, dst_slice_pitch); + cuCtxEnablePeerAccess(dstContext, 0); + + retErr = commonEnqueueMemBufferCopyRectPeer( + cuStream, region, &srcPtr, CU_MEMORYTYPE_DEVICE, src_origin, + src_row_pitch, src_slice_pitch, &dstPtr, CU_MEMORYTYPE_DEVICE, + dst_origin, dst_row_pitch, dst_slice_pitch, dstContext, srcContext); + } if (event) { retImplEv->record(); @@ -4095,6 +4187,54 @@ static pi_result commonEnqueueMemImageNDCopy( return PI_INVALID_VALUE; } +/// Similar to commonEnqueueMemImageNDCopy for Peer to Peer copies. +static pi_result commonEnqueueMemImageNDCopyPeer( + CUstream cu_stream, pi_mem_type img_type, const size_t *region, + const void *src_ptr, const CUmemorytype_enum src_type, + const size_t *src_offset, void *dst_ptr, const CUmemorytype_enum dst_type, + const size_t *dst_offset, CUcontext dst_context, CUcontext src_context) { + assert(region != nullptr); + + assert(src_type == CU_MEMORYTYPE_ARRAY || src_type == CU_MEMORYTYPE_HOST); + assert(dst_type == CU_MEMORYTYPE_ARRAY || dst_type == CU_MEMORYTYPE_HOST); + + CUDA_MEMCPY3D_PEER cpyDesc; + memset(&cpyDesc, 0, sizeof(cpyDesc)); + cpyDesc.srcMemoryType = src_type; + if (src_type == CU_MEMORYTYPE_ARRAY) { + cpyDesc.srcArray = *static_cast(src_ptr); + cpyDesc.srcXInBytes = src_offset[0]; + cpyDesc.srcY = src_offset[1]; + cpyDesc.srcZ = src_offset[2]; + } else { + cpyDesc.srcDevice = src_type == CU_MEMORYTYPE_DEVICE + ? *static_cast(src_ptr) + : 0; + cpyDesc.srcHost = src_type == CU_MEMORYTYPE_HOST ? src_ptr : nullptr; + } + cpyDesc.dstMemoryType = dst_type; + if (dst_type == CU_MEMORYTYPE_ARRAY) { + cpyDesc.dstArray = *static_cast(dst_ptr); + cpyDesc.dstXInBytes = dst_offset[0]; + cpyDesc.dstY = dst_offset[1]; + cpyDesc.dstZ = dst_offset[2]; + } else { + cpyDesc.dstDevice = dst_type == CU_MEMORYTYPE_DEVICE + ? *static_cast(dst_ptr) + : 0; + cpyDesc.dstHost = dst_type == CU_MEMORYTYPE_HOST ? dst_ptr : nullptr; + } + cpyDesc.WidthInBytes = region[0]; + cpyDesc.Height = region[1]; + cpyDesc.Depth = region[2]; + cpyDesc.dstContext = dst_context; + cpyDesc.srcContext = src_context; + + return PI_CHECK_ERROR(cuMemcpy3DPeerAsync(&cpyDesc, cu_stream)); + + return PI_INVALID_VALUE; +} + pi_result cuda_piEnqueueMemImageRead( pi_queue command_queue, pi_mem image, pi_bool blocking_read, const size_t *origin, const size_t *region, size_t row_pitch, @@ -4277,17 +4417,37 @@ pi_result cuda_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, size_t bytesToCopy = elementByteSize * srcArrayDesc.NumChannels * region[0]; pi_mem_type imgType = src_image->mem_.surface_mem_.get_image_type(); - if (imgType == PI_MEM_TYPE_IMAGE1D) { - retErr = PI_CHECK_ERROR(cuMemcpyAtoA(dstArray, dstByteOffsetX, srcArray, - srcByteOffsetX, bytesToCopy)); + if (src_image->context_ == dst_image->context_) { + if (imgType == PI_MEM_TYPE_IMAGE1D) { + retErr = PI_CHECK_ERROR(cuMemcpyAtoA(dstArray, dstByteOffsetX, srcArray, + srcByteOffsetX, bytesToCopy)); + } else { + size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]}; + size_t srcOffset[3] = {srcByteOffsetX, src_origin[1], src_origin[2]}; + size_t dstOffset[3] = {dstByteOffsetX, dst_origin[1], dst_origin[2]}; + + retErr = commonEnqueueMemImageNDCopy( + cuStream, imgType, adjustedRegion, &srcArray, CU_MEMORYTYPE_ARRAY, + srcOffset, &dstArray, CU_MEMORYTYPE_ARRAY, dstOffset); + + if (retErr != PI_SUCCESS) { + return retErr; + } + } } else { size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]}; size_t srcOffset[3] = {srcByteOffsetX, src_origin[1], src_origin[2]}; size_t dstOffset[3] = {dstByteOffsetX, dst_origin[1], dst_origin[2]}; - retErr = commonEnqueueMemImageNDCopy( + auto dstContext = dst_image->context_->get(); + auto srcContext = src_image->context_->get(); + + cuCtxEnablePeerAccess(dstContext, 0); + + retErr = commonEnqueueMemImageNDCopyPeer( cuStream, imgType, adjustedRegion, &srcArray, CU_MEMORYTYPE_ARRAY, - srcOffset, &dstArray, CU_MEMORYTYPE_ARRAY, dstOffset); + srcOffset, &dstArray, CU_MEMORYTYPE_ARRAY, dstOffset, dstContext, + srcContext); if (retErr != PI_SUCCESS) { return retErr; diff --git a/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp b/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp index 3a24d1be3906d..1a119abf832a6 100644 --- a/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp +++ b/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp @@ -513,6 +513,9 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, // cl_khr_fp64, cl_khr_int64_base_atomics, // cl_khr_int64_extended_atomics return ReturnValue(""); + // P2P is currently unsupported in level zero + case PI_DEVICE_INFO_P2P_READ_DEVICES: + return ReturnValue(std::vector{}); #define UNSUPPORTED_INFO(info) \ case info: \ diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index c959b789486c0..4974c05a8b474 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -1563,6 +1563,10 @@ pi_result hip_piDeviceGetInfo(pi_device device, pi_device_info param_name, } return getInfo(param_value_size, param_value, param_value_size_ret, value); } + // P2P is currently unsupported in level zero + case PI_DEVICE_INFO_P2P_READ_DEVICES: + return getInfo(param_value_size, param_value, param_value_size_ret, + std::vector{}); // TODO: Implement. case PI_DEVICE_INFO_ATOMIC_64: diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 6a013df6c8cc6..3a07e25a111f7 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -2448,6 +2448,9 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH: // currently not supported in level zero runtime return PI_INVALID_VALUE; + // P2P is currently unsupported in level zero + case PI_DEVICE_INFO_P2P_READ_DEVICES: + return ReturnValue(std::vector{}); default: zePrint("Unsupported ParamName in piGetDeviceInfo\n"); diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 8a2c03db023f0..5e576e4bb7d33 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -517,7 +517,6 @@ void Command::makeTraceEventEpilog() { Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep) { const QueueImplPtr &WorkerQueue = getWorkerQueue(); - const ContextImplPtr &WorkerContext = WorkerQueue->getContextImplPtr(); // 1. Async work is not supported for host device. // 2. Some types of commands do not produce PI events after they are enqueued @@ -543,13 +542,7 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep) { getType() != CommandType::HOST_TASK) return nullptr; - ContextImplPtr DepEventContext = DepEvent->getContextImpl(); - // If contexts don't match we'll connect them using host task - if (DepEventContext != WorkerContext && !WorkerContext->is_host()) { - Scheduler::GraphBuilder &GB = Scheduler::getInstance().MGraphBuilder; - ConnectionCmd = GB.connectDepEvent(this, DepEvent, Dep); - } else - MPreparedDepsEvents.push_back(std::move(DepEvent)); + MPreparedDepsEvents.push_back(std::move(DepEvent)); return ConnectionCmd; } diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 9b0f79d24f731..e7a774ee1102c 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -52,6 +53,20 @@ static bool IsSuitableSubReq(const Requirement *Req) { return Req->MIsSubBuffer; } +/// Finds the correct AllocaCommand matching the context of Record. +AllocaCommandBase *findAllocaCmd(MemObjRecord *Record) { + auto IsSuitableAlloca = [Record](AllocaCommandBase *AllocaCmd) { + bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(), + Record->MCurContext) && + // Looking for a parent buffer alloca command + AllocaCmd->getType() == Command::CommandType::ALLOCA; + return Res; + }; + const auto It = std::find_if(Record->MAllocaCommands.begin(), + Record->MAllocaCommands.end(), IsSuitableAlloca); + return (Record->MAllocaCommands.end() != It) ? *It : nullptr; +} + /// Checks if the required access mode is allowed under the current one. static bool isAccessModeAllowed(access::mode Required, access::mode Current) { switch (Current) { @@ -328,17 +343,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove( // Since no alloca command for the sub buffer requirement was found in the // current context, need to find a parent alloca command for it (it must be // there) - auto IsSuitableAlloca = [Record](AllocaCommandBase *AllocaCmd) { - bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(), - Record->MCurContext) && - // Looking for a parent buffer alloca command - AllocaCmd->getType() == Command::CommandType::ALLOCA; - return Res; - }; - const auto It = - std::find_if(Record->MAllocaCommands.begin(), - Record->MAllocaCommands.end(), IsSuitableAlloca); - AllocaCmdSrc = (Record->MAllocaCommands.end() != It) ? *It : nullptr; + AllocaCmdSrc = findAllocaCmd(Record); } if (!AllocaCmdSrc) throw runtime_error("Cannot find buffer allocation", PI_INVALID_VALUE); @@ -941,9 +946,26 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, NeedMemMoveToHost = true; MemMoveTargetQueue = HT.MQueue; } - } else if (!Queue->is_host() && !Record->MCurContext->is_host()) - NeedMemMoveToHost = true; + } else if (!Queue->is_host() && !Record->MCurContext->is_host()) { + if (detail::getImplBackend(Queue) != + detail::getImplBackend(Record->MCurContext)) + NeedMemMoveToHost = true; + else { + std::vector Devs; + + Queue->getPlugin().call_nocheck( + Queue->getDeviceImplPtr()->getHandleRef(), + PI_DEVICE_INFO_P2P_READ_DEVICES, sizeof(Devs), &Devs, nullptr); + _pi_device *SrcDev = findAllocaCmd(Record) + ->getQueue() + ->getDeviceImplPtr() + ->getHandleRef(); + + NeedMemMoveToHost = + std::find(Devs.begin(), Devs.end(), SrcDev) == Devs.end(); + } + } if (NeedMemMoveToHost) insertMemoryMove(Record, Req, Scheduler::getInstance().getDefaultHostQueue(),