diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp index b6d9ec6e013b4..873ada24b9b2e 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -167,7 +167,8 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern { "mgpuMemAlloc", llvmPointerType /* void * */, {llvmIntPtrType /* intptr_t sizeBytes */, - llvmPointerType /* void *stream */}}; + llvmPointerType /* void *stream */, + llvmInt8Type /* bool isHostShared */}}; FunctionCallBuilder deallocCallBuilder = { "mgpuMemFree", llvmVoidType, @@ -786,19 +787,23 @@ LogicalResult ConvertHostUnregisterOpToGpuRuntimeCallPattern::matchAndRewrite( LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::AllocOp allocOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { - if (adaptor.getHostShared()) - return rewriter.notifyMatchFailure( - allocOp, "host_shared allocation is not supported"); MemRefType memRefType = allocOp.getType(); if (failed(areAllLLVMTypes(allocOp, adaptor.getOperands(), rewriter)) || - !isConvertibleAndHasIdentityMaps(memRefType) || - failed(isAsyncWithOneDependency(rewriter, allocOp))) + !isConvertibleAndHasIdentityMaps(memRefType)) return failure(); auto loc = allocOp.getLoc(); + bool isShared = allocOp.getHostShared(); + + if (isShared && allocOp.getAsyncToken()) + return rewriter.notifyMatchFailure( + allocOp, "Host Shared allocation cannot be done async"); + else if (!isShared && failed(isAsyncWithOneDependency(rewriter, allocOp))) + return failure(); + // Get shape of the memref as values: static sizes are constant // values and dynamic sizes are passed to 'alloc' as operands. SmallVector shape; @@ -811,8 +816,13 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( // descriptor. Type elementPtrType = this->getElementPtrType(memRefType); auto stream = adaptor.getAsyncDependencies().front(); + + auto isHostShared = rewriter.create( + loc, llvmInt8Type, rewriter.getI8IntegerAttr(isShared)); + Value allocatedPtr = - allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult(); + allocCallBuilder.create(loc, rewriter, {sizeBytes, stream, isHostShared}) + .getResult(); if (!getTypeConverter()->useOpaquePointers()) allocatedPtr = rewriter.create(loc, elementPtrType, allocatedPtr); diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp index 2c04821303a49..b4321622687f9 100644 --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -231,7 +231,8 @@ extern MLIR_CUDA_WRAPPERS_EXPORT "C" void mgpuEventRecord(CUevent event, CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream)); } -extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/) { +extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/, + bool /*isHostShared*/) { ScopedContext scopedContext; CUdeviceptr ptr = 0; if (sizeBytes != 0) diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp index da2ae87fef671..4ebabc793db3d 100644 --- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp @@ -104,7 +104,8 @@ extern "C" void mgpuEventRecord(hipEvent_t event, hipStream_t stream) { HIP_REPORT_IF_ERROR(hipEventRecord(event, stream)); } -extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /*stream*/) { +extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /*stream*/, + bool /*isHostShared*/) { void *ptr; HIP_REPORT_IF_ERROR(hipMalloc(&ptr, sizeBytes)); return ptr; diff --git a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir index 2506c6ceb990e..f365dcb02daf4 100644 --- a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir @@ -8,7 +8,8 @@ module attributes {gpu.container_module} { %0 = gpu.wait async // CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}}[%[[size]]] // CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]] - // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]]) + // CHECK: %[[isHostShared:.*]] = llvm.mlir.constant + // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]], %[[isHostShared]]) %1, %2 = gpu.alloc async [%0] (%size) : memref // CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0] // CHECK: llvm.call @mgpuMemFree(%[[float_ptr]], %[[stream]]) diff --git a/mlir/test/Conversion/GPUCommon/typed-pointers.mlir b/mlir/test/Conversion/GPUCommon/typed-pointers.mlir index 2fa6c854c5678..e27162c7dbc19 100644 --- a/mlir/test/Conversion/GPUCommon/typed-pointers.mlir +++ b/mlir/test/Conversion/GPUCommon/typed-pointers.mlir @@ -8,7 +8,8 @@ module attributes {gpu.container_module} { %0 = gpu.wait async // CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}}[%[[size]]] // CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]] - // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]]) + // CHECK: %[[isHostShared:.*]] = llvm.mlir.constant + // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]], %[[isHostShared]]) %1, %2 = gpu.alloc async [%0] (%size) : memref // CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0] // CHECK: %[[void_ptr:.*]] = llvm.bitcast %[[float_ptr]]