diff --git a/llvm/lib/SYCLLowerIR/LowerWGScope.cpp b/llvm/lib/SYCLLowerIR/LowerWGScope.cpp index e220fc52738d..b328c5a4ac1a 100644 --- a/llvm/lib/SYCLLowerIR/LowerWGScope.cpp +++ b/llvm/lib/SYCLLowerIR/LowerWGScope.cpp @@ -703,8 +703,22 @@ static void shareByValParams(Function &F, const Triple &TT) { spirv::createWGLocalVariable(*F.getParent(), T, "ArgShadow"); // 3) replace argument with shadow in all uses + Value *RepVal = Shadow; + if (TT.isNVPTX()) { + // For NVPTX target address space inference for kernel arguments and + // allocas is happening in the backend (NVPTXLowerArgs and + // NVPTXLowerAlloca passes). After the frontend these pointers are in LLVM + // default address space 0 which is the generic address space for NVPTX + // target. + assert(Arg.getType()->getPointerAddressSpace() == 0); + + // Cast a pointer in the shared address space to the generic address + // space. + RepVal = + ConstantExpr::getPointerBitCastOrAddrSpaceCast(Shadow, Arg.getType()); + } for (auto *U : Arg.users()) - U->replaceUsesOfWith(&Arg, Shadow); + U->replaceUsesOfWith(&Arg, RepVal); // 4) fill the shadow from the argument for the leader WI only LLVMContext &Ctx = At.getContext(); diff --git a/llvm/test/SYCLLowerIR/cast_shadow.ll b/llvm/test/SYCLLowerIR/cast_shadow.ll new file mode 100644 index 000000000000..1822f1c5e814 --- /dev/null +++ b/llvm/test/SYCLLowerIR/cast_shadow.ll @@ -0,0 +1,34 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -LowerWGScope -verify -S | FileCheck %s + +target triple = "nvptx64-nvidia-cuda-sycldevice" + +%struct.baz = type { i8 } +%struct.spam = type { %struct.wobble, %struct.wobble, %struct.wobble, %struct.wombat.0 } +%struct.wobble = type { %struct.wombat } +%struct.wombat = type { [1 x i64] } +%struct.wombat.0 = type { %struct.wombat } +%struct.quux = type { i8 } + +; CHECK: @[[SHADOW:[a-zA-Z0-9]+]] = internal unnamed_addr addrspace(3) global %struct.spam undef + +define internal void @wobble(%struct.baz* %arg, %struct.spam* byval(%struct.spam) %arg1) !work_group_scope !0 { +; CHECK: [[TMP10:%.*]] = bitcast %struct.spam* [[ARG1:%.*]] to i8* +; CHECK: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 16 bitcast (%struct.spam addrspace(3)* @[[SHADOW]] to i8 addrspace(3)*), i8* [[TMP10]], i64 32, i1 false) +; CHECK: call void @widget(%struct.spam* addrspacecast (%struct.spam addrspace(3)* @[[SHADOW]] to %struct.spam*), %struct.quux* byval(%struct.quux) [[TMP2:%.*]]) +; +bb: + %tmp = alloca %struct.baz* + %tmp2 = alloca %struct.quux + store %struct.baz* %arg, %struct.baz** %tmp + %tmp3 = load %struct.baz*, %struct.baz** %tmp + call void @widget(%struct.spam* %arg1, %struct.quux* byval(%struct.quux) %tmp2) + ret void +} + +define internal void @widget(%struct.spam* %arg, %struct.quux* byval(%struct.quux) %arg1) !work_item_scope !0 !parallel_for_work_item !0 { +bb: + ret void +} + +!0 = !{}