From 498b16919f57778e38e39b07d9471ca15962d8c2 Mon Sep 17 00:00:00 2001 From: Julian Oppermann Date: Mon, 21 Aug 2023 17:31:01 +0100 Subject: [PATCH 1/6] [SYCL][Fusion] Adapt internalization to opaque pointers. Signed-off-by: Julian Oppermann --- .../lib/fusion/FusionPipeline.cpp | 3 + .../internalization/Internalization.cpp | 80 ++++++++++--------- .../internalize_array_wrapper.cpp | 3 - 3 files changed, 46 insertions(+), 40 deletions(-) diff --git a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp index 965202fe5bbfa..a6c5569d87d22 100644 --- a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp +++ b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp @@ -105,6 +105,9 @@ FusionPipeline::runFusionPasses(Module &Mod, SYCLModuleInfo &InputInfo, // Ideally, the static compiler should have performed that job. const unsigned FlatAddressSpace = getFlatAddressSpace(Mod); FPM.addPass(InferAddressSpacesPass(FlatAddressSpace)); + // Run CFG simplification to prevent unreachable code from obscuring + // internalization opportunities. + FPM.addPass(SimplifyCFGPass{}); MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); } // Run dataflow internalization and runtime constant propagation. diff --git a/sycl-fusion/passes/internalization/Internalization.cpp b/sycl-fusion/passes/internalization/Internalization.cpp index 3d5c38c799e8f..1d4c9bc82596e 100644 --- a/sycl-fusion/passes/internalization/Internalization.cpp +++ b/sycl-fusion/passes/internalization/Internalization.cpp @@ -58,10 +58,10 @@ struct SYCLInternalizerImpl { /// - Promote the function and call the new function instead, /// keeping the original function. /// - The value appears in a load/store operation: Do nothing - void promoteValue(Value *Val, std::size_t LocalSize) const; + void promoteValue(Value *Val, std::size_t LocalSize, bool InAggregate) const; void promoteGEPI(GetElementPtrInst *GEPI, const Value *Val, - std::size_t LocalSize) const; + std::size_t LocalSize, bool InAggregate) const; void promoteCall(CallBase *C, const Value *Val, std::size_t LocalSize) const; @@ -81,23 +81,26 @@ struct SYCLInternalizerImpl { /// /// Check that an value can be promoted. /// For GEP and Call instructions, delegate to the specific implementations. + /// \p InAggregate indicates that at least one GEP instruction addressing into + /// an aggregate object was encountered, hence \p Val no longer represents a + /// pure offset computation on the original candidate argument. /// For address-space casts, pointer-to-int conversions and unknown users, /// return an error. - Error canPromoteValue(Value *Val, size_t LocalSize) const; + Error canPromoteValue(Value *Val, size_t LocalSize, bool InAggregate) const; /// - /// Check that the operand of a GEP can be promoted. - /// If the GEP uses more than one index, return an error. - /// Otherwise, check if the GEP itself can be promoted in its users. + /// Check that the operand of a GEP can be promoted to its users, and + /// propagate whether it represents a pointer into an aggregate object. Error canPromoteGEP(GetElementPtrInst *GEPI, const Value *Val, - size_t LocalSize) const; + size_t LocalSize, bool InAggregate) const; /// /// Check if operand to a function call can be promoted. - /// If the function returns a pointer, return an error. - /// Otherwise, check if the corresponding formal parameter can be promoted in - /// the function body. - Error canPromoteCall(CallBase *C, const Value *Val, size_t LocalSize) const; + /// If the function returns a pointer, or the operand points into an aggregate + /// object, return an error. Otherwise, check if the corresponding formal + /// parameter can be promoted in the function body. + Error canPromoteCall(CallBase *C, const Value *Val, size_t LocalSize, + bool InAggregate) const; Error checkArgsPromotable(Function *F, SmallVectorImpl &PromoteArgSizes) const; @@ -212,7 +215,8 @@ getUsagesInternalization(const User *U, const Value *V, std::size_t LocalSize) { } Error SYCLInternalizerImpl::canPromoteCall(CallBase *C, const Value *Val, - size_t LocalSize) const { + size_t LocalSize, + bool InAggregate) const { if (isa(C->getType())) { // With opaque pointers, we do not have the necessary information to compare // the element-type of the pointer returned by the function and the element @@ -222,6 +226,11 @@ Error SYCLInternalizerImpl::canPromoteCall(CallBase *C, const Value *Val, inconvertibleErrorCode(), "It is not safe to promote a called function which returns a pointer."); } + if (InAggregate) { + return createStringError(inconvertibleErrorCode(), + "It is not safe to promote a pointer into an " + "aggregate object to a called function."); + } SmallVector InternInfo = getUsagesInternalization(C, Val, LocalSize); assert(!InternInfo.empty() && "Value must be used at least once"); @@ -232,27 +241,20 @@ Error SYCLInternalizerImpl::canPromoteCall(CallBase *C, const Value *Val, } Error SYCLInternalizerImpl::canPromoteGEP(GetElementPtrInst *GEPI, - const Value *Val, - size_t LocalSize) const { + const Value *Val, size_t LocalSize, + bool InAggregate) const { if (cast(GEPI->getType())->getAddressSpace() == AS) { // If the GEPI is already using the correct address-space, no change is // required. return Error::success(); } - if (GEPI->getNumIndices() != 1 && - std::any_of(GEPI->user_begin(), GEPI->user_end(), [](const auto *User) { - return isa(User); - })) { - return createStringError(inconvertibleErrorCode(), - "Only one index expected in source of " - "promotable GEP instruction pointer argument"); - } // Recurse to check all users of the GEP. - return canPromoteValue(GEPI, LocalSize); + return canPromoteValue(GEPI, LocalSize, + InAggregate || GEPI->getNumIndices() >= 2); } -Error SYCLInternalizerImpl::canPromoteValue(Value *Val, - size_t LocalSize) const { +Error SYCLInternalizerImpl::canPromoteValue(Value *Val, size_t LocalSize, + bool InAggregate) const { for (auto *U : Val->users()) { auto *I = dyn_cast(U); if (!I) { @@ -272,13 +274,14 @@ Error SYCLInternalizerImpl::canPromoteValue(Value *Val, case Instruction::Call: case Instruction::Invoke: case Instruction::CallBr: - if (auto Err = canPromoteCall(cast(I), Val, LocalSize)) { + if (auto Err = + canPromoteCall(cast(I), Val, LocalSize, InAggregate)) { return Err; } break; case Instruction::GetElementPtr: - if (auto Err = - canPromoteGEP(cast(I), Val, LocalSize)) { + if (auto Err = canPromoteGEP(cast(I), Val, LocalSize, + InAggregate)) { return Err; } break; @@ -316,7 +319,7 @@ Error SYCLInternalizerImpl::checkArgsPromotable( PromoteArgSizes[Index] = 0; continue; } - if (auto Err = canPromoteValue(Arg, LocalSize)) { + if (auto Err = canPromoteValue(Arg, LocalSize, /*InAggregate=*/false)) { // Set the local size to 0 to indicate that this argument should not be // promoted. PromoteArgSizes[Index] = 0; @@ -360,30 +363,33 @@ void SYCLInternalizerImpl::promoteCall(CallBase *C, const Value *Val, } void SYCLInternalizerImpl::promoteGEPI(GetElementPtrInst *GEPI, - const Value *Val, - std::size_t LocalSize) const { + const Value *Val, std::size_t LocalSize, + bool InAggregate) const { // Not PointerType is unreachable. Other case is catched in caller. if (cast(GEPI->getType())->getAddressSpace() != AS) { - remapIndices(GEPI, LocalSize); + if (!InAggregate) + remapIndices(GEPI, LocalSize); auto *ValTy = cast(Val->getType()); GEPI->mutateType(PointerType::getWithSamePointeeType( cast(GEPI->getType()), ValTy->getAddressSpace())); - return promoteValue(GEPI, LocalSize); + return promoteValue(GEPI, LocalSize, + InAggregate || GEPI->getNumIndices() >= 2); } } -void SYCLInternalizerImpl::promoteValue(Value *Val, - std::size_t LocalSize) const { +void SYCLInternalizerImpl::promoteValue(Value *Val, std::size_t LocalSize, + bool InAggregate) const { for (auto *U : Val->users()) { auto *I = cast(U); switch (I->getOpcode()) { case Instruction::Call: case Instruction::Invoke: case Instruction::CallBr: + assert(!InAggregate); promoteCall(cast(I), Val, LocalSize); break; case Instruction::GetElementPtr: - promoteGEPI(cast(I), Val, LocalSize); + promoteGEPI(cast(I), Val, LocalSize, InAggregate); break; case Instruction::Load: case Instruction::Store: @@ -535,7 +541,7 @@ Function *SYCLInternalizerImpl::promoteFunctionArgs( if (CreateAllocas) { Arg = replaceByNewAlloca(cast(Arg), AS, LocalSize); } - promoteValue(Arg, LocalSize); + promoteValue(Arg, LocalSize, /*InAggregate=*/false); } TargetInfo.updateAddressSpaceMetadata(NewF, PromoteToLocal, AS); diff --git a/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp b/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp index b968b48af9497..e90f42d023616 100644 --- a/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp +++ b/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp @@ -2,9 +2,6 @@ // RUN: %{build} -fsycl-embed-ir -O2 -o %t.out // RUN: %{run} %t.out -// FIXME: enable opaque pointers support -// REQUIRES: TEMPORARY_DISABLED - // Test internalization of a nested array type. #include From abc4bb3bed5bd397203e5df07763b495a4ccf97b Mon Sep 17 00:00:00 2001 From: Julian Oppermann Date: Tue, 22 Aug 2023 09:10:41 +0100 Subject: [PATCH 2/6] Explain magic number. Signed-off-by: Julian Oppermann --- sycl-fusion/passes/internalization/Internalization.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sycl-fusion/passes/internalization/Internalization.cpp b/sycl-fusion/passes/internalization/Internalization.cpp index 1d4c9bc82596e..23b899374f402 100644 --- a/sycl-fusion/passes/internalization/Internalization.cpp +++ b/sycl-fusion/passes/internalization/Internalization.cpp @@ -248,7 +248,10 @@ Error SYCLInternalizerImpl::canPromoteGEP(GetElementPtrInst *GEPI, // required. return Error::success(); } - // Recurse to check all users of the GEP. + // Recurse to check all users of the GEP. We are either already in + // `InAggregate` mode, or inspect the current instruction. Recall that a GEP's + // first index is used to step through the base pointer, whereas any + // additional indices represent addressing into an aggregrate type. return canPromoteValue(GEPI, LocalSize, InAggregate || GEPI->getNumIndices() >= 2); } @@ -372,6 +375,10 @@ void SYCLInternalizerImpl::promoteGEPI(GetElementPtrInst *GEPI, auto *ValTy = cast(Val->getType()); GEPI->mutateType(PointerType::getWithSamePointeeType( cast(GEPI->getType()), ValTy->getAddressSpace())); + // Recurse to promote to all users of the GEP. We are either already in + // `InAggregate` mode, or inspect the current instruction. Recall that a + // GEP's first index is used to step through the base pointer, whereas any + // additional indices represent addressing into an aggregrate type. return promoteValue(GEPI, LocalSize, InAggregate || GEPI->getNumIndices() >= 2); } From 3f94da5072732da55bdc563b69ef9e8b5fc3a33b Mon Sep 17 00:00:00 2001 From: Julian Oppermann Date: Thu, 24 Aug 2023 10:22:23 +0100 Subject: [PATCH 3/6] Add tests. Signed-off-by: Julian Oppermann --- .../internalization/promote-local-nested.ll | 214 ++++++++++++++++++ .../internalization/promote-private-nested.ll | 204 +++++++++++++++++ .../internalize_array_wrapper_local.cpp | 139 ++++++++++++ 3 files changed, 557 insertions(+) create mode 100644 sycl-fusion/test/internalization/promote-local-nested.ll create mode 100644 sycl-fusion/test/internalization/promote-private-nested.ll create mode 100644 sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp diff --git a/sycl-fusion/test/internalization/promote-local-nested.ll b/sycl-fusion/test/internalization/promote-local-nested.ll new file mode 100644 index 0000000000000..5def5e7706fff --- /dev/null +++ b/sycl-fusion/test/internalization/promote-local-nested.ll @@ -0,0 +1,214 @@ +; RUN: opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext \ +; RUN: -passes=sycl-internalization --sycl-info-path %S/../kernel-fusion/kernel-info.yaml -S %s | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +target triple = "spir64-unknown-unknown" + +%"class.sycl::_V1::id" = type { %"class.sycl::_V1::detail::array" } +%"class.sycl::_V1::detail::array" = type { [1 x i64] } +%struct.array_wrapper = type { %"struct.std::array" } +%"struct.std::array" = type { [2 x %"struct.std::array.0"] } +%"struct.std::array.0" = type { [2 x %"class.sycl::_V1::vec"] } +%"class.sycl::_V1::vec" = type { <2 x i32> } + +; Function Attrs: alwaysinline nounwind +define spir_func void @__itt_offload_wi_start_wrapper() #0 { +entry: + %GroupID = alloca [3 x i64], align 8 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1 + +; Function Attrs: noinline nounwind +define spir_func void @__itt_offload_wi_start_stub(ptr addrspace(4) %group_id, i64 %wi_id, i32 %wg_size) #2 { +entry: + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) +declare void @llvm.assume(i1 noundef) #3 + +; Function Attrs: alwaysinline nounwind +define spir_func void @__itt_offload_wi_finish_wrapper() #0 { +entry: + %GroupID = alloca [3 x i64], align 8 + ret void +} + +; Function Attrs: noinline nounwind +define spir_func void @__itt_offload_wi_finish_stub(ptr addrspace(4) %group_id, i64 %wi_id) #2 { +entry: + ret void +} + +; Function Attrs: nounwind willreturn memory(none) +declare spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32) #4 + +; Function Attrs: nounwind willreturn memory(none) +declare spir_func i64 @_Z26__spirv_BuiltInWorkgroupIdi(i32) #4 + +; Function Attrs: nounwind willreturn memory(none) +declare spir_func i64 @_Z29__spirv_BuiltInGlobalLinearIdv() #4 + +; Function Attrs: nounwind willreturn memory(none) +declare spir_func i64 @_Z28__spirv_BuiltInWorkgroupSizei(i32) #4 + +define spir_kernel void @fused_0(ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn1, ptr byval(%"class.sycl::_V1::id") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn13, ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn2, ptr byval(%"class.sycl::_V1::id") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn26, ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp, ptr byval(%"class.sycl::_V1::id") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp9, ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn3, ptr byval(%"class.sycl::_V1::id") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn36, ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut, ptr byval(%"class.sycl::_V1::id") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut9) !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !8 !kernel_arg_type_qual !9 !kernel_arg_base_type !8 !kernel_arg_name !10 !sycl.kernel.promote !11 !sycl.kernel.promote.localsize !12 !sycl.kernel.constants !13 { +; Scenario: Test the successful local internalization of the pointer argument +; `...KernelOne__arg_accTmp`. This means the pointer argument has been replaced +; by a pointer to the local address space (address space 3), and offset-wrapping +; instructions have been introduced. This test is based on an IR dump of +; `sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp`. + +; CHECK-LABEL: define spir_kernel void @fused_0 +; CHECK-SAME: (ptr addrspace(1) align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCIN1:%.*]], ptr byval(%"class.sycl::_V1::id") align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCIN13:%.*]], ptr addrspace(1) align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCIN2:%.*]], ptr byval(%"class.sycl::_V1::id") align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCIN26:%.*]], ptr addrspace(3) align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP:%.*]], ptr byval(%"class.sycl::_V1::id") align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP9:%.*]], ptr addrspace(1) align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE0_CLES2_E9KERNELTWO__ARG_ACCIN3:%.*]], ptr byval(%"class.sycl::_V1::id") align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE0_CLES2_E9KERNELTWO__ARG_ACCIN36:%.*]], ptr addrspace(1) align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE0_CLES2_E9KERNELTWO__ARG_ACCOUT:%.*]], ptr byval(%"class.sycl::_V1::id") align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE0_CLES2_E9KERNELTWO__ARG_ACCOUT9:%.*]]) !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !8 !kernel_arg_type_qual !9 !kernel_arg_base_type !8 !kernel_arg_name !10 !sycl.kernel.constants !11 { +; CHECK-NEXT: entry: +; CHECK: [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP93_SROA_0_0_COPYLOAD:%.*]] = load i64, ptr [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP9]], align 1 +; CHECK: [[TMP0:%.*]] = urem i64 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP93_SROA_0_0_COPYLOAD]], 4 +; CHECK: [[ADD_PTR_I43_I:%.*]] = getelementptr inbounds %struct.array_wrapper, ptr addrspace(3) [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP]], i64 [[TMP0]] +; CHECK: [[TMP1:%.*]] = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) #[[ATTR4:[0-9]+]] +; CHECK: [[TMP2:%.*]] = add i64 [[TMP1]], [[TMP0]] +; CHECK: [[TMP3:%.*]] = urem i64 [[TMP2]], 4 +; CHECK: [[ARRAYIDX_I34_I_I:%.*]] = getelementptr inbounds %struct.array_wrapper, ptr addrspace(3) [[ADD_PTR_I43_I]], i64 [[TMP3]] +; CHECK: [[I_0_I_I:%.*]] = phi i64 {{.*}} +; CHECK: [[TMP4:%.*]] = add i64 0, [[TMP3]] +; CHECK: [[TMP5:%.*]] = add i64 [[TMP4]], [[TMP0]] +; CHECK: [[TMP6:%.*]] = urem i64 [[TMP5]], 4 +; CHECK: [[ARRAYIDX_I_I_I37_I_I:%.*]] = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(3) [[ARRAYIDX_I34_I_I]], i64 [[TMP6]], i64 [[I_0_I_I]] +; CHECK: store <2 x i32> {{%.*}}, ptr addrspace(3) [[ARRAYIDX_I_I_I37_I_I]], align 8 +; CHECK: [[ARRAYIDX_I_I39_I_I_1:%.*]] = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(3) [[ARRAYIDX_I_I_I37_I_I]], i64 0, i64 1 +; CHECK: store <2 x i32> {{%.*}}, ptr addrspace(3) [[ARRAYIDX_I_I39_I_I_1]], align 8 +; CHECK: [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP94_SROA_0_0_COPYLOAD:%.*]] = load i64, ptr [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP9]], align 1 +; CHECK: [[TMP11:%.*]] = urem i64 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP94_SROA_0_0_COPYLOAD]], 4 +; CHECK: [[ADD_PTR_I_I7:%.*]] = getelementptr inbounds %struct.array_wrapper, ptr addrspace(3) [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP]], i64 [[TMP11]] +; CHECK: [[TMP12:%.*]] = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) #[[ATTR4]] +; CHECK: [[TMP13:%.*]] = add i64 [[TMP12]], [[TMP11]] +; CHECK: [[TMP14:%.*]] = urem i64 [[TMP13]], 4 +; CHECK: [[ARRAYIDX_I_I_I11:%.*]] = getelementptr inbounds %struct.array_wrapper, ptr addrspace(3) [[ADD_PTR_I_I7]], i64 [[TMP14]] +; CHECK: [[I_0_I_I15:%.*]] = phi i64 {{.*}} +; CHECK: [[TMP15:%.*]] = add i64 0, [[TMP14]] +; CHECK: [[TMP16:%.*]] = add i64 [[TMP15]], [[TMP11]] +; CHECK: [[TMP17:%.*]] = urem i64 [[TMP16]], 4 +; CHECK: [[ARRAYIDX_I_I_I_I_I18:%.*]] = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(3) [[ARRAYIDX_I_I_I11]], i64 [[TMP17]], i64 [[I_0_I_I15]] +; CHECK: [[TMP18:%.*]] = load <2 x i32>, ptr addrspace(3) [[ARRAYIDX_I_I_I_I_I18]], align 8 +; CHECK: [[ARRAYIDX_I_I_I_I27_1:%.*]] = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(3) [[ARRAYIDX_I_I_I_I_I18]], i64 0, i64 1 +; CHECK: ret void +; +entry: + %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp93.sroa.0.0.copyload = load i64, ptr %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp9, align 1 + %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn262.sroa.0.0.copyload = load i64, ptr %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn26, align 1 + %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn131.sroa.0.0.copyload = load i64, ptr %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn13, align 1 + %add.ptr.i.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn1, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn131.sroa.0.0.copyload + %add.ptr.i34.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn2, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn262.sroa.0.0.copyload + %add.ptr.i43.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp93.sroa.0.0.copyload + %0 = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) #4 + %cmp.i.i.i = icmp ult i64 %0, 2147483648 + call void @llvm.assume(i1 %cmp.i.i.i) + %arrayidx.i.i.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i.i, i64 %0 + %arrayidx.i30.i.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i34.i, i64 %0 + %arrayidx.i34.i.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i43.i, i64 %0 + br label %for.cond.i.i + +for.cond.i.i: ; preds = %for.body.i.i, %entry + %i.0.i.i = phi i64 [ 0, %entry ], [ %inc17.i.i, %for.body.i.i ] + %cmp.i.i = icmp ult i64 %i.0.i.i, 16 + br i1 %cmp.i.i, label %for.body.i.i, label %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne.exit + +for.body.i.i: ; preds = %for.cond.i.i + %arrayidx.i.i.i.i.i = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i.i.i, i64 0, i64 %i.0.i.i + %arrayidx.i.i.i36.i.i = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i30.i.i, i64 0, i64 %i.0.i.i + %arrayidx.i.i.i37.i.i = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i34.i.i, i64 0, i64 %i.0.i.i + %1 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i.i, align 8 + %2 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i36.i.i, align 8 + %add.i.i.i = add <2 x i32> %1, %2 + store <2 x i32> %add.i.i.i, ptr addrspace(1) %arrayidx.i.i.i37.i.i, align 8 + %arrayidx.i.i.i.i.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i.i.i, i64 0, i64 1 + %arrayidx.i.i38.i.i.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i36.i.i, i64 0, i64 1 + %3 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i.1, align 8 + %4 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i38.i.i.1, align 8 + %add.i.i.i.1 = add <2 x i32> %3, %4 + %arrayidx.i.i39.i.i.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i37.i.i, i64 0, i64 1 + store <2 x i32> %add.i.i.i.1, ptr addrspace(1) %arrayidx.i.i39.i.i.1, align 8 + %inc17.i.i = add nuw nsw i64 %i.0.i.i, 1 + br label %for.cond.i.i + +_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne.exit: ; preds = %for.cond.i.i + %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut96.sroa.0.0.copyload = load i64, ptr %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut9, align 1 + %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn365.sroa.0.0.copyload = load i64, ptr %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn36, align 1 + %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp94.sroa.0.0.copyload = load i64, ptr %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp9, align 1 + %add.ptr.i.i7 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp94.sroa.0.0.copyload + %add.ptr.i34.i8 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn3, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn365.sroa.0.0.copyload + %add.ptr.i43.i9 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut96.sroa.0.0.copyload + %5 = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) #4 + %cmp.i.i.i10 = icmp ult i64 %5, 2147483648 + call void @llvm.assume(i1 %cmp.i.i.i10) + %arrayidx.i.i.i11 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i.i7, i64 %5 + %arrayidx.i30.i.i12 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i34.i8, i64 %5 + %arrayidx.i34.i.i13 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i43.i9, i64 %5 + br label %for.cond.i.i14 + +for.cond.i.i14: ; preds = %for.body.i.i17, %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne.exit + %i.0.i.i15 = phi i64 [ 0, %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne.exit ], [ %inc17.i.i25, %for.body.i.i17 ] + %cmp.i.i16 = icmp ult i64 %i.0.i.i15, 16 + br i1 %cmp.i.i16, label %for.body.i.i17, label %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo.exit + +for.body.i.i17: ; preds = %for.cond.i.i14 + %arrayidx.i.i.i.i.i18 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i.i.i11, i64 0, i64 %i.0.i.i15 + %arrayidx.i.i.i36.i.i19 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i30.i.i12, i64 0, i64 %i.0.i.i15 + %arrayidx.i.i.i37.i.i20 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i34.i.i13, i64 0, i64 %i.0.i.i15 + %6 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i.i18, align 8 + %7 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i36.i.i19, align 8 + %mul.i.i.i = mul <2 x i32> %6, %7 + store <2 x i32> %mul.i.i.i, ptr addrspace(1) %arrayidx.i.i.i37.i.i20, align 8 + %arrayidx.i.i.i.i27.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i.i.i18, i64 0, i64 1 + %arrayidx.i.i38.i.i28.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i36.i.i19, i64 0, i64 1 + %8 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i27.1, align 8 + %9 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i38.i.i28.1, align 8 + %mul.i.i.i.1 = mul <2 x i32> %8, %9 + %arrayidx.i.i39.i.i29.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i37.i.i20, i64 0, i64 1 + store <2 x i32> %mul.i.i.i.1, ptr addrspace(1) %arrayidx.i.i39.i.i29.1, align 8 + %inc17.i.i25 = add nuw nsw i64 %i.0.i.i15, 1 + br label %for.cond.i.i14 + +_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo.exit: ; preds = %for.cond.i.i14 + ret void +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite) +declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #5 + +attributes #0 = { alwaysinline nounwind } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +attributes #2 = { noinline nounwind } +attributes #3 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } +attributes #4 = { nounwind willreturn memory(none) } +attributes #5 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } + +!spirv.MemoryModel = !{!0} +!opencl.enable.FP_CONTRACT = !{} +!spirv.Source = !{!1} +!opencl.spir.version = !{!2} +!opencl.ocl.version = !{!3} +!opencl.used.extensions = !{!4} +!opencl.used.optional.core.features = !{!4} +!spirv.Generator = !{!5} + +!0 = !{i32 2, i32 2} +!1 = !{i32 4, i32 100000} +!2 = !{i32 1, i32 2} +!3 = !{i32 1, i32 0} +!4 = !{} +!5 = !{i16 6, i16 14} +!6 = !{i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0} +!7 = !{!"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none"} +!8 = !{!"struct array_wrapper*", !"class.sycl::_V1::id", !"struct array_wrapper*", !"class.sycl::_V1::id", !"struct array_wrapper*", !"class.sycl::_V1::id", !"struct array_wrapper*", !"class.sycl::_V1::id", !"struct array_wrapper*", !"class.sycl::_V1::id"} +!9 = !{!"", !"", !"", !"", !"", !"", !"", !"", !"", !""} +!10 = !{!"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn1", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn13", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn2", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn26", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp9", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn3", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn36", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut9"} +!11 = !{!"none", !"none", !"none", !"none", !"local", !"none", !"none", !"none", !"none", !"none"} +!12 = !{!"", !"", !"", !"", i64 4, !"", !"", !"", !"", !""} +!13 = !{!"", !"\00\00\00\00\00\00\00\00", !"", !"\00\00\00\00\00\00\00\00", !"", !"", !"", !"\00\00\00\00\00\00\00\00", !"", !"\00\00\00\00\00\00\00\00"} diff --git a/sycl-fusion/test/internalization/promote-private-nested.ll b/sycl-fusion/test/internalization/promote-private-nested.ll new file mode 100644 index 0000000000000..54be0fc1ae40b --- /dev/null +++ b/sycl-fusion/test/internalization/promote-private-nested.ll @@ -0,0 +1,204 @@ +; RUN: opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext \ +; RUN: -passes=sycl-internalization --sycl-info-path %S/../kernel-fusion/kernel-info.yaml -S %s | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +target triple = "spir64-unknown-unknown" + +%"class.sycl::_V1::range" = type { %"class.sycl::_V1::detail::array" } +%"class.sycl::_V1::detail::array" = type { [1 x i64] } +%struct.array_wrapper = type { %"struct.std::array" } +%"struct.std::array" = type { [2 x %"struct.std::array.0"] } +%"struct.std::array.0" = type { [2 x %"class.sycl::_V1::vec"] } +%"class.sycl::_V1::vec" = type { <2 x i32> } + +; Function Attrs: alwaysinline nounwind +define spir_func void @__itt_offload_wi_start_wrapper() #0 { +entry: + %GroupID = alloca [3 x i64], align 8 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1 + +; Function Attrs: noinline nounwind +define spir_func void @__itt_offload_wi_start_stub(ptr addrspace(4) %group_id, i64 %wi_id, i32 %wg_size) #2 { +entry: + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) +declare void @llvm.assume(i1 noundef) #3 + +; Function Attrs: alwaysinline nounwind +define spir_func void @__itt_offload_wi_finish_wrapper() #0 { +entry: + %GroupID = alloca [3 x i64], align 8 + ret void +} + +; Function Attrs: noinline nounwind +define spir_func void @__itt_offload_wi_finish_stub(ptr addrspace(4) %group_id, i64 %wi_id) #2 { +entry: + ret void +} + +; Function Attrs: nounwind willreturn memory(none) +declare spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32) #4 + +; Function Attrs: nounwind willreturn memory(none) +declare spir_func i64 @_Z26__spirv_BuiltInWorkgroupIdi(i32) #4 + +; Function Attrs: nounwind willreturn memory(none) +declare spir_func i64 @_Z29__spirv_BuiltInGlobalLinearIdv() #4 + +; Function Attrs: nounwind willreturn memory(none) +declare spir_func i64 @_Z28__spirv_BuiltInWorkgroupSizei(i32) #4 + +define spir_kernel void @fused_0(ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn1, ptr byval(%"class.sycl::_V1::range") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn13, ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn2, ptr byval(%"class.sycl::_V1::range") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn26, ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp, ptr byval(%"class.sycl::_V1::range") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp9, ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn3, ptr byval(%"class.sycl::_V1::range") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn36, ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut, ptr byval(%"class.sycl::_V1::range") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut9) !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !8 !kernel_arg_type_qual !9 !kernel_arg_base_type !8 !kernel_arg_name !10 !sycl.kernel.promote !11 !sycl.kernel.promote.localsize !12 !sycl.kernel.constants !13 { +; Scenario: Test the successful private internalization of the pointer argument +; `...KernelOne__arg_accTmp`. This means the pointer argument has been replaced +; by a function-local alloca and all accesses have been updated to use this +; alloca (and the default address space) instead. This test is based on an IR +; dump of `sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp`. + +; CHECK-LABEL: define spir_kernel void @fused_0 +; CHECK-SAME: (ptr addrspace(1) align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCIN1:%.*]], ptr byval(%"class.sycl::_V1::range") align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCIN13:%.*]], ptr addrspace(1) align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCIN2:%.*]], ptr byval(%"class.sycl::_V1::range") align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCIN26:%.*]], ptr byval(%"class.sycl::_V1::range") align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP9:%.*]], ptr addrspace(1) align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE0_CLES2_E9KERNELTWO__ARG_ACCIN3:%.*]], ptr byval(%"class.sycl::_V1::range") align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE0_CLES2_E9KERNELTWO__ARG_ACCIN36:%.*]], ptr addrspace(1) align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE0_CLES2_E9KERNELTWO__ARG_ACCOUT:%.*]], ptr byval(%"class.sycl::_V1::range") align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE0_CLES2_E9KERNELTWO__ARG_ACCOUT9:%.*]]) !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !8 !kernel_arg_type_qual !9 !kernel_arg_base_type !8 !kernel_arg_name !10 !sycl.kernel.constants !11 { +; CHECK: entry: +; CHECK: [[TMP0:%.*]] = alloca [1 x %struct.array_wrapper], align 8 +; CHECK: [[TMP1:%.*]] = getelementptr inbounds [1 x %struct.array_wrapper], ptr [[TMP0]], i64 0, i64 0 +; CHECK: [[ADD_PTR_I43_I:%.*]] = getelementptr inbounds %struct.array_wrapper, ptr [[TMP1]], i64 0 +; CHECK: [[TMP2:%.*]] = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) #[[ATTR4:[0-9]+]] +; CHECK: [[ARRAYIDX_I34_I_I:%.*]] = getelementptr inbounds %struct.array_wrapper, ptr [[ADD_PTR_I43_I]], i64 0 +; CHECK: store <2 x i32> {{%.*}}, ptr [[ARRAYIDX_I34_I_I]], align 8 +; CHECK: [[ARRAYIDX_I_I39_I_I_1:%.*]] = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr [[ARRAYIDX_I34_I_I]], i64 0, i64 1 +; CHECK: store <2 x i32> {{%.*}}, ptr [[ARRAYIDX_I_I39_I_I_1]], align 8 +; CHECK: [[ARRAYIDX_I_I_I37_I_I_1:%.*]] = getelementptr inbounds [2 x %"struct.std::array.0"], ptr [[ARRAYIDX_I34_I_I]], i64 0, i64 1 +; CHECK: store <2 x i32> {{%.*}}, ptr [[ARRAYIDX_I_I_I37_I_I_1]], align 8 +; CHECK: [[ARRAYIDX_I_I39_I_I_1_1:%.*]] = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr [[ARRAYIDX_I_I_I37_I_I_1]], i64 0, i64 1 +; CHECK: store <2 x i32> {{%.*}}, ptr [[ARRAYIDX_I_I39_I_I_1_1]], align 8 +; CHECK: [[ADD_PTR_I_I7:%.*]] = getelementptr inbounds %struct.array_wrapper, ptr [[TMP1]], i64 0 +; CHECK: [[ARRAYIDX_I_I_I11:%.*]] = getelementptr inbounds %struct.array_wrapper, ptr [[ADD_PTR_I_I7]], i64 0 +; CHECK: [[TMP12:%.*]] = load <2 x i32>, ptr [[ARRAYIDX_I_I_I11]], align 8 +; CHECK: [[ARRAYIDX_I_I_I_I27_1:%.*]] = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr [[ARRAYIDX_I_I_I11]], i64 0, i64 1 +; CHECK: [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX_I_I_I_I27_1]], align 8 +; CHECK: [[ARRAYIDX_I_I_I_I_I18_1:%.*]] = getelementptr inbounds [2 x %"struct.std::array.0"], ptr [[ARRAYIDX_I_I_I11]], i64 0, i64 1 +; CHECK: [[TMP16:%.*]] = load <2 x i32>, ptr [[ARRAYIDX_I_I_I_I_I18_1]], align 8 +; CHECK: [[ARRAYIDX_I_I_I_I27_1_1:%.*]] = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr [[ARRAYIDX_I_I_I_I_I18_1]], i64 0, i64 1 +; CHECK: [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX_I_I_I_I27_1_1]], align 8 +; CHECK: ret void +; +entry: + %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp93.sroa.0.0.copyload = load i64, ptr %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp9, align 1 + %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn262.sroa.0.0.copyload = load i64, ptr %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn26, align 1 + %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn131.sroa.0.0.copyload = load i64, ptr %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn13, align 1 + %add.ptr.i.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn1, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn131.sroa.0.0.copyload + %add.ptr.i34.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn2, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn262.sroa.0.0.copyload + %add.ptr.i43.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp93.sroa.0.0.copyload + %0 = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) #4 + %cmp.i.i.i = icmp ult i64 %0, 2147483648 + call void @llvm.assume(i1 %cmp.i.i.i) + %arrayidx.i.i.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i.i, i64 %0 + %arrayidx.i30.i.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i34.i, i64 %0 + %arrayidx.i34.i.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i43.i, i64 %0 + %1 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i, align 8 + %2 = load <2 x i32>, ptr addrspace(1) %arrayidx.i30.i.i, align 8 + %add.i.i.i = add <2 x i32> %1, %2 + store <2 x i32> %add.i.i.i, ptr addrspace(1) %arrayidx.i34.i.i, align 8 + %arrayidx.i.i.i.i.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i, i64 0, i64 1 + %arrayidx.i.i38.i.i.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i30.i.i, i64 0, i64 1 + %3 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i.1, align 8 + %4 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i38.i.i.1, align 8 + %add.i.i.i.1 = add <2 x i32> %3, %4 + %arrayidx.i.i39.i.i.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i34.i.i, i64 0, i64 1 + store <2 x i32> %add.i.i.i.1, ptr addrspace(1) %arrayidx.i.i39.i.i.1, align 8 + %arrayidx.i.i.i.i.i.1 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i.i.i, i64 0, i64 1 + %arrayidx.i.i.i36.i.i.1 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i30.i.i, i64 0, i64 1 + %arrayidx.i.i.i37.i.i.1 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i34.i.i, i64 0, i64 1 + %5 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i.i.1, align 8 + %6 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i36.i.i.1, align 8 + %add.i.i.i.131 = add <2 x i32> %5, %6 + store <2 x i32> %add.i.i.i.131, ptr addrspace(1) %arrayidx.i.i.i37.i.i.1, align 8 + %arrayidx.i.i.i.i.1.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i.i.i.1, i64 0, i64 1 + %arrayidx.i.i38.i.i.1.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i36.i.i.1, i64 0, i64 1 + %7 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i.1.1, align 8 + %8 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i38.i.i.1.1, align 8 + %add.i.i.i.1.1 = add <2 x i32> %7, %8 + %arrayidx.i.i39.i.i.1.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i37.i.i.1, i64 0, i64 1 + store <2 x i32> %add.i.i.i.1.1, ptr addrspace(1) %arrayidx.i.i39.i.i.1.1, align 8 + %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut96.sroa.0.0.copyload = load i64, ptr %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut9, align 1 + %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn365.sroa.0.0.copyload = load i64, ptr %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn36, align 1 + %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp94.sroa.0.0.copyload = load i64, ptr %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp9, align 1 + %add.ptr.i.i7 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp94.sroa.0.0.copyload + %add.ptr.i34.i8 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn3, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn365.sroa.0.0.copyload + %add.ptr.i43.i9 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut96.sroa.0.0.copyload + %9 = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) #4 + %cmp.i.i.i10 = icmp ult i64 %9, 2147483648 + call void @llvm.assume(i1 %cmp.i.i.i10) + %arrayidx.i.i.i11 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i.i7, i64 %9 + %arrayidx.i30.i.i12 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i34.i8, i64 %9 + %arrayidx.i34.i.i13 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i43.i9, i64 %9 + %10 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i11, align 8 + %11 = load <2 x i32>, ptr addrspace(1) %arrayidx.i30.i.i12, align 8 + %mul.i.i.i = mul <2 x i32> %10, %11 + store <2 x i32> %mul.i.i.i, ptr addrspace(1) %arrayidx.i34.i.i13, align 8 + %arrayidx.i.i.i.i27.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i11, i64 0, i64 1 + %arrayidx.i.i38.i.i28.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i30.i.i12, i64 0, i64 1 + %12 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i27.1, align 8 + %13 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i38.i.i28.1, align 8 + %mul.i.i.i.1 = mul <2 x i32> %12, %13 + %arrayidx.i.i39.i.i29.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i34.i.i13, i64 0, i64 1 + store <2 x i32> %mul.i.i.i.1, ptr addrspace(1) %arrayidx.i.i39.i.i29.1, align 8 + %arrayidx.i.i.i.i.i18.1 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i.i.i11, i64 0, i64 1 + %arrayidx.i.i.i36.i.i19.1 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i30.i.i12, i64 0, i64 1 + %arrayidx.i.i.i37.i.i20.1 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i34.i.i13, i64 0, i64 1 + %14 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i.i18.1, align 8 + %15 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i36.i.i19.1, align 8 + %mul.i.i.i.135 = mul <2 x i32> %14, %15 + store <2 x i32> %mul.i.i.i.135, ptr addrspace(1) %arrayidx.i.i.i37.i.i20.1, align 8 + %arrayidx.i.i.i.i27.1.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i.i.i18.1, i64 0, i64 1 + %arrayidx.i.i38.i.i28.1.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i36.i.i19.1, i64 0, i64 1 + %16 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i27.1.1, align 8 + %17 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i38.i.i28.1.1, align 8 + %mul.i.i.i.1.1 = mul <2 x i32> %16, %17 + %arrayidx.i.i39.i.i29.1.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i37.i.i20.1, i64 0, i64 1 + store <2 x i32> %mul.i.i.i.1.1, ptr addrspace(1) %arrayidx.i.i39.i.i29.1.1, align 8 + ret void +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite) +declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #5 + +attributes #0 = { alwaysinline nounwind } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +attributes #2 = { noinline nounwind } +attributes #3 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } +attributes #4 = { nounwind willreturn memory(none) } +attributes #5 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } + +!spirv.MemoryModel = !{!0} +!opencl.enable.FP_CONTRACT = !{} +!spirv.Source = !{!1} +!opencl.spir.version = !{!2} +!opencl.ocl.version = !{!3} +!opencl.used.extensions = !{!4} +!opencl.used.optional.core.features = !{!4} +!spirv.Generator = !{!5} + +!0 = !{i32 2, i32 2} +!1 = !{i32 4, i32 100000} +!2 = !{i32 1, i32 2} +!3 = !{i32 1, i32 0} +!4 = !{} +!5 = !{i16 6, i16 14} +!6 = !{i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0} +!7 = !{!"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none"} +!8 = !{!"struct array_wrapper*", !"class.sycl::_V1::range", !"struct array_wrapper*", !"class.sycl::_V1::range", !"struct array_wrapper*", !"class.sycl::_V1::range", !"struct array_wrapper*", !"class.sycl::_V1::range", !"struct array_wrapper*", !"class.sycl::_V1::range"} +!9 = !{!"", !"", !"", !"", !"", !"", !"", !"", !"", !""} +!10 = !{!"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn1", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn13", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn2", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn26", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp9", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn3", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn36", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut9"} +!11 = !{!"none", !"none", !"none", !"none", !"private", !"none", !"none", !"none", !"none", !"none"} +!12 = !{!"", !"", !"", !"", i64 1, !"", !"", !"", !"", !""} +!13 = !{!"", !"\00\00\00\00\00\00\00\00", !"", !"\00\00\00\00\00\00\00\00", !"", !"", !"", !"\00\00\00\00\00\00\00\00", !"", !"\00\00\00\00\00\00\00\00"} diff --git a/sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp b/sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp new file mode 100644 index 0000000000000..650580768977b --- /dev/null +++ b/sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp @@ -0,0 +1,139 @@ +// REQUIRES: fusion +// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out +// RUN: %{run} %t.out + +// Test local internalization of a nested array type. + +#include + +#include + +using namespace sycl; + +template struct array_wrapper { + static constexpr size_t rows{N}; + static constexpr size_t columns{M}; + static constexpr size_t vec_width{2}; + + using value_type = vec; + using reference_type = value_type &; + using const_reference_type = const value_type &; + + std::array, rows> vs; + + explicit array_wrapper(const_reference_type v) { + std::array el; + el.fill(v); + vs.fill(el); + } + + array_wrapper() : array_wrapper{value_type{}} {} + + constexpr std::array &operator[](size_t i) { + return vs[i]; + } + + constexpr const std::array &operator[](size_t i) const { + return vs[i]; + } +}; + +int main() { + constexpr size_t dataSize = 16; + constexpr size_t rows = 2; + constexpr size_t columns = 2; + + using array_type = array_wrapper; + + array_type in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], + out[dataSize]; + + for (size_t id = 0; id < dataSize; ++id) { + for (size_t i = 0; i < rows; ++i) { + for (size_t j = 0; j < columns; ++j) { + in1[id][i][j].s0() = in1[id][i][j].s1() = id * 2; + in2[id][i][j].s0() = in2[id][i][j].s1() = id * 3; + in3[id][i][j].s0() = in3[id][i][j].s1() = id * 4; + tmp[id][i][j].s0() = tmp[id][i][j].s1() = -1; + out[id][i][j].s0() = out[id][i][j].s1() = -1; + } + } + } + + queue q{default_selector_v, + {ext::codeplay::experimental::property::queue::enable_fusion{}}}; + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bIn3{in3, range{dataSize}}; + buffer bTmp{tmp, range{dataSize}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_local{}); + cgh.parallel_for( + nd_range<1>{{dataSize}, {4}}, [=](id<1> id) { + const auto &accIn1Wrapp = accIn1[id]; + const auto &accIn2Wrapp = accIn2[id]; + auto &accTmpWrapp = accTmp[id]; + for (size_t i = 0; i < dataSize; ++i) { + const auto &in1 = accIn1Wrapp[i]; + const auto &in2 = accIn2Wrapp[i]; + auto &tmp = accTmpWrapp[i]; + for (size_t j = 0; j < columns; ++j) { + tmp[j] = in1[j] + in2[j]; + } + } + }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_local{}); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for( + nd_range<1>{{dataSize}, {4}}, [=](id<1> id) { + const auto &tmpWrapp = accTmp[id]; + const auto &accIn3Wrapp = accIn3[id]; + auto &accOutWrapp = accOut[id]; + for (size_t i = 0; i < dataSize; ++i) { + const auto &tmp = tmpWrapp[i]; + const auto &in3 = accIn3Wrapp[i]; + auto &out = accOutWrapp[i]; + for (size_t j = 0; j < columns; ++j) { + out[j] = tmp[j] * in3[j]; + } + } + }); + }); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + constexpr array_type::value_type not_written{-1, -1}; + for (size_t id = 0; id < dataSize; ++id) { + const array_type::value_type expected{20 * id * id, 20 * id * id}; + for (size_t i = 0; i < rows; ++i) { + for (size_t j = 0; j < columns; ++j) { + assert(all(out[id][i][j] == expected) && "Computation error"); + assert(all(tmp[id][i][j] == not_written) && "Not internalizing"); + } + } + } + + return 0; +} From a88f3cba49f9e0fa37df8b3bcc5a92d26c739753 Mon Sep 17 00:00:00 2001 From: Julian Oppermann Date: Thu, 24 Aug 2023 14:46:45 +0100 Subject: [PATCH 4/6] Fix bug in testcase. Signed-off-by: Julian Oppermann --- .../internalization/promote-local-nested.ll | 222 ++++++++---------- .../internalization/promote-private-nested.ll | 83 ++----- .../internalize_array_wrapper.cpp | 4 +- .../internalize_array_wrapper_local.cpp | 4 +- 4 files changed, 129 insertions(+), 184 deletions(-) diff --git a/sycl-fusion/test/internalization/promote-local-nested.ll b/sycl-fusion/test/internalization/promote-local-nested.ll index 5def5e7706fff..1293c72dc62f9 100644 --- a/sycl-fusion/test/internalization/promote-local-nested.ll +++ b/sycl-fusion/test/internalization/promote-local-nested.ll @@ -11,52 +11,17 @@ target triple = "spir64-unknown-unknown" %"struct.std::array.0" = type { [2 x %"class.sycl::_V1::vec"] } %"class.sycl::_V1::vec" = type { <2 x i32> } -; Function Attrs: alwaysinline nounwind -define spir_func void @__itt_offload_wi_start_wrapper() #0 { -entry: - %GroupID = alloca [3 x i64], align 8 - ret void -} +; Function Attrs: nounwind +declare spir_func void @__itt_offload_wi_start_wrapper() #0 -; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) -declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1 - -; Function Attrs: noinline nounwind -define spir_func void @__itt_offload_wi_start_stub(ptr addrspace(4) %group_id, i64 %wi_id, i32 %wg_size) #2 { -entry: - ret void -} - -; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) -declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1 +; Function Attrs: nounwind +declare spir_func void @__itt_offload_wi_finish_wrapper() #0 ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) -declare void @llvm.assume(i1 noundef) #3 - -; Function Attrs: alwaysinline nounwind -define spir_func void @__itt_offload_wi_finish_wrapper() #0 { -entry: - %GroupID = alloca [3 x i64], align 8 - ret void -} - -; Function Attrs: noinline nounwind -define spir_func void @__itt_offload_wi_finish_stub(ptr addrspace(4) %group_id, i64 %wi_id) #2 { -entry: - ret void -} - -; Function Attrs: nounwind willreturn memory(none) -declare spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32) #4 - -; Function Attrs: nounwind willreturn memory(none) -declare spir_func i64 @_Z26__spirv_BuiltInWorkgroupIdi(i32) #4 +declare void @llvm.assume(i1 noundef) #1 ; Function Attrs: nounwind willreturn memory(none) -declare spir_func i64 @_Z29__spirv_BuiltInGlobalLinearIdv() #4 - -; Function Attrs: nounwind willreturn memory(none) -declare spir_func i64 @_Z28__spirv_BuiltInWorkgroupSizei(i32) #4 +declare spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32) #2 define spir_kernel void @fused_0(ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn1, ptr byval(%"class.sycl::_V1::id") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn13, ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn2, ptr byval(%"class.sycl::_V1::id") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn26, ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp, ptr byval(%"class.sycl::_V1::id") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp9, ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn3, ptr byval(%"class.sycl::_V1::id") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn36, ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut, ptr byval(%"class.sycl::_V1::id") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut9) !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !8 !kernel_arg_type_qual !9 !kernel_arg_base_type !8 !kernel_arg_name !10 !sycl.kernel.promote !11 !sycl.kernel.promote.localsize !12 !sycl.kernel.constants !13 { ; Scenario: Test the successful local internalization of the pointer argument @@ -71,32 +36,43 @@ define spir_kernel void @fused_0(ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sy ; CHECK: [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP93_SROA_0_0_COPYLOAD:%.*]] = load i64, ptr [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP9]], align 1 ; CHECK: [[TMP0:%.*]] = urem i64 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP93_SROA_0_0_COPYLOAD]], 4 ; CHECK: [[ADD_PTR_I43_I:%.*]] = getelementptr inbounds %struct.array_wrapper, ptr addrspace(3) [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP]], i64 [[TMP0]] -; CHECK: [[TMP1:%.*]] = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) #[[ATTR4:[0-9]+]] +; CHECK: [[TMP1:%.*]] = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) #[[ATTR2:[0-9]+]] ; CHECK: [[TMP2:%.*]] = add i64 [[TMP1]], [[TMP0]] ; CHECK: [[TMP3:%.*]] = urem i64 [[TMP2]], 4 ; CHECK: [[ARRAYIDX_I34_I_I:%.*]] = getelementptr inbounds %struct.array_wrapper, ptr addrspace(3) [[ADD_PTR_I43_I]], i64 [[TMP3]] -; CHECK: [[I_0_I_I:%.*]] = phi i64 {{.*}} -; CHECK: [[TMP4:%.*]] = add i64 0, [[TMP3]] -; CHECK: [[TMP5:%.*]] = add i64 [[TMP4]], [[TMP0]] -; CHECK: [[TMP6:%.*]] = urem i64 [[TMP5]], 4 -; CHECK: [[ARRAYIDX_I_I_I37_I_I:%.*]] = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(3) [[ARRAYIDX_I34_I_I]], i64 [[TMP6]], i64 [[I_0_I_I]] -; CHECK: store <2 x i32> {{%.*}}, ptr addrspace(3) [[ARRAYIDX_I_I_I37_I_I]], align 8 -; CHECK: [[ARRAYIDX_I_I39_I_I_1:%.*]] = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(3) [[ARRAYIDX_I_I_I37_I_I]], i64 0, i64 1 +; CHECK: store <2 x i32> {{%.*}}, ptr addrspace(3) [[ARRAYIDX_I34_I_I]], align 8 +; CHECK: [[TMP8:%.*]] = add i64 0, [[TMP3]] +; CHECK: [[TMP9:%.*]] = add i64 [[TMP8]], [[TMP0]] +; CHECK: [[TMP10:%.*]] = urem i64 [[TMP9]], 4 +; CHECK: [[ARRAYIDX_I_I39_I_I_1:%.*]] = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(3) [[ARRAYIDX_I34_I_I]], i64 [[TMP10]], i64 1 ; CHECK: store <2 x i32> {{%.*}}, ptr addrspace(3) [[ARRAYIDX_I_I39_I_I_1]], align 8 +; CHECK: [[TMP11:%.*]] = add i64 0, [[TMP3]] +; CHECK: [[TMP12:%.*]] = add i64 [[TMP11]], [[TMP0]] +; CHECK: [[TMP13:%.*]] = urem i64 [[TMP12]], 4 +; CHECK: [[ARRAYIDX_I_I_I37_I_I_1:%.*]] = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(3) [[ARRAYIDX_I34_I_I]], i64 [[TMP13]], i64 1 +; CHECK: store <2 x i32> {{%.*}}, ptr addrspace(3) [[ARRAYIDX_I_I_I37_I_I_1]], align 8 +; CHECK: [[ARRAYIDX_I_I39_I_I_1_1:%.*]] = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(3) [[ARRAYIDX_I_I_I37_I_I_1]], i64 0, i64 1 +; CHECK: store <2 x i32> {{%.*}}, ptr addrspace(3) [[ARRAYIDX_I_I39_I_I_1_1]], align 8 ; CHECK: [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP94_SROA_0_0_COPYLOAD:%.*]] = load i64, ptr [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP9]], align 1 -; CHECK: [[TMP11:%.*]] = urem i64 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP94_SROA_0_0_COPYLOAD]], 4 -; CHECK: [[ADD_PTR_I_I7:%.*]] = getelementptr inbounds %struct.array_wrapper, ptr addrspace(3) [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP]], i64 [[TMP11]] -; CHECK: [[TMP12:%.*]] = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) #[[ATTR4]] -; CHECK: [[TMP13:%.*]] = add i64 [[TMP12]], [[TMP11]] -; CHECK: [[TMP14:%.*]] = urem i64 [[TMP13]], 4 -; CHECK: [[ARRAYIDX_I_I_I11:%.*]] = getelementptr inbounds %struct.array_wrapper, ptr addrspace(3) [[ADD_PTR_I_I7]], i64 [[TMP14]] -; CHECK: [[I_0_I_I15:%.*]] = phi i64 {{.*}} -; CHECK: [[TMP15:%.*]] = add i64 0, [[TMP14]] -; CHECK: [[TMP16:%.*]] = add i64 [[TMP15]], [[TMP11]] -; CHECK: [[TMP17:%.*]] = urem i64 [[TMP16]], 4 -; CHECK: [[ARRAYIDX_I_I_I_I_I18:%.*]] = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(3) [[ARRAYIDX_I_I_I11]], i64 [[TMP17]], i64 [[I_0_I_I15]] -; CHECK: [[TMP18:%.*]] = load <2 x i32>, ptr addrspace(3) [[ARRAYIDX_I_I_I_I_I18]], align 8 -; CHECK: [[ARRAYIDX_I_I_I_I27_1:%.*]] = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(3) [[ARRAYIDX_I_I_I_I_I18]], i64 0, i64 1 +; CHECK: [[TMP18:%.*]] = urem i64 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP94_SROA_0_0_COPYLOAD]], 4 +; CHECK: [[ADD_PTR_I_I7:%.*]] = getelementptr inbounds %struct.array_wrapper, ptr addrspace(3) [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP]], i64 [[TMP18]] +; CHECK: [[TMP19:%.*]] = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) #[[ATTR2]] +; CHECK: [[TMP20:%.*]] = add i64 [[TMP19]], [[TMP18]] +; CHECK: [[TMP21:%.*]] = urem i64 [[TMP20]], 4 +; CHECK: [[ARRAYIDX_I_I_I11:%.*]] = getelementptr inbounds %struct.array_wrapper, ptr addrspace(3) [[ADD_PTR_I_I7]], i64 [[TMP21]] +; CHECK: [[TMP22:%.*]] = load <2 x i32>, ptr addrspace(3) [[ARRAYIDX_I_I_I11]], align 8 +; CHECK: [[TMP24:%.*]] = add i64 0, [[TMP21]] +; CHECK: [[TMP25:%.*]] = add i64 [[TMP24]], [[TMP18]] +; CHECK: [[TMP26:%.*]] = urem i64 [[TMP25]], 4 +; CHECK: [[ARRAYIDX_I_I_I_I27_1:%.*]] = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(3) [[ARRAYIDX_I_I_I11]], i64 [[TMP26]], i64 1 +; CHECK: [[TMP27:%.*]] = load <2 x i32>, ptr addrspace(3) [[ARRAYIDX_I_I_I_I27_1]], align 8 +; CHECK: [[TMP29:%.*]] = add i64 0, [[TMP21]] +; CHECK: [[TMP30:%.*]] = add i64 [[TMP29]], [[TMP18]] +; CHECK: [[TMP31:%.*]] = urem i64 [[TMP30]], 4 +; CHECK: [[ARRAYIDX_I_I_I_I_I18_1:%.*]] = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(3) [[ARRAYIDX_I_I_I11]], i64 [[TMP31]], i64 1 +; CHECK: [[TMP32:%.*]] = load <2 x i32>, ptr addrspace(3) [[ARRAYIDX_I_I_I_I_I18_1]], align 8 +; CHECK: [[ARRAYIDX_I_I_I_I27_1_1:%.*]] = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(3) [[ARRAYIDX_I_I_I_I_I18_1]], i64 0, i64 1 +; CHECK: [[TMP34:%.*]] = load <2 x i32>, ptr addrspace(3) [[ARRAYIDX_I_I_I_I27_1_1]], align 8 ; CHECK: ret void ; entry: @@ -106,91 +82,93 @@ entry: %add.ptr.i.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn1, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn131.sroa.0.0.copyload %add.ptr.i34.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn2, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn262.sroa.0.0.copyload %add.ptr.i43.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp93.sroa.0.0.copyload - %0 = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) #4 + %0 = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) #2 %cmp.i.i.i = icmp ult i64 %0, 2147483648 call void @llvm.assume(i1 %cmp.i.i.i) %arrayidx.i.i.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i.i, i64 %0 %arrayidx.i30.i.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i34.i, i64 %0 %arrayidx.i34.i.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i43.i, i64 %0 - br label %for.cond.i.i - -for.cond.i.i: ; preds = %for.body.i.i, %entry - %i.0.i.i = phi i64 [ 0, %entry ], [ %inc17.i.i, %for.body.i.i ] - %cmp.i.i = icmp ult i64 %i.0.i.i, 16 - br i1 %cmp.i.i, label %for.body.i.i, label %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne.exit - -for.body.i.i: ; preds = %for.cond.i.i - %arrayidx.i.i.i.i.i = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i.i.i, i64 0, i64 %i.0.i.i - %arrayidx.i.i.i36.i.i = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i30.i.i, i64 0, i64 %i.0.i.i - %arrayidx.i.i.i37.i.i = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i34.i.i, i64 0, i64 %i.0.i.i - %1 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i.i, align 8 - %2 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i36.i.i, align 8 + %1 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i, align 8 + %2 = load <2 x i32>, ptr addrspace(1) %arrayidx.i30.i.i, align 8 %add.i.i.i = add <2 x i32> %1, %2 - store <2 x i32> %add.i.i.i, ptr addrspace(1) %arrayidx.i.i.i37.i.i, align 8 - %arrayidx.i.i.i.i.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i.i.i, i64 0, i64 1 - %arrayidx.i.i38.i.i.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i36.i.i, i64 0, i64 1 + store <2 x i32> %add.i.i.i, ptr addrspace(1) %arrayidx.i34.i.i, align 8 + %arrayidx.i.i.i.i.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i, i64 0, i64 1 + %arrayidx.i.i38.i.i.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i30.i.i, i64 0, i64 1 %3 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i.1, align 8 %4 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i38.i.i.1, align 8 %add.i.i.i.1 = add <2 x i32> %3, %4 - %arrayidx.i.i39.i.i.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i37.i.i, i64 0, i64 1 + %arrayidx.i.i39.i.i.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i34.i.i, i64 0, i64 1 store <2 x i32> %add.i.i.i.1, ptr addrspace(1) %arrayidx.i.i39.i.i.1, align 8 - %inc17.i.i = add nuw nsw i64 %i.0.i.i, 1 - br label %for.cond.i.i - -_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne.exit: ; preds = %for.cond.i.i + %arrayidx.i.i.i.i.i.1 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i.i.i, i64 0, i64 1 + %arrayidx.i.i.i36.i.i.1 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i30.i.i, i64 0, i64 1 + %arrayidx.i.i.i37.i.i.1 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i34.i.i, i64 0, i64 1 + %5 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i.i.1, align 8 + %6 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i36.i.i.1, align 8 + %add.i.i.i.131 = add <2 x i32> %5, %6 + store <2 x i32> %add.i.i.i.131, ptr addrspace(1) %arrayidx.i.i.i37.i.i.1, align 8 + %arrayidx.i.i.i.i.1.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i.i.i.1, i64 0, i64 1 + %arrayidx.i.i38.i.i.1.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i36.i.i.1, i64 0, i64 1 + %7 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i.1.1, align 8 + %8 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i38.i.i.1.1, align 8 + %add.i.i.i.1.1 = add <2 x i32> %7, %8 + %arrayidx.i.i39.i.i.1.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i37.i.i.1, i64 0, i64 1 + store <2 x i32> %add.i.i.i.1.1, ptr addrspace(1) %arrayidx.i.i39.i.i.1.1, align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut96.sroa.0.0.copyload = load i64, ptr %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut9, align 1 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn365.sroa.0.0.copyload = load i64, ptr %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn36, align 1 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp94.sroa.0.0.copyload = load i64, ptr %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp9, align 1 %add.ptr.i.i7 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp94.sroa.0.0.copyload %add.ptr.i34.i8 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn3, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn365.sroa.0.0.copyload %add.ptr.i43.i9 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut96.sroa.0.0.copyload - %5 = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) #4 - %cmp.i.i.i10 = icmp ult i64 %5, 2147483648 + %9 = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) #2 + %cmp.i.i.i10 = icmp ult i64 %9, 2147483648 call void @llvm.assume(i1 %cmp.i.i.i10) - %arrayidx.i.i.i11 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i.i7, i64 %5 - %arrayidx.i30.i.i12 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i34.i8, i64 %5 - %arrayidx.i34.i.i13 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i43.i9, i64 %5 - br label %for.cond.i.i14 - -for.cond.i.i14: ; preds = %for.body.i.i17, %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne.exit - %i.0.i.i15 = phi i64 [ 0, %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne.exit ], [ %inc17.i.i25, %for.body.i.i17 ] - %cmp.i.i16 = icmp ult i64 %i.0.i.i15, 16 - br i1 %cmp.i.i16, label %for.body.i.i17, label %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo.exit - -for.body.i.i17: ; preds = %for.cond.i.i14 - %arrayidx.i.i.i.i.i18 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i.i.i11, i64 0, i64 %i.0.i.i15 - %arrayidx.i.i.i36.i.i19 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i30.i.i12, i64 0, i64 %i.0.i.i15 - %arrayidx.i.i.i37.i.i20 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i34.i.i13, i64 0, i64 %i.0.i.i15 - %6 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i.i18, align 8 - %7 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i36.i.i19, align 8 - %mul.i.i.i = mul <2 x i32> %6, %7 - store <2 x i32> %mul.i.i.i, ptr addrspace(1) %arrayidx.i.i.i37.i.i20, align 8 - %arrayidx.i.i.i.i27.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i.i.i18, i64 0, i64 1 - %arrayidx.i.i38.i.i28.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i36.i.i19, i64 0, i64 1 - %8 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i27.1, align 8 - %9 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i38.i.i28.1, align 8 - %mul.i.i.i.1 = mul <2 x i32> %8, %9 - %arrayidx.i.i39.i.i29.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i37.i.i20, i64 0, i64 1 + %arrayidx.i.i.i11 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i.i7, i64 %9 + %arrayidx.i30.i.i12 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i34.i8, i64 %9 + %arrayidx.i34.i.i13 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i43.i9, i64 %9 + %10 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i11, align 8 + %11 = load <2 x i32>, ptr addrspace(1) %arrayidx.i30.i.i12, align 8 + %mul.i.i.i = mul <2 x i32> %10, %11 + store <2 x i32> %mul.i.i.i, ptr addrspace(1) %arrayidx.i34.i.i13, align 8 + %arrayidx.i.i.i.i27.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i11, i64 0, i64 1 + %arrayidx.i.i38.i.i28.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i30.i.i12, i64 0, i64 1 + %12 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i27.1, align 8 + %13 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i38.i.i28.1, align 8 + %mul.i.i.i.1 = mul <2 x i32> %12, %13 + %arrayidx.i.i39.i.i29.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i34.i.i13, i64 0, i64 1 store <2 x i32> %mul.i.i.i.1, ptr addrspace(1) %arrayidx.i.i39.i.i29.1, align 8 - %inc17.i.i25 = add nuw nsw i64 %i.0.i.i15, 1 - br label %for.cond.i.i14 - -_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo.exit: ; preds = %for.cond.i.i14 + %arrayidx.i.i.i.i.i18.1 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i.i.i11, i64 0, i64 1 + %arrayidx.i.i.i36.i.i19.1 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i30.i.i12, i64 0, i64 1 + %arrayidx.i.i.i37.i.i20.1 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i34.i.i13, i64 0, i64 1 + %14 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i.i18.1, align 8 + %15 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i36.i.i19.1, align 8 + %mul.i.i.i.135 = mul <2 x i32> %14, %15 + store <2 x i32> %mul.i.i.i.135, ptr addrspace(1) %arrayidx.i.i.i37.i.i20.1, align 8 + %arrayidx.i.i.i.i27.1.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i.i.i18.1, i64 0, i64 1 + %arrayidx.i.i38.i.i28.1.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i36.i.i19.1, i64 0, i64 1 + %16 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i27.1.1, align 8 + %17 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i38.i.i28.1.1, align 8 + %mul.i.i.i.1.1 = mul <2 x i32> %16, %17 + %arrayidx.i.i39.i.i29.1.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i37.i.i20.1, i64 0, i64 1 + store <2 x i32> %mul.i.i.i.1.1, ptr addrspace(1) %arrayidx.i.i39.i.i29.1.1, align 8 ret void } ; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite) -declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #5 +declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #3 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #4 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #4 -attributes #0 = { alwaysinline nounwind } -attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } -attributes #2 = { noinline nounwind } -attributes #3 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } -attributes #4 = { nounwind willreturn memory(none) } -attributes #5 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +attributes #0 = { nounwind } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } +attributes #2 = { nounwind willreturn memory(none) } +attributes #3 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } !spirv.MemoryModel = !{!0} -!opencl.enable.FP_CONTRACT = !{} !spirv.Source = !{!1} !opencl.spir.version = !{!2} !opencl.ocl.version = !{!3} diff --git a/sycl-fusion/test/internalization/promote-private-nested.ll b/sycl-fusion/test/internalization/promote-private-nested.ll index 54be0fc1ae40b..81a20490e0f2b 100644 --- a/sycl-fusion/test/internalization/promote-private-nested.ll +++ b/sycl-fusion/test/internalization/promote-private-nested.ll @@ -1,6 +1,5 @@ ; RUN: opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext \ ; RUN: -passes=sycl-internalization --sycl-info-path %S/../kernel-fusion/kernel-info.yaml -S %s | FileCheck %s - target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" @@ -11,52 +10,17 @@ target triple = "spir64-unknown-unknown" %"struct.std::array.0" = type { [2 x %"class.sycl::_V1::vec"] } %"class.sycl::_V1::vec" = type { <2 x i32> } -; Function Attrs: alwaysinline nounwind -define spir_func void @__itt_offload_wi_start_wrapper() #0 { -entry: - %GroupID = alloca [3 x i64], align 8 - ret void -} +; Function Attrs: nounwind +declare spir_func void @__itt_offload_wi_start_wrapper() #0 -; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) -declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1 - -; Function Attrs: noinline nounwind -define spir_func void @__itt_offload_wi_start_stub(ptr addrspace(4) %group_id, i64 %wi_id, i32 %wg_size) #2 { -entry: - ret void -} - -; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) -declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1 +; Function Attrs: nounwind +declare spir_func void @__itt_offload_wi_finish_wrapper() #0 ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) -declare void @llvm.assume(i1 noundef) #3 - -; Function Attrs: alwaysinline nounwind -define spir_func void @__itt_offload_wi_finish_wrapper() #0 { -entry: - %GroupID = alloca [3 x i64], align 8 - ret void -} - -; Function Attrs: noinline nounwind -define spir_func void @__itt_offload_wi_finish_stub(ptr addrspace(4) %group_id, i64 %wi_id) #2 { -entry: - ret void -} - -; Function Attrs: nounwind willreturn memory(none) -declare spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32) #4 - -; Function Attrs: nounwind willreturn memory(none) -declare spir_func i64 @_Z26__spirv_BuiltInWorkgroupIdi(i32) #4 - -; Function Attrs: nounwind willreturn memory(none) -declare spir_func i64 @_Z29__spirv_BuiltInGlobalLinearIdv() #4 +declare void @llvm.assume(i1 noundef) #1 ; Function Attrs: nounwind willreturn memory(none) -declare spir_func i64 @_Z28__spirv_BuiltInWorkgroupSizei(i32) #4 +declare spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32) #2 define spir_kernel void @fused_0(ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn1, ptr byval(%"class.sycl::_V1::range") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn13, ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn2, ptr byval(%"class.sycl::_V1::range") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn26, ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp, ptr byval(%"class.sycl::_V1::range") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp9, ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn3, ptr byval(%"class.sycl::_V1::range") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn36, ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut, ptr byval(%"class.sycl::_V1::range") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut9) !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !8 !kernel_arg_type_qual !9 !kernel_arg_base_type !8 !kernel_arg_name !10 !sycl.kernel.promote !11 !sycl.kernel.promote.localsize !12 !sycl.kernel.constants !13 { ; Scenario: Test the successful private internalization of the pointer argument @@ -67,19 +31,18 @@ define spir_kernel void @fused_0(ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sy ; CHECK-LABEL: define spir_kernel void @fused_0 ; CHECK-SAME: (ptr addrspace(1) align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCIN1:%.*]], ptr byval(%"class.sycl::_V1::range") align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCIN13:%.*]], ptr addrspace(1) align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCIN2:%.*]], ptr byval(%"class.sycl::_V1::range") align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCIN26:%.*]], ptr byval(%"class.sycl::_V1::range") align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP9:%.*]], ptr addrspace(1) align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE0_CLES2_E9KERNELTWO__ARG_ACCIN3:%.*]], ptr byval(%"class.sycl::_V1::range") align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE0_CLES2_E9KERNELTWO__ARG_ACCIN36:%.*]], ptr addrspace(1) align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE0_CLES2_E9KERNELTWO__ARG_ACCOUT:%.*]], ptr byval(%"class.sycl::_V1::range") align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE0_CLES2_E9KERNELTWO__ARG_ACCOUT9:%.*]]) !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !8 !kernel_arg_type_qual !9 !kernel_arg_base_type !8 !kernel_arg_name !10 !sycl.kernel.constants !11 { -; CHECK: entry: +; CHECK-NEXT: entry: ; CHECK: [[TMP0:%.*]] = alloca [1 x %struct.array_wrapper], align 8 ; CHECK: [[TMP1:%.*]] = getelementptr inbounds [1 x %struct.array_wrapper], ptr [[TMP0]], i64 0, i64 0 ; CHECK: [[ADD_PTR_I43_I:%.*]] = getelementptr inbounds %struct.array_wrapper, ptr [[TMP1]], i64 0 -; CHECK: [[TMP2:%.*]] = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) #[[ATTR4:[0-9]+]] ; CHECK: [[ARRAYIDX_I34_I_I:%.*]] = getelementptr inbounds %struct.array_wrapper, ptr [[ADD_PTR_I43_I]], i64 0 -; CHECK: store <2 x i32> {{%.*}}, ptr [[ARRAYIDX_I34_I_I]], align 8 +; CHECK: store <2 x i32> {{.*}}, ptr [[ARRAYIDX_I34_I_I]], align 8 ; CHECK: [[ARRAYIDX_I_I39_I_I_1:%.*]] = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr [[ARRAYIDX_I34_I_I]], i64 0, i64 1 -; CHECK: store <2 x i32> {{%.*}}, ptr [[ARRAYIDX_I_I39_I_I_1]], align 8 +; CHECK: store <2 x i32> {{.*}}, ptr [[ARRAYIDX_I_I39_I_I_1]], align 8 ; CHECK: [[ARRAYIDX_I_I_I37_I_I_1:%.*]] = getelementptr inbounds [2 x %"struct.std::array.0"], ptr [[ARRAYIDX_I34_I_I]], i64 0, i64 1 -; CHECK: store <2 x i32> {{%.*}}, ptr [[ARRAYIDX_I_I_I37_I_I_1]], align 8 +; CHECK: store <2 x i32> {{.*}}, ptr [[ARRAYIDX_I_I_I37_I_I_1]], align 8 ; CHECK: [[ARRAYIDX_I_I39_I_I_1_1:%.*]] = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr [[ARRAYIDX_I_I_I37_I_I_1]], i64 0, i64 1 -; CHECK: store <2 x i32> {{%.*}}, ptr [[ARRAYIDX_I_I39_I_I_1_1]], align 8 +; CHECK: store <2 x i32> {{.*}}, ptr [[ARRAYIDX_I_I39_I_I_1_1]], align 8 ; CHECK: [[ADD_PTR_I_I7:%.*]] = getelementptr inbounds %struct.array_wrapper, ptr [[TMP1]], i64 0 ; CHECK: [[ARRAYIDX_I_I_I11:%.*]] = getelementptr inbounds %struct.array_wrapper, ptr [[ADD_PTR_I_I7]], i64 0 ; CHECK: [[TMP12:%.*]] = load <2 x i32>, ptr [[ARRAYIDX_I_I_I11]], align 8 @@ -98,7 +61,7 @@ entry: %add.ptr.i.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn1, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn131.sroa.0.0.copyload %add.ptr.i34.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn2, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn262.sroa.0.0.copyload %add.ptr.i43.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp93.sroa.0.0.copyload - %0 = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) #4 + %0 = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) #2 %cmp.i.i.i = icmp ult i64 %0, 2147483648 call void @llvm.assume(i1 %cmp.i.i.i) %arrayidx.i.i.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i.i, i64 %0 @@ -135,7 +98,7 @@ entry: %add.ptr.i.i7 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp94.sroa.0.0.copyload %add.ptr.i34.i8 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn3, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn365.sroa.0.0.copyload %add.ptr.i43.i9 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut96.sroa.0.0.copyload - %9 = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) #4 + %9 = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) #2 %cmp.i.i.i10 = icmp ult i64 %9, 2147483648 call void @llvm.assume(i1 %cmp.i.i.i10) %arrayidx.i.i.i11 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i.i7, i64 %9 @@ -170,17 +133,21 @@ entry: } ; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite) -declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #5 +declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #3 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #4 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #4 -attributes #0 = { alwaysinline nounwind } -attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } -attributes #2 = { noinline nounwind } -attributes #3 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } -attributes #4 = { nounwind willreturn memory(none) } -attributes #5 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +attributes #0 = { nounwind } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } +attributes #2 = { nounwind willreturn memory(none) } +attributes #3 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } !spirv.MemoryModel = !{!0} -!opencl.enable.FP_CONTRACT = !{} !spirv.Source = !{!1} !opencl.spir.version = !{!2} !opencl.ocl.version = !{!3} diff --git a/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp b/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp index e90f42d023616..4d2f9c4b72b94 100644 --- a/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp +++ b/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp @@ -84,7 +84,7 @@ int main() { const auto &accIn1Wrapp = accIn1[id]; const auto &accIn2Wrapp = accIn2[id]; auto &accTmpWrapp = accTmp[id]; - for (size_t i = 0; i < dataSize; ++i) { + for (size_t i = 0; i < rows; ++i) { const auto &in1 = accIn1Wrapp[i]; const auto &in2 = accIn2Wrapp[i]; auto &tmp = accTmpWrapp[i]; @@ -104,7 +104,7 @@ int main() { const auto &tmpWrapp = accTmp[id]; const auto &accIn3Wrapp = accIn3[id]; auto &accOutWrapp = accOut[id]; - for (size_t i = 0; i < dataSize; ++i) { + for (size_t i = 0; i < rows; ++i) { const auto &tmp = tmpWrapp[i]; const auto &in3 = accIn3Wrapp[i]; auto &out = accOutWrapp[i]; diff --git a/sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp b/sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp index 650580768977b..f500c6ce52eaa 100644 --- a/sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp +++ b/sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp @@ -85,7 +85,7 @@ int main() { const auto &accIn1Wrapp = accIn1[id]; const auto &accIn2Wrapp = accIn2[id]; auto &accTmpWrapp = accTmp[id]; - for (size_t i = 0; i < dataSize; ++i) { + for (size_t i = 0; i < rows; ++i) { const auto &in1 = accIn1Wrapp[i]; const auto &in2 = accIn2Wrapp[i]; auto &tmp = accTmpWrapp[i]; @@ -106,7 +106,7 @@ int main() { const auto &tmpWrapp = accTmp[id]; const auto &accIn3Wrapp = accIn3[id]; auto &accOutWrapp = accOut[id]; - for (size_t i = 0; i < dataSize; ++i) { + for (size_t i = 0; i < rows; ++i) { const auto &tmp = tmpWrapp[i]; const auto &in3 = accIn3Wrapp[i]; auto &out = accOutWrapp[i]; From 8942ee0f4508834bd40e369c1812adbc68be469f Mon Sep 17 00:00:00 2001 From: Julian Oppermann Date: Fri, 25 Aug 2023 11:44:21 +0100 Subject: [PATCH 5/6] Add lit test for conservative call behavior. Signed-off-by: Julian Oppermann --- .../internalization/Internalization.cpp | 7 ++-- .../internalization/abort-promote-call.ll | 36 +++++++++++++++++++ 2 files changed, 40 insertions(+), 3 deletions(-) create mode 100644 sycl-fusion/test/internalization/abort-promote-call.ll diff --git a/sycl-fusion/passes/internalization/Internalization.cpp b/sycl-fusion/passes/internalization/Internalization.cpp index 23b899374f402..75aff209a600d 100644 --- a/sycl-fusion/passes/internalization/Internalization.cpp +++ b/sycl-fusion/passes/internalization/Internalization.cpp @@ -227,9 +227,10 @@ Error SYCLInternalizerImpl::canPromoteCall(CallBase *C, const Value *Val, "It is not safe to promote a called function which returns a pointer."); } if (InAggregate) { - return createStringError(inconvertibleErrorCode(), - "It is not safe to promote a pointer into an " - "aggregate object to a called function."); + return createStringError( + inconvertibleErrorCode(), + "Promotion of a pointer into an aggregate object to a called function " + "is currently not supported."); } SmallVector InternInfo = getUsagesInternalization(C, Val, LocalSize); diff --git a/sycl-fusion/test/internalization/abort-promote-call.ll b/sycl-fusion/test/internalization/abort-promote-call.ll new file mode 100644 index 0000000000000..bcd2b38143ac5 --- /dev/null +++ b/sycl-fusion/test/internalization/abort-promote-call.ll @@ -0,0 +1,36 @@ +; RUN: opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext \ +; RUN: -passes=sycl-internalization --sycl-info-path %S/abort-kernel-info.yaml -S %s | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +target triple = "spir64-unknown-unknown" + +; Function Attrs: noinline +define void @fun(ptr addrspace(1) %arg) #0 { + ret void +} + +%struct = type { i32, i32, i32 } + +; CHECK-LABEL: define {{[^@]+}}@fused_0 +; CHECK-SAME: (ptr addrspace(1) align 4 %[[ACC:.*]]) +define spir_kernel void @fused_0(ptr addrspace(1) align 4 %acc) !kernel_arg_addr_space !12 !kernel_arg_access_qual !13 !kernel_arg_type !14 !kernel_arg_type_qual !15 !kernel_arg_base_type !14 !kernel_arg_name !16 !sycl.kernel.promote !17 !sycl.kernel.promote.localsize !18 { +; Scenario: Test private internalization is not performed when pointers into +; aggregate object are passed to function calls. + +; CHECK-NOT: alloca [1 x %struct] + %gep1 = getelementptr %struct, ptr addrspace(1) %acc, i64 17 + %gep2 = getelementptr %struct, ptr addrspace(1) %gep1, i64 0, i32 2 + call void @fun(ptr addrspace(1) %gep2) + store i32 42, ptr addrspace(1) %gep2 + ret void +} + +attributes #0 = { noinline } + +!12 = !{i32 1} +!13 = !{!"none"} +!14 = !{!"ptr"} +!15 = !{!""} +!16 = !{!"acc"} +!17 = !{!"private"} +!18 = !{i64 1} From 50167272f99f5aec05b1c41f0c06e11d50aaa65a Mon Sep 17 00:00:00 2001 From: Julian Oppermann Date: Mon, 28 Aug 2023 13:12:52 +0100 Subject: [PATCH 6/6] Mark new local promotion test as unsupported on CUDA. Signed-off-by: Julian Oppermann --- sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp b/sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp index f500c6ce52eaa..daa3909b705e8 100644 --- a/sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp +++ b/sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp @@ -1,6 +1,7 @@ // REQUIRES: fusion // RUN: %{build} -fsycl-embed-ir -O2 -o %t.out // RUN: %{run} %t.out +// UNSUPPORTED: cuda // Test local internalization of a nested array type.