diff --git a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp index 965202fe5bbfa..a6c5569d87d22 100644 --- a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp +++ b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp @@ -105,6 +105,9 @@ FusionPipeline::runFusionPasses(Module &Mod, SYCLModuleInfo &InputInfo, // Ideally, the static compiler should have performed that job. const unsigned FlatAddressSpace = getFlatAddressSpace(Mod); FPM.addPass(InferAddressSpacesPass(FlatAddressSpace)); + // Run CFG simplification to prevent unreachable code from obscuring + // internalization opportunities. + FPM.addPass(SimplifyCFGPass{}); MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); } // Run dataflow internalization and runtime constant propagation. diff --git a/sycl-fusion/passes/internalization/Internalization.cpp b/sycl-fusion/passes/internalization/Internalization.cpp index 3d5c38c799e8f..75aff209a600d 100644 --- a/sycl-fusion/passes/internalization/Internalization.cpp +++ b/sycl-fusion/passes/internalization/Internalization.cpp @@ -58,10 +58,10 @@ struct SYCLInternalizerImpl { /// - Promote the function and call the new function instead, /// keeping the original function. /// - The value appears in a load/store operation: Do nothing - void promoteValue(Value *Val, std::size_t LocalSize) const; + void promoteValue(Value *Val, std::size_t LocalSize, bool InAggregate) const; void promoteGEPI(GetElementPtrInst *GEPI, const Value *Val, - std::size_t LocalSize) const; + std::size_t LocalSize, bool InAggregate) const; void promoteCall(CallBase *C, const Value *Val, std::size_t LocalSize) const; @@ -81,23 +81,26 @@ struct SYCLInternalizerImpl { /// /// Check that an value can be promoted. /// For GEP and Call instructions, delegate to the specific implementations. + /// \p InAggregate indicates that at least one GEP instruction addressing into + /// an aggregate object was encountered, hence \p Val no longer represents a + /// pure offset computation on the original candidate argument. /// For address-space casts, pointer-to-int conversions and unknown users, /// return an error. - Error canPromoteValue(Value *Val, size_t LocalSize) const; + Error canPromoteValue(Value *Val, size_t LocalSize, bool InAggregate) const; /// - /// Check that the operand of a GEP can be promoted. - /// If the GEP uses more than one index, return an error. - /// Otherwise, check if the GEP itself can be promoted in its users. + /// Check that the operand of a GEP can be promoted to its users, and + /// propagate whether it represents a pointer into an aggregate object. Error canPromoteGEP(GetElementPtrInst *GEPI, const Value *Val, - size_t LocalSize) const; + size_t LocalSize, bool InAggregate) const; /// /// Check if operand to a function call can be promoted. - /// If the function returns a pointer, return an error. - /// Otherwise, check if the corresponding formal parameter can be promoted in - /// the function body. - Error canPromoteCall(CallBase *C, const Value *Val, size_t LocalSize) const; + /// If the function returns a pointer, or the operand points into an aggregate + /// object, return an error. Otherwise, check if the corresponding formal + /// parameter can be promoted in the function body. + Error canPromoteCall(CallBase *C, const Value *Val, size_t LocalSize, + bool InAggregate) const; Error checkArgsPromotable(Function *F, SmallVectorImpl &PromoteArgSizes) const; @@ -212,7 +215,8 @@ getUsagesInternalization(const User *U, const Value *V, std::size_t LocalSize) { } Error SYCLInternalizerImpl::canPromoteCall(CallBase *C, const Value *Val, - size_t LocalSize) const { + size_t LocalSize, + bool InAggregate) const { if (isa(C->getType())) { // With opaque pointers, we do not have the necessary information to compare // the element-type of the pointer returned by the function and the element @@ -222,6 +226,12 @@ Error SYCLInternalizerImpl::canPromoteCall(CallBase *C, const Value *Val, inconvertibleErrorCode(), "It is not safe to promote a called function which returns a pointer."); } + if (InAggregate) { + return createStringError( + inconvertibleErrorCode(), + "Promotion of a pointer into an aggregate object to a called function " + "is currently not supported."); + } SmallVector InternInfo = getUsagesInternalization(C, Val, LocalSize); assert(!InternInfo.empty() && "Value must be used at least once"); @@ -232,27 +242,23 @@ Error SYCLInternalizerImpl::canPromoteCall(CallBase *C, const Value *Val, } Error SYCLInternalizerImpl::canPromoteGEP(GetElementPtrInst *GEPI, - const Value *Val, - size_t LocalSize) const { + const Value *Val, size_t LocalSize, + bool InAggregate) const { if (cast(GEPI->getType())->getAddressSpace() == AS) { // If the GEPI is already using the correct address-space, no change is // required. return Error::success(); } - if (GEPI->getNumIndices() != 1 && - std::any_of(GEPI->user_begin(), GEPI->user_end(), [](const auto *User) { - return isa(User); - })) { - return createStringError(inconvertibleErrorCode(), - "Only one index expected in source of " - "promotable GEP instruction pointer argument"); - } - // Recurse to check all users of the GEP. - return canPromoteValue(GEPI, LocalSize); + // Recurse to check all users of the GEP. We are either already in + // `InAggregate` mode, or inspect the current instruction. Recall that a GEP's + // first index is used to step through the base pointer, whereas any + // additional indices represent addressing into an aggregrate type. + return canPromoteValue(GEPI, LocalSize, + InAggregate || GEPI->getNumIndices() >= 2); } -Error SYCLInternalizerImpl::canPromoteValue(Value *Val, - size_t LocalSize) const { +Error SYCLInternalizerImpl::canPromoteValue(Value *Val, size_t LocalSize, + bool InAggregate) const { for (auto *U : Val->users()) { auto *I = dyn_cast(U); if (!I) { @@ -272,13 +278,14 @@ Error SYCLInternalizerImpl::canPromoteValue(Value *Val, case Instruction::Call: case Instruction::Invoke: case Instruction::CallBr: - if (auto Err = canPromoteCall(cast(I), Val, LocalSize)) { + if (auto Err = + canPromoteCall(cast(I), Val, LocalSize, InAggregate)) { return Err; } break; case Instruction::GetElementPtr: - if (auto Err = - canPromoteGEP(cast(I), Val, LocalSize)) { + if (auto Err = canPromoteGEP(cast(I), Val, LocalSize, + InAggregate)) { return Err; } break; @@ -316,7 +323,7 @@ Error SYCLInternalizerImpl::checkArgsPromotable( PromoteArgSizes[Index] = 0; continue; } - if (auto Err = canPromoteValue(Arg, LocalSize)) { + if (auto Err = canPromoteValue(Arg, LocalSize, /*InAggregate=*/false)) { // Set the local size to 0 to indicate that this argument should not be // promoted. PromoteArgSizes[Index] = 0; @@ -360,30 +367,37 @@ void SYCLInternalizerImpl::promoteCall(CallBase *C, const Value *Val, } void SYCLInternalizerImpl::promoteGEPI(GetElementPtrInst *GEPI, - const Value *Val, - std::size_t LocalSize) const { + const Value *Val, std::size_t LocalSize, + bool InAggregate) const { // Not PointerType is unreachable. Other case is catched in caller. if (cast(GEPI->getType())->getAddressSpace() != AS) { - remapIndices(GEPI, LocalSize); + if (!InAggregate) + remapIndices(GEPI, LocalSize); auto *ValTy = cast(Val->getType()); GEPI->mutateType(PointerType::getWithSamePointeeType( cast(GEPI->getType()), ValTy->getAddressSpace())); - return promoteValue(GEPI, LocalSize); + // Recurse to promote to all users of the GEP. We are either already in + // `InAggregate` mode, or inspect the current instruction. Recall that a + // GEP's first index is used to step through the base pointer, whereas any + // additional indices represent addressing into an aggregrate type. + return promoteValue(GEPI, LocalSize, + InAggregate || GEPI->getNumIndices() >= 2); } } -void SYCLInternalizerImpl::promoteValue(Value *Val, - std::size_t LocalSize) const { +void SYCLInternalizerImpl::promoteValue(Value *Val, std::size_t LocalSize, + bool InAggregate) const { for (auto *U : Val->users()) { auto *I = cast(U); switch (I->getOpcode()) { case Instruction::Call: case Instruction::Invoke: case Instruction::CallBr: + assert(!InAggregate); promoteCall(cast(I), Val, LocalSize); break; case Instruction::GetElementPtr: - promoteGEPI(cast(I), Val, LocalSize); + promoteGEPI(cast(I), Val, LocalSize, InAggregate); break; case Instruction::Load: case Instruction::Store: @@ -535,7 +549,7 @@ Function *SYCLInternalizerImpl::promoteFunctionArgs( if (CreateAllocas) { Arg = replaceByNewAlloca(cast(Arg), AS, LocalSize); } - promoteValue(Arg, LocalSize); + promoteValue(Arg, LocalSize, /*InAggregate=*/false); } TargetInfo.updateAddressSpaceMetadata(NewF, PromoteToLocal, AS); diff --git a/sycl-fusion/test/internalization/abort-promote-call.ll b/sycl-fusion/test/internalization/abort-promote-call.ll new file mode 100644 index 0000000000000..bcd2b38143ac5 --- /dev/null +++ b/sycl-fusion/test/internalization/abort-promote-call.ll @@ -0,0 +1,36 @@ +; RUN: opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext \ +; RUN: -passes=sycl-internalization --sycl-info-path %S/abort-kernel-info.yaml -S %s | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +target triple = "spir64-unknown-unknown" + +; Function Attrs: noinline +define void @fun(ptr addrspace(1) %arg) #0 { + ret void +} + +%struct = type { i32, i32, i32 } + +; CHECK-LABEL: define {{[^@]+}}@fused_0 +; CHECK-SAME: (ptr addrspace(1) align 4 %[[ACC:.*]]) +define spir_kernel void @fused_0(ptr addrspace(1) align 4 %acc) !kernel_arg_addr_space !12 !kernel_arg_access_qual !13 !kernel_arg_type !14 !kernel_arg_type_qual !15 !kernel_arg_base_type !14 !kernel_arg_name !16 !sycl.kernel.promote !17 !sycl.kernel.promote.localsize !18 { +; Scenario: Test private internalization is not performed when pointers into +; aggregate object are passed to function calls. + +; CHECK-NOT: alloca [1 x %struct] + %gep1 = getelementptr %struct, ptr addrspace(1) %acc, i64 17 + %gep2 = getelementptr %struct, ptr addrspace(1) %gep1, i64 0, i32 2 + call void @fun(ptr addrspace(1) %gep2) + store i32 42, ptr addrspace(1) %gep2 + ret void +} + +attributes #0 = { noinline } + +!12 = !{i32 1} +!13 = !{!"none"} +!14 = !{!"ptr"} +!15 = !{!""} +!16 = !{!"acc"} +!17 = !{!"private"} +!18 = !{i64 1} diff --git a/sycl-fusion/test/internalization/promote-local-nested.ll b/sycl-fusion/test/internalization/promote-local-nested.ll new file mode 100644 index 0000000000000..1293c72dc62f9 --- /dev/null +++ b/sycl-fusion/test/internalization/promote-local-nested.ll @@ -0,0 +1,192 @@ +; RUN: opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext \ +; RUN: -passes=sycl-internalization --sycl-info-path %S/../kernel-fusion/kernel-info.yaml -S %s | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +target triple = "spir64-unknown-unknown" + +%"class.sycl::_V1::id" = type { %"class.sycl::_V1::detail::array" } +%"class.sycl::_V1::detail::array" = type { [1 x i64] } +%struct.array_wrapper = type { %"struct.std::array" } +%"struct.std::array" = type { [2 x %"struct.std::array.0"] } +%"struct.std::array.0" = type { [2 x %"class.sycl::_V1::vec"] } +%"class.sycl::_V1::vec" = type { <2 x i32> } + +; Function Attrs: nounwind +declare spir_func void @__itt_offload_wi_start_wrapper() #0 + +; Function Attrs: nounwind +declare spir_func void @__itt_offload_wi_finish_wrapper() #0 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) +declare void @llvm.assume(i1 noundef) #1 + +; Function Attrs: nounwind willreturn memory(none) +declare spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32) #2 + +define spir_kernel void @fused_0(ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn1, ptr byval(%"class.sycl::_V1::id") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn13, ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn2, ptr byval(%"class.sycl::_V1::id") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn26, ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp, ptr byval(%"class.sycl::_V1::id") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp9, ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn3, ptr byval(%"class.sycl::_V1::id") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn36, ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut, ptr byval(%"class.sycl::_V1::id") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut9) !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !8 !kernel_arg_type_qual !9 !kernel_arg_base_type !8 !kernel_arg_name !10 !sycl.kernel.promote !11 !sycl.kernel.promote.localsize !12 !sycl.kernel.constants !13 { +; Scenario: Test the successful local internalization of the pointer argument +; `...KernelOne__arg_accTmp`. This means the pointer argument has been replaced +; by a pointer to the local address space (address space 3), and offset-wrapping +; instructions have been introduced. This test is based on an IR dump of +; `sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp`. + +; CHECK-LABEL: define spir_kernel void @fused_0 +; CHECK-SAME: (ptr addrspace(1) align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCIN1:%.*]], ptr byval(%"class.sycl::_V1::id") align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCIN13:%.*]], ptr addrspace(1) align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCIN2:%.*]], ptr byval(%"class.sycl::_V1::id") align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCIN26:%.*]], ptr addrspace(3) align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP:%.*]], ptr byval(%"class.sycl::_V1::id") align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP9:%.*]], ptr addrspace(1) align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE0_CLES2_E9KERNELTWO__ARG_ACCIN3:%.*]], ptr byval(%"class.sycl::_V1::id") align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE0_CLES2_E9KERNELTWO__ARG_ACCIN36:%.*]], ptr addrspace(1) align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE0_CLES2_E9KERNELTWO__ARG_ACCOUT:%.*]], ptr byval(%"class.sycl::_V1::id") align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE0_CLES2_E9KERNELTWO__ARG_ACCOUT9:%.*]]) !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !8 !kernel_arg_type_qual !9 !kernel_arg_base_type !8 !kernel_arg_name !10 !sycl.kernel.constants !11 { +; CHECK-NEXT: entry: +; CHECK: [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP93_SROA_0_0_COPYLOAD:%.*]] = load i64, ptr [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP9]], align 1 +; CHECK: [[TMP0:%.*]] = urem i64 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP93_SROA_0_0_COPYLOAD]], 4 +; CHECK: [[ADD_PTR_I43_I:%.*]] = getelementptr inbounds %struct.array_wrapper, ptr addrspace(3) [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP]], i64 [[TMP0]] +; CHECK: [[TMP1:%.*]] = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) #[[ATTR2:[0-9]+]] +; CHECK: [[TMP2:%.*]] = add i64 [[TMP1]], [[TMP0]] +; CHECK: [[TMP3:%.*]] = urem i64 [[TMP2]], 4 +; CHECK: [[ARRAYIDX_I34_I_I:%.*]] = getelementptr inbounds %struct.array_wrapper, ptr addrspace(3) [[ADD_PTR_I43_I]], i64 [[TMP3]] +; CHECK: store <2 x i32> {{%.*}}, ptr addrspace(3) [[ARRAYIDX_I34_I_I]], align 8 +; CHECK: [[TMP8:%.*]] = add i64 0, [[TMP3]] +; CHECK: [[TMP9:%.*]] = add i64 [[TMP8]], [[TMP0]] +; CHECK: [[TMP10:%.*]] = urem i64 [[TMP9]], 4 +; CHECK: [[ARRAYIDX_I_I39_I_I_1:%.*]] = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(3) [[ARRAYIDX_I34_I_I]], i64 [[TMP10]], i64 1 +; CHECK: store <2 x i32> {{%.*}}, ptr addrspace(3) [[ARRAYIDX_I_I39_I_I_1]], align 8 +; CHECK: [[TMP11:%.*]] = add i64 0, [[TMP3]] +; CHECK: [[TMP12:%.*]] = add i64 [[TMP11]], [[TMP0]] +; CHECK: [[TMP13:%.*]] = urem i64 [[TMP12]], 4 +; CHECK: [[ARRAYIDX_I_I_I37_I_I_1:%.*]] = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(3) [[ARRAYIDX_I34_I_I]], i64 [[TMP13]], i64 1 +; CHECK: store <2 x i32> {{%.*}}, ptr addrspace(3) [[ARRAYIDX_I_I_I37_I_I_1]], align 8 +; CHECK: [[ARRAYIDX_I_I39_I_I_1_1:%.*]] = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(3) [[ARRAYIDX_I_I_I37_I_I_1]], i64 0, i64 1 +; CHECK: store <2 x i32> {{%.*}}, ptr addrspace(3) [[ARRAYIDX_I_I39_I_I_1_1]], align 8 +; CHECK: [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP94_SROA_0_0_COPYLOAD:%.*]] = load i64, ptr [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP9]], align 1 +; CHECK: [[TMP18:%.*]] = urem i64 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP94_SROA_0_0_COPYLOAD]], 4 +; CHECK: [[ADD_PTR_I_I7:%.*]] = getelementptr inbounds %struct.array_wrapper, ptr addrspace(3) [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP]], i64 [[TMP18]] +; CHECK: [[TMP19:%.*]] = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) #[[ATTR2]] +; CHECK: [[TMP20:%.*]] = add i64 [[TMP19]], [[TMP18]] +; CHECK: [[TMP21:%.*]] = urem i64 [[TMP20]], 4 +; CHECK: [[ARRAYIDX_I_I_I11:%.*]] = getelementptr inbounds %struct.array_wrapper, ptr addrspace(3) [[ADD_PTR_I_I7]], i64 [[TMP21]] +; CHECK: [[TMP22:%.*]] = load <2 x i32>, ptr addrspace(3) [[ARRAYIDX_I_I_I11]], align 8 +; CHECK: [[TMP24:%.*]] = add i64 0, [[TMP21]] +; CHECK: [[TMP25:%.*]] = add i64 [[TMP24]], [[TMP18]] +; CHECK: [[TMP26:%.*]] = urem i64 [[TMP25]], 4 +; CHECK: [[ARRAYIDX_I_I_I_I27_1:%.*]] = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(3) [[ARRAYIDX_I_I_I11]], i64 [[TMP26]], i64 1 +; CHECK: [[TMP27:%.*]] = load <2 x i32>, ptr addrspace(3) [[ARRAYIDX_I_I_I_I27_1]], align 8 +; CHECK: [[TMP29:%.*]] = add i64 0, [[TMP21]] +; CHECK: [[TMP30:%.*]] = add i64 [[TMP29]], [[TMP18]] +; CHECK: [[TMP31:%.*]] = urem i64 [[TMP30]], 4 +; CHECK: [[ARRAYIDX_I_I_I_I_I18_1:%.*]] = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(3) [[ARRAYIDX_I_I_I11]], i64 [[TMP31]], i64 1 +; CHECK: [[TMP32:%.*]] = load <2 x i32>, ptr addrspace(3) [[ARRAYIDX_I_I_I_I_I18_1]], align 8 +; CHECK: [[ARRAYIDX_I_I_I_I27_1_1:%.*]] = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(3) [[ARRAYIDX_I_I_I_I_I18_1]], i64 0, i64 1 +; CHECK: [[TMP34:%.*]] = load <2 x i32>, ptr addrspace(3) [[ARRAYIDX_I_I_I_I27_1_1]], align 8 +; CHECK: ret void +; +entry: + %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp93.sroa.0.0.copyload = load i64, ptr %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp9, align 1 + %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn262.sroa.0.0.copyload = load i64, ptr %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn26, align 1 + %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn131.sroa.0.0.copyload = load i64, ptr %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn13, align 1 + %add.ptr.i.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn1, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn131.sroa.0.0.copyload + %add.ptr.i34.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn2, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn262.sroa.0.0.copyload + %add.ptr.i43.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp93.sroa.0.0.copyload + %0 = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) #2 + %cmp.i.i.i = icmp ult i64 %0, 2147483648 + call void @llvm.assume(i1 %cmp.i.i.i) + %arrayidx.i.i.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i.i, i64 %0 + %arrayidx.i30.i.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i34.i, i64 %0 + %arrayidx.i34.i.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i43.i, i64 %0 + %1 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i, align 8 + %2 = load <2 x i32>, ptr addrspace(1) %arrayidx.i30.i.i, align 8 + %add.i.i.i = add <2 x i32> %1, %2 + store <2 x i32> %add.i.i.i, ptr addrspace(1) %arrayidx.i34.i.i, align 8 + %arrayidx.i.i.i.i.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i, i64 0, i64 1 + %arrayidx.i.i38.i.i.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i30.i.i, i64 0, i64 1 + %3 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i.1, align 8 + %4 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i38.i.i.1, align 8 + %add.i.i.i.1 = add <2 x i32> %3, %4 + %arrayidx.i.i39.i.i.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i34.i.i, i64 0, i64 1 + store <2 x i32> %add.i.i.i.1, ptr addrspace(1) %arrayidx.i.i39.i.i.1, align 8 + %arrayidx.i.i.i.i.i.1 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i.i.i, i64 0, i64 1 + %arrayidx.i.i.i36.i.i.1 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i30.i.i, i64 0, i64 1 + %arrayidx.i.i.i37.i.i.1 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i34.i.i, i64 0, i64 1 + %5 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i.i.1, align 8 + %6 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i36.i.i.1, align 8 + %add.i.i.i.131 = add <2 x i32> %5, %6 + store <2 x i32> %add.i.i.i.131, ptr addrspace(1) %arrayidx.i.i.i37.i.i.1, align 8 + %arrayidx.i.i.i.i.1.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i.i.i.1, i64 0, i64 1 + %arrayidx.i.i38.i.i.1.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i36.i.i.1, i64 0, i64 1 + %7 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i.1.1, align 8 + %8 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i38.i.i.1.1, align 8 + %add.i.i.i.1.1 = add <2 x i32> %7, %8 + %arrayidx.i.i39.i.i.1.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i37.i.i.1, i64 0, i64 1 + store <2 x i32> %add.i.i.i.1.1, ptr addrspace(1) %arrayidx.i.i39.i.i.1.1, align 8 + %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut96.sroa.0.0.copyload = load i64, ptr %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut9, align 1 + %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn365.sroa.0.0.copyload = load i64, ptr %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn36, align 1 + %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp94.sroa.0.0.copyload = load i64, ptr %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp9, align 1 + %add.ptr.i.i7 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp94.sroa.0.0.copyload + %add.ptr.i34.i8 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn3, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn365.sroa.0.0.copyload + %add.ptr.i43.i9 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut96.sroa.0.0.copyload + %9 = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) #2 + %cmp.i.i.i10 = icmp ult i64 %9, 2147483648 + call void @llvm.assume(i1 %cmp.i.i.i10) + %arrayidx.i.i.i11 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i.i7, i64 %9 + %arrayidx.i30.i.i12 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i34.i8, i64 %9 + %arrayidx.i34.i.i13 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i43.i9, i64 %9 + %10 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i11, align 8 + %11 = load <2 x i32>, ptr addrspace(1) %arrayidx.i30.i.i12, align 8 + %mul.i.i.i = mul <2 x i32> %10, %11 + store <2 x i32> %mul.i.i.i, ptr addrspace(1) %arrayidx.i34.i.i13, align 8 + %arrayidx.i.i.i.i27.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i11, i64 0, i64 1 + %arrayidx.i.i38.i.i28.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i30.i.i12, i64 0, i64 1 + %12 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i27.1, align 8 + %13 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i38.i.i28.1, align 8 + %mul.i.i.i.1 = mul <2 x i32> %12, %13 + %arrayidx.i.i39.i.i29.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i34.i.i13, i64 0, i64 1 + store <2 x i32> %mul.i.i.i.1, ptr addrspace(1) %arrayidx.i.i39.i.i29.1, align 8 + %arrayidx.i.i.i.i.i18.1 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i.i.i11, i64 0, i64 1 + %arrayidx.i.i.i36.i.i19.1 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i30.i.i12, i64 0, i64 1 + %arrayidx.i.i.i37.i.i20.1 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i34.i.i13, i64 0, i64 1 + %14 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i.i18.1, align 8 + %15 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i36.i.i19.1, align 8 + %mul.i.i.i.135 = mul <2 x i32> %14, %15 + store <2 x i32> %mul.i.i.i.135, ptr addrspace(1) %arrayidx.i.i.i37.i.i20.1, align 8 + %arrayidx.i.i.i.i27.1.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i.i.i18.1, i64 0, i64 1 + %arrayidx.i.i38.i.i28.1.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i36.i.i19.1, i64 0, i64 1 + %16 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i27.1.1, align 8 + %17 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i38.i.i28.1.1, align 8 + %mul.i.i.i.1.1 = mul <2 x i32> %16, %17 + %arrayidx.i.i39.i.i29.1.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i37.i.i20.1, i64 0, i64 1 + store <2 x i32> %mul.i.i.i.1.1, ptr addrspace(1) %arrayidx.i.i39.i.i29.1.1, align 8 + ret void +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite) +declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #3 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #4 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #4 + +attributes #0 = { nounwind } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } +attributes #2 = { nounwind willreturn memory(none) } +attributes #3 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } + +!spirv.MemoryModel = !{!0} +!spirv.Source = !{!1} +!opencl.spir.version = !{!2} +!opencl.ocl.version = !{!3} +!opencl.used.extensions = !{!4} +!opencl.used.optional.core.features = !{!4} +!spirv.Generator = !{!5} + +!0 = !{i32 2, i32 2} +!1 = !{i32 4, i32 100000} +!2 = !{i32 1, i32 2} +!3 = !{i32 1, i32 0} +!4 = !{} +!5 = !{i16 6, i16 14} +!6 = !{i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0} +!7 = !{!"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none"} +!8 = !{!"struct array_wrapper*", !"class.sycl::_V1::id", !"struct array_wrapper*", !"class.sycl::_V1::id", !"struct array_wrapper*", !"class.sycl::_V1::id", !"struct array_wrapper*", !"class.sycl::_V1::id", !"struct array_wrapper*", !"class.sycl::_V1::id"} +!9 = !{!"", !"", !"", !"", !"", !"", !"", !"", !"", !""} +!10 = !{!"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn1", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn13", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn2", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn26", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp9", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn3", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn36", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut9"} +!11 = !{!"none", !"none", !"none", !"none", !"local", !"none", !"none", !"none", !"none", !"none"} +!12 = !{!"", !"", !"", !"", i64 4, !"", !"", !"", !"", !""} +!13 = !{!"", !"\00\00\00\00\00\00\00\00", !"", !"\00\00\00\00\00\00\00\00", !"", !"", !"", !"\00\00\00\00\00\00\00\00", !"", !"\00\00\00\00\00\00\00\00"} diff --git a/sycl-fusion/test/internalization/promote-private-nested.ll b/sycl-fusion/test/internalization/promote-private-nested.ll new file mode 100644 index 0000000000000..81a20490e0f2b --- /dev/null +++ b/sycl-fusion/test/internalization/promote-private-nested.ll @@ -0,0 +1,171 @@ +; RUN: opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext \ +; RUN: -passes=sycl-internalization --sycl-info-path %S/../kernel-fusion/kernel-info.yaml -S %s | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +target triple = "spir64-unknown-unknown" + +%"class.sycl::_V1::range" = type { %"class.sycl::_V1::detail::array" } +%"class.sycl::_V1::detail::array" = type { [1 x i64] } +%struct.array_wrapper = type { %"struct.std::array" } +%"struct.std::array" = type { [2 x %"struct.std::array.0"] } +%"struct.std::array.0" = type { [2 x %"class.sycl::_V1::vec"] } +%"class.sycl::_V1::vec" = type { <2 x i32> } + +; Function Attrs: nounwind +declare spir_func void @__itt_offload_wi_start_wrapper() #0 + +; Function Attrs: nounwind +declare spir_func void @__itt_offload_wi_finish_wrapper() #0 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) +declare void @llvm.assume(i1 noundef) #1 + +; Function Attrs: nounwind willreturn memory(none) +declare spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32) #2 + +define spir_kernel void @fused_0(ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn1, ptr byval(%"class.sycl::_V1::range") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn13, ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn2, ptr byval(%"class.sycl::_V1::range") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn26, ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp, ptr byval(%"class.sycl::_V1::range") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp9, ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn3, ptr byval(%"class.sycl::_V1::range") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn36, ptr addrspace(1) align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut, ptr byval(%"class.sycl::_V1::range") align 8 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut9) !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !8 !kernel_arg_type_qual !9 !kernel_arg_base_type !8 !kernel_arg_name !10 !sycl.kernel.promote !11 !sycl.kernel.promote.localsize !12 !sycl.kernel.constants !13 { +; Scenario: Test the successful private internalization of the pointer argument +; `...KernelOne__arg_accTmp`. This means the pointer argument has been replaced +; by a function-local alloca and all accesses have been updated to use this +; alloca (and the default address space) instead. This test is based on an IR +; dump of `sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp`. + +; CHECK-LABEL: define spir_kernel void @fused_0 +; CHECK-SAME: (ptr addrspace(1) align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCIN1:%.*]], ptr byval(%"class.sycl::_V1::range") align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCIN13:%.*]], ptr addrspace(1) align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCIN2:%.*]], ptr byval(%"class.sycl::_V1::range") align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCIN26:%.*]], ptr byval(%"class.sycl::_V1::range") align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_E9KERNELONE__ARG_ACCTMP9:%.*]], ptr addrspace(1) align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE0_CLES2_E9KERNELTWO__ARG_ACCIN3:%.*]], ptr byval(%"class.sycl::_V1::range") align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE0_CLES2_E9KERNELTWO__ARG_ACCIN36:%.*]], ptr addrspace(1) align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE0_CLES2_E9KERNELTWO__ARG_ACCOUT:%.*]], ptr byval(%"class.sycl::_V1::range") align 8 [[_ZTSZZ4MAINENKULRN4SYCL3_V17HANDLEREE0_CLES2_E9KERNELTWO__ARG_ACCOUT9:%.*]]) !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !8 !kernel_arg_type_qual !9 !kernel_arg_base_type !8 !kernel_arg_name !10 !sycl.kernel.constants !11 { +; CHECK-NEXT: entry: +; CHECK: [[TMP0:%.*]] = alloca [1 x %struct.array_wrapper], align 8 +; CHECK: [[TMP1:%.*]] = getelementptr inbounds [1 x %struct.array_wrapper], ptr [[TMP0]], i64 0, i64 0 +; CHECK: [[ADD_PTR_I43_I:%.*]] = getelementptr inbounds %struct.array_wrapper, ptr [[TMP1]], i64 0 +; CHECK: [[ARRAYIDX_I34_I_I:%.*]] = getelementptr inbounds %struct.array_wrapper, ptr [[ADD_PTR_I43_I]], i64 0 +; CHECK: store <2 x i32> {{.*}}, ptr [[ARRAYIDX_I34_I_I]], align 8 +; CHECK: [[ARRAYIDX_I_I39_I_I_1:%.*]] = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr [[ARRAYIDX_I34_I_I]], i64 0, i64 1 +; CHECK: store <2 x i32> {{.*}}, ptr [[ARRAYIDX_I_I39_I_I_1]], align 8 +; CHECK: [[ARRAYIDX_I_I_I37_I_I_1:%.*]] = getelementptr inbounds [2 x %"struct.std::array.0"], ptr [[ARRAYIDX_I34_I_I]], i64 0, i64 1 +; CHECK: store <2 x i32> {{.*}}, ptr [[ARRAYIDX_I_I_I37_I_I_1]], align 8 +; CHECK: [[ARRAYIDX_I_I39_I_I_1_1:%.*]] = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr [[ARRAYIDX_I_I_I37_I_I_1]], i64 0, i64 1 +; CHECK: store <2 x i32> {{.*}}, ptr [[ARRAYIDX_I_I39_I_I_1_1]], align 8 +; CHECK: [[ADD_PTR_I_I7:%.*]] = getelementptr inbounds %struct.array_wrapper, ptr [[TMP1]], i64 0 +; CHECK: [[ARRAYIDX_I_I_I11:%.*]] = getelementptr inbounds %struct.array_wrapper, ptr [[ADD_PTR_I_I7]], i64 0 +; CHECK: [[TMP12:%.*]] = load <2 x i32>, ptr [[ARRAYIDX_I_I_I11]], align 8 +; CHECK: [[ARRAYIDX_I_I_I_I27_1:%.*]] = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr [[ARRAYIDX_I_I_I11]], i64 0, i64 1 +; CHECK: [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX_I_I_I_I27_1]], align 8 +; CHECK: [[ARRAYIDX_I_I_I_I_I18_1:%.*]] = getelementptr inbounds [2 x %"struct.std::array.0"], ptr [[ARRAYIDX_I_I_I11]], i64 0, i64 1 +; CHECK: [[TMP16:%.*]] = load <2 x i32>, ptr [[ARRAYIDX_I_I_I_I_I18_1]], align 8 +; CHECK: [[ARRAYIDX_I_I_I_I27_1_1:%.*]] = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr [[ARRAYIDX_I_I_I_I_I18_1]], i64 0, i64 1 +; CHECK: [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX_I_I_I_I27_1_1]], align 8 +; CHECK: ret void +; +entry: + %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp93.sroa.0.0.copyload = load i64, ptr %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp9, align 1 + %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn262.sroa.0.0.copyload = load i64, ptr %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn26, align 1 + %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn131.sroa.0.0.copyload = load i64, ptr %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn13, align 1 + %add.ptr.i.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn1, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn131.sroa.0.0.copyload + %add.ptr.i34.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn2, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn262.sroa.0.0.copyload + %add.ptr.i43.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp93.sroa.0.0.copyload + %0 = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) #2 + %cmp.i.i.i = icmp ult i64 %0, 2147483648 + call void @llvm.assume(i1 %cmp.i.i.i) + %arrayidx.i.i.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i.i, i64 %0 + %arrayidx.i30.i.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i34.i, i64 %0 + %arrayidx.i34.i.i = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i43.i, i64 %0 + %1 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i, align 8 + %2 = load <2 x i32>, ptr addrspace(1) %arrayidx.i30.i.i, align 8 + %add.i.i.i = add <2 x i32> %1, %2 + store <2 x i32> %add.i.i.i, ptr addrspace(1) %arrayidx.i34.i.i, align 8 + %arrayidx.i.i.i.i.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i, i64 0, i64 1 + %arrayidx.i.i38.i.i.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i30.i.i, i64 0, i64 1 + %3 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i.1, align 8 + %4 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i38.i.i.1, align 8 + %add.i.i.i.1 = add <2 x i32> %3, %4 + %arrayidx.i.i39.i.i.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i34.i.i, i64 0, i64 1 + store <2 x i32> %add.i.i.i.1, ptr addrspace(1) %arrayidx.i.i39.i.i.1, align 8 + %arrayidx.i.i.i.i.i.1 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i.i.i, i64 0, i64 1 + %arrayidx.i.i.i36.i.i.1 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i30.i.i, i64 0, i64 1 + %arrayidx.i.i.i37.i.i.1 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i34.i.i, i64 0, i64 1 + %5 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i.i.1, align 8 + %6 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i36.i.i.1, align 8 + %add.i.i.i.131 = add <2 x i32> %5, %6 + store <2 x i32> %add.i.i.i.131, ptr addrspace(1) %arrayidx.i.i.i37.i.i.1, align 8 + %arrayidx.i.i.i.i.1.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i.i.i.1, i64 0, i64 1 + %arrayidx.i.i38.i.i.1.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i36.i.i.1, i64 0, i64 1 + %7 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i.1.1, align 8 + %8 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i38.i.i.1.1, align 8 + %add.i.i.i.1.1 = add <2 x i32> %7, %8 + %arrayidx.i.i39.i.i.1.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i37.i.i.1, i64 0, i64 1 + store <2 x i32> %add.i.i.i.1.1, ptr addrspace(1) %arrayidx.i.i39.i.i.1.1, align 8 + %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut96.sroa.0.0.copyload = load i64, ptr %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut9, align 1 + %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn365.sroa.0.0.copyload = load i64, ptr %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn36, align 1 + %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp94.sroa.0.0.copyload = load i64, ptr %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp9, align 1 + %add.ptr.i.i7 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp94.sroa.0.0.copyload + %add.ptr.i34.i8 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn3, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn365.sroa.0.0.copyload + %add.ptr.i43.i9 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut, i64 %_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut96.sroa.0.0.copyload + %9 = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) #2 + %cmp.i.i.i10 = icmp ult i64 %9, 2147483648 + call void @llvm.assume(i1 %cmp.i.i.i10) + %arrayidx.i.i.i11 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i.i7, i64 %9 + %arrayidx.i30.i.i12 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i34.i8, i64 %9 + %arrayidx.i34.i.i13 = getelementptr inbounds %struct.array_wrapper, ptr addrspace(1) %add.ptr.i43.i9, i64 %9 + %10 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i11, align 8 + %11 = load <2 x i32>, ptr addrspace(1) %arrayidx.i30.i.i12, align 8 + %mul.i.i.i = mul <2 x i32> %10, %11 + store <2 x i32> %mul.i.i.i, ptr addrspace(1) %arrayidx.i34.i.i13, align 8 + %arrayidx.i.i.i.i27.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i11, i64 0, i64 1 + %arrayidx.i.i38.i.i28.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i30.i.i12, i64 0, i64 1 + %12 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i27.1, align 8 + %13 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i38.i.i28.1, align 8 + %mul.i.i.i.1 = mul <2 x i32> %12, %13 + %arrayidx.i.i39.i.i29.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i34.i.i13, i64 0, i64 1 + store <2 x i32> %mul.i.i.i.1, ptr addrspace(1) %arrayidx.i.i39.i.i29.1, align 8 + %arrayidx.i.i.i.i.i18.1 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i.i.i11, i64 0, i64 1 + %arrayidx.i.i.i36.i.i19.1 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i30.i.i12, i64 0, i64 1 + %arrayidx.i.i.i37.i.i20.1 = getelementptr inbounds [2 x %"struct.std::array.0"], ptr addrspace(1) %arrayidx.i34.i.i13, i64 0, i64 1 + %14 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i.i18.1, align 8 + %15 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i36.i.i19.1, align 8 + %mul.i.i.i.135 = mul <2 x i32> %14, %15 + store <2 x i32> %mul.i.i.i.135, ptr addrspace(1) %arrayidx.i.i.i37.i.i20.1, align 8 + %arrayidx.i.i.i.i27.1.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i.i.i18.1, i64 0, i64 1 + %arrayidx.i.i38.i.i28.1.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i36.i.i19.1, i64 0, i64 1 + %16 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i.i.i27.1.1, align 8 + %17 = load <2 x i32>, ptr addrspace(1) %arrayidx.i.i38.i.i28.1.1, align 8 + %mul.i.i.i.1.1 = mul <2 x i32> %16, %17 + %arrayidx.i.i39.i.i29.1.1 = getelementptr inbounds [2 x %"class.sycl::_V1::vec"], ptr addrspace(1) %arrayidx.i.i.i37.i.i20.1, i64 0, i64 1 + store <2 x i32> %mul.i.i.i.1.1, ptr addrspace(1) %arrayidx.i.i39.i.i29.1.1, align 8 + ret void +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite) +declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #3 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #4 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #4 + +attributes #0 = { nounwind } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } +attributes #2 = { nounwind willreturn memory(none) } +attributes #3 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } + +!spirv.MemoryModel = !{!0} +!spirv.Source = !{!1} +!opencl.spir.version = !{!2} +!opencl.ocl.version = !{!3} +!opencl.used.extensions = !{!4} +!opencl.used.optional.core.features = !{!4} +!spirv.Generator = !{!5} + +!0 = !{i32 2, i32 2} +!1 = !{i32 4, i32 100000} +!2 = !{i32 1, i32 2} +!3 = !{i32 1, i32 0} +!4 = !{} +!5 = !{i16 6, i16 14} +!6 = !{i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0} +!7 = !{!"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none"} +!8 = !{!"struct array_wrapper*", !"class.sycl::_V1::range", !"struct array_wrapper*", !"class.sycl::_V1::range", !"struct array_wrapper*", !"class.sycl::_V1::range", !"struct array_wrapper*", !"class.sycl::_V1::range", !"struct array_wrapper*", !"class.sycl::_V1::range"} +!9 = !{!"", !"", !"", !"", !"", !"", !"", !"", !"", !""} +!10 = !{!"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn1", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn13", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn2", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accIn26", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E9KernelOne__arg_accTmp9", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn3", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accIn36", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut", !"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E9KernelTwo__arg_accOut9"} +!11 = !{!"none", !"none", !"none", !"none", !"private", !"none", !"none", !"none", !"none", !"none"} +!12 = !{!"", !"", !"", !"", i64 1, !"", !"", !"", !"", !""} +!13 = !{!"", !"\00\00\00\00\00\00\00\00", !"", !"\00\00\00\00\00\00\00\00", !"", !"", !"", !"\00\00\00\00\00\00\00\00", !"", !"\00\00\00\00\00\00\00\00"} diff --git a/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp b/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp index b968b48af9497..4d2f9c4b72b94 100644 --- a/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp +++ b/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp @@ -2,9 +2,6 @@ // RUN: %{build} -fsycl-embed-ir -O2 -o %t.out // RUN: %{run} %t.out -// FIXME: enable opaque pointers support -// REQUIRES: TEMPORARY_DISABLED - // Test internalization of a nested array type. #include @@ -87,7 +84,7 @@ int main() { const auto &accIn1Wrapp = accIn1[id]; const auto &accIn2Wrapp = accIn2[id]; auto &accTmpWrapp = accTmp[id]; - for (size_t i = 0; i < dataSize; ++i) { + for (size_t i = 0; i < rows; ++i) { const auto &in1 = accIn1Wrapp[i]; const auto &in2 = accIn2Wrapp[i]; auto &tmp = accTmpWrapp[i]; @@ -107,7 +104,7 @@ int main() { const auto &tmpWrapp = accTmp[id]; const auto &accIn3Wrapp = accIn3[id]; auto &accOutWrapp = accOut[id]; - for (size_t i = 0; i < dataSize; ++i) { + for (size_t i = 0; i < rows; ++i) { const auto &tmp = tmpWrapp[i]; const auto &in3 = accIn3Wrapp[i]; auto &out = accOutWrapp[i]; diff --git a/sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp b/sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp new file mode 100644 index 0000000000000..daa3909b705e8 --- /dev/null +++ b/sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp @@ -0,0 +1,140 @@ +// REQUIRES: fusion +// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out +// RUN: %{run} %t.out +// UNSUPPORTED: cuda + +// Test local internalization of a nested array type. + +#include + +#include + +using namespace sycl; + +template struct array_wrapper { + static constexpr size_t rows{N}; + static constexpr size_t columns{M}; + static constexpr size_t vec_width{2}; + + using value_type = vec; + using reference_type = value_type &; + using const_reference_type = const value_type &; + + std::array, rows> vs; + + explicit array_wrapper(const_reference_type v) { + std::array el; + el.fill(v); + vs.fill(el); + } + + array_wrapper() : array_wrapper{value_type{}} {} + + constexpr std::array &operator[](size_t i) { + return vs[i]; + } + + constexpr const std::array &operator[](size_t i) const { + return vs[i]; + } +}; + +int main() { + constexpr size_t dataSize = 16; + constexpr size_t rows = 2; + constexpr size_t columns = 2; + + using array_type = array_wrapper; + + array_type in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], + out[dataSize]; + + for (size_t id = 0; id < dataSize; ++id) { + for (size_t i = 0; i < rows; ++i) { + for (size_t j = 0; j < columns; ++j) { + in1[id][i][j].s0() = in1[id][i][j].s1() = id * 2; + in2[id][i][j].s0() = in2[id][i][j].s1() = id * 3; + in3[id][i][j].s0() = in3[id][i][j].s1() = id * 4; + tmp[id][i][j].s0() = tmp[id][i][j].s1() = -1; + out[id][i][j].s0() = out[id][i][j].s1() = -1; + } + } + } + + queue q{default_selector_v, + {ext::codeplay::experimental::property::queue::enable_fusion{}}}; + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bIn3{in3, range{dataSize}}; + buffer bTmp{tmp, range{dataSize}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_local{}); + cgh.parallel_for( + nd_range<1>{{dataSize}, {4}}, [=](id<1> id) { + const auto &accIn1Wrapp = accIn1[id]; + const auto &accIn2Wrapp = accIn2[id]; + auto &accTmpWrapp = accTmp[id]; + for (size_t i = 0; i < rows; ++i) { + const auto &in1 = accIn1Wrapp[i]; + const auto &in2 = accIn2Wrapp[i]; + auto &tmp = accTmpWrapp[i]; + for (size_t j = 0; j < columns; ++j) { + tmp[j] = in1[j] + in2[j]; + } + } + }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_local{}); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for( + nd_range<1>{{dataSize}, {4}}, [=](id<1> id) { + const auto &tmpWrapp = accTmp[id]; + const auto &accIn3Wrapp = accIn3[id]; + auto &accOutWrapp = accOut[id]; + for (size_t i = 0; i < rows; ++i) { + const auto &tmp = tmpWrapp[i]; + const auto &in3 = accIn3Wrapp[i]; + auto &out = accOutWrapp[i]; + for (size_t j = 0; j < columns; ++j) { + out[j] = tmp[j] * in3[j]; + } + } + }); + }); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + constexpr array_type::value_type not_written{-1, -1}; + for (size_t id = 0; id < dataSize; ++id) { + const array_type::value_type expected{20 * id * id, 20 * id * id}; + for (size_t i = 0; i < rows; ++i) { + for (size_t j = 0; j < columns; ++j) { + assert(all(out[id][i][j] == expected) && "Computation error"); + assert(all(tmp[id][i][j] == not_written) && "Not internalizing"); + } + } + } + + return 0; +}