Skip to content

[SYCL][SYCLLowerWGLocalMemoryPass] Remove implicit dependency on AlwaysInlinerPass and move to PipelineStart #16356

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions clang/lib/CodeGen/BackendUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1042,6 +1042,8 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
/*FP64ConvEmu=*/CodeGenOpts.FP64ConvEmu,
/*ExcludeAspects=*/{"fp64"}));
MPM.addPass(SYCLPropagateJointMatrixUsagePass());
// Lowers static/dynamic local memory builtin calls.
MPM.addPass(SYCLLowerWGLocalMemoryPass());
});
else if (LangOpts.SYCLIsHost && !LangOpts.SYCLESIMDBuildHostCode)
PB.registerPipelineStartEPCallback(
Expand Down Expand Up @@ -1191,10 +1193,6 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
MPM.addPass(SPIRITTAnnotationsPass());
}

// Allocate static local memory in SYCL kernel scope for each allocation
// call.
MPM.addPass(SYCLLowerWGLocalMemoryPass());

// Process properties and annotations
MPM.addPass(CompileTimePropertiesPass());

Expand Down
29 changes: 0 additions & 29 deletions clang/test/CodeGenSYCL/group-local-memory.cpp

This file was deleted.

31 changes: 20 additions & 11 deletions clang/test/CodeGenSYCL/kernel-early-optimization-pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,24 @@
// SYCL device target, and can be disabled with -fno-sycl-early-optimizations.
// New pass manager doesn't print all passes tree, only module level.
//
// RUN: %clang_cc1 -O2 -fsycl-is-device -triple spir64-unknown-unknown %s -mdebug-pass Structure -emit-llvm -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-NEWPM-EARLYOPT
// CHECK-NEWPM-EARLYOPT: ConstantMergePass
// CHECK-NEWPM-EARLYOPT: SYCLMutatePrintfAddrspacePass
// RUN: %clang_cc1 -O2 -fsycl-is-device -triple spir64-unknown-unknown %s -mdebug-pass Structure -emit-llvm -o /dev/null 2>&1 | FileCheck %s
// CHECK: SYCLVirtualFunctionsAnalysisPass
// CHECK: ESIMDVerifierPass
// CHECK: SYCLConditionalCallOnDevicePass
// CHECK: SYCLPropagateAspectsUsagePass
// CHECK: SYCLPropagateJointMatrixUsagePass
// CHECK: SYCLLowerWGLocalMemoryPass
// CHECK: InferFunctionAttrsPass
// CHECK: AlwaysInlinerPass
// CHECK: ModuleInlinerWrapperPass
// CHECK: ConstantMergePass
// CHECK: SYCLMutatePrintfAddrspacePass
// CHECK: SYCLPropagateAspectsUsagePass
// CHECK: SYCLAddOptLevelAttributePass
// CHECK: CompileTimePropertiesPass
// CHECK: RecordSYCLAspectNamesPass
// CHECK: CleanupSYCLMetadataPass
//
// RUN: %clang_cc1 -O2 -fsycl-is-device -triple spir64-unknown-unknown %s -mdebug-pass Structure -emit-llvm -fno-sycl-early-optimizations -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-NEWPM-NOEARLYOPT
// CHECK-NEWPM-NOEARLYOPT-NOT: ConstantMergePass
// CHECK-NEWPM-NOEARLYOPT: SYCLMutatePrintfAddrspacePass

// Checks that the compile time properties pass is added into the compilation pipeline
//
// RUN: %clang_cc1 -O2 -fsycl-is-device -triple spir64-unknown-unknown %s -mdebug-pass Structure -emit-llvm -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-COMPTIMEPROPS
// CHECK-COMPTIMEPROPS: Running pass: CompileTimePropertiesPass on [module]
// RUN: %clang_cc1 -O2 -fsycl-is-device -triple spir64-unknown-unknown %s -mdebug-pass Structure -emit-llvm -fno-sycl-early-optimizations -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-NOEARLYOPT
// CHECK-NOEARLYOPT-NOT: ConstantMergePass1
// CHECK-NOEARLYOPT: SYCLMutatePrintfAddrspacePass
43 changes: 37 additions & 6 deletions llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,17 @@
//===----------------------------------------------------------------------===//

#include "llvm/SYCLLowerIR/LowerWGLocalMemory.h"
#include "llvm/Demangle/Demangle.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/Pass.h"
#include "llvm/TargetParser/Triple.h"
#include "llvm/Transforms/Utils/Cloning.h"

using namespace llvm;

#define DEBUG_TYPE "LowerWGLocalMemory"
#define DEBUG_TYPE "sycllowerwglocalmemory"

static constexpr char SYCL_ALLOCLOCALMEM_CALL[] = "__sycl_allocateLocalMemory";
static constexpr char SYCL_DYNAMIC_LOCALMEM_CALL[] =
Expand Down Expand Up @@ -84,6 +86,36 @@ ModulePass *llvm::createSYCLLowerWGLocalMemoryLegacyPass() {
return new SYCLLowerWGLocalMemoryLegacy();
}

// In sycl header __sycl_allocateLocalMemory builtin call is wrapped in
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why can we not rewrite the SYCL headers to 'inline' these calls? Is there a specific reason? Thanks

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why can we not rewrite the SYCL headers to 'inline' these calls? Is there a specific reason? Thanks

We can't ask users to call __sycl_allocateLocalMemory internal intrsinsic when documented interface is sycl::ext::something::something::group_local_memory<T>

// group_local_memory/group_local_memory_for_overwrite functions, which must be
// inlined first before each __sycl_allocateLocalMemory call can be lowered to a
// unique global variable. Inlining them here so that this pass doesn't have
// implicit dependency on AlwaysInlinerPass.
static bool inlineGroupLocalMemoryFunc(Module &M) {
Function *ALMFunc = M.getFunction(SYCL_ALLOCLOCALMEM_CALL);
if (!ALMFunc || ALMFunc->use_empty())
return false;

bool Changed = false;
for (auto *U : ALMFunc->users()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we need to use a work list here rather than the simple loop.

This function https://github.com/intel/llvm/blob/sycl/sycl/include/syclcompat/memory.hpp#L71 needs to be updated as well, and this function won't be able to handle the nesting. The CI is currently green because there is no test requesting 2 distinct local memory objects using this function in the same kernel.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done, thank you for the suggestion. Now I understand what you mean by syclcompat::local_mem.
Also added a new test sycl/test/check_device_code/syclcompat_local_mem.cpp that has two calls to syclcompat::local_mem in a kernel.

auto *Caller = cast<CallInst>(U)->getFunction();
if (!Caller->hasFnAttribute("sycl_forceinline")) {
// Already inlined.
continue;
}
for (auto *U2 : make_early_inc_range(Caller->users())) {
auto *CI = cast<CallInst>(U2);
InlineFunctionInfo IFI;
[[maybe_unused]] auto Result = InlineFunction(*CI, IFI);
assert(Result.isSuccess() && "inlining failed");
}
Caller->eraseFromParent();
Changed = true;
}

return Changed;
}

// TODO: It should be checked that __sycl_allocateLocalMemory (or its source
// form - group_local_memory) does not occur:
// - in a function (other than user lambda/functor)
Expand Down Expand Up @@ -317,9 +349,8 @@ static bool dynamicWGLocalMemory(Module &M) {

PreservedAnalyses SYCLLowerWGLocalMemoryPass::run(Module &M,
ModuleAnalysisManager &) {
bool MadeChanges = allocaWGLocalMemory(M);
MadeChanges = dynamicWGLocalMemory(M) || MadeChanges;
if (MadeChanges)
return PreservedAnalyses::none();
return PreservedAnalyses::all();
bool Changed = inlineGroupLocalMemoryFunc(M);
Changed |= allocaWGLocalMemory(M);
Changed |= dynamicWGLocalMemory(M);
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
}
66 changes: 66 additions & 0 deletions llvm/test/SYCLLowerIR/group_local_memory_inline.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
; RUN: opt < %s -passes=sycllowerwglocalmemory -S | FileCheck %s

; Check group_local_memory_for_overwrite and group_local_memory functions are inlined.
; Check __sycl_allocateLocalMemory calls are lowered to four separate allocations.

target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1"
target triple = "spir64-unknown-unknown"

%"class.sycl::_V1::multi_ptr" = type { ptr addrspace(3) }
%"class.sycl::_V1::group" = type { %"class.sycl::_V1::range", %"class.sycl::_V1::range", %"class.sycl::_V1::range", %"class.sycl::_V1::id" }
%"class.sycl::_V1::range" = type { %"class.sycl::_V1::detail::array" }
%"class.sycl::_V1::detail::array" = type { [1 x i64] }
%"class.sycl::_V1::id" = type { %"class.sycl::_V1::detail::array" }

; CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4
; CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4
; CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4
; CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4

; Function Attrs: alwaysinline
define internal spir_func void @_ZZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_ENKUlNS0_7nd_itemILi1EEEE_clES5_() #0 {
entry:
; CHECK: define internal spir_func void @_ZZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_ENKUlNS0_7nd_itemILi1EEEE_clES5_(
; CHECK: store ptr addrspace(3) @WGLocalMem{{.*}}, ptr addrspace(4) %AllocatedMem{{.*}}, align 8
; CHECK: store ptr addrspace(3) @WGLocalMem{{.*}}, ptr addrspace(4) %AllocatedMem{{.*}}, align 8
; CHECK: store ptr addrspace(3) @WGLocalMem{{.*}}, ptr addrspace(4) %AllocatedMem{{.*}}, align 8
; CHECK: store ptr addrspace(3) @WGLocalMem{{.*}}, ptr addrspace(4) %AllocatedMem{{.*}}, align 8

%Ptr = alloca %"class.sycl::_V1::multi_ptr", align 8
%agg = alloca %"class.sycl::_V1::group", align 8
%Ptr.ascast = addrspacecast ptr %Ptr to ptr addrspace(4)
call spir_func void @_ZN4sycl3_V13ext6oneapi32group_local_memory_for_overwriteIiNS0_5groupILi1EEEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %Ptr.ascast, ptr byval(%"class.sycl::_V1::group") align 8 %agg)
call spir_func void @_ZN4sycl3_V13ext6oneapi32group_local_memory_for_overwriteIiNS0_5groupILi1EEEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %Ptr.ascast, ptr byval(%"class.sycl::_V1::group") align 8 %agg)
call spir_func void @_ZN4sycl3_V13ext6oneapi18group_local_memoryIiNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_DpOT1_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %Ptr.ascast, ptr byval(%"class.sycl::_V1::group") align 8 %agg)
call spir_func void @_ZN4sycl3_V13ext6oneapi18group_local_memoryIiNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_DpOT1_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %Ptr.ascast, ptr byval(%"class.sycl::_V1::group") align 8 %agg)
ret void
}

; CHECK-NOT: define {{.*}} @_ZN4sycl3_V13ext6oneapi32group_local_memory_for_overwriteIiNS0_5groupILi1EEEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_(

; Function Attrs: alwaysinline
define spir_func void @_ZN4sycl3_V13ext6oneapi32group_local_memory_for_overwriteIiNS0_5groupILi1EEEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %agg.result, ptr byval(%"class.sycl::_V1::group") align 8 %g) #1 {
entry:
%AllocatedMem = alloca ptr addrspace(3), align 8
%AllocatedMem.ascast = addrspacecast ptr %AllocatedMem to ptr addrspace(4)
%call = call spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64 4, i64 4)
store ptr addrspace(3) %call, ptr addrspace(4) %AllocatedMem.ascast, align 8
ret void
}

; CHECK-NOT: define {{.*}} @_ZN4sycl3_V13ext6oneapi18group_local_memoryIiNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_DpOT1_(

; Function Attrs: alwaysinline
define spir_func void @_ZN4sycl3_V13ext6oneapi18group_local_memoryIiNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_DpOT1_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %agg.result, ptr byval(%"class.sycl::_V1::group") align 8 %g) #1 {
entry:
%AllocatedMem = alloca ptr addrspace(3), align 8
%AllocatedMem.ascast = addrspacecast ptr %AllocatedMem to ptr addrspace(4)
%call = call spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64 4, i64 4)
store ptr addrspace(3) %call, ptr addrspace(4) %AllocatedMem.ascast, align 8
ret void
}

declare spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64 noundef, i64 noundef)

attributes #0 = { alwaysinline }
attributes #1 = { "sycl_forceinline"="true" }
10 changes: 8 additions & 2 deletions sycl/include/sycl/ext/oneapi/group_local_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,13 @@ namespace sycl {
inline namespace _V1 {
namespace ext::oneapi {
template <typename T, typename Group>
#ifdef __SYCL_DEVICE_ONLY__
[[__sycl_detail__::add_ir_attributes_function("sycl_forceinline", true)]]
#endif
std::enable_if_t<
std::is_trivially_destructible_v<T> && sycl::detail::is_group<Group>::value,
multi_ptr<T, access::address_space::local_space, access::decorated::legacy>>
__SYCL_ALWAYS_INLINE group_local_memory_for_overwrite(Group g) {
group_local_memory_for_overwrite(Group g) {
(void)g;
#ifdef __SYCL_DEVICE_ONLY__
__attribute__((opencl_local)) std::uint8_t *AllocatedMem =
Expand All @@ -44,10 +47,13 @@ std::enable_if_t<
}

template <typename T, typename Group, typename... Args>
#ifdef __SYCL_DEVICE_ONLY__
[[__sycl_detail__::add_ir_attributes_function("sycl_forceinline", true)]]
#endif
std::enable_if_t<
std::is_trivially_destructible_v<T> && sycl::detail::is_group<Group>::value,
multi_ptr<T, access::address_space::local_space, access::decorated::legacy>>
__SYCL_ALWAYS_INLINE group_local_memory(Group g, Args &&...args) {
group_local_memory(Group g, Args &&...args) {
#ifdef __SYCL_DEVICE_ONLY__
__attribute__((opencl_local)) std::uint8_t *AllocatedMem =
__sycl_allocateLocalMemory(sizeof(T), alignof(T));
Expand Down
38 changes: 38 additions & 0 deletions sycl/test/check_device_code/extensions/group_local_memory.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm %s -o - | FileCheck %s
// RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm %s -fno-sycl-early-optimizations -o - | FileCheck %s
// RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm %s -O0 -o - | FileCheck %s

// The test checks that multiple calls to the same template instantiation of a
// group local memory function result in separate allocations.

#include <sycl/detail/core.hpp>
#include <sycl/ext/oneapi/group_local_memory.hpp>
#include <sycl/usm.hpp>

using namespace sycl;

int main() {
queue Q;

int **Out = malloc_shared<int *>(4, Q);

Q.submit([&](handler &Cgh) {
Cgh.parallel_for(nd_range<1>({1}, {1}), [=](nd_item<1> Item) {
auto Ptr0 =
ext::oneapi::group_local_memory_for_overwrite<int>(Item.get_group());
auto Ptr1 =
ext::oneapi::group_local_memory_for_overwrite<int>(Item.get_group());
auto Ptr2 = ext::oneapi::group_local_memory<int>(Item.get_group());
auto Ptr3 = ext::oneapi::group_local_memory<int>(Item.get_group());
Out[0] = Ptr0;
Out[1] = Ptr1;
Out[2] = Ptr2;
Out[3] = Ptr3;
});
});
}

// CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4
// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4
// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4
// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4
Loading