Skip to content

[SYCL][CUDA][HIP] Fix enable-global-offset flag #11674

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Nov 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 60 additions & 21 deletions llvm/lib/SYCLLowerIR/GlobalOffset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,21 @@ ModulePass *llvm::createGlobalOffsetPassLegacy() {
return new GlobalOffsetLegacy();
}

// Recursive helper function to collect Loads from GEPs in a BFS fashion.
static void getLoads(Instruction *P, SmallVectorImpl<Instruction *> &Traversed,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: it was a bit confusing with the name 'Traversed'. Can we not just call this also as 'PtrUses'?
Also, the function name seems a bit incomplete. May be getLoadsAndGEPs?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I prefer the name traversed here honestly, since it is capturing the approach that this function follows a bit better.
It does a BFS for finding Loads and GEPs while keeping track of the already traversed functions.
The name focuses on the goal of this function: To collect the Loads in a small vector.
The traversal of GEPs could be more seen as a byproduct.

SmallVectorImpl<LoadInst *> &Loads) {
Traversed.push_back(P);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May be an assert at the very top of this function body to check for *p being either a load or a GEP might be better?

Copy link
Contributor Author

@MartinWehking MartinWehking Nov 8, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean the check is already taking place in the dyn_cast<LoadInst>(P) and the assertion if the else branch is taken: assert(isa<GetElementPtrInst>(*P));.
If the instruction is not a Load or GEP, the code will also throw without another assert in the first line, so I think it would add a sort of redundancy here.

if (auto *L = dyn_cast<LoadInst>(P)) // Base case for recursion
Loads.push_back(L);
else {
assert(isa<GetElementPtrInst>(*P));
for (Value *V : P->users())
getLoads(cast<Instruction>(V), Traversed, Loads);
}
}

// New PM implementation.
PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) {
if (!EnableGlobalOffset)
return PreservedAnalyses::all();

AT = TargetHelpers::getArchType(M);
Function *ImplicitOffsetIntrinsic = M.getFunction(Intrinsic::getName(
AT == ArchType::Cuda
Expand All @@ -73,33 +83,62 @@ PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) {
if (!ImplicitOffsetIntrinsic || ImplicitOffsetIntrinsic->use_empty())
return PreservedAnalyses::all();

// For AMD allocas and pointers have to be to CONSTANT_PRIVATE (5), NVVM is
// happy with ADDRESS_SPACE_GENERIC (0).
TargetAS = AT == ArchType::Cuda ? 0 : 5;
/// The value for NVVM's ADDRESS_SPACE_SHARED and AMD's LOCAL_ADDRESS happen
/// to be 3, use it for the implicit argument pointer type.
KernelImplicitArgumentType =
ArrayType::get(Type::getInt32Ty(M.getContext()), 3);
ImplicitOffsetPtrType =
Type::getInt32Ty(M.getContext())->getPointerTo(TargetAS);
assert((ImplicitOffsetIntrinsic->getReturnType() == ImplicitOffsetPtrType) &&
"Implicit offset intrinsic does not return the expected type");
if (!EnableGlobalOffset) {
SmallVector<CallInst *, 4> Worklist;
SmallVector<LoadInst *, 4> LI;
SmallVector<Instruction *, 4> PtrUses;

SmallVector<KernelPayload, 4> KernelPayloads;
TargetHelpers::populateKernels(M, KernelPayloads, AT);
// Collect all GEPs and Loads from the intrinsic's CallInsts
for (Value *V : ImplicitOffsetIntrinsic->users()) {
Worklist.push_back(cast<CallInst>(V));
for (Value *V2 : V->users())
getLoads(cast<Instruction>(V2), PtrUses, LI);
}

// Replace each use of a collected Load with a Constant 0
for (LoadInst *L : LI)
L->replaceAllUsesWith(ConstantInt::get(L->getType(), 0));

// Validate kernels and populate entry map
EntryPointMetadata = generateKernelMDNodeMap(M, KernelPayloads);
// Remove all collected Loads and GEPs from the kernel.
// PtrUses is returned by `getLoads` in topological order.
// Walk it backwards so we don't violate users.
for (auto *I : reverse(PtrUses))
I->eraseFromParent();

// Add implicit parameters to all direct and indirect users of the offset
addImplicitParameterToCallers(M, ImplicitOffsetIntrinsic, nullptr);
// Remove all collected CallInsts from the kernel.
for (CallInst *CI : Worklist) {
auto *I = cast<Instruction>(CI);
I->eraseFromParent();
}
} else {
// For AMD allocas and pointers have to be to CONSTANT_PRIVATE (5), NVVM is
// happy with ADDRESS_SPACE_GENERIC (0).
TargetAS = AT == ArchType::Cuda ? 0 : 5;
/// The value for NVVM's adDRESS_SPACE_SHARED and AMD's LOCAL_ADDRESS happen
/// to be 3, use it for the implicit argument pointer type.
KernelImplicitArgumentType =
ArrayType::get(Type::getInt32Ty(M.getContext()), 3);
ImplicitOffsetPtrType =
Type::getInt32Ty(M.getContext())->getPointerTo(TargetAS);
assert(
(ImplicitOffsetIntrinsic->getReturnType() == ImplicitOffsetPtrType) &&
"Implicit offset intrinsic does not return the expected type");

SmallVector<KernelPayload, 4> KernelPayloads;
TargetHelpers::populateKernels(M, KernelPayloads, AT);

// Validate kernels and populate entry map
EntryPointMetadata = generateKernelMDNodeMap(M, KernelPayloads);

// Add implicit parameters to all direct and indirect users of the offset
addImplicitParameterToCallers(M, ImplicitOffsetIntrinsic, nullptr);
}

// Assert that all uses of `ImplicitOffsetIntrinsic` are removed and delete
// it.
assert(ImplicitOffsetIntrinsic->use_empty() &&
"Not all uses of intrinsic removed");
ImplicitOffsetIntrinsic->eraseFromParent();

return PreservedAnalyses::none();
}

Expand Down
20 changes: 20 additions & 0 deletions llvm/test/CodeGen/AMDGPU/global-offset-removal.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
; RUN: opt -bugpoint-enable-legacy-pm -globaloffset -enable-global-offset=false %s -S -o - | FileCheck %s

; This test checks that the implicit offset intrinsic is correctly removed

declare ptr addrspace(5) @llvm.amdgcn.implicit.offset()
; CHECK-NOT: llvm.amdgcn.implicit.offset

define weak_odr dso_local i64 @_ZTS14example_kernel() {
entry:
; CHECK-NOT: @llvm.amdgcn.implicit.offset()
; CHECK-NOT: getelementptr
; CHECK-NOT: load
; CHECK: [[REG:%[0-9]+]] = zext i{{[0-9]+}} 0 to i{{[0-9]+}}
; CHECK: ret i{{[0-9]+}} [[REG]]
%0 = tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset()
%1 = getelementptr inbounds i32, ptr addrspace(5) %0, i64 1
%2 = load i32, ptr addrspace(5) %1, align 4
%3 = zext i32 %2 to i64
ret i64 %3
}
21 changes: 21 additions & 0 deletions llvm/test/CodeGen/NVPTX/global-offset-removal.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
; RUN: opt -bugpoint-enable-legacy-pm -globaloffset -enable-global-offset=false %s -S -o - | FileCheck %s
target triple = "nvptx64-nvidia-cuda"

; This test checks that the implicit offset intrinsic is correctly removed

declare ptr @llvm.nvvm.implicit.offset()
; CHECK-NOT: llvm.nvvm.implicit.offset

define weak_odr dso_local i64 @_ZTS14example_kernel() {
entry:
; CHECK-NOT: @llvm.nvvm.implicit.offset()
; CHECK-NOT: getelementptr
; CHECK-NOT: load
; CHECK: [[REG:%[0-9]+]] = zext i{{[0-9]+}} 0 to i{{[0-9]+}}
; CHECK: ret i{{[0-9]+}} [[REG]]
%0 = tail call ptr @llvm.nvvm.implicit.offset()
%1 = getelementptr inbounds i32, ptr %0, i64 1
%2 = load i32, ptr %1, align 4
%3 = zext i32 %2 to i64
ret i64 %3
}