Skip to content

[SIFoldOperands] Folding immediate into a copy invalidates candidates in the fold list #148187

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1761,7 +1761,9 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
for (MachineInstr *Copy : CopiesToReplace)
Copy->addImplicitDefUseOperands(*MF);

for (FoldCandidate &Fold : FoldList) {
for (auto FoldIt = FoldList.begin(), End = FoldList.end(); FoldIt != End;
++FoldIt) {
FoldCandidate &Fold = *FoldIt;
assert(!Fold.isReg() || Fold.Def.OpToFold);
if (Fold.isReg() && Fold.getReg().isVirtual()) {
Register Reg = Fold.getReg();
Expand All @@ -1785,9 +1787,13 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,

if (Fold.isImm() && tryConstantFoldOp(Fold.UseMI)) {
LLVM_DEBUG(dbgs() << "Constant folded " << *Fold.UseMI);
// The instruction was folded into a copy, we have to skip any other
// occurence of UseMI in the fold list
End = std::remove_if(FoldIt + 1, End, [&](const FoldCandidate &F) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We really need to rewrite this pass. The fold list is garbage. It would be a lot simpler to just track a map of foldable sources like peephole opt does

return F.UseMI == Fold.UseMI;
});
Changed = true;
}

} else if (Fold.Commuted) {
// Restoring instruction's original operand order if fold has failed.
TII->commuteInstruction(*Fold.UseMI, false);
Expand Down
62 changes: 62 additions & 0 deletions llvm/test/CodeGen/AMDGPU/si-fold-operands-swdev-542372.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -O3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s -o - | FileCheck %s
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this really need -O3

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems. I cannot trigger the bug with O0/1/2. And I was not able on the mir.


@lds = internal addrspace(3) global [5 x i32] poison

define amdgpu_kernel void @kernel() {
; CHECK-LABEL: kernel:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: ds_read_b64 v[2:3], v0
; CHECK-NEXT: ds_write_b32 v0, v0
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 2, v2
; CHECK-NEXT: s_cbranch_vccnz .LBB0_3
; CHECK-NEXT: ; %bb.1: ; %land.rhs49
; CHECK-NEXT: ds_read_b64 v[0:1], v0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
; CHECK-NEXT: s_cbranch_vccnz .LBB0_3
; CHECK-NEXT: ; %bb.2: ; %land.rhs57
; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ds_write_b64 v2, v[0:1]
; CHECK-NEXT: .LBB0_3: ; %land.end59
; CHECK-NEXT: s_endpgm
entry:
%load.lds.0 = load <2 x i32>, ptr addrspace(3) @lds
%vecext.i55 = extractelement <2 x i32> %load.lds.0, i64 0
%cmp3.i57 = icmp eq i32 %vecext.i55, 2
store i32 0, ptr addrspace(3) @lds
br i1 %cmp3.i57, label %land.rhs49, label %land.end59

land.rhs49: ; preds = %entry
%load.lds.1 = load <2 x i32>, ptr addrspace(3) @lds
%vecext.i67 = extractelement <2 x i32> %load.lds.1, i64 0
%cmp3.i69 = icmp eq i32 %vecext.i67, 1
br i1 %cmp3.i69, label %land.rhs57, label %land.end59

land.rhs57: ; preds = %land.rhs49
%rem.i.i.i = srem <2 x i32> %load.lds.0, %load.lds.1
%ref.tmp.sroa.0.0.vec.extract.i.i = extractelement <2 x i32> %rem.i.i.i, i64 0
store i32 %ref.tmp.sroa.0.0.vec.extract.i.i, ptr addrspace(3) @lds
store i32 %ref.tmp.sroa.0.0.vec.extract.i.i, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @lds, i32 4)
%load.lds.2 = load <2 x i32>, ptr addrspace(3) @lds
%vecext.i.i.i = extractelement <2 x i32> %load.lds.2, i64 0
%cmp3.i.i.i = icmp ne i32 %vecext.i.i.i, 0
%vecext.1.i.i.i = extractelement <2 x i32> %load.lds.2, i64 1
%cmp3.1.i.i.i = icmp ne i32 %vecext.1.i.i.i, 0
%.not.i.i = select i1 %cmp3.i.i.i, i1 true, i1 %cmp3.1.i.i.i
br i1 %.not.i.i, label %land.end59, label %if.end.i

if.end.i: ; preds = %land.rhs57
%and.i.i.i = and <2 x i32> %load.lds.2, splat (i32 1)
%ref.tmp.sroa.0.0.vec.extract.i20.i = extractelement <2 x i32> %and.i.i.i, i64 0
br label %land.end59

land.end59: ; preds = %if.end.i, %land.rhs57, %land.rhs49, %entry
ret void
}