-
Notifications
You must be signed in to change notification settings - Fork 14.5k
AMDGPU: Fix assert when multi operands to update after folding imm #148205
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Fix assert when multi operands to update after folding imm #148205
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: None (macurtis-amd) ChangesIn the original motivating test case, FoldList had entries:
After calling updateOperand(#0), tryConstantFoldOp(#0.UseMI) removed operand 1, and entry #​1.UseOpNo was no longer valid, resulting in an assert. This change defers constant folding until all operands have been updated so that UseOpNo values remain stable. Full diff: https://github.com/llvm/llvm-project/pull/148205.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 0ed06c37507af..0f2a932f984b1 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1761,6 +1761,7 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
for (MachineInstr *Copy : CopiesToReplace)
Copy->addImplicitDefUseOperands(*MF);
+ SmallVector<MachineInstr *, 4> ConstantFoldCandidates;
for (FoldCandidate &Fold : FoldList) {
assert(!Fold.isReg() || Fold.Def.OpToFold);
if (Fold.isReg() && Fold.getReg().isVirtual()) {
@@ -1783,16 +1784,21 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
<< static_cast<int>(Fold.UseOpNo) << " of "
<< *Fold.UseMI);
- if (Fold.isImm() && tryConstantFoldOp(Fold.UseMI)) {
- LLVM_DEBUG(dbgs() << "Constant folded " << *Fold.UseMI);
- Changed = true;
- }
+ if (Fold.isImm() && !is_contained(ConstantFoldCandidates, Fold.UseMI))
+ ConstantFoldCandidates.push_back(Fold.UseMI);
} else if (Fold.Commuted) {
// Restoring instruction's original operand order if fold has failed.
TII->commuteInstruction(*Fold.UseMI, false);
}
}
+
+ for (MachineInstr *MI : ConstantFoldCandidates) {
+ if (tryConstantFoldOp(MI)) {
+ LLVM_DEBUG(dbgs() << "Constant folded " << *MI);
+ Changed = true;
+ }
+ }
return true;
}
diff --git a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll
new file mode 100644
index 0000000000000..a81fc6a25e43e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll
@@ -0,0 +1,58 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -O3 -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 -o - < %s | FileCheck %s
+
+%struct.bar = type { %struct.bar.0, %struct.bar.0, %struct.bar.0 }
+%struct.bar.0 = type { %struct.blam }
+%struct.blam = type { i32, i32, i32, i32 }
+
+@global = external addrspace(3) global %struct.bar
+
+define void @snork() {
+; CHECK-LABEL: snork:
+; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: v_mov_b32_e32 v4, global@abs32@lo
+; CHECK-NEXT: s_mov_b32 s5, s4
+; CHECK-NEXT: s_mov_b32 s6, s4
+; CHECK-NEXT: s_mov_b32 s7, s4
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: v_mov_b32_e32 v1, s5
+; CHECK-NEXT: v_mov_b32_e32 v2, s6
+; CHECK-NEXT: v_mov_b32_e32 v3, s7
+; CHECK-NEXT: s_cmp_lg_u32 0, 0
+; CHECK-NEXT: ds_write_b128 v4, v[0:3] offset:32
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %call = call float @llvm.amdgcn.rcp.f32(float 0.000000e+00)
+ %fmul = fmul ninf float %call, 0.000000e+00
+ %fptoui = fptoui float %fmul to i32
+ %zext = zext i32 %fptoui to i64
+ %mul = mul i64 2, %zext
+ %trunc = trunc i64 %mul to i32
+ store i32 %trunc, ptr addrspace(3) getelementptr inbounds (%struct.bar, ptr addrspace(3) @global, i32 0, i32 2), align 16
+ store i32 0, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 36), align 4
+ store i32 0, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 40), align 8
+ store i32 %trunc, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 44), align 4
+ %load = load <4 x i32>, ptr addrspace(3) getelementptr inbounds (%struct.bar, ptr addrspace(3) @global, i32 0, i32 2), align 16
+ %extractelement = extractelement <4 x i32> %load, i64 0
+ %icmp = icmp ne i32 %extractelement, 0
+ %extractelement1 = extractelement <4 x i32> %load, i64 3
+ %icmp2 = icmp ne i32 %extractelement1, 0
+ %select = select i1 %icmp, i1 true, i1 %icmp2
+ br i1 %select, label %bb5, label %bb3
+
+bb3: ; preds = %bb
+ %and = and <4 x i32> %load, splat (i32 1)
+ %extractelement4 = extractelement <4 x i32> %and, i64 0
+ br label %bb5
+
+bb5: ; preds = %bb3, %bb
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.amdgcn.rcp.f32(float) #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
probably good to wait for @arsenm to review |
llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll
Outdated
Show resolved
Hide resolved
@@ -0,0 +1,58 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | |||
; RUN: llc -O3 -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 -o - < %s | FileCheck %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
; RUN: llc -O3 -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 -o - < %s | FileCheck %s | |
; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 < %s | FileCheck %s |
Does this really need -O3? An additional mir test would also be good
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Apparently it does which is unusual, if you can pre-run the extra optimizations and still reproduce with the default that would be better
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added mir test and removed ll test which I could not get to fail without -O3.
Is that good enough?
(My first mir test. Let me know what I got wrong)
It seems it's the same issue as #148187 . You can avoid the extra |
@arsenm ok to land ? |
@@ -0,0 +1,127 @@ | |||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 | |||
# RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 -run-pass=si-fold-operands -o - %s | FileCheck %s | |||
--- | |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No need to keep the IR.
Well, is this IR minimal? The store to the global might need the IR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Without it, llc complained about global
, but I wasn't sure trying to remove things. I'll see about pruning it down.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed most of the IR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I meant to trim the IR and then regenerate the MIR. If we can get rid of the global, that'd be great because in that case it is very likely you would not need the IR. Currently you just trimmed the IR but keep the MIR the same. That is the reason that you will still need to keep the IR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now that the MIR does not refer to @global
, you do not need the IR section at all.
isSSA: true | ||
noVRegs: false | ||
hasFakeUses: false | ||
registers: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Almost all of these things can go
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed.
@shiltian landable ? |
I'm not sure if the MIR test can be trimmed further (and I hope it can), but if @arsenm is happy with it, I'm totally fine with it. |
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3 | ||
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @global, implicit $exec | ||
; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] | ||
; CHECK-NEXT: DS_WRITE_B128_gfx9 killed [[V_MOV_B32_e32_]], [[COPY]], 32, 0, implicit $exec :: (store (s128) into `ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32)`, addrspace 3) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can remove the MMO:
; CHECK-NEXT: DS_WRITE_B128_gfx9 killed [[V_MOV_B32_e32_]], [[COPY]], 32, 0, implicit $exec :: (store (s128) into `ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32)`, addrspace 3) | |
; CHECK-NEXT: DS_WRITE_B128_gfx9 killed [[V_MOV_B32_e32_]], [[COPY]], 32, 0, implicit $exec |
name: snork | ||
body: | | ||
; CHECK-LABEL: name: snork | ||
; CHECK: bb.0.bb: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The final suffix on the label name here refers back to the IR name for the block. So if you remove it (and the same for the other labels below) then you can reduce the IR function definition to just ret void
:
; CHECK: bb.0.bb: | |
; CHECK: bb.0: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
llvm-reduce will also do this on mir
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Did not realize that I could run llvm-reduce on mir!
Latest test is now substantially smaller.
bb.0.bb: | ||
successors: %bb.1, %bb.2 | ||
|
||
%9:sreg_32 = S_MOV_B32 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should also run this through -run-pass=none to compact the register numbers
@arsenm are you ok with the latest test? |
; CHECK-NEXT: SI_RETURN | ||
%0:sreg_32 = S_MOV_B32 0 | ||
%1:sgpr_128 = REG_SEQUENCE undef %0, %subreg.sub0, undef %0, %subreg.sub1, undef %0, %subreg.sub2, undef %0, %subreg.sub3 | ||
%2:sreg_32 = S_OR_B32 undef %1.sub0, undef %1.sub3, implicit-def dead $scc |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
%2:sreg_32 = S_OR_B32 undef %1.sub0, undef %1.sub3, implicit-def dead $scc | |
%2:sreg_32 = S_OR_B32 %1.sub0, %1.sub3, implicit-def dead $scc |
Not actually undef
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE undef [[S_MOV_B32_]], %subreg.sub0, undef [[S_MOV_B32_]], %subreg.sub1, undef [[S_MOV_B32_]], %subreg.sub2, undef [[S_MOV_B32_]], %subreg.sub3 | ||
; CHECK-NEXT: SI_RETURN | ||
%0:sreg_32 = S_MOV_B32 0 | ||
%1:sgpr_128 = REG_SEQUENCE undef %0, %subreg.sub0, undef %0, %subreg.sub1, undef %0, %subreg.sub2, undef %0, %subreg.sub3 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
%1:sgpr_128 = REG_SEQUENCE undef %0, %subreg.sub0, undef %0, %subreg.sub1, undef %0, %subreg.sub2, undef %0, %subreg.sub3 | |
%1:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1, %0, %subreg.sub2, %0, %subreg.sub3 |
Not actually undef
In the original motivating test case, FoldList had entries: #0: UseMI: %224:sreg_32 = S_OR_B32 %219.sub0:sreg_64, %219.sub1:sreg_64, implicit-def dead $scc UseOpNo: 1 llvm#1: UseMI: %224:sreg_32 = S_OR_B32 %219.sub0:sreg_64, %219.sub1:sreg_64, implicit-def dead $scc UseOpNo: 2 After calling updateOperand(#0), tryConstantFoldOp(#0.UseMI) removed operand 1, and entry llvm#1.UseOpNo was no longer valid, resulting in an assert. This change defers constant folding until after all operands have been updated so that UseOpNo values remain valid.
5bd778b
to
4c44451
Compare
In the original motivating test case, FoldList had entries:
After calling updateOperand(#0), tryConstantFoldOp(#0.UseMI) removed operand 1, and entry #1.UseOpNo was no longer valid, resulting in an assert.
This change defers constant folding until all operands have been updated so that UseOpNo values remain stable.