AMDGPU: Fix assert when multi operands to update after folding imm #148205

macurtis-amd · 2025-07-11T10:48:19Z

In the original motivating test case, FoldList had entries:

  #0: UseMI: %224:sreg_32 = S_OR_B32 %219.sub0:sreg_64, %219.sub1:sreg_64, implicit-def dead $scc
      UseOpNo: 1

  #1: UseMI: %224:sreg_32 = S_OR_B32 %219.sub0:sreg_64, %219.sub1:sreg_64, implicit-def dead $scc
      UseOpNo: 2

After calling updateOperand(#0), tryConstantFoldOp(#0.UseMI) removed operand 1, and entry #1.UseOpNo was no longer valid, resulting in an assert.

This change defers constant folding until all operands have been updated so that UseOpNo values remain stable.

llvmbot · 2025-07-11T10:48:51Z

@llvm/pr-subscribers-backend-amdgpu

Author: None (macurtis-amd)

Changes

In the original motivating test case, FoldList had entries:

  #<!-- -->0: UseMI: %224:sreg_32 = S_OR_B32 %219.sub0:sreg_64, %219.sub1:sreg_64, implicit-def dead $scc
      UseOpNo: 1

  #<!-- -->1: UseMI: %224:sreg_32 = S_OR_B32 %219.sub0:sreg_64, %219.sub1:sreg_64, implicit-def dead $scc
      UseOpNo: 2

After calling updateOperand(#0), tryConstantFoldOp(#0.UseMI) removed operand 1, and entry #1.UseOpNo was no longer valid, resulting in an assert.

This change defers constant folding until all operands have been updated so that UseOpNo values remain stable.

Full diff: https://github.com/llvm/llvm-project/pull/148205.diff

2 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SIFoldOperands.cpp (+10-4)
(added) llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll (+58)

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 0ed06c37507af..0f2a932f984b1 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1761,6 +1761,7 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
   for (MachineInstr *Copy : CopiesToReplace)
     Copy->addImplicitDefUseOperands(*MF);
 
+  SmallVector<MachineInstr *, 4> ConstantFoldCandidates;
   for (FoldCandidate &Fold : FoldList) {
     assert(!Fold.isReg() || Fold.Def.OpToFold);
     if (Fold.isReg() && Fold.getReg().isVirtual()) {
@@ -1783,16 +1784,21 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
                         << static_cast<int>(Fold.UseOpNo) << " of "
                         << *Fold.UseMI);
 
-      if (Fold.isImm() && tryConstantFoldOp(Fold.UseMI)) {
-        LLVM_DEBUG(dbgs() << "Constant folded " << *Fold.UseMI);
-        Changed = true;
-      }
+      if (Fold.isImm() && !is_contained(ConstantFoldCandidates, Fold.UseMI))
+        ConstantFoldCandidates.push_back(Fold.UseMI);
 
     } else if (Fold.Commuted) {
       // Restoring instruction's original operand order if fold has failed.
       TII->commuteInstruction(*Fold.UseMI, false);
     }
   }
+
+  for (MachineInstr *MI : ConstantFoldCandidates) {
+    if (tryConstantFoldOp(MI)) {
+      LLVM_DEBUG(dbgs() << "Constant folded " << *MI);
+      Changed = true;
+    }
+  }
   return true;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll
new file mode 100644
index 0000000000000..a81fc6a25e43e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll
@@ -0,0 +1,58 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -O3 -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 -o - < %s | FileCheck %s
+
+%struct.bar = type { %struct.bar.0, %struct.bar.0, %struct.bar.0 }
+%struct.bar.0 = type { %struct.blam }
+%struct.blam = type { i32, i32, i32, i32 }
+
+@global = external addrspace(3) global %struct.bar
+
+define void @snork() {
+; CHECK-LABEL: snork:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    v_mov_b32_e32 v4, global@abs32@lo
+; CHECK-NEXT:    s_mov_b32 s5, s4
+; CHECK-NEXT:    s_mov_b32 s6, s4
+; CHECK-NEXT:    s_mov_b32 s7, s4
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    v_mov_b32_e32 v1, s5
+; CHECK-NEXT:    v_mov_b32_e32 v2, s6
+; CHECK-NEXT:    v_mov_b32_e32 v3, s7
+; CHECK-NEXT:    s_cmp_lg_u32 0, 0
+; CHECK-NEXT:    ds_write_b128 v4, v[0:3] offset:32
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  %call = call float @llvm.amdgcn.rcp.f32(float 0.000000e+00)
+  %fmul = fmul ninf float %call, 0.000000e+00
+  %fptoui = fptoui float %fmul to i32
+  %zext = zext i32 %fptoui to i64
+  %mul = mul i64 2, %zext
+  %trunc = trunc i64 %mul to i32
+  store i32 %trunc, ptr addrspace(3) getelementptr inbounds (%struct.bar, ptr addrspace(3) @global, i32 0, i32 2), align 16
+  store i32 0, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 36), align 4
+  store i32 0, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 40), align 8
+  store i32 %trunc, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 44), align 4
+  %load = load <4 x i32>, ptr addrspace(3) getelementptr inbounds (%struct.bar, ptr addrspace(3) @global, i32 0, i32 2), align 16
+  %extractelement = extractelement <4 x i32> %load, i64 0
+  %icmp = icmp ne i32 %extractelement, 0
+  %extractelement1 = extractelement <4 x i32> %load, i64 3
+  %icmp2 = icmp ne i32 %extractelement1, 0
+  %select = select i1 %icmp, i1 true, i1 %icmp2
+  br i1 %select, label %bb5, label %bb3
+
+bb3:                                              ; preds = %bb
+  %and = and <4 x i32> %load, splat (i32 1)
+  %extractelement4 = extractelement <4 x i32> %and, i64 0
+  br label %bb5
+
+bb5:                                              ; preds = %bb3, %bb
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.amdgcn.rcp.f32(float) #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

ronlieb · 2025-07-11T11:13:03Z

probably good to wait for @arsenm to review

llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll

arsenm · 2025-07-11T11:38:17Z

llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll

@@ -0,0 +1,58 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -O3 -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 -o - < %s | FileCheck %s


Suggested change

; RUN: llc -O3 -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 -o - < %s | FileCheck %s

; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 < %s | FileCheck %s

Does this really need -O3? An additional mir test would also be good

Apparently it does which is unusual, if you can pre-run the extra optimizations and still reproduce with the default that would be better

Added mir test and removed ll test which I could not get to fail without -O3.
Is that good enough?
(My first mir test. Let me know what I got wrong)

jmmartinez · 2025-07-11T12:11:38Z

It seems it's the same issue as #148187 .

You can avoid the extra SmallVector if you iterate in the same way as the PR above (or at least use a Set (I think SetVector to preserve the insertion order when iteration) to avoid scan through is_contained.

ronlieb · 2025-07-11T19:09:07Z

@arsenm ok to land ?

shiltian · 2025-07-11T19:37:29Z

llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir

@@ -0,0 +1,127 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 -run-pass=si-fold-operands -o - %s | FileCheck %s
+--- |


No need to keep the IR.

Well, is this IR minimal? The store to the global might need the IR.

Without it, llc complained about global, but I wasn't sure trying to remove things. I'll see about pruning it down.

Removed most of the IR.

I meant to trim the IR and then regenerate the MIR. If we can get rid of the global, that'd be great because in that case it is very likely you would not need the IR. Currently you just trimmed the IR but keep the MIR the same. That is the reason that you will still need to keep the IR.

Now that the MIR does not refer to @global, you do not need the IR section at all.

shiltian · 2025-07-11T19:37:56Z

llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir

+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+registers:


Almost all of these things can go

ronlieb · 2025-07-11T23:17:58Z

@shiltian landable ?

shiltian · 2025-07-12T04:12:04Z

I'm not sure if the MIR test can be trimmed further (and I hope it can), but if @arsenm is happy with it, I'm totally fine with it.

jayfoad · 2025-07-14T10:00:48Z

llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir

+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @global, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 killed [[V_MOV_B32_e32_]], [[COPY]], 32, 0, implicit $exec :: (store (s128) into `ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32)`, addrspace 3)


You can remove the MMO:

Suggested change

; CHECK-NEXT: DS_WRITE_B128_gfx9 killed [[V_MOV_B32_e32_]], [[COPY]], 32, 0, implicit $exec :: (store (s128) into `ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32)`, addrspace 3)

; CHECK-NEXT: DS_WRITE_B128_gfx9 killed [[V_MOV_B32_e32_]], [[COPY]], 32, 0, implicit $exec

jayfoad · 2025-07-14T10:02:14Z

llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir

+name:            snork
+body:             |
+  ; CHECK-LABEL: name: snork
+  ; CHECK: bb.0.bb:


The final suffix on the label name here refers back to the IR name for the block. So if you remove it (and the same for the other labels below) then you can reduce the IR function definition to just ret void:

Suggested change

; CHECK: bb.0.bb:

; CHECK: bb.0:

llvm-reduce will also do this on mir

Did not realize that I could run llvm-reduce on mir!
Latest test is now substantially smaller.

arsenm · 2025-07-14T10:30:42Z

llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir

+  bb.0.bb:
+    successors: %bb.1, %bb.2
+
+    %9:sreg_32 = S_MOV_B32 0


Should also run this through -run-pass=none to compact the register numbers

macurtis-amd · 2025-07-15T12:39:17Z

@arsenm are you ok with the latest test?

arsenm · 2025-07-15T12:40:24Z

llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir

+    ; CHECK-NEXT: SI_RETURN
+    %0:sreg_32 = S_MOV_B32 0
+    %1:sgpr_128 = REG_SEQUENCE undef %0, %subreg.sub0, undef %0, %subreg.sub1, undef %0, %subreg.sub2, undef %0, %subreg.sub3
+    %2:sreg_32 = S_OR_B32 undef %1.sub0, undef %1.sub3, implicit-def dead $scc


Suggested change

%2:sreg_32 = S_OR_B32 undef %1.sub0, undef %1.sub3, implicit-def dead $scc

%2:sreg_32 = S_OR_B32 %1.sub0, %1.sub3, implicit-def dead $scc

Not actually undef

arsenm · 2025-07-15T12:40:39Z

llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir

+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE undef [[S_MOV_B32_]], %subreg.sub0, undef [[S_MOV_B32_]], %subreg.sub1, undef [[S_MOV_B32_]], %subreg.sub2, undef [[S_MOV_B32_]], %subreg.sub3
+    ; CHECK-NEXT: SI_RETURN
+    %0:sreg_32 = S_MOV_B32 0
+    %1:sgpr_128 = REG_SEQUENCE undef %0, %subreg.sub0, undef %0, %subreg.sub1, undef %0, %subreg.sub2, undef %0, %subreg.sub3


Suggested change

%1:sgpr_128 = REG_SEQUENCE undef %0, %subreg.sub0, undef %0, %subreg.sub1, undef %0, %subreg.sub2, undef %0, %subreg.sub3

%1:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1, %0, %subreg.sub2, %0, %subreg.sub3

Not actually undef

In the original motivating test case, FoldList had entries: #0: UseMI: %224:sreg_32 = S_OR_B32 %219.sub0:sreg_64, %219.sub1:sreg_64, implicit-def dead $scc UseOpNo: 1 llvm#1: UseMI: %224:sreg_32 = S_OR_B32 %219.sub0:sreg_64, %219.sub1:sreg_64, implicit-def dead $scc UseOpNo: 2 After calling updateOperand(#0), tryConstantFoldOp(#0.UseMI) removed operand 1, and entry llvm#1.UseOpNo was no longer valid, resulting in an assert. This change defers constant folding until after all operands have been updated so that UseOpNo values remain valid.

… imm

llvmbot added the backend:AMDGPU label Jul 11, 2025

macurtis-amd requested a review from arsenm July 11, 2025 10:48

ronlieb self-requested a review July 11, 2025 11:11

ronlieb approved these changes Jul 11, 2025

View reviewed changes

arsenm requested a review from jmmartinez July 11, 2025 11:37

arsenm reviewed Jul 11, 2025

View reviewed changes

shiltian reviewed Jul 11, 2025

View reviewed changes

jayfoad reviewed Jul 14, 2025

View reviewed changes

arsenm reviewed Jul 14, 2025

View reviewed changes

arsenm approved these changes Jul 15, 2025

View reviewed changes

This was referenced Jul 15, 2025

[SIFoldOperands] Folding immediate into a copy invalidates candidates in the fold list #148187

Closed

Revert "AMDGPU: Try constant fold after folding immediate (#141862)" #148197

Closed

shiltian approved these changes Jul 15, 2025

View reviewed changes

macurtis-amd added 7 commits July 16, 2025 05:14

fixup! AMDGPU: Fix assert when multi operands to update after folding…

8e02ce5

… imm

fixup! AMDGPU: Fix assert when multi operands to update after folding…

be9c98f

… imm

fixup! AMDGPU: Fix assert when multi operands to update after folding…

34c61fc

… imm

fixup! AMDGPU: Fix assert when multi operands to update after folding…

269cae1

… imm

fixup! AMDGPU: Fix assert when multi operands to update after folding…

bc2d5b1

… imm

fixup! AMDGPU: Fix assert when multi operands to update after folding…

4c44451

… imm

macurtis-amd force-pushed the bug-multi-operands-to-update-after-fold branch from 5bd778b to 4c44451 Compare July 16, 2025 10:22

macurtis-amd merged commit 402b989 into llvm:main Jul 16, 2025
9 checks passed

This was referenced Jul 23, 2025

test abhinavgaba/llvm-project#2

Closed

Add dataFence plugin interface abhinavgaba/llvm-project#3

Open

		@@ -0,0 +1,58 @@
		; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
		; RUN: llc -O3 -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 -o - < %s \| FileCheck %s

	; RUN: llc -O3 -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 -o - < %s \| FileCheck %s
	; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 < %s \| FileCheck %s

	; CHECK-NEXT: DS_WRITE_B128_gfx9 killed [[V_MOV_B32_e32_]], [[COPY]], 32, 0, implicit $exec :: (store (s128) into `ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32)`, addrspace 3)
	; CHECK-NEXT: DS_WRITE_B128_gfx9 killed [[V_MOV_B32_e32_]], [[COPY]], 32, 0, implicit $exec

	%2:sreg_32 = S_OR_B32 undef %1.sub0, undef %1.sub3, implicit-def dead $scc
	%2:sreg_32 = S_OR_B32 %1.sub0, %1.sub3, implicit-def dead $scc

	%1:sgpr_128 = REG_SEQUENCE undef %0, %subreg.sub0, undef %0, %subreg.sub1, undef %0, %subreg.sub2, undef %0, %subreg.sub3
	%1:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1, %0, %subreg.sub2, %0, %subreg.sub3

AMDGPU: Fix assert when multi operands to update after folding imm #148205

AMDGPU: Fix assert when multi operands to update after folding imm #148205

Conversation

macurtis-amd commented Jul 11, 2025

Uh oh!

llvmbot commented Jul 11, 2025

Uh oh!

ronlieb commented Jul 11, 2025

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

macurtis-amd Jul 11, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

jmmartinez commented Jul 11, 2025

Uh oh!

ronlieb commented Jul 11, 2025

Uh oh!

shiltian Jul 11, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

ronlieb commented Jul 11, 2025

Uh oh!

shiltian commented Jul 12, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

macurtis-amd commented Jul 15, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

macurtis-amd Jul 11, 2025 •

edited

Loading

shiltian Jul 11, 2025 •

edited

Loading