llvm · arsenm · Mar 6, 2025 · Feb 19, 2025 · Mar 20, 2024 · Mar 5, 2025
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
@@ -1707,6 +1707,24 @@ The AMDGPU backend supports the following LLVM IR attributes.
                                                       as hidden. Hidden arguments are managed by the compiler and are not part of
                                                       the explicit arguments supplied by the user.
 
+     "amdgpu-agpr-alloc"="min(,max)"                  Indicates a minimum and maximum range for the number of AGPRs to make
+                                                      available to allocate. The values will be rounded up to the next multiple
+                                                      of the allocation granularity (4). The minimum value is interpreted as the
+                                                      minimum required number of AGPRs for the function to allocate (that is, the
+                                                      function requires no more than min registers). If only one value is specified,
+                                                      it is interpreted as the minimum register budget. The maximum will restrict
+                                                      allocation to use no more than max AGPRs.
+
+                                                      The values may be ignored if satisfying it would violate other allocation
+                                                      constraints.
+
+                                                      The behavior is undefined if a function which requires more AGPRs than the
+                                                      lower bound is reached through any function marked with a higher value of this
+                                                      attribute. A minimum value of 0 indicates the function does not require
+                                                      any AGPRs. A minimum of 0 is equivalent to "amdgpu-no-agpr".
+
+                                                      This is only relevant on targets with AGPRs which support accum_offset (gfx90a+).
+
      "amdgpu-sgpr-hazard-wait"                        Disabled SGPR hazard wait insertion if set to 0.
                                                       Exists for testing performance impact of SGPR hazard waits only.
 

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -572,9 +572,10 @@ MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
 std::pair<unsigned, unsigned>
 SIRegisterInfo::getMaxNumVectorRegs(const MachineFunction &MF) const {
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
-  unsigned MaxNumAGPRs = MaxNumVGPRs;
-  unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
+  const unsigned MaxVectorRegs = ST.getMaxNumVGPRs(MF);
+
+  unsigned MaxNumVGPRs = MaxVectorRegs;
+  unsigned MaxNumAGPRs = 0;
 
   // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
   // a wave may have up to 512 total vector registers combining together both
@@ -585,16 +586,49 @@ SIRegisterInfo::getMaxNumVectorRegs(const MachineFunction &MF) const {
   // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
   //       register file accordingly.
   if (ST.hasGFX90AInsts()) {
-    if (MFI->mayNeedAGPRs()) {
-      MaxNumVGPRs /= 2;
-      MaxNumAGPRs = MaxNumVGPRs;
+    unsigned MinNumAGPRs = 0;
+    const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs();
+    const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
+
+    const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u};
+
+    // TODO: Replace amdgpu-no-agpr with amdgpu-agpr-alloc=0
+    // TODO: Move this logic into subtarget on IR function
+    //
+    // TODO: The lower bound should probably force the number of required
+    // registers up, overriding amdgpu-waves-per-eu.
+    std::tie(MinNumAGPRs, MaxNumAGPRs) = AMDGPU::getIntegerPairAttribute(
+        MF.getFunction(), "amdgpu-agpr-alloc", DefaultNumAGPR,
+        /*OnlyFirstRequired=*/true);
+
+    if (MinNumAGPRs == DefaultNumAGPR.first) {
+      // Default to splitting half the registers if AGPRs are required.
+
+      if (MFI->mayNeedAGPRs())
+        MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2;
+      else
+        MinNumAGPRs = 0;
     } else {
-      if (MaxNumVGPRs > TotalNumVGPRs) {
-        MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs;
-        MaxNumVGPRs = TotalNumVGPRs;
-      } else
-        MaxNumAGPRs = 0;
+      // Align to accum_offset's allocation granularity.
+      MinNumAGPRs = alignTo(MinNumAGPRs, 4);
+
+      MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs);
     }
+
+    // Clamp values to be inbounds of our limits, and ensure min <= max.
+
+    MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs);
+    MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs);
+
+    MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs);
+    MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs);
+
+    assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs &&
+           MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs &&
+           "invalid register counts");
+  } else if (ST.hasMAIInsts()) {
+    // On gfx908 the number of AGPRs always equals the number of VGPRs.
+    MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs;
   }
 
   return std::pair(MaxNumVGPRs, MaxNumAGPRs);

diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll
@@ -0,0 +1,49 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefixes=CHECK,GFX908 %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s 2> %t.err | FileCheck -check-prefixes=CHECK,GFX90A %s
+; RUN: FileCheck -check-prefix=ERR < %t.err %s
+
+; Test undefined behavior where a function ends up needing AGPRs that
+; was marked with "amdgpu-agpr-alloc="="0". There should be no asserts.
+
+; TODO: Should this be an error, or let UB happen?
+
+; ERR: error: <unknown>:0:0: no registers from class available to allocate in function 'kernel_illegal_agpr_use_asm'
+; ERR: error: <unknown>:0:0: no registers from class available to allocate in function 'func_illegal_agpr_use_asm'
+; ERR: error: <unknown>:0:0: no registers from class available to allocate in function 'kernel_calls_mfma.f32.32x32x1f32'
+
+; CHECK: {{^}}kernel_illegal_agpr_use_asm:
+; CHECK: ; use a0
+
+; CHECK: NumVgprs: 0
+; CHECK: NumAgprs: 1
+define amdgpu_kernel void @kernel_illegal_agpr_use_asm() #0 {
+  call void asm sideeffect "; use $0", "a"(i32 poison)
+  ret void
+}
+
+; CHECK: {{^}}func_illegal_agpr_use_asm:
+; CHECK: ; use a0
+
+; CHECK: NumVgprs: 0
+; CHECK: NumAgprs: 1
+define void @func_illegal_agpr_use_asm() #0 {
+  call void asm sideeffect "; use $0", "a"(i32 poison)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}kernel_calls_mfma.f32.32x32x1f32:
+; CHECK: v_accvgpr_write_b32
+
+; GFX908: NumVgprs: 5
+; GFX90A: NumVgprs: 36
+; CHECK: NumAgprs: 32
+
+; GFX908: TotalNumVgprs: 32
+; GFX90A: TotalNumVgprs: 68
+define amdgpu_kernel void @kernel_calls_mfma.f32.32x32x1f32(ptr addrspace(1) %out, float %a, float %b, <32 x float> %c) #0 {
+  %result = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %a, float %b, <32 x float> %c, i32 0, i32 0, i32 0)
+  store <32 x float> %result, ptr addrspace(1) %out
+  ret void
+}
+
+attributes #0 = { "amdgpu-agpr-alloc"="0" }