diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp index 93ed77bb6f7ef..f6a63502ec2cd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp @@ -64,13 +64,19 @@ class AMDGPUInsertSingleUseVDST : public MachineFunctionPass { bool InstructionEmitted = false; for (MachineBasicBlock &MBB : MF) { - DenseMap RegisterUseCount; // TODO: MCRegUnits + DenseMap RegisterUseCount; // Handle boundaries at the end of basic block separately to avoid // false positives. If they are live at the end of a basic block then // assume it has more uses later on. - for (const auto &Liveouts : MBB.liveouts()) - RegisterUseCount[Liveouts.PhysReg] = 2; + for (const auto &Liveout : MBB.liveouts()) { + for (MCRegUnitMaskIterator Units(Liveout.PhysReg, TRI); Units.isValid(); + ++Units) { + const auto [Unit, Mask] = *Units; + if ((Mask & Liveout.LaneMask).any()) + RegisterUseCount[Unit] = 2; + } + } for (MachineInstr &MI : reverse(MBB.instrs())) { // All registers in all operands need to be single use for an @@ -84,7 +90,8 @@ class AMDGPUInsertSingleUseVDST : public MachineFunctionPass { // Count the number of times each register is read. if (Operand.readsReg()) - RegisterUseCount[Reg]++; + for (const MCRegUnit &Unit : TRI->regunits(Reg)) + RegisterUseCount[Unit]++; // Do not attempt to optimise across exec mask changes. if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) { @@ -96,10 +103,16 @@ class AMDGPUInsertSingleUseVDST : public MachineFunctionPass { // check if the operands are single use. if (!MI.modifiesRegister(Reg, TRI)) continue; - if (RegisterUseCount[Reg] > 1) + + const auto RegUnits = TRI->regunits(Reg); + if (any_of(RegUnits, [&RegisterUseCount](const MCRegUnit &Unit) { + return RegisterUseCount[Unit] > 1; + })) AllProducerOperandsAreSingleUse = false; + // Reset uses count when a register is no longer live. - RegisterUseCount.erase(Reg); + for (const MCRegUnit &Unit : RegUnits) + RegisterUseCount.erase(Unit); } if (AllProducerOperandsAreSingleUse && SIInstrInfo::isVALU(MI)) { // TODO: Replace with candidate logging for instruction grouping diff --git a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir index 833699b4656b6..135a101822bfa 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir @@ -521,9 +521,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_SINGLEUSE_VDST 1 ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: S_SINGLEUSE_VDST 1 ; CHECK-NEXT: $vgpr1_lo16 = V_MOV_B16_t16_e32 $vgpr0_lo16, implicit $exec - ; CHECK-NEXT: S_SINGLEUSE_VDST 1 ; CHECK-NEXT: $vgpr1_hi16 = V_MOV_B16_t16_e32 $vgpr0_hi16, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: @@ -582,6 +580,31 @@ body: | liveins: $vgpr1 ... +# Write low 16-bits and then read 32-bit vgpr twice. +--- +name: write_lo_read_full_twice +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: write_lo_read_full_twice + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec + ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: liveins: $vgpr1, $vgpr2 + bb.0: + liveins: $vgpr0 + $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec + $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec + bb.1: + liveins: $vgpr1, $vgpr2 +... + # Write high 16-bits and then read 32-bit vgpr. --- name: write_hi_read_full @@ -605,3 +628,78 @@ body: | bb.1: liveins: $vgpr1 ... + +# Write high 16-bits and then read 32-bit vgpr twice. +--- +name: write_hi_read_full_twice +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: write_hi_read_full_twice + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec + ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: liveins: $vgpr1, $vgpr2 + bb.0: + liveins: $vgpr0 + $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec + $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec + bb.1: + liveins: $vgpr1, $vgpr2 +... + +# Write low 16-bits and then write high 16-bits and then read 32-bit vgpr. +--- +name: write_both_read_full +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: write_both_read_full + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec + ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: liveins: $vgpr1 + bb.0: + $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec + $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec + $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + bb.1: + liveins: $vgpr1 +... + +# Write low 16-bits and then write high 16-bits and then read 32-bit vgpr twice. +--- +name: write_both_read_full_twice +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: write_both_read_full_twice + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec + ; CHECK-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec + ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: liveins: $vgpr1, $vgpr2 + bb.0: + $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec + $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec + $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec + bb.1: + liveins: $vgpr1, $vgpr2 +...