diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index bbecc7a6ddaee..6f7b64e663e84 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -5659,6 +5659,13 @@ class TargetLowering : public TargetLoweringBase { LoadSDNode *OriginalLoad, SelectionDAG &DAG) const; + /// Return the value of nullptr. In most cases, nullptr is a zero-value + /// constant with the corresponding pointer type. However, this is not always + /// the case. For certain address spaces on some targets, it could be a value + /// like ~0U. + virtual SDValue getNullPtrValue(unsigned AS, const SDLoc &DL, + SelectionDAG &DAG) const; + private: SDValue foldSetCCWithAnd(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, const SDLoc &DL, DAGCombinerInfo &DCI) const; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 7178f6398bede..adc1d531826e7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -1807,8 +1807,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) { if (isa(C)) { unsigned AS = V->getType()->getPointerAddressSpace(); - return DAG.getConstant(0, getCurSDLoc(), - TLI.getPointerTy(DAG.getDataLayout(), AS)); + return TLI.getNullPtrValue(AS, getCurSDLoc(), DAG); } if (match(C, m_VScale())) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index adfb96041c5c0..13220a8e9cf12 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -12188,3 +12188,8 @@ SDValue TargetLowering::scalarizeExtractedVectorLoad(EVT ResultVT, return Load; } + +SDValue TargetLowering::getNullPtrValue(unsigned AS, const SDLoc &DL, + SelectionDAG &DAG) const { + return DAG.getConstant(0, DL, getPointerTy(DAG.getDataLayout(), AS)); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 792e17eeedab1..021f602a56ed7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -6050,3 +6050,12 @@ bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const { return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks } + +SDValue AMDGPUTargetLowering::getNullPtrValue(unsigned AS, const SDLoc &DL, + SelectionDAG &DAG) const { + if (AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::LOCAL_ADDRESS) { + return DAG.getConstant(0xffffffff, DL, + getPointerTy(DAG.getDataLayout(), AS)); + } + return TargetLowering::getNullPtrValue(AS, DL, DAG); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index c74dc7942f52c..9e6b2eecb5c28 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -387,6 +387,9 @@ class AMDGPUTargetLowering : public TargetLowering { MVT getFenceOperandTy(const DataLayout &DL) const override { return MVT::i32; } + + SDValue getNullPtrValue(unsigned AS, const SDLoc &DL, + SelectionDAG &DAG) const override; }; namespace AMDGPUISD { diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index 4ce46bbaf45ac..9c4fb346a1fb7 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -521,11 +521,12 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX908-NEXT: s_sub_i32 s8, 0, s3 -; GFX908-NEXT: v_cvt_f32_f16_e32 v17, s7 -; GFX908-NEXT: v_mov_b32_e32 v19, 0 +; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s7 +; GFX908-NEXT: v_mov_b32_e32 v17, 0 ; GFX908-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0 ; GFX908-NEXT: v_mov_b32_e32 v1, 0 +; GFX908-NEXT: v_mov_b32_e32 v20, -1 ; GFX908-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX908-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -544,7 +545,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_cmp_ge_u32 s2, s3 ; GFX908-NEXT: s_cselect_b32 s8, s10, s8 ; GFX908-NEXT: s_lshr_b32 s7, s7, 16 -; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s7 +; GFX908-NEXT: v_cvt_f32_f16_e32 v19, s7 ; GFX908-NEXT: s_lshl_b64 s[2:3], s[0:1], 5 ; GFX908-NEXT: s_lshl_b64 s[12:13], s[8:9], 5 ; GFX908-NEXT: s_lshl_b64 s[10:11], s[4:5], 5 @@ -611,37 +612,37 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: s_add_u32 s20, s18, s7 ; GFX908-NEXT: s_addc_u32 s21, s19, s9 -; GFX908-NEXT: global_load_dword v21, v19, s[20:21] offset:-12 glc +; GFX908-NEXT: global_load_dword v22, v17, s[20:21] offset:-12 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v20, v19, s[20:21] offset:-8 glc +; GFX908-NEXT: global_load_dword v21, v17, s[20:21] offset:-8 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v12, v19, s[20:21] offset:-4 glc +; GFX908-NEXT: global_load_dword v12, v17, s[20:21] offset:-4 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v12, v19, s[20:21] glc +; GFX908-NEXT: global_load_dword v12, v17, s[20:21] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: ds_read_b64 v[12:13], v19 +; GFX908-NEXT: ds_read_b64 v[12:13], v20 ; GFX908-NEXT: ds_read_b64 v[14:15], v0 ; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1] ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_cbranch_vccnz .LBB3_7 ; GFX908-NEXT: ; %bb.6: ; %bb51 ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 -; GFX908-NEXT: v_cvt_f32_f16_sdwa v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX908-NEXT: v_cvt_f32_f16_sdwa v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX908-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GFX908-NEXT: v_cvt_f32_f16_sdwa v24, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX908-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GFX908-NEXT: v_cvt_f32_f16_sdwa v23, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX908-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GFX908-NEXT: v_add_f32_e32 v24, v17, v12 -; GFX908-NEXT: v_add_f32_e32 v25, v18, v13 -; GFX908-NEXT: v_add_f32_e32 v26, 0, v12 -; GFX908-NEXT: v_add_f32_e32 v27, 0, v13 -; GFX908-NEXT: v_add_f32_e32 v15, v22, v15 -; GFX908-NEXT: v_add_f32_e32 v14, v21, v14 -; GFX908-NEXT: v_add_f32_e32 v13, v23, v13 -; GFX908-NEXT: v_add_f32_e32 v12, v20, v12 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v25 -; GFX908-NEXT: v_add_f32_e32 v4, v4, v24 -; GFX908-NEXT: v_add_f32_e32 v7, v7, v27 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v26 +; GFX908-NEXT: v_add_f32_e32 v25, v18, v12 +; GFX908-NEXT: v_add_f32_e32 v26, v19, v13 +; GFX908-NEXT: v_add_f32_e32 v27, 0, v12 +; GFX908-NEXT: v_add_f32_e32 v28, 0, v13 +; GFX908-NEXT: v_add_f32_e32 v15, v23, v15 +; GFX908-NEXT: v_add_f32_e32 v14, v22, v14 +; GFX908-NEXT: v_add_f32_e32 v13, v24, v13 +; GFX908-NEXT: v_add_f32_e32 v12, v21, v12 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v26 +; GFX908-NEXT: v_add_f32_e32 v4, v4, v25 +; GFX908-NEXT: v_add_f32_e32 v7, v7, v28 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v27 ; GFX908-NEXT: v_add_f32_e32 v8, v8, v14 ; GFX908-NEXT: v_add_f32_e32 v9, v9, v15 ; GFX908-NEXT: v_add_f32_e32 v10, v10, v12 @@ -686,6 +687,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX90A-NEXT: s_sub_i32 s8, 0, s3 ; GFX90A-NEXT: v_mov_b32_e32 v19, 0 +; GFX90A-NEXT: v_mov_b32_e32 v20, -1 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GFX90A-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 @@ -770,15 +772,15 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: s_add_u32 s20, s18, s7 ; GFX90A-NEXT: s_addc_u32 s21, s19, s9 -; GFX90A-NEXT: global_load_dword v21, v19, s[20:21] offset:-12 glc +; GFX90A-NEXT: global_load_dword v22, v19, s[20:21] offset:-12 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v20, v19, s[20:21] offset:-8 glc +; GFX90A-NEXT: global_load_dword v21, v19, s[20:21] offset:-8 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_load_dword v14, v19, s[20:21] offset:-4 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_load_dword v14, v19, s[20:21] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: ds_read_b64 v[14:15], v19 +; GFX90A-NEXT: ds_read_b64 v[14:15], v20 ; GFX90A-NEXT: ds_read_b64 v[16:17], v0 ; GFX90A-NEXT: s_and_b64 vcc, exec, s[0:1] ; GFX90A-NEXT: ; kill: killed $sgpr20 killed $sgpr21 @@ -786,16 +788,16 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_cbranch_vccnz .LBB3_7 ; GFX90A-NEXT: ; %bb.6: ; %bb51 ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 -; GFX90A-NEXT: v_cvt_f32_f16_sdwa v23, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v22, v21 -; GFX90A-NEXT: v_cvt_f32_f16_sdwa v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GFX90A-NEXT: v_pk_add_f32 v[24:25], v[2:3], v[14:15] -; GFX90A-NEXT: v_pk_add_f32 v[26:27], v[14:15], 0 op_sel_hi:[1,0] +; GFX90A-NEXT: v_cvt_f32_f16_sdwa v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GFX90A-NEXT: v_cvt_f32_f16_sdwa v25, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v24, v21 +; GFX90A-NEXT: v_pk_add_f32 v[26:27], v[2:3], v[14:15] +; GFX90A-NEXT: v_pk_add_f32 v[28:29], v[14:15], 0 op_sel_hi:[1,0] ; GFX90A-NEXT: v_pk_add_f32 v[16:17], v[22:23], v[16:17] -; GFX90A-NEXT: v_pk_add_f32 v[14:15], v[20:21], v[14:15] -; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[24:25] -; GFX90A-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[26:27] +; GFX90A-NEXT: v_pk_add_f32 v[14:15], v[24:25], v[14:15] +; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[26:27] +; GFX90A-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[28:29] ; GFX90A-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[16:17] ; GFX90A-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[14:15] ; GFX90A-NEXT: s_mov_b64 s[20:21], -1 diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll index f9ffa5ae57f3e..cd38381d3520d 100644 --- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll +++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll @@ -73,11 +73,12 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext ; CHECK-NEXT: s_mov_b32 s38, s36 ; CHECK-NEXT: s_mov_b32 s39, s36 ; CHECK-NEXT: .LBB0_7: ; %if.end294.i.i -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CHECK-NEXT: v_mov_b32_e32 v0, -1 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:11 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:7 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:3 ; CHECK-NEXT: .LBB0_8: ; %kernel_direct_lighting.exit ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x20 ; CHECK-NEXT: v_mov_b32_e32 v0, s36 diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 98136347ab702..6de9fc050c821 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -25,8 +25,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr33, 8, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_CSELECT_B64 -1, 0, implicit killed $scc ; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_XOR_B64 killed renamable $sgpr18_sgpr19, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 -1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) null`, align 8, addrspace 3) ; GFX90A-NEXT: renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr2 = DS_READ_B32_gfx9 renamable $vgpr3, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) null`, align 8, addrspace 3) ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr28_sgpr29, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCZ %bb.2, implicit $vcc @@ -143,8 +144,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: successors: %bb.10(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: renamable $vgpr2 = V_MOV_B32_e32 -1, implicit $exec + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFEN renamable $vgpr8, killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 3, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.10.Flow33: @@ -160,8 +162,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: successors: %bb.12(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: renamable $vgpr2 = V_MOV_B32_e32 -1, implicit $exec + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFEN renamable $vgpr6, killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 3, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.12.Flow34: @@ -177,8 +180,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: successors: %bb.14(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: renamable $vgpr2 = V_MOV_B32_e32 -1, implicit $exec + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFEN renamable $vgpr4, killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 3, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.14.Flow35: @@ -217,8 +221,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: successors: %bb.18(0x80000000) ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 -1, implicit $exec + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFEN renamable $vgpr46, killed renamable $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 3, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.18.Flow37: @@ -234,8 +239,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: successors: %bb.20(0x80000000) ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 -1, implicit $exec + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFEN renamable $vgpr62, killed renamable $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 3, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.20.Flow38: @@ -251,8 +257,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: successors: %bb.22(0x80000000) ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 -1, implicit $exec + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFEN renamable $vgpr60, killed renamable $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 3, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.22.Flow39: @@ -268,8 +275,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: successors: %bb.24(0x80000000) ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 -1, implicit $exec + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFEN renamable $vgpr58, killed renamable $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 3, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.24.Flow40: @@ -285,8 +293,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: successors: %bb.26(0x80000000) ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 -1, implicit $exec + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFEN renamable $vgpr56, killed renamable $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 3, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.26.Flow41: @@ -302,8 +311,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: successors: %bb.28(0x80000000) ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 -1, implicit $exec + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFEN renamable $vgpr44, killed renamable $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 3, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.28.Flow42: @@ -327,8 +337,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: successors: %bb.31(0x80000000) ; GFX90A-NEXT: liveins: $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 -1, implicit $exec + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFEN renamable $vgpr40, killed renamable $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 3, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.31.Flow44: @@ -353,8 +364,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: successors: %bb.29(0x80000000) ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 -1, implicit $exec + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFEN renamable $vgpr42, killed renamable $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 3, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.29 ; GFX90A-NEXT: {{ $}} @@ -766,7 +778,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec - ; GFX90A-NEXT: renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10 = V_MOV_B32_e32 -1, implicit $exec ; GFX90A-NEXT: renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr21, implicit $exec ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) @@ -823,7 +835,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: successors: %bb.3(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 -1, implicit $exec ; GFX90A-NEXT: renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr23, implicit $exec ; GFX90A-NEXT: renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.434, addrspace 3) @@ -933,21 +945,22 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr17, 0, $vgpr3, 0, 0, 6, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr15, implicit $exec - ; GFX90A-NEXT: DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr10 = V_MOV_B32_e32 -1, implicit $exec + ; GFX90A-NEXT: DS_WRITE2_B32_gfx9 killed renamable $vgpr10, killed renamable $vgpr2, killed renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3) ; GFX90A-NEXT: S_BRANCH %bb.65 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.68.bb174: ; GFX90A-NEXT: successors: %bb.72(0x40000000), %bb.69(0x40000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr26 = V_OR_B32_e32 1, $vgpr24, implicit $exec - ; GFX90A-NEXT: renamable $vgpr48 = V_OR_B32_e32 $vgpr26, $vgpr22, implicit $exec - ; GFX90A-NEXT: renamable $vgpr34 = V_OR_B32_e32 $vgpr48, $vgpr20, implicit $exec - ; GFX90A-NEXT: renamable $vgpr28 = V_CNDMASK_B32_e64 0, $vgpr34, 0, 0, $sgpr12_sgpr13, implicit $exec - ; GFX90A-NEXT: renamable $vgpr38 = V_OR_B32_e32 $vgpr28, $vgpr18, implicit $exec - ; GFX90A-NEXT: renamable $vgpr36 = V_OR_B32_e32 $vgpr38, $vgpr10, implicit $exec - ; GFX90A-NEXT: renamable $vgpr32 = V_OR_B32_e32 $vgpr36, $vgpr12, implicit $exec - ; GFX90A-NEXT: renamable $vgpr50 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr32, killed $sgpr12_sgpr13, implicit $exec + ; GFX90A-NEXT: renamable $vgpr50 = V_OR_B32_e32 1, $vgpr24, implicit $exec + ; GFX90A-NEXT: renamable $vgpr38 = V_OR_B32_e32 $vgpr50, $vgpr22, implicit $exec + ; GFX90A-NEXT: renamable $vgpr32 = V_OR_B32_e32 $vgpr38, $vgpr20, implicit $exec + ; GFX90A-NEXT: renamable $vgpr26 = V_CNDMASK_B32_e64 0, $vgpr32, 0, 0, $sgpr12_sgpr13, implicit $exec + ; GFX90A-NEXT: renamable $vgpr36 = V_OR_B32_e32 $vgpr26, $vgpr18, implicit $exec + ; GFX90A-NEXT: renamable $vgpr34 = V_OR_B32_e32 $vgpr36, $vgpr10, implicit $exec + ; GFX90A-NEXT: renamable $vgpr28 = V_OR_B32_e32 $vgpr34, $vgpr12, implicit $exec + ; GFX90A-NEXT: renamable $vgpr48 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr28, killed $sgpr12_sgpr13, implicit $exec ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr28_sgpr29, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.72, implicit $vcc @@ -967,26 +980,27 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr27, implicit $exec ; GFX90A-NEXT: renamable $vgpr2, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr26, $vgpr2, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr3, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr10, killed $vgpr3, killed $vcc, 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr27 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr49 = COPY renamable $vgpr27, implicit $exec - ; GFX90A-NEXT: renamable $vgpr35 = COPY renamable $vgpr27, implicit $exec - ; GFX90A-NEXT: renamable $vgpr39 = COPY renamable $vgpr27, implicit $exec - ; GFX90A-NEXT: renamable $vgpr37 = COPY renamable $vgpr27, implicit $exec - ; GFX90A-NEXT: renamable $vgpr29 = COPY renamable $vgpr27, implicit $exec - ; GFX90A-NEXT: renamable $vgpr51 = COPY renamable $vgpr27, implicit $exec - ; GFX90A-NEXT: renamable $vgpr33 = COPY renamable $vgpr27, implicit $exec - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr27, renamable $vgpr26_vgpr27, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr21, implicit $exec - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr12 = COPY killed renamable $sgpr22, implicit $exec - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr12, killed renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into %ir.8, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr27, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr27, killed renamable $vgpr28_vgpr29, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr10, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr27, killed renamable $vgpr32_vgpr33, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: renamable $vgpr51 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr39 = COPY renamable $vgpr51, implicit $exec + ; GFX90A-NEXT: renamable $vgpr33 = COPY renamable $vgpr51, implicit $exec + ; GFX90A-NEXT: renamable $vgpr37 = COPY renamable $vgpr51, implicit $exec + ; GFX90A-NEXT: renamable $vgpr35 = COPY renamable $vgpr51, implicit $exec + ; GFX90A-NEXT: renamable $vgpr27 = COPY renamable $vgpr51, implicit $exec + ; GFX90A-NEXT: renamable $vgpr49 = COPY renamable $vgpr51, implicit $exec + ; GFX90A-NEXT: renamable $vgpr29 = COPY renamable $vgpr51, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10 = V_MOV_B32_e32 -1, implicit $exec + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr12 = COPY renamable $sgpr21, implicit $exec + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr12, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr14 = COPY killed renamable $sgpr22, implicit $exec + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr14, killed renamable $vgpr32_vgpr33, 0, 0, implicit $exec :: (store (s64) into %ir.8, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr12, killed renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr26_vgpr27, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr12, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr28_vgpr29, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFEN killed renamable $vgpr2, killed renamable $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 3, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.71.Flow9: ; GFX90A-NEXT: successors: %bb.63(0x80000000) @@ -999,10 +1013,11 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: successors: %bb.69(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 $vgpr50, killed $vgpr16, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 $vgpr48, killed $vgpr16, implicit $exec ; GFX90A-NEXT: renamable $vgpr54 = V_OR_B32_e32 killed $vgpr10, killed $vgpr14, implicit $exec ; GFX90A-NEXT: renamable $vgpr55 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr55, renamable $vgpr54_vgpr55, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr10 = V_MOV_B32_e32 -1, implicit $exec + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr10, killed renamable $vgpr54_vgpr55, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_MOV_B64 0 ; GFX90A-NEXT: S_BRANCH %bb.69 bb: diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll index fa4e82da1d18e..004261768ede7 100644 --- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -365,7 +365,7 @@ for.body: define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN-LABEL: loop_arg_0: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v0, -1 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_read_u8 v0, v0 ; GCN-NEXT: s_load_dword s4, s[4:5], 0x9 @@ -401,7 +401,7 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 -; GCN_DBG-NEXT: v_mov_b32_e32 v0, 0 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, -1 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 ; GCN_DBG-NEXT: ds_read_u8 v0, v0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll index 21a2ae80574e0..90ef3737e4aa8 100644 --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -11,8 +11,8 @@ define <2 x half> @chain_hi_to_lo_private() { ; GFX900: ; %bb.0: ; %bb ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], 0 +; GFX900-NEXT: v_mov_b32_e32 v1, -1 +; GFX900-NEXT: buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -21,7 +21,7 @@ define <2 x half> @chain_hi_to_lo_private() { ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FLATSCR-NEXT: s_mov_b32 s0, 2 ; FLATSCR-NEXT: scratch_load_ushort v0, off, s0 -; FLATSCR-NEXT: s_mov_b32 s0, 0 +; FLATSCR-NEXT: s_mov_b32 s0, -1 ; FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] @@ -29,9 +29,9 @@ define <2 x half> @chain_hi_to_lo_private() { ; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private: ; GFX10_DEFAULT: ; %bb.0: ; %bb ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_DEFAULT-NEXT: s_clause 0x1 ; GFX10_DEFAULT-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 -; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], 0 +; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v1, -1 +; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) ; GFX10_DEFAULT-NEXT: s_setpc_b64 s[30:31] ; @@ -41,7 +41,7 @@ define <2 x half> @chain_hi_to_lo_private() { ; FLATSCR_GFX10-NEXT: s_mov_b32 s0, 2 ; FLATSCR_GFX10-NEXT: scratch_load_ushort v0, off, s0 ; FLATSCR_GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; FLATSCR_GFX10-NEXT: s_mov_b32 s0, 0 +; FLATSCR_GFX10-NEXT: s_mov_b32 s0, -1 ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v0, off, s0 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] @@ -51,7 +51,7 @@ define <2 x half> @chain_hi_to_lo_private() { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, 2 ; GFX11-NEXT: scratch_load_u16 v0, off, s0 -; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_mov_b32 s0, -1 ; GFX11-NEXT: scratch_load_d16_hi_b16 v0, off, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -178,8 +178,9 @@ define <2 x half> @chain_hi_to_lo_group() { ; GCN-LABEL: chain_hi_to_lo_group: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: ds_read_u16 v0, v1 offset:2 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: ds_read_u16 v0, v0 offset:2 +; GCN-NEXT: v_mov_b32_e32 v1, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_read_u16_d16_hi v0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -188,8 +189,9 @@ define <2 x half> @chain_hi_to_lo_group() { ; GFX10-LABEL: chain_hi_to_lo_group: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: ds_read_u16 v0, v1 offset:2 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, -1 +; GFX10-NEXT: ds_read_u16 v0, v0 offset:2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_read_u16_d16_hi v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -198,8 +200,8 @@ define <2 x half> @chain_hi_to_lo_group() { ; GFX11-LABEL: chain_hi_to_lo_group: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: ds_load_u16 v0, v1 offset:2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1 +; GFX11-NEXT: ds_load_u16 v0, v0 offset:2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ds_load_u16_d16_hi v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index 50c9c0cb64ccd..d1cadc3131441 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -35,7 +35,7 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-NEXT: .LBB0_3: ; %bb.outer.end ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: v_mov_b32_e32 v0, 3 -; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v1, -1 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_write_b32 v1, v0 ; GCN-NEXT: s_endpgm @@ -142,7 +142,7 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_readlane_b32 s1, v4, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: v_mov_b32_e32 v1, 3 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, -1 ; GCN-O0-NEXT: s_mov_b32 m0, -1 ; GCN-O0-NEXT: ds_write_b32 v0, v1 ; GCN-O0-NEXT: s_endpgm @@ -204,7 +204,7 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 3 -; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v1, -1 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_write_b32 v1, v0 ; GCN-NEXT: s_endpgm @@ -332,7 +332,7 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: s_branch .LBB1_3 ; GCN-O0-NEXT: .LBB1_5: ; %bb.outer.end ; GCN-O0-NEXT: v_mov_b32_e32 v1, 3 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, -1 ; GCN-O0-NEXT: s_mov_b32 m0, -1 ; GCN-O0-NEXT: ds_write_b32 v0, v1 ; GCN-O0-NEXT: s_endpgm @@ -378,9 +378,10 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GCN-NEXT: s_cbranch_execz .LBB2_5 ; GCN-NEXT: ; %bb.1: ; %bb.outer.then -; GCN-NEXT: v_mov_b32_e32 v4, s1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s0, v1 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s0, v1 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0 ; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GCN-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -391,8 +392,8 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 ; GCN-NEXT: v_mov_b32_e32 v0, 2 -; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 offset:8 -; GCN-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 offset:8 +; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GCN-NEXT: .LBB2_3: ; %Flow ; GCN-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GCN-NEXT: s_cbranch_execz .LBB2_5 @@ -403,13 +404,14 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-NEXT: s_mov_b32 s5, s6 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 1 -; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 offset:4 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 offset:4 ; GCN-NEXT: .LBB2_5: ; %bb.outer.end ; GCN-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 3 +; GCN-NEXT: v_mov_b32_e32 v1, -1 ; GCN-NEXT: s_mov_b32 m0, -1 -; GCN-NEXT: ds_write_b32 v2, v0 +; GCN-NEXT: ds_write_b32 v1, v0 ; GCN-NEXT: s_endpgm ; ; GCN-O0-LABEL: nested_if_if_else: @@ -560,7 +562,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_readlane_b32 s1, v4, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: v_mov_b32_e32 v1, 3 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, -1 ; GCN-O0-NEXT: s_mov_b32 m0, -1 ; GCN-O0-NEXT: ds_write_b32 v0, v1 ; GCN-O0-NEXT: s_endpgm @@ -649,7 +651,7 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 3 -; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v1, -1 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_write_b32 v1, v0 ; GCN-NEXT: s_endpgm @@ -842,7 +844,7 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_readlane_b32 s1, v6, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: v_mov_b32_e32 v1, 3 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, -1 ; GCN-O0-NEXT: s_mov_b32 m0, -1 ; GCN-O0-NEXT: ds_write_b32 v0, v1 ; GCN-O0-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll index 9104dc68eb9b4..8dacf832d7502 100644 --- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll +++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll @@ -31,7 +31,8 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK-NEXT: s_and_b64 s[4:5], exec, s[4:5] ; CHECK-NEXT: s_and_b64 s[6:7], exec, s[10:11] ; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, -1 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_branch .LBB0_3 ; CHECK-NEXT: .LBB0_1: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_mov_b64 s[18:19], 0 @@ -98,8 +99,8 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK-NEXT: s_cbranch_vccnz .LBB0_15 ; CHECK-NEXT: ; %bb.14: ; %bb15 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:4 -; CHECK-NEXT: buffer_store_dword v0, off, s[24:27], 0 +; CHECK-NEXT: buffer_store_dword v1, v0, s[24:27], 0 offen +; CHECK-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:3 ; CHECK-NEXT: .LBB0_15: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_mov_b64 s[20:21], 0 diff --git a/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll b/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll index 4aa49f2c9296d..5e95ac9b619fe 100644 --- a/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll +++ b/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll @@ -4,11 +4,12 @@ define amdgpu_kernel void @foo() { ; CHECK-LABEL: foo: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[0:1], src_shared_base -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s1 -; CHECK-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v0 -; CHECK-NEXT: flat_store_b64 v[0:1], v[2:3] +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_mov_b32_e32 v3, 0 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) +; CHECK-NEXT: v_mov_b32_e32 v1, v0 +; CHECK-NEXT: flat_store_b64 v[2:3], v[0:1] ; CHECK-NEXT: s_endpgm entry: br label %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll index 6cb2d6d55ea32..1f274c9589313 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll @@ -451,7 +451,7 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace( ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v0, -1, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 @@ -472,7 +472,7 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace( ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v0, -1, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll index 0c61c58ef0619..6cd704e4a08bc 100644 --- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll @@ -12,7 +12,7 @@ define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lo(ptr addrspace(3) noalias % ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, -1 ; GFX900-NEXT: ds_write_b16 v0, v2 ; GFX900-NEXT: s_waitcnt lgkmcnt(1) ; GFX900-NEXT: v_mov_b32_e32 v0, v1 @@ -25,7 +25,7 @@ define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lo(ptr addrspace(3) noalias % ; GFX906-NEXT: ds_read_u16 v1, v0 ; GFX906-NEXT: ds_read_u16 v0, v0 offset:16 ; GFX906-NEXT: s_mov_b32 s4, 0x5040100 -; GFX906-NEXT: v_mov_b32_e32 v2, 0 +; GFX906-NEXT: v_mov_b32_e32 v2, -1 ; GFX906-NEXT: s_waitcnt lgkmcnt(1) ; GFX906-NEXT: ds_write_b16 v2, v1 ; GFX906-NEXT: s_waitcnt lgkmcnt(1) @@ -39,7 +39,7 @@ define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lo(ptr addrspace(3) noalias % ; GFX803-NEXT: s_mov_b32 m0, -1 ; GFX803-NEXT: ds_read_u16 v1, v0 ; GFX803-NEXT: ds_read_u16 v0, v0 offset:16 -; GFX803-NEXT: v_mov_b32_e32 v2, 0 +; GFX803-NEXT: v_mov_b32_e32 v2, -1 ; GFX803-NEXT: s_waitcnt lgkmcnt(1) ; GFX803-NEXT: ds_write_b16 v2, v1 ; GFX803-NEXT: s_waitcnt lgkmcnt(1) @@ -55,7 +55,7 @@ define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lo(ptr addrspace(3) noalias % ; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-FLATSCR-NEXT: ds_read_u16_d16_hi v1, v0 offset:16 -; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v0, -1 ; GFX900-FLATSCR-NEXT: ds_write_b16 v0, v2 ; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(1) ; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v0, v1 @@ -78,7 +78,7 @@ define <2 x i16> @load_local_lo_hi_v2i16_multi_use_hi(ptr addrspace(3) noalias % ; GFX900-NEXT: ds_read_u16 v1, v0 offset:16 ; GFX900-NEXT: ds_read_u16 v0, v0 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, -1 ; GFX900-NEXT: s_waitcnt lgkmcnt(1) ; GFX900-NEXT: ds_write_b16 v2, v1 ; GFX900-NEXT: s_waitcnt lgkmcnt(1) @@ -92,7 +92,7 @@ define <2 x i16> @load_local_lo_hi_v2i16_multi_use_hi(ptr addrspace(3) noalias % ; GFX906-NEXT: ds_read_u16 v1, v0 offset:16 ; GFX906-NEXT: ds_read_u16 v0, v0 ; GFX906-NEXT: s_mov_b32 s4, 0x5040100 -; GFX906-NEXT: v_mov_b32_e32 v2, 0 +; GFX906-NEXT: v_mov_b32_e32 v2, -1 ; GFX906-NEXT: s_waitcnt lgkmcnt(1) ; GFX906-NEXT: ds_write_b16 v2, v1 ; GFX906-NEXT: s_waitcnt lgkmcnt(1) @@ -106,7 +106,7 @@ define <2 x i16> @load_local_lo_hi_v2i16_multi_use_hi(ptr addrspace(3) noalias % ; GFX803-NEXT: s_mov_b32 m0, -1 ; GFX803-NEXT: ds_read_u16 v1, v0 offset:16 ; GFX803-NEXT: ds_read_u16 v0, v0 -; GFX803-NEXT: v_mov_b32_e32 v2, 0 +; GFX803-NEXT: v_mov_b32_e32 v2, -1 ; GFX803-NEXT: s_waitcnt lgkmcnt(1) ; GFX803-NEXT: ds_write_b16 v2, v1 ; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -121,7 +121,7 @@ define <2 x i16> @load_local_lo_hi_v2i16_multi_use_hi(ptr addrspace(3) noalias % ; GFX900-FLATSCR-NEXT: ds_read_u16 v1, v0 offset:16 ; GFX900-FLATSCR-NEXT: ds_read_u16 v0, v0 ; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 -; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v2, -1 ; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(1) ; GFX900-FLATSCR-NEXT: ds_write_b16 v2, v1 ; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(1) diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll index 3ef86c13e150a..680006523b9fa 100644 --- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -593,7 +593,7 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(ptr addrspace(3) %in, < ; GFX900-MUBUF: ; %bb.0: ; %entry ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-MUBUF-NEXT: ds_read_u16 v0, v0 -; GFX900-MUBUF-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-MUBUF-NEXT: v_mov_b32_e32 v2, -1 ; GFX900-MUBUF-NEXT: s_mov_b32 s4, 0xffff ; GFX900-MUBUF-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-MUBUF-NEXT: ds_write_b16 v2, v0 @@ -606,7 +606,7 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(ptr addrspace(3) %in, < ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 -; GFX906-NEXT: v_mov_b32_e32 v2, 0 +; GFX906-NEXT: v_mov_b32_e32 v2, -1 ; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: ds_write_b16 v2, v0 @@ -620,7 +620,7 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(ptr addrspace(3) %in, < ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_mov_b32 m0, -1 ; GFX803-NEXT: ds_read_u16 v0, v0 -; GFX803-NEXT: v_mov_b32_e32 v2, 0 +; GFX803-NEXT: v_mov_b32_e32 v2, -1 ; GFX803-NEXT: s_mov_b32 s4, 0x3020504 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: ds_write_b16 v2, v0 @@ -633,7 +633,7 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(ptr addrspace(3) %in, < ; GFX900-FLATSCR: ; %bb.0: ; %entry ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-FLATSCR-NEXT: ds_read_u16 v0, v0 -; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v2, -1 ; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0xffff ; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-FLATSCR-NEXT: ds_write_b16 v2, v0 @@ -656,7 +656,7 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_hi(ptr addrspace(3) %in, < ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX900-NEXT: ds_read_u16_d16 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, -1 ; GFX900-NEXT: ds_write_b16 v0, v2 ; GFX900-NEXT: s_waitcnt lgkmcnt(1) ; GFX900-NEXT: global_store_dword v[0:1], v1, off @@ -669,7 +669,7 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_hi(ptr addrspace(3) %in, < ; GFX906-NEXT: ds_read_u16 v0, v0 ; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX906-NEXT: v_mov_b32_e32 v3, 0 +; GFX906-NEXT: v_mov_b32_e32 v3, -1 ; GFX906-NEXT: ds_write_b16 v3, v2 ; GFX906-NEXT: s_waitcnt lgkmcnt(1) ; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v1 @@ -684,7 +684,7 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_hi(ptr addrspace(3) %in, < ; GFX803-NEXT: ds_read_u16 v0, v0 ; GFX803-NEXT: s_mov_b32 s4, 0x3020504 ; GFX803-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX803-NEXT: v_mov_b32_e32 v3, 0 +; GFX803-NEXT: v_mov_b32_e32 v3, -1 ; GFX803-NEXT: ds_write_b16 v3, v2 ; GFX803-NEXT: s_waitcnt lgkmcnt(1) ; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll index b8e74bc7db09a..9fda59ceb2675 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll @@ -12,7 +12,7 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49 ; CHECK-NEXT: v_and_b32_e32 v3, 1, v3 ; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: v_cmp_eq_u32_e64 s4, 1, v1 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, -1 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 ; CHECK-NEXT: s_xor_b32 s6, s4, -1 ; CHECK-NEXT: s_inst_prefetch 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/nullptr-lowering.ll b/llvm/test/CodeGen/AMDGPU/nullptr-lowering.ll new file mode 100644 index 0000000000000..691cfd6270869 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/nullptr-lowering.ll @@ -0,0 +1,102 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s -o - | FileCheck %s + +define i32 @nullptr_p0(ptr %p) { +; CHECK-LABEL: nullptr_p0: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: ; %bb.1: ; %bb.1 +; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq ptr %p, null + br i1 %cmp, label %bb.0, label %bb.1 +bb.0: + ret i32 0 +bb.1: + ret i32 1 +} + +define i32 @nullptr_p1(ptr addrspace(1) %p) { +; CHECK-LABEL: nullptr_p1: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: ; %bb.1: ; %bb.1 +; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq ptr addrspace(1) %p, null + br i1 %cmp, label %bb.0, label %bb.1 +bb.0: + ret i32 0 +bb.1: + ret i32 1 +} + +define i32 @nullptr_p3(ptr addrspace(3) %p) { +; CHECK-LABEL: nullptr_p3: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: ; %bb.1: ; %bb.1 +; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq ptr addrspace(3) %p, null + br i1 %cmp, label %bb.0, label %bb.1 +bb.0: + ret i32 0 +bb.1: + ret i32 1 +} + +define i32 @nullptr_p4(ptr addrspace(4) %p) { +; CHECK-LABEL: nullptr_p4: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: ; %bb.1: ; %bb.1 +; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq ptr addrspace(4) %p, null + br i1 %cmp, label %bb.0, label %bb.1 +bb.0: + ret i32 0 +bb.1: + ret i32 1 +} + +define i32 @nullptr_p5(ptr addrspace(5) %p) { +; CHECK-LABEL: nullptr_p5: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: ; %bb.1: ; %bb.1 +; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq ptr addrspace(5) %p, null + br i1 %cmp, label %bb.0, label %bb.1 +bb.0: + ret i32 0 +bb.1: + ret i32 1 +} diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index a1197aeace86f..01c2e179d0c41 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -2560,12 +2560,8 @@ entry: define amdgpu_kernel void @negativeoffsetnullptr(ptr %buffer) { ; GFX8-LABEL: negativeoffsetnullptr: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s1, s[4:5], 0xec -; GFX8-NEXT: s_add_u32 s0, 0, -1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_addc_u32 s1, s1, -1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, -1 +; GFX8-NEXT: v_mov_b32_e32 v1, -1 ; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2579,32 +2575,27 @@ define amdgpu_kernel void @negativeoffsetnullptr(ptr %buffer) { ; GFX8-NEXT: ; %bb.2: ; %end ; GFX8-NEXT: s_endpgm ; -; GFX9-LABEL: negativeoffsetnullptr: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, -1, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 -; GFX9-NEXT: .LBB8_1: ; %branch -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_and_b64 s[2:3], exec, vcc -; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB8_1 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_endpgm +; GFX900-LABEL: negativeoffsetnullptr: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: v_mov_b32_e32 v0, -1 +; GFX900-NEXT: v_mov_b32_e32 v1, -1 +; GFX900-NEXT: flat_load_ubyte v0, v[0:1] +; GFX900-NEXT: s_mov_b64 s[0:1], 0 +; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 +; GFX900-NEXT: .LBB8_1: ; %branch +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_and_b64 s[2:3], exec, vcc +; GFX900-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX900-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX900-NEXT: s_cbranch_execnz .LBB8_1 +; GFX900-NEXT: ; %bb.2: ; %end +; GFX900-NEXT: s_endpgm ; ; GFX10-LABEL: negativeoffsetnullptr: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX10-NEXT: s_add_u32 s0, 0, -1 -; GFX10-NEXT: s_addc_u32 s1, s1, -1 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, -1 +; GFX10-NEXT: v_mov_b32_e32 v1, -1 ; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2618,12 +2609,26 @@ define amdgpu_kernel void @negativeoffsetnullptr(ptr %buffer) { ; GFX10-NEXT: ; %bb.2: ; %end ; GFX10-NEXT: s_endpgm ; +; GFX90A-LABEL: negativeoffsetnullptr: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], -1, -1 +; GFX90A-NEXT: flat_load_ubyte v0, v[0:1] +; GFX90A-NEXT: s_mov_b64 s[0:1], 0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 +; GFX90A-NEXT: .LBB8_1: ; %branch +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_and_b64 s[2:3], exec, vcc +; GFX90A-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: ; %bb.2: ; %end +; GFX90A-NEXT: s_endpgm +; ; GFX11-LABEL: negativeoffsetnullptr: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX11-NEXT: v_add_co_u32 v0, s0, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, -1 +; GFX11-NEXT: v_mov_b32_e32 v1, -1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_u8 v0, v[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll index 6eae905278f3e..a212b00263d98 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll @@ -30,7 +30,8 @@ define amdgpu_kernel void @widget(ptr addrspace(1) %arg, i1 %arg1, ptr addrspace ; CHECK-NEXT: s_cbranch_vccz .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %bb19 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: ds_write_b32 v1, v1 +; CHECK-NEXT: v_mov_b32_e32 v2, -1 +; CHECK-NEXT: ds_write_b32 v2, v1 ; CHECK-NEXT: .LBB0_2: ; %bb20 ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; CHECK-NEXT: s_mov_b32 s0, exec_lo diff --git a/llvm/test/CodeGen/AMDGPU/setcc-multiple-use.ll b/llvm/test/CodeGen/AMDGPU/setcc-multiple-use.ll index ace4907670d37..4bf798e661661 100644 --- a/llvm/test/CodeGen/AMDGPU/setcc-multiple-use.ll +++ b/llvm/test/CodeGen/AMDGPU/setcc-multiple-use.ll @@ -11,7 +11,7 @@ define i32 @f() { ; CHECK-LABEL: f: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, -1 ; CHECK-NEXT: ds_read_b32 v0, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll index 8f16fcf6d0890..4ad72a3cf7c1c 100644 --- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll +++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll @@ -390,19 +390,19 @@ define void @func_stackrestore_null() { ; WAVE32-OPT-LABEL: func_stackrestore_null: ; WAVE32-OPT: ; %bb.0: ; WAVE32-OPT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; WAVE32-OPT-NEXT: s_mov_b32 s32, 0 +; WAVE32-OPT-NEXT: s_movk_i32 s32, 0xffe0 ; WAVE32-OPT-NEXT: s_setpc_b64 s[30:31] ; ; WAVE64-OPT-LABEL: func_stackrestore_null: ; WAVE64-OPT: ; %bb.0: ; WAVE64-OPT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; WAVE64-OPT-NEXT: s_mov_b32 s32, 0 +; WAVE64-OPT-NEXT: s_movk_i32 s32, 0xffc0 ; WAVE64-OPT-NEXT: s_setpc_b64 s[30:31] ; ; WAVE32-O0-LABEL: func_stackrestore_null: ; WAVE32-O0: ; %bb.0: ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; WAVE32-O0-NEXT: s_mov_b32 s4, 0 +; WAVE32-O0-NEXT: s_mov_b32 s4, -1 ; WAVE32-O0-NEXT: s_lshl_b32 s4, s4, 5 ; WAVE32-O0-NEXT: s_mov_b32 s32, s4 ; WAVE32-O0-NEXT: s_setpc_b64 s[30:31] @@ -410,7 +410,7 @@ define void @func_stackrestore_null() { ; WAVE64-O0-LABEL: func_stackrestore_null: ; WAVE64-O0: ; %bb.0: ; WAVE64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; WAVE64-O0-NEXT: s_mov_b32 s4, 0 +; WAVE64-O0-NEXT: s_mov_b32 s4, -1 ; WAVE64-O0-NEXT: s_lshl_b32 s4, s4, 6 ; WAVE64-O0-NEXT: s_mov_b32 s32, s4 ; WAVE64-O0-NEXT: s_setpc_b64 s[30:31] @@ -418,7 +418,7 @@ define void @func_stackrestore_null() { ; WAVE32-WWM-PREALLOC-LABEL: func_stackrestore_null: ; WAVE32-WWM-PREALLOC: ; %bb.0: ; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s4, 0 +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s4, -1 ; WAVE32-WWM-PREALLOC-NEXT: s_lshl_b32 s4, s4, 5 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s32, s4 ; WAVE32-WWM-PREALLOC-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.convergencetokens.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.convergencetokens.ll index c0587d260c6f2..fb3cbc218edca 100644 --- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.convergencetokens.ll +++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.convergencetokens.ll @@ -22,7 +22,7 @@ define void @tail_call_i64_inreg_uniform_in_vgpr_convergence_tokens() #0 { ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[CONVERGENCECTRL_ENTRY:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ENTRY - ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec ; CHECK-NEXT: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 killed [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[DS_READ_B64_gfx9_]].sub1 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[DS_READ_B64_gfx9_]].sub0 diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll index 80dae9142870a..c362c66805f50 100644 --- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll +++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll @@ -56,7 +56,7 @@ define void @tail_call_i64_inreg_uniform_in_vgpr() { ; CHECK-LABEL: tail_call_i64_inreg_uniform_in_vgpr: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, -1 ; CHECK-NEXT: ds_read_b64 v[0:1], v0 ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, void_func_i64_inreg@gotpcrel32@lo+4 diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.convergencetokens.ll b/llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.convergencetokens.ll index ac449f972acb5..fd2d831f7c7f6 100644 --- a/llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.convergencetokens.ll +++ b/llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.convergencetokens.ll @@ -23,7 +23,7 @@ define void @tail_call_uniform_vgpr_value_convergence_tokens() #0 { ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[CONVERGENCECTRL_ENTRY:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ENTRY - ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec ; CHECK-NEXT: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 killed [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[DS_READ_B64_gfx9_]].sub1 ; CHECK-NEXT: CONVERGENCECTRL_GLUE [[CONVERGENCECTRL_ENTRY]] diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.ll b/llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.ll index b5a68720dc19f..799e58f874445 100644 --- a/llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.ll +++ b/llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.ll @@ -9,7 +9,7 @@ define void @tail_call_uniform_vgpr_value() { ; CHECK-LABEL: tail_call_uniform_vgpr_value: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, -1 ; CHECK-NEXT: ds_read_b64 v[0:1], v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_readfirstlane_b32 s17, v1 diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index dd78c2f46dde8..14fe9fda0cdaf 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -78,6 +78,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b32 s71, s15 ; GLOBALNESS1-NEXT: s_mov_b32 s72, s14 ; GLOBALNESS1-NEXT: s_mov_b64 s[34:35], s[10:11] +; GLOBALNESS1-NEXT: v_mov_b32_e32 v40, -1 ; GLOBALNESS1-NEXT: s_mov_b32 s32, 0 ; GLOBALNESS1-NEXT: ; implicit-def: $vgpr44_vgpr45 ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) @@ -113,10 +114,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: ; Child Loop BB1_16 Depth 2 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0x80 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0 -; GLOBALNESS1-NEXT: flat_load_dword v40, v[0:1] -; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40 -; GLOBALNESS1-NEXT: buffer_store_dword v42, off, s[0:3], 0 ; GLOBALNESS1-NEXT: flat_load_dword v46, v[0:1] +; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40 +; GLOBALNESS1-NEXT: buffer_store_dword v42, v40, s[0:3], 0 offen +; GLOBALNESS1-NEXT: flat_load_dword v47, v[0:1] ; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0 ; GLOBALNESS1-NEXT: s_getpc_b64 s[4:5] ; GLOBALNESS1-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 @@ -180,8 +181,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off ; GLOBALNESS1-NEXT: .LBB1_13: ; %bb44.lr.ph.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 -; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v46, vcc ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -370,6 +371,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b32 s69, s15 ; GLOBALNESS0-NEXT: s_mov_b32 s70, s14 ; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[10:11] +; GLOBALNESS0-NEXT: v_mov_b32_e32 v40, -1 ; GLOBALNESS0-NEXT: s_mov_b32 s32, 0 ; GLOBALNESS0-NEXT: ; implicit-def: $vgpr44_vgpr45 ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) @@ -405,10 +407,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: ; Child Loop BB1_16 Depth 2 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0x80 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0 -; GLOBALNESS0-NEXT: flat_load_dword v40, v[0:1] -; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40 -; GLOBALNESS0-NEXT: buffer_store_dword v42, off, s[0:3], 0 ; GLOBALNESS0-NEXT: flat_load_dword v46, v[0:1] +; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40 +; GLOBALNESS0-NEXT: buffer_store_dword v42, v40, s[0:3], 0 offen +; GLOBALNESS0-NEXT: flat_load_dword v47, v[0:1] ; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0 ; GLOBALNESS0-NEXT: s_getpc_b64 s[4:5] ; GLOBALNESS0-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 @@ -472,8 +474,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off ; GLOBALNESS0-NEXT: .LBB1_13: ; %bb44.lr.ph.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 -; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v46, vcc ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll index d9df80ce6c1c0..54756a1581f16 100644 --- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -257,41 +257,42 @@ define hidden void @blam() { ; GCN-NEXT: s_mov_b32 s16, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 v45, s16, 26 +; GCN-NEXT: v_writelane_b32 v46, s16, 26 ; GCN-NEXT: s_addk_i32 s32, 0x800 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v45, s30, 0 -; GCN-NEXT: v_writelane_b32 v45, s31, 1 -; GCN-NEXT: v_writelane_b32 v45, s34, 2 -; GCN-NEXT: v_writelane_b32 v45, s35, 3 -; GCN-NEXT: v_writelane_b32 v45, s36, 4 -; GCN-NEXT: v_writelane_b32 v45, s37, 5 -; GCN-NEXT: v_writelane_b32 v45, s38, 6 -; GCN-NEXT: v_writelane_b32 v45, s39, 7 -; GCN-NEXT: v_writelane_b32 v45, s40, 8 -; GCN-NEXT: v_writelane_b32 v45, s41, 9 -; GCN-NEXT: v_writelane_b32 v45, s42, 10 -; GCN-NEXT: v_writelane_b32 v45, s43, 11 -; GCN-NEXT: v_writelane_b32 v45, s44, 12 -; GCN-NEXT: v_writelane_b32 v45, s45, 13 -; GCN-NEXT: v_writelane_b32 v45, s46, 14 -; GCN-NEXT: v_writelane_b32 v45, s47, 15 -; GCN-NEXT: v_writelane_b32 v45, s48, 16 -; GCN-NEXT: v_writelane_b32 v45, s49, 17 -; GCN-NEXT: v_writelane_b32 v45, s50, 18 -; GCN-NEXT: v_writelane_b32 v45, s51, 19 -; GCN-NEXT: v_writelane_b32 v45, s52, 20 -; GCN-NEXT: v_writelane_b32 v45, s53, 21 -; GCN-NEXT: v_writelane_b32 v45, s54, 22 -; GCN-NEXT: v_writelane_b32 v45, s55, 23 -; GCN-NEXT: v_writelane_b32 v45, s56, 24 -; GCN-NEXT: v_writelane_b32 v45, s57, 25 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v46, s30, 0 +; GCN-NEXT: v_writelane_b32 v46, s31, 1 +; GCN-NEXT: v_writelane_b32 v46, s34, 2 +; GCN-NEXT: v_writelane_b32 v46, s35, 3 +; GCN-NEXT: v_writelane_b32 v46, s36, 4 +; GCN-NEXT: v_writelane_b32 v46, s37, 5 +; GCN-NEXT: v_writelane_b32 v46, s38, 6 +; GCN-NEXT: v_writelane_b32 v46, s39, 7 +; GCN-NEXT: v_writelane_b32 v46, s40, 8 +; GCN-NEXT: v_writelane_b32 v46, s41, 9 +; GCN-NEXT: v_writelane_b32 v46, s42, 10 +; GCN-NEXT: v_writelane_b32 v46, s43, 11 +; GCN-NEXT: v_writelane_b32 v46, s44, 12 +; GCN-NEXT: v_writelane_b32 v46, s45, 13 +; GCN-NEXT: v_writelane_b32 v46, s46, 14 +; GCN-NEXT: v_writelane_b32 v46, s47, 15 +; GCN-NEXT: v_writelane_b32 v46, s48, 16 +; GCN-NEXT: v_writelane_b32 v46, s49, 17 +; GCN-NEXT: v_writelane_b32 v46, s50, 18 +; GCN-NEXT: v_writelane_b32 v46, s51, 19 +; GCN-NEXT: v_writelane_b32 v46, s52, 20 +; GCN-NEXT: v_writelane_b32 v46, s53, 21 +; GCN-NEXT: v_writelane_b32 v46, s54, 22 +; GCN-NEXT: v_writelane_b32 v46, s55, 23 +; GCN-NEXT: v_writelane_b32 v46, s56, 24 +; GCN-NEXT: v_writelane_b32 v46, s57, 25 ; GCN-NEXT: v_mov_b32_e32 v40, v31 ; GCN-NEXT: s_mov_b32 s46, s15 ; GCN-NEXT: s_mov_b32 s47, s14 @@ -304,14 +305,15 @@ define hidden void @blam() { ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v40 -; GCN-NEXT: flat_load_dword v43, v[0:1] ; GCN-NEXT: v_mov_b32_e32 v42, 0 +; GCN-NEXT: flat_load_dword v43, v[0:1] ; GCN-NEXT: s_mov_b64 s[50:51], 0 +; GCN-NEXT: v_mov_b32_e32 v44, -1 ; GCN-NEXT: v_lshlrev_b32_e32 v41, 2, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cmp_eq_f32_e64 s[52:53], 0, v43 ; GCN-NEXT: v_cmp_neq_f32_e64 s[42:43], 0, v43 -; GCN-NEXT: v_mov_b32_e32 v44, 0x7fc00000 +; GCN-NEXT: v_mov_b32_e32 v45, 0x7fc00000 ; GCN-NEXT: s_branch .LBB1_2 ; GCN-NEXT: .LBB1_1: ; %Flow7 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 @@ -323,7 +325,7 @@ define hidden void @blam() { ; GCN-NEXT: .LBB1_2: ; %bb2 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: flat_load_dword v0, v[41:42] -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v42, v44, s[0:3], 0 offen ; GCN-NEXT: s_mov_b64 s[6:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 2, v0 @@ -362,7 +364,7 @@ define hidden void @blam() { ; GCN-NEXT: s_cbranch_execz .LBB1_7 ; GCN-NEXT: ; %bb.6: ; %bb16 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v45, v44, s[0:3], 0 offen ; GCN-NEXT: s_or_b64 s[8:9], s[52:53], exec ; GCN-NEXT: .LBB1_7: ; %Flow3 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 @@ -374,7 +376,7 @@ define hidden void @blam() { ; GCN-NEXT: ; %bb.8: ; %bb17 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v43, v44, s[0:3], 0 offen ; GCN-NEXT: .LBB1_9: ; %Flow4 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[8:9] @@ -404,7 +406,7 @@ define hidden void @blam() { ; GCN-NEXT: s_cbranch_execz .LBB1_15 ; GCN-NEXT: ; %bb.14: ; %bb10 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v45, v44, s[0:3], 0 offen ; GCN-NEXT: s_or_b64 s[10:11], s[6:7], exec ; GCN-NEXT: .LBB1_15: ; %Flow6 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 @@ -422,46 +424,47 @@ define hidden void @blam() { ; GCN-NEXT: s_cbranch_execz .LBB1_1 ; GCN-NEXT: ; %bb.17: ; %bb18 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v45, v44, s[0:3], 0 offen ; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec ; GCN-NEXT: s_branch .LBB1_1 ; GCN-NEXT: .LBB1_18: ; %DummyReturnBlock ; GCN-NEXT: s_or_b64 exec, exec, s[50:51] -; GCN-NEXT: v_readlane_b32 s57, v45, 25 -; GCN-NEXT: v_readlane_b32 s56, v45, 24 -; GCN-NEXT: v_readlane_b32 s55, v45, 23 -; GCN-NEXT: v_readlane_b32 s54, v45, 22 -; GCN-NEXT: v_readlane_b32 s53, v45, 21 -; GCN-NEXT: v_readlane_b32 s52, v45, 20 -; GCN-NEXT: v_readlane_b32 s51, v45, 19 -; GCN-NEXT: v_readlane_b32 s50, v45, 18 -; GCN-NEXT: v_readlane_b32 s49, v45, 17 -; GCN-NEXT: v_readlane_b32 s48, v45, 16 -; GCN-NEXT: v_readlane_b32 s47, v45, 15 -; GCN-NEXT: v_readlane_b32 s46, v45, 14 -; GCN-NEXT: v_readlane_b32 s45, v45, 13 -; GCN-NEXT: v_readlane_b32 s44, v45, 12 -; GCN-NEXT: v_readlane_b32 s43, v45, 11 -; GCN-NEXT: v_readlane_b32 s42, v45, 10 -; GCN-NEXT: v_readlane_b32 s41, v45, 9 -; GCN-NEXT: v_readlane_b32 s40, v45, 8 -; GCN-NEXT: v_readlane_b32 s39, v45, 7 -; GCN-NEXT: v_readlane_b32 s38, v45, 6 -; GCN-NEXT: v_readlane_b32 s37, v45, 5 -; GCN-NEXT: v_readlane_b32 s36, v45, 4 -; GCN-NEXT: v_readlane_b32 s35, v45, 3 -; GCN-NEXT: v_readlane_b32 s34, v45, 2 -; GCN-NEXT: v_readlane_b32 s31, v45, 1 -; GCN-NEXT: v_readlane_b32 s30, v45, 0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: v_readlane_b32 s57, v46, 25 +; GCN-NEXT: v_readlane_b32 s56, v46, 24 +; GCN-NEXT: v_readlane_b32 s55, v46, 23 +; GCN-NEXT: v_readlane_b32 s54, v46, 22 +; GCN-NEXT: v_readlane_b32 s53, v46, 21 +; GCN-NEXT: v_readlane_b32 s52, v46, 20 +; GCN-NEXT: v_readlane_b32 s51, v46, 19 +; GCN-NEXT: v_readlane_b32 s50, v46, 18 +; GCN-NEXT: v_readlane_b32 s49, v46, 17 +; GCN-NEXT: v_readlane_b32 s48, v46, 16 +; GCN-NEXT: v_readlane_b32 s47, v46, 15 +; GCN-NEXT: v_readlane_b32 s46, v46, 14 +; GCN-NEXT: v_readlane_b32 s45, v46, 13 +; GCN-NEXT: v_readlane_b32 s44, v46, 12 +; GCN-NEXT: v_readlane_b32 s43, v46, 11 +; GCN-NEXT: v_readlane_b32 s42, v46, 10 +; GCN-NEXT: v_readlane_b32 s41, v46, 9 +; GCN-NEXT: v_readlane_b32 s40, v46, 8 +; GCN-NEXT: v_readlane_b32 s39, v46, 7 +; GCN-NEXT: v_readlane_b32 s38, v46, 6 +; GCN-NEXT: v_readlane_b32 s37, v46, 5 +; GCN-NEXT: v_readlane_b32 s36, v46, 4 +; GCN-NEXT: v_readlane_b32 s35, v46, 3 +; GCN-NEXT: v_readlane_b32 s34, v46, 2 +; GCN-NEXT: v_readlane_b32 s31, v46, 1 +; GCN-NEXT: v_readlane_b32 s30, v46, 0 +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b32 s32, s33 -; GCN-NEXT: v_readlane_b32 s4, v45, 26 +; GCN-NEXT: v_readlane_b32 s4, v46, 26 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll index 5b40d53e0a81c..79798d5c83b3d 100644 --- a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @icmp_test() { ; CHECK-LABEL: icmp_test: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: v_cmp_eq_u16_e64 s[0:1], 0, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, -1 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -24,7 +24,7 @@ define amdgpu_kernel void @fcmp_test(half %x, half %y) { ; CHECK-LABEL: fcmp_test: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_b32 s0, s[4:5], 0x0 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, -1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_lshr_b32 s1, s0, 16 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -47,7 +47,7 @@ define amdgpu_kernel void @ballot_test(half %x, half %y) { ; CHECK-LABEL: ballot_test: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_b32 s0, s[4:5], 0x0 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, -1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_lshr_b32 s1, s0, 16 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll b/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll index 6133cb4690723..8f072aff56f52 100644 --- a/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll +++ b/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll @@ -24,10 +24,11 @@ define amdgpu_kernel void @foo(i1 %cmp1) { ; GFX906-NEXT: s_mov_b32 s15, 0xe00000 ; GFX906-NEXT: s_add_u32 s12, s12, s11 ; GFX906-NEXT: s_addc_u32 s13, s13, 0 -; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 -; GFX906-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:4 -; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:8 -; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:12 +; GFX906-NEXT: v_mov_b32_e32 v7, -1 +; GFX906-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:3 +; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:7 +; GFX906-NEXT: buffer_load_dword v3, v7, s[12:15], 0 offen +; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:11 ; GFX906-NEXT: s_load_dword s2, s[4:5], 0x24 ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1c ; GFX906-NEXT: s_mov_b32 s4, 0 @@ -43,12 +44,10 @@ define amdgpu_kernel void @foo(i1 %cmp1) { ; GFX906-NEXT: s_mov_b64 s[2:3], exec ; GFX906-NEXT: ds_write_b64 v2, v[0:1] ; GFX906-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 -; GFX906-NEXT: s_waitcnt vmcnt(3) +; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_readfirstlane_b32 s0, v3 -; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_readfirstlane_b32 s1, v4 ; GFX906-NEXT: v_cmp_eq_u64_e32 vcc, s[0:1], v[3:4] -; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_readfirstlane_b32 s0, v5 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_readfirstlane_b32 s1, v6 diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index deab407581880..3a42593a65d96 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -3409,6 +3409,7 @@ define amdgpu_gs void @wqm_init_exec() { ; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $exec +; GFX9-W64-NEXT: v_mov_b32_e32 v0, -1 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-W64-NEXT: ds_write_b32 v0, v1 ; GFX9-W64-NEXT: s_endpgm @@ -3416,20 +3417,21 @@ define amdgpu_gs void @wqm_init_exec() { ; GFX10-W32-LABEL: wqm_init_exec: ; GFX10-W32: ; %bb.0: ; %bb ; GFX10-W32-NEXT: s_mov_b32 exec_lo, -1 -; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo -; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-W32-NEXT: s_mov_b32 s0, 0 +; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo +; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-W32-NEXT: s_mov_b32 s1, s0 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: s_mov_b32 s2, s0 -; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s1 +; GFX10-W32-NEXT: s_mov_b32 s3, s0 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-W32-NEXT: v_mov_b32_e32 v4, s0 -; GFX10-W32-NEXT: s_mov_b32 s1, s0 -; GFX10-W32-NEXT: s_mov_b32 s3, s0 +; GFX10-W32-NEXT: v_mov_b32_e32 v4, -1 +; GFX10-W32-NEXT: v_mov_b32_e32 v5, s0 +; GFX10-W32-NEXT: s_mov_b32 s2, s0 ; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; GFX10-W32-NEXT: ds_write_b32 v0, v4 +; GFX10-W32-NEXT: ds_write_b32 v4, v5 ; GFX10-W32-NEXT: s_endpgm bb: call void @llvm.amdgcn.init.exec(i64 -1)