diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 05dd269d48921..c355d41d8e9a4 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -4636,10 +4636,17 @@ bool CombinerHelper::matchBitfieldExtractFromSExtInReg( if (ShiftImm < 0 || ShiftImm + Width > Ty.getScalarSizeInBits()) return false; + const RegisterBank *RB = getRegBank(ShiftSrc); + MatchInfo = [=](MachineIRBuilder &B) { auto Cst1 = B.buildConstant(ExtractTy, ShiftImm); auto Cst2 = B.buildConstant(ExtractTy, Width); B.buildSbfx(Dst, ShiftSrc, Cst1, Cst2); + + if (RB) { + MRI.setRegBank(Cst1.getReg(0), *RB); + MRI.setRegBank(Cst2.getReg(0), *RB); + } }; return true; } @@ -4674,10 +4681,18 @@ bool CombinerHelper::matchBitfieldExtractFromAnd(MachineInstr &MI, return false; uint64_t Width = APInt(Size, AndImm).countr_one(); + + const RegisterBank *RB = getRegBank(ShiftSrc); + MatchInfo = [=](MachineIRBuilder &B) { auto WidthCst = B.buildConstant(ExtractTy, Width); auto LSBCst = B.buildConstant(ExtractTy, LSBImm); B.buildInstr(TargetOpcode::G_UBFX, {Dst}, {ShiftSrc, LSBCst, WidthCst}); + + if (RB) { + MRI.setRegBank(WidthCst.getReg(0), *RB); + MRI.setRegBank(LSBCst.getReg(0), *RB); + } }; return true; } @@ -4724,10 +4739,17 @@ bool CombinerHelper::matchBitfieldExtractFromShr( const int64_t Pos = ShrAmt - ShlAmt; const int64_t Width = Size - ShrAmt; + const RegisterBank *RB = getRegBank(ShlSrc); + MatchInfo = [=](MachineIRBuilder &B) { auto WidthCst = B.buildConstant(ExtractTy, Width); auto PosCst = B.buildConstant(ExtractTy, Pos); B.buildInstr(ExtrOpcode, {Dst}, {ShlSrc, PosCst, WidthCst}); + + if (RB) { + MRI.setRegBank(WidthCst.getReg(0), *RB); + MRI.setRegBank(PosCst.getReg(0), *RB); + } }; return true; } @@ -4782,10 +4804,17 @@ bool CombinerHelper::matchBitfieldExtractFromShrAnd( if (Opcode == TargetOpcode::G_ASHR && Width + ShrAmt == Size) return false; + const RegisterBank *RB = getRegBank(AndSrc); + MatchInfo = [=](MachineIRBuilder &B) { auto WidthCst = B.buildConstant(ExtractTy, Width); auto PosCst = B.buildConstant(ExtractTy, Pos); B.buildInstr(TargetOpcode::G_UBFX, {Dst}, {AndSrc, PosCst, WidthCst}); + + if (RB) { + MRI.setRegBank(WidthCst.getReg(0), *RB); + MRI.setRegBank(PosCst.getReg(0), *RB); + } }; return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 94e1175b06b14..96be17c487130 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -210,5 +210,5 @@ def AMDGPURegBankCombiner : GICombiner< fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp, identity_combines, redundant_and, constant_fold_cast_op, cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines, - lower_uniform_sbfx, lower_uniform_ubfx]> { + lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract]> { } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index 353c09b4b0bfb..567116dab2665 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -811,16 +811,15 @@ define amdgpu_ps i32 @s_ashr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou ; ; GFX8-LABEL: s_ashr_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_ashr_i32 s0, s0, s1 -; GFX8-NEXT: s_sext_i32_i16 s1, s2 -; GFX8-NEXT: s_ashr_i32 s1, s1, s3 -; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshr_b32 s2, s1, 16 +; GFX8-NEXT: s_sext_i32_i16 s3, s0 +; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010 +; GFX8-NEXT: s_ashr_i32 s0, s0, s2 +; GFX8-NEXT: s_ashr_i32 s1, s3, s1 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 16 +; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_ashr_v2i16: @@ -1014,26 +1013,24 @@ define amdgpu_ps <2 x i32> @s_ashr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg ; ; GFX8-LABEL: s_ashr_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_lshr_b32 s6, s2, 16 -; GFX8-NEXT: s_ashr_i32 s0, s0, s2 -; GFX8-NEXT: s_sext_i32_i16 s2, s4 -; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_ashr_i32 s2, s2, s6 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_lshr_b32 s7, s3, 16 -; GFX8-NEXT: s_ashr_i32 s1, s1, s3 -; GFX8-NEXT: s_sext_i32_i16 s3, s5 -; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_ashr_i32 s3, s3, s7 +; GFX8-NEXT: s_lshr_b32 s4, s2, 16 +; GFX8-NEXT: s_sext_i32_i16 s6, s0 +; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010 +; GFX8-NEXT: s_lshr_b32 s5, s3, 16 +; GFX8-NEXT: s_ashr_i32 s0, s0, s4 +; GFX8-NEXT: s_sext_i32_i16 s4, s1 +; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010 +; GFX8-NEXT: s_ashr_i32 s2, s6, s2 +; GFX8-NEXT: s_ashr_i32 s1, s1, s5 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, 0xffff, s3 +; GFX8-NEXT: s_ashr_i32 s3, s4, s3 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_or_b32 s0, s2, s0 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s3 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_ashr_v4i16: @@ -1223,46 +1220,42 @@ define amdgpu_ps <4 x i32> @s_ashr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg ; ; GFX8-LABEL: s_ashr_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s8, s0, 16 -; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_lshr_b32 s12, s4, 16 -; GFX8-NEXT: s_ashr_i32 s0, s0, s4 -; GFX8-NEXT: s_sext_i32_i16 s4, s8 -; GFX8-NEXT: s_lshr_b32 s9, s1, 16 -; GFX8-NEXT: s_ashr_i32 s4, s4, s12 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_lshr_b32 s13, s5, 16 -; GFX8-NEXT: s_ashr_i32 s1, s1, s5 -; GFX8-NEXT: s_sext_i32_i16 s5, s9 -; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX8-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NEXT: s_ashr_i32 s5, s5, s13 -; GFX8-NEXT: s_sext_i32_i16 s2, s2 +; GFX8-NEXT: s_lshr_b32 s8, s4, 16 +; GFX8-NEXT: s_sext_i32_i16 s12, s0 +; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010 +; GFX8-NEXT: s_lshr_b32 s9, s5, 16 +; GFX8-NEXT: s_ashr_i32 s0, s0, s8 +; GFX8-NEXT: s_sext_i32_i16 s8, s1 +; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010 +; GFX8-NEXT: s_lshr_b32 s10, s6, 16 +; GFX8-NEXT: s_ashr_i32 s4, s12, s4 +; GFX8-NEXT: s_ashr_i32 s5, s8, s5 +; GFX8-NEXT: s_ashr_i32 s1, s1, s9 +; GFX8-NEXT: s_sext_i32_i16 s8, s2 +; GFX8-NEXT: s_bfe_i32 s2, s2, 0x100010 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_lshr_b32 s14, s6, 16 -; GFX8-NEXT: s_ashr_i32 s2, s2, s6 -; GFX8-NEXT: s_sext_i32_i16 s6, s10 -; GFX8-NEXT: s_or_b32 s0, s0, s4 -; GFX8-NEXT: s_and_b32 s4, 0xffff, s5 -; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: s_ashr_i32 s6, s6, s14 -; GFX8-NEXT: s_sext_i32_i16 s3, s3 +; GFX8-NEXT: s_lshr_b32 s11, s7, 16 +; GFX8-NEXT: s_ashr_i32 s6, s8, s6 +; GFX8-NEXT: s_ashr_i32 s2, s2, s10 +; GFX8-NEXT: s_sext_i32_i16 s8, s3 +; GFX8-NEXT: s_bfe_i32 s3, s3, 0x100010 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_lshr_b32 s15, s7, 16 -; GFX8-NEXT: s_ashr_i32 s3, s3, s7 -; GFX8-NEXT: s_sext_i32_i16 s7, s11 -; GFX8-NEXT: s_or_b32 s1, s1, s4 -; GFX8-NEXT: s_and_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_ashr_i32 s7, s7, s15 +; GFX8-NEXT: s_ashr_i32 s3, s3, s11 +; GFX8-NEXT: s_or_b32 s0, s4, s0 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s5 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_or_b32 s2, s2, s4 -; GFX8-NEXT: s_and_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_ashr_i32 s7, s8, s7 +; GFX8-NEXT: s_or_b32 s1, s4, s1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_or_b32 s3, s3, s4 +; GFX8-NEXT: s_or_b32 s2, s4, s2 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_lshl_b32 s3, s3, 16 +; GFX8-NEXT: s_or_b32 s3, s4, s3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_ashr_v8i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index fc81e16d68e98..ad6477fd8b692 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -40,8 +40,7 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_and_b32 s2, s2, 0x7f -; GFX8-NEXT: s_and_b32 s1, s1, 0x7f -; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x60001 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, v0, -7 @@ -70,8 +69,7 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7f -; GFX9-NEXT: s_and_b32 s1, s1, 0x7f -; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x60001 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, -7 @@ -99,8 +97,7 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX10-NEXT: s_and_b32 s2, s2, 0x7f -; GFX10-NEXT: s_and_b32 s1, s1, 0x7f -; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x60001 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -129,40 +126,38 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX11-NEXT: s_and_b32 s2, s2, 0x7f -; GFX11-NEXT: s_and_b32 s1, s1, 0x7f -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x60001 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: v_mul_lo_u32 v1, v0, -7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v1, v0, -7 ; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX11-NEXT: v_mul_hi_u32 v0, s2, v0 -; GFX11-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_nc_u32_e32 v1, -7, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_nc_u32_e32 v1, -7, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u16 v1, 6, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v1, 0x7f, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b16 v0, v0, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b16 v1, v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %result = call i7 @llvm.fshl.i7(i7 %lhs, i7 %rhs, i7 %amt) @@ -345,10 +340,10 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { ; ; GFX8-LABEL: s_fshl_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x70001 ; GFX8-NEXT: s_and_b32 s3, s2, 7 -; GFX8-NEXT: s_lshr_b32 s1, s1, 1 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, s3 ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -356,10 +351,10 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { ; ; GFX9-LABEL: s_fshl_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x70001 ; GFX9-NEXT: s_and_b32 s3, s2, 7 -; GFX9-NEXT: s_lshr_b32 s1, s1, 1 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshl_b32 s0, s0, s3 ; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 @@ -367,10 +362,10 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { ; ; GFX10-LABEL: s_fshl_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x70001 ; GFX10-NEXT: s_and_b32 s3, s2, 7 -; GFX10-NEXT: s_lshr_b32 s1, s1, 1 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshl_b32 s0, s0, s3 ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -378,10 +373,10 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { ; ; GFX11-LABEL: s_fshl_i8: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x70001 ; GFX11-NEXT: s_and_b32 s3, s2, 7 -; GFX11-NEXT: s_lshr_b32 s1, s1, 1 ; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_lshl_b32 s0, s0, s3 ; GFX11-NEXT: s_lshr_b32 s1, s1, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -463,42 +458,17 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) { } define amdgpu_ps i8 @s_fshl_i8_4(i8 inreg %lhs, i8 inreg %rhs) { -; GFX6-LABEL: s_fshl_i8_4: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 4 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x40004 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_fshl_i8_4: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_lshl_b32 s0, s0, 4 -; GFX8-NEXT: s_lshr_b32 s1, s1, 4 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_fshl_i8_4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_lshl_b32 s0, s0, 4 -; GFX9-NEXT: s_lshr_b32 s1, s1, 4 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: s_fshl_i8_4: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: s_lshr_b32 s1, s1, 4 -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: s_fshl_i8_4: +; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b32 s0, s0, 4 +; GCN-NEXT: s_bfe_u32 s1, s1, 0x40004 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_i8_4: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_lshl_b32 s0, s0, 4 -; GFX11-NEXT: s_lshr_b32 s1, s1, 4 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x40004 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -556,42 +526,17 @@ define i8 @v_fshl_i8_4(i8 %lhs, i8 %rhs) { } define amdgpu_ps i8 @s_fshl_i8_5(i8 inreg %lhs, i8 inreg %rhs) { -; GFX6-LABEL: s_fshl_i8_5: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 5 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x50003 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_fshl_i8_5: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_lshl_b32 s0, s0, 5 -; GFX8-NEXT: s_lshr_b32 s1, s1, 3 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_fshl_i8_5: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_lshl_b32 s0, s0, 5 -; GFX9-NEXT: s_lshr_b32 s1, s1, 3 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: s_fshl_i8_5: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshl_b32 s0, s0, 5 -; GFX10-NEXT: s_lshr_b32 s1, s1, 3 -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: s_fshl_i8_5: +; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b32 s0, s0, 5 +; GCN-NEXT: s_bfe_u32 s1, s1, 0x50003 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_i8_5: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-NEXT: s_lshr_b32 s1, s1, 3 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x50003 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -674,23 +619,23 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; ; GFX8-LABEL: s_fshl_v2i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s4, s1, 8 -; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_lshr_b32 s5, s2, 8 -; GFX8-NEXT: s_and_b32 s6, s2, 7 -; GFX8-NEXT: s_lshr_b32 s1, s1, 1 -; GFX8-NEXT: s_andn2_b32 s2, 7, s2 +; GFX8-NEXT: s_and_b32 s5, s2, 7 ; GFX8-NEXT: s_lshr_b32 s3, s0, 8 -; GFX8-NEXT: s_lshl_b32 s0, s0, s6 -; GFX8-NEXT: s_lshr_b32 s1, s1, s2 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s5, 7 -; GFX8-NEXT: s_and_b32 s2, s4, 0xff -; GFX8-NEXT: s_lshl_b32 s1, s3, s1 -; GFX8-NEXT: s_lshr_b32 s2, s2, 1 -; GFX8-NEXT: s_andn2_b32 s3, 7, s5 -; GFX8-NEXT: s_lshr_b32 s2, s2, s3 -; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s5 +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x70001 +; GFX8-NEXT: s_lshr_b32 s4, s2, 8 +; GFX8-NEXT: s_andn2_b32 s2, 7, s2 +; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX8-NEXT: s_lshr_b32 s2, s5, s2 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, s4, 7 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshl_b32 s2, s3, s2 +; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_andn2_b32 s3, 7, s4 +; GFX8-NEXT: s_lshr_b32 s1, s1, s3 +; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 @@ -699,23 +644,23 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; ; GFX9-LABEL: s_fshl_v2i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s4, s1, 8 -; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_lshr_b32 s5, s2, 8 -; GFX9-NEXT: s_and_b32 s6, s2, 7 -; GFX9-NEXT: s_lshr_b32 s1, s1, 1 -; GFX9-NEXT: s_andn2_b32 s2, 7, s2 +; GFX9-NEXT: s_and_b32 s5, s2, 7 ; GFX9-NEXT: s_lshr_b32 s3, s0, 8 -; GFX9-NEXT: s_lshl_b32 s0, s0, s6 -; GFX9-NEXT: s_lshr_b32 s1, s1, s2 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s5, 7 -; GFX9-NEXT: s_and_b32 s2, s4, 0xff -; GFX9-NEXT: s_lshl_b32 s1, s3, s1 -; GFX9-NEXT: s_lshr_b32 s2, s2, 1 -; GFX9-NEXT: s_andn2_b32 s3, 7, s5 -; GFX9-NEXT: s_lshr_b32 s2, s2, s3 -; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s5 +; GFX9-NEXT: s_bfe_u32 s5, s1, 0x70001 +; GFX9-NEXT: s_lshr_b32 s4, s2, 8 +; GFX9-NEXT: s_andn2_b32 s2, 7, s2 +; GFX9-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX9-NEXT: s_lshr_b32 s2, s5, s2 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s2, s4, 7 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_lshl_b32 s2, s3, s2 +; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_andn2_b32 s3, 7, s4 +; GFX9-NEXT: s_lshr_b32 s1, s1, s3 +; GFX9-NEXT: s_or_b32 s1, s2, s1 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_and_b32 s0, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 @@ -724,24 +669,24 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; ; GFX10-LABEL: s_fshl_v2i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s4, s1, 8 -; GFX10-NEXT: s_lshr_b32 s5, s2, 8 -; GFX10-NEXT: s_and_b32 s6, s2, 7 -; GFX10-NEXT: s_and_b32 s4, s4, 0xff +; GFX10-NEXT: s_and_b32 s4, s2, 7 ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshl_b32 s0, s0, s6 +; GFX10-NEXT: s_lshl_b32 s0, s0, s4 +; GFX10-NEXT: s_bfe_u32 s4, s1, 0x70001 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s5, s2, 8 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_and_b32 s6, s5, 7 -; GFX10-NEXT: s_lshr_b32 s4, s4, 1 -; GFX10-NEXT: s_andn2_b32 s5, 7, s5 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: s_andn2_b32 s5, 7, s5 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX10-NEXT: s_lshl_b32 s3, s3, s6 -; GFX10-NEXT: s_lshr_b32 s4, s4, s5 -; GFX10-NEXT: s_lshr_b32 s1, s1, s2 -; GFX10-NEXT: s_or_b32 s2, s3, s4 -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_and_b32 s1, s2, 0xff +; GFX10-NEXT: s_lshr_b32 s1, s1, s5 +; GFX10-NEXT: s_lshr_b32 s2, s4, s2 +; GFX10-NEXT: s_or_b32 s1, s3, s1 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_and_b32 s0, s0, 0xff ; GFX10-NEXT: s_lshl_b32 s1, s1, 8 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -749,24 +694,24 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; ; GFX11-LABEL: s_fshl_v2i8: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_lshr_b32 s4, s1, 8 -; GFX11-NEXT: s_lshr_b32 s5, s2, 8 -; GFX11-NEXT: s_and_b32 s6, s2, 7 -; GFX11-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-NEXT: s_and_b32 s4, s2, 7 ; GFX11-NEXT: s_lshr_b32 s3, s0, 8 -; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_lshl_b32 s0, s0, s6 +; GFX11-NEXT: s_lshl_b32 s0, s0, s4 +; GFX11-NEXT: s_bfe_u32 s4, s1, 0x70001 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX11-NEXT: s_lshr_b32 s5, s2, 8 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_and_b32 s6, s5, 7 -; GFX11-NEXT: s_lshr_b32 s4, s4, 1 -; GFX11-NEXT: s_and_not1_b32 s5, 7, s5 ; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_and_not1_b32 s5, 7, s5 ; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX11-NEXT: s_lshl_b32 s3, s3, s6 -; GFX11-NEXT: s_lshr_b32 s4, s4, s5 -; GFX11-NEXT: s_lshr_b32 s1, s1, s2 -; GFX11-NEXT: s_or_b32 s2, s3, s4 -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-NEXT: s_lshr_b32 s1, s1, s5 +; GFX11-NEXT: s_lshr_b32 s2, s4, s2 +; GFX11-NEXT: s_or_b32 s1, s3, s1 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_and_b32 s0, s0, 0xff ; GFX11-NEXT: s_lshl_b32 s1, s1, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -974,47 +919,47 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; ; GFX8-LABEL: s_fshl_v4i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s6, s1, 8 -; GFX8-NEXT: s_lshr_b32 s7, s1, 16 -; GFX8-NEXT: s_lshr_b32 s8, s1, 24 -; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_lshr_b32 s9, s2, 8 -; GFX8-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NEXT: s_lshr_b32 s11, s2, 24 -; GFX8-NEXT: s_and_b32 s12, s2, 7 -; GFX8-NEXT: s_lshr_b32 s1, s1, 1 -; GFX8-NEXT: s_andn2_b32 s2, 7, s2 +; GFX8-NEXT: s_and_b32 s10, s2, 7 ; GFX8-NEXT: s_lshr_b32 s3, s0, 8 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 ; GFX8-NEXT: s_lshr_b32 s5, s0, 24 -; GFX8-NEXT: s_lshl_b32 s0, s0, s12 -; GFX8-NEXT: s_lshr_b32 s1, s1, s2 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s9, 7 -; GFX8-NEXT: s_and_b32 s2, s6, 0xff -; GFX8-NEXT: s_lshl_b32 s1, s3, s1 -; GFX8-NEXT: s_lshr_b32 s2, s2, 1 -; GFX8-NEXT: s_andn2_b32 s3, 7, s9 -; GFX8-NEXT: s_lshr_b32 s2, s2, s3 -; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s2, s10, 7 -; GFX8-NEXT: s_and_b32 s3, s7, 0xff -; GFX8-NEXT: s_lshl_b32 s2, s4, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s10 +; GFX8-NEXT: s_bfe_u32 s10, s1, 0x70001 +; GFX8-NEXT: s_lshr_b32 s7, s2, 8 +; GFX8-NEXT: s_lshr_b32 s8, s2, 16 +; GFX8-NEXT: s_lshr_b32 s9, s2, 24 +; GFX8-NEXT: s_andn2_b32 s2, 7, s2 +; GFX8-NEXT: s_and_b32 s10, 0xffff, s10 +; GFX8-NEXT: s_lshr_b32 s2, s10, s2 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, s7, 7 +; GFX8-NEXT: s_lshl_b32 s2, s3, s2 +; GFX8-NEXT: s_bfe_u32 s3, s1, 0x80008 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshr_b32 s3, s3, 1 -; GFX8-NEXT: s_andn2_b32 s4, 7, s10 -; GFX8-NEXT: s_lshr_b32 s3, s3, s4 +; GFX8-NEXT: s_andn2_b32 s7, 7, s7 +; GFX8-NEXT: s_lshr_b32 s6, s1, 24 +; GFX8-NEXT: s_lshr_b32 s3, s3, s7 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX8-NEXT: s_or_b32 s2, s2, s3 -; GFX8-NEXT: s_and_b32 s3, s11, 7 -; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_and_b32 s3, s8, 7 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshl_b32 s3, s4, s3 +; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_andn2_b32 s4, 7, s8 +; GFX8-NEXT: s_lshr_b32 s1, s1, s4 +; GFX8-NEXT: s_or_b32 s1, s3, s1 +; GFX8-NEXT: s_and_b32 s3, s9, 7 ; GFX8-NEXT: s_lshl_b32 s3, s5, s3 -; GFX8-NEXT: s_lshr_b32 s4, s8, 1 -; GFX8-NEXT: s_andn2_b32 s5, 7, s11 -; GFX8-NEXT: s_and_b32 s0, s0, 0xff -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_lshr_b32 s4, s6, 1 +; GFX8-NEXT: s_andn2_b32 s5, 7, s9 +; GFX8-NEXT: s_and_b32 s2, s2, 0xff ; GFX8-NEXT: s_lshr_b32 s4, s4, s5 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s2, 0xff +; GFX8-NEXT: s_and_b32 s0, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s2, s2, 8 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_or_b32 s3, s3, s4 +; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s3, 0xff @@ -1024,47 +969,47 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; ; GFX9-LABEL: s_fshl_v4i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s6, s1, 8 -; GFX9-NEXT: s_lshr_b32 s7, s1, 16 -; GFX9-NEXT: s_lshr_b32 s8, s1, 24 -; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_lshr_b32 s9, s2, 8 -; GFX9-NEXT: s_lshr_b32 s10, s2, 16 -; GFX9-NEXT: s_lshr_b32 s11, s2, 24 -; GFX9-NEXT: s_and_b32 s12, s2, 7 -; GFX9-NEXT: s_lshr_b32 s1, s1, 1 -; GFX9-NEXT: s_andn2_b32 s2, 7, s2 +; GFX9-NEXT: s_and_b32 s10, s2, 7 ; GFX9-NEXT: s_lshr_b32 s3, s0, 8 ; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshr_b32 s5, s0, 24 -; GFX9-NEXT: s_lshl_b32 s0, s0, s12 -; GFX9-NEXT: s_lshr_b32 s1, s1, s2 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s9, 7 -; GFX9-NEXT: s_and_b32 s2, s6, 0xff -; GFX9-NEXT: s_lshl_b32 s1, s3, s1 -; GFX9-NEXT: s_lshr_b32 s2, s2, 1 -; GFX9-NEXT: s_andn2_b32 s3, 7, s9 -; GFX9-NEXT: s_lshr_b32 s2, s2, s3 -; GFX9-NEXT: s_or_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s2, s10, 7 -; GFX9-NEXT: s_and_b32 s3, s7, 0xff -; GFX9-NEXT: s_lshl_b32 s2, s4, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s10 +; GFX9-NEXT: s_bfe_u32 s10, s1, 0x70001 +; GFX9-NEXT: s_lshr_b32 s7, s2, 8 +; GFX9-NEXT: s_lshr_b32 s8, s2, 16 +; GFX9-NEXT: s_lshr_b32 s9, s2, 24 +; GFX9-NEXT: s_andn2_b32 s2, 7, s2 +; GFX9-NEXT: s_and_b32 s10, 0xffff, s10 +; GFX9-NEXT: s_lshr_b32 s2, s10, s2 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s2, s7, 7 +; GFX9-NEXT: s_lshl_b32 s2, s3, s2 +; GFX9-NEXT: s_bfe_u32 s3, s1, 0x80008 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX9-NEXT: s_lshr_b32 s3, s3, 1 -; GFX9-NEXT: s_andn2_b32 s4, 7, s10 -; GFX9-NEXT: s_lshr_b32 s3, s3, s4 +; GFX9-NEXT: s_andn2_b32 s7, 7, s7 +; GFX9-NEXT: s_lshr_b32 s6, s1, 24 +; GFX9-NEXT: s_lshr_b32 s3, s3, s7 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX9-NEXT: s_or_b32 s2, s2, s3 -; GFX9-NEXT: s_and_b32 s3, s11, 7 -; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_and_b32 s3, s8, 7 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_lshl_b32 s3, s4, s3 +; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_andn2_b32 s4, 7, s8 +; GFX9-NEXT: s_lshr_b32 s1, s1, s4 +; GFX9-NEXT: s_or_b32 s1, s3, s1 +; GFX9-NEXT: s_and_b32 s3, s9, 7 ; GFX9-NEXT: s_lshl_b32 s3, s5, s3 -; GFX9-NEXT: s_lshr_b32 s4, s8, 1 -; GFX9-NEXT: s_andn2_b32 s5, 7, s11 -; GFX9-NEXT: s_and_b32 s0, s0, 0xff -; GFX9-NEXT: s_lshl_b32 s1, s1, 8 +; GFX9-NEXT: s_lshr_b32 s4, s6, 1 +; GFX9-NEXT: s_andn2_b32 s5, 7, s9 +; GFX9-NEXT: s_and_b32 s2, s2, 0xff ; GFX9-NEXT: s_lshr_b32 s4, s4, s5 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s2, 0xff +; GFX9-NEXT: s_and_b32 s0, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_or_b32 s3, s3, s4 +; GFX9-NEXT: s_or_b32 s0, s0, s2 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s3, 0xff @@ -1074,48 +1019,48 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; ; GFX10-LABEL: s_fshl_v4i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s6, s1, 8 -; GFX10-NEXT: s_lshr_b32 s7, s1, 16 -; GFX10-NEXT: s_lshr_b32 s8, s1, 24 -; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshr_b32 s9, s2, 8 -; GFX10-NEXT: s_lshr_b32 s10, s2, 16 -; GFX10-NEXT: s_lshr_b32 s11, s2, 24 -; GFX10-NEXT: s_and_b32 s12, s2, 7 -; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: s_bfe_u32 s11, s1, 0x70001 +; GFX10-NEXT: s_lshr_b32 s7, s2, 8 +; GFX10-NEXT: s_lshr_b32 s8, s2, 16 +; GFX10-NEXT: s_lshr_b32 s9, s2, 24 +; GFX10-NEXT: s_and_b32 s10, s2, 7 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_and_b32 s11, 0xffff, s11 ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: s_lshr_b32 s1, s1, s2 -; GFX10-NEXT: s_and_b32 s2, s6, 0xff -; GFX10-NEXT: s_and_b32 s6, s9, 7 -; GFX10-NEXT: s_lshr_b32 s2, s2, 1 -; GFX10-NEXT: s_andn2_b32 s9, 7, s9 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s0, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, s12 -; GFX10-NEXT: s_lshl_b32 s3, s3, s6 -; GFX10-NEXT: s_lshr_b32 s2, s2, s9 -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_or_b32 s1, s3, s2 -; GFX10-NEXT: s_and_b32 s2, s7, 0xff -; GFX10-NEXT: s_and_b32 s3, s10, 7 +; GFX10-NEXT: s_lshl_b32 s0, s0, s10 +; GFX10-NEXT: s_lshr_b32 s2, s11, s2 +; GFX10-NEXT: s_bfe_u32 s10, s1, 0x80008 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s10 +; GFX10-NEXT: s_lshr_b32 s6, s1, 24 +; GFX10-NEXT: s_and_b32 s10, s7, 7 ; GFX10-NEXT: s_lshr_b32 s2, s2, 1 -; GFX10-NEXT: s_andn2_b32 s6, 7, s10 +; GFX10-NEXT: s_andn2_b32 s7, 7, s7 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s3, s3, s10 +; GFX10-NEXT: s_lshr_b32 s2, s2, s7 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: s_and_b32 s3, s8, 7 +; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: s_andn2_b32 s7, 7, s8 ; GFX10-NEXT: s_lshl_b32 s3, s4, s3 -; GFX10-NEXT: s_lshr_b32 s2, s2, s6 -; GFX10-NEXT: s_and_b32 s4, s11, 7 -; GFX10-NEXT: s_lshr_b32 s6, s8, 1 -; GFX10-NEXT: s_andn2_b32 s7, 7, s11 +; GFX10-NEXT: s_lshr_b32 s1, s1, s7 +; GFX10-NEXT: s_and_b32 s4, s9, 7 +; GFX10-NEXT: s_lshr_b32 s6, s6, 1 +; GFX10-NEXT: s_andn2_b32 s7, 7, s9 ; GFX10-NEXT: s_lshl_b32 s4, s5, s4 ; GFX10-NEXT: s_lshr_b32 s5, s6, s7 -; GFX10-NEXT: s_or_b32 s2, s3, s2 -; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_or_b32 s1, s3, s1 +; GFX10-NEXT: s_and_b32 s2, s2, 0xff ; GFX10-NEXT: s_or_b32 s3, s4, s5 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_and_b32 s2, s2, 0xff -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, 16 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s1, s1, 16 ; GFX10-NEXT: s_and_b32 s2, s3, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_lshl_b32 s1, s2, 24 @@ -1124,48 +1069,48 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; ; GFX11-LABEL: s_fshl_v4i8: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_lshr_b32 s6, s1, 8 -; GFX11-NEXT: s_lshr_b32 s7, s1, 16 -; GFX11-NEXT: s_lshr_b32 s8, s1, 24 -; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_lshr_b32 s9, s2, 8 -; GFX11-NEXT: s_lshr_b32 s10, s2, 16 -; GFX11-NEXT: s_lshr_b32 s11, s2, 24 -; GFX11-NEXT: s_and_b32 s12, s2, 7 -; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_bfe_u32 s11, s1, 0x70001 +; GFX11-NEXT: s_lshr_b32 s7, s2, 8 +; GFX11-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-NEXT: s_lshr_b32 s9, s2, 24 +; GFX11-NEXT: s_and_b32 s10, s2, 7 ; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_and_b32 s11, 0xffff, s11 ; GFX11-NEXT: s_lshr_b32 s3, s0, 8 -; GFX11-NEXT: s_lshr_b32 s1, s1, s2 -; GFX11-NEXT: s_and_b32 s2, s6, 0xff -; GFX11-NEXT: s_and_b32 s6, s9, 7 -; GFX11-NEXT: s_lshr_b32 s2, s2, 1 -; GFX11-NEXT: s_and_not1_b32 s9, 7, s9 ; GFX11-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-NEXT: s_lshr_b32 s5, s0, 24 -; GFX11-NEXT: s_lshl_b32 s0, s0, s12 -; GFX11-NEXT: s_lshl_b32 s3, s3, s6 -; GFX11-NEXT: s_lshr_b32 s2, s2, s9 -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s3, s2 -; GFX11-NEXT: s_and_b32 s2, s7, 0xff -; GFX11-NEXT: s_and_b32 s3, s10, 7 +; GFX11-NEXT: s_lshl_b32 s0, s0, s10 +; GFX11-NEXT: s_lshr_b32 s2, s11, s2 +; GFX11-NEXT: s_bfe_u32 s10, s1, 0x80008 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s10 +; GFX11-NEXT: s_lshr_b32 s6, s1, 24 +; GFX11-NEXT: s_and_b32 s10, s7, 7 ; GFX11-NEXT: s_lshr_b32 s2, s2, 1 -; GFX11-NEXT: s_and_not1_b32 s6, 7, s10 +; GFX11-NEXT: s_and_not1_b32 s7, 7, s7 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX11-NEXT: s_lshl_b32 s3, s3, s10 +; GFX11-NEXT: s_lshr_b32 s2, s2, s7 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_and_b32 s3, s8, 7 +; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_and_not1_b32 s7, 7, s8 ; GFX11-NEXT: s_lshl_b32 s3, s4, s3 -; GFX11-NEXT: s_lshr_b32 s2, s2, s6 -; GFX11-NEXT: s_and_b32 s4, s11, 7 -; GFX11-NEXT: s_lshr_b32 s6, s8, 1 -; GFX11-NEXT: s_and_not1_b32 s7, 7, s11 +; GFX11-NEXT: s_lshr_b32 s1, s1, s7 +; GFX11-NEXT: s_and_b32 s4, s9, 7 +; GFX11-NEXT: s_lshr_b32 s6, s6, 1 +; GFX11-NEXT: s_and_not1_b32 s7, 7, s9 ; GFX11-NEXT: s_lshl_b32 s4, s5, s4 ; GFX11-NEXT: s_lshr_b32 s5, s6, s7 -; GFX11-NEXT: s_or_b32 s2, s3, s2 -; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_or_b32 s1, s3, s1 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff ; GFX11-NEXT: s_or_b32 s3, s4, s5 ; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s1, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_lshl_b32 s1, s2, 16 +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 ; GFX11-NEXT: s_and_b32 s2, s3, 0xff ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_lshl_b32 s1, s2, 24 @@ -1768,63 +1713,63 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) { define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 inreg %amt.arg) { ; GFX6-LABEL: s_fshl_v2i24: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_bfe_u32 s7, s0, 0x80008 +; GFX6-NEXT: s_and_b32 s6, s0, 0xff +; GFX6-NEXT: s_lshl_b32 s7, s7, 8 +; GFX6-NEXT: s_or_b32 s6, s6, s7 +; GFX6-NEXT: s_bfe_u32 s7, s0, 0x80010 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 +; GFX6-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX6-NEXT: s_bfe_u32 s9, s0, 0x80008 -; GFX6-NEXT: s_lshr_b32 s6, s0, 16 -; GFX6-NEXT: s_and_b32 s8, s0, 0xff -; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX6-NEXT: s_lshl_b32 s9, s9, 8 -; GFX6-NEXT: s_lshr_b32 s7, s1, 8 -; GFX6-NEXT: s_or_b32 s8, s8, s9 -; GFX6-NEXT: s_and_b32 s6, s6, 0xff -; GFX6-NEXT: s_and_b32 s1, s1, 0xff +; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX6-NEXT: s_lshl_b32 s7, s7, 16 +; GFX6-NEXT: s_or_b32 s6, s6, s7 +; GFX6-NEXT: s_and_b32 s7, s1, 0xff ; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: s_and_b32 s8, 0xffff, s8 -; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24 -; GFX6-NEXT: s_and_b32 s0, s7, 0xff -; GFX6-NEXT: v_not_b32_e32 v3, 23 -; GFX6-NEXT: s_or_b32 s6, s8, s6 +; GFX6-NEXT: s_bfe_u32 s0, s1, 0x80008 +; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 24 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_bfe_u32 s8, s2, 0x80008 -; GFX6-NEXT: v_mul_lo_u32 v4, v2, v3 +; GFX6-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX6-NEXT: s_lshr_b32 s0, s2, 16 -; GFX6-NEXT: s_and_b32 s7, s2, 0xff -; GFX6-NEXT: s_lshl_b32 s8, s8, 8 -; GFX6-NEXT: s_lshr_b32 s1, s3, 8 -; GFX6-NEXT: s_or_b32 s7, s7, s8 -; GFX6-NEXT: s_and_b32 s0, s0, 0xff -; GFX6-NEXT: s_and_b32 s3, s3, 0xff +; GFX6-NEXT: s_and_b32 s0, s2, 0xff +; GFX6-NEXT: s_lshl_b32 s1, s1, 8 +; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_bfe_u32 s1, s2, 0x80010 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: v_not_b32_e32 v3, 23 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_and_b32 s1, s3, 0xff ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_and_b32 s7, 0xffff, s7 -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 24 -; GFX6-NEXT: s_and_b32 s1, s1, 0xff -; GFX6-NEXT: s_or_b32 s0, s7, s0 +; GFX6-NEXT: v_mul_lo_u32 v4, v2, v3 +; GFX6-NEXT: v_alignbit_b32 v1, s1, v1, 24 +; GFX6-NEXT: s_bfe_u32 s1, s3, 0x80008 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX6-NEXT: v_mul_hi_u32 v4, v2, v4 +; GFX6-NEXT: s_bfe_u32 s2, s4, 0x80008 ; GFX6-NEXT: v_or_b32_e32 v1, s1, v1 -; GFX6-NEXT: s_lshr_b32 s1, s4, 16 -; GFX6-NEXT: s_and_b32 s3, s4, 0xff -; GFX6-NEXT: s_lshl_b32 s7, s7, 8 -; GFX6-NEXT: s_or_b32 s3, s3, s7 -; GFX6-NEXT: s_and_b32 s1, s1, 0xff -; GFX6-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_or_b32 s1, s3, s1 +; GFX6-NEXT: s_and_b32 s1, s4, 0xff +; GFX6-NEXT: s_lshl_b32 s2, s2, 8 +; GFX6-NEXT: v_mul_hi_u32 v4, v2, v4 +; GFX6-NEXT: s_or_b32 s1, s1, s2 +; GFX6-NEXT: s_bfe_u32 s2, s4, 0x80010 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_hi_u32 v4, s1, v2 -; GFX6-NEXT: s_lshr_b32 s2, s5, 8 -; GFX6-NEXT: s_and_b32 s3, s5, 0xff +; GFX6-NEXT: s_and_b32 s2, s5, 0xff ; GFX6-NEXT: v_mov_b32_e32 v5, s4 -; GFX6-NEXT: v_alignbit_b32 v5, s3, v5, 24 -; GFX6-NEXT: s_and_b32 s2, s2, 0xff +; GFX6-NEXT: v_alignbit_b32 v5, s2, v5, 24 +; GFX6-NEXT: s_bfe_u32 s2, s5, 0x80008 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX6-NEXT: v_mul_lo_u32 v4, v4, 24 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 @@ -1879,67 +1824,67 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; ; GFX8-LABEL: s_fshl_v2i24: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s6, s0, 8 -; GFX8-NEXT: s_and_b32 s6, s6, 0xff ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX8-NEXT: s_lshr_b32 s7, s0, 16 -; GFX8-NEXT: s_lshr_b32 s8, s0, 24 -; GFX8-NEXT: s_and_b32 s0, s0, 0xff -; GFX8-NEXT: s_lshl_b32 s6, s6, 8 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_or_b32 s0, s0, s6 -; GFX8-NEXT: s_and_b32 s6, s7, 0xff -; GFX8-NEXT: s_lshr_b32 s9, s1, 8 -; GFX8-NEXT: s_lshl_b32 s6, s6, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_or_b32 s0, s0, s6 -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_and_b32 s6, s9, 0xff -; GFX8-NEXT: s_or_b32 s1, s8, s1 -; GFX8-NEXT: s_lshl_b32 s6, s6, 16 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x80008 +; GFX8-NEXT: s_lshr_b32 s6, s0, 24 +; GFX8-NEXT: s_and_b32 s7, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s8, s8, 8 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX8-NEXT: s_or_b32 s7, s7, s8 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX8-NEXT: s_or_b32 s1, s1, s6 -; GFX8-NEXT: s_lshr_b32 s6, s2, 8 +; GFX8-NEXT: s_or_b32 s0, s7, s0 +; GFX8-NEXT: s_and_b32 s7, s1, 0xff +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80008 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_and_b32 s6, s6, 0xff -; GFX8-NEXT: s_lshr_b32 s7, s2, 16 -; GFX8-NEXT: s_lshr_b32 s8, s2, 24 -; GFX8-NEXT: s_and_b32 s2, s2, 0xff -; GFX8-NEXT: s_lshl_b32 s6, s6, 8 -; GFX8-NEXT: s_or_b32 s2, s2, s6 -; GFX8-NEXT: s_and_b32 s6, s7, 0xff +; GFX8-NEXT: s_lshl_b32 s7, s7, 8 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_or_b32 s6, s6, s7 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x80008 +; GFX8-NEXT: s_or_b32 s1, s6, s1 +; GFX8-NEXT: s_lshr_b32 s6, s2, 24 +; GFX8-NEXT: s_and_b32 s7, s2, 0xff +; GFX8-NEXT: s_lshl_b32 s8, s8, 8 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX8-NEXT: v_not_b32_e32 v1, 23 -; GFX8-NEXT: s_lshr_b32 s9, s3, 8 -; GFX8-NEXT: s_lshl_b32 s6, s6, 16 -; GFX8-NEXT: s_and_b32 s3, s3, 0xff +; GFX8-NEXT: s_or_b32 s7, s7, s8 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: v_mul_lo_u32 v2, v0, v1 -; GFX8-NEXT: s_or_b32 s2, s2, s6 -; GFX8-NEXT: s_lshl_b32 s3, s3, 8 -; GFX8-NEXT: s_and_b32 s6, s9, 0xff -; GFX8-NEXT: s_or_b32 s3, s8, s3 -; GFX8-NEXT: s_lshl_b32 s6, s6, 16 -; GFX8-NEXT: s_or_b32 s3, s3, s6 -; GFX8-NEXT: s_lshr_b32 s6, s4, 8 -; GFX8-NEXT: s_and_b32 s6, s6, 0xff +; GFX8-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_or_b32 s2, s7, s2 +; GFX8-NEXT: s_and_b32 s7, s3, 0xff +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x80008 +; GFX8-NEXT: s_lshl_b32 s7, s7, 8 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_or_b32 s6, s6, s7 +; GFX8-NEXT: s_lshl_b32 s3, s3, 16 +; GFX8-NEXT: s_bfe_u32 s8, s4, 0x80008 ; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX8-NEXT: s_lshr_b32 s7, s4, 16 -; GFX8-NEXT: s_lshr_b32 s8, s4, 24 -; GFX8-NEXT: s_and_b32 s4, s4, 0xff -; GFX8-NEXT: s_lshl_b32 s6, s6, 8 -; GFX8-NEXT: s_or_b32 s4, s4, s6 -; GFX8-NEXT: s_and_b32 s6, s7, 0xff -; GFX8-NEXT: s_lshl_b32 s6, s6, 16 -; GFX8-NEXT: s_or_b32 s4, s4, s6 +; GFX8-NEXT: s_or_b32 s3, s6, s3 +; GFX8-NEXT: s_lshr_b32 s6, s4, 24 +; GFX8-NEXT: s_and_b32 s7, s4, 0xff +; GFX8-NEXT: s_lshl_b32 s8, s8, 8 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x80010 +; GFX8-NEXT: s_or_b32 s7, s7, s8 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX8-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_or_b32 s4, s7, s4 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 -; GFX8-NEXT: s_lshr_b32 s9, s5, 8 -; GFX8-NEXT: s_and_b32 s5, s5, 0xff -; GFX8-NEXT: s_lshl_b32 s5, s5, 8 +; GFX8-NEXT: s_and_b32 s7, s5, 0xff +; GFX8-NEXT: s_bfe_u32 s5, s5, 0x80008 +; GFX8-NEXT: s_lshl_b32 s7, s7, 8 ; GFX8-NEXT: v_mul_lo_u32 v2, v2, 24 -; GFX8-NEXT: s_and_b32 s6, s9, 0xff -; GFX8-NEXT: s_or_b32 s5, s8, s5 -; GFX8-NEXT: s_lshl_b32 s6, s6, 16 -; GFX8-NEXT: s_or_b32 s5, s5, s6 +; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX8-NEXT: s_or_b32 s6, s6, s7 +; GFX8-NEXT: s_lshl_b32 s5, s5, 16 +; GFX8-NEXT: s_or_b32 s5, s6, s5 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s4, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s5, v0 @@ -1987,67 +1932,67 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; ; GFX9-LABEL: s_fshl_v2i24: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s6, s0, 8 -; GFX9-NEXT: s_and_b32 s6, s6, 0xff ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX9-NEXT: s_lshr_b32 s7, s0, 16 -; GFX9-NEXT: s_lshr_b32 s8, s0, 24 -; GFX9-NEXT: s_and_b32 s0, s0, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_or_b32 s0, s0, s6 -; GFX9-NEXT: s_and_b32 s6, s7, 0xff -; GFX9-NEXT: s_lshr_b32 s9, s1, 8 -; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_or_b32 s0, s0, s6 -; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_and_b32 s6, s9, 0xff -; GFX9-NEXT: s_or_b32 s1, s8, s1 -; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x80008 +; GFX9-NEXT: s_lshr_b32 s6, s0, 24 +; GFX9-NEXT: s_and_b32 s7, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX9-NEXT: s_or_b32 s1, s1, s6 -; GFX9-NEXT: s_lshr_b32 s6, s2, 8 +; GFX9-NEXT: s_or_b32 s0, s7, s0 +; GFX9-NEXT: s_and_b32 s7, s1, 0xff +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80008 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_and_b32 s6, s6, 0xff -; GFX9-NEXT: s_lshr_b32 s7, s2, 16 -; GFX9-NEXT: s_lshr_b32 s8, s2, 24 -; GFX9-NEXT: s_and_b32 s2, s2, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_or_b32 s2, s2, s6 -; GFX9-NEXT: s_and_b32 s6, s7, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x80008 +; GFX9-NEXT: s_or_b32 s1, s6, s1 +; GFX9-NEXT: s_lshr_b32 s6, s2, 24 +; GFX9-NEXT: s_and_b32 s7, s2, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX9-NEXT: v_not_b32_e32 v1, 23 -; GFX9-NEXT: s_lshr_b32 s9, s3, 8 -; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: s_and_b32 s3, s3, 0xff +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, v1 -; GFX9-NEXT: s_or_b32 s2, s2, s6 -; GFX9-NEXT: s_lshl_b32 s3, s3, 8 -; GFX9-NEXT: s_and_b32 s6, s9, 0xff -; GFX9-NEXT: s_or_b32 s3, s8, s3 -; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: s_or_b32 s3, s3, s6 -; GFX9-NEXT: s_lshr_b32 s6, s4, 8 -; GFX9-NEXT: s_and_b32 s6, s6, 0xff +; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_or_b32 s2, s7, s2 +; GFX9-NEXT: s_and_b32 s7, s3, 0xff +; GFX9-NEXT: s_bfe_u32 s3, s3, 0x80008 +; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_lshl_b32 s3, s3, 16 +; GFX9-NEXT: s_bfe_u32 s8, s4, 0x80008 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX9-NEXT: s_lshr_b32 s7, s4, 16 -; GFX9-NEXT: s_lshr_b32 s8, s4, 24 -; GFX9-NEXT: s_and_b32 s4, s4, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: s_and_b32 s6, s7, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_or_b32 s3, s6, s3 +; GFX9-NEXT: s_lshr_b32 s6, s4, 24 +; GFX9-NEXT: s_and_b32 s7, s4, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_bfe_u32 s4, s4, 0x80010 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: s_or_b32 s4, s7, s4 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v1, s4, v0 -; GFX9-NEXT: s_lshr_b32 s9, s5, 8 -; GFX9-NEXT: s_and_b32 s5, s5, 0xff -; GFX9-NEXT: s_lshl_b32 s5, s5, 8 +; GFX9-NEXT: s_and_b32 s7, s5, 0xff +; GFX9-NEXT: s_bfe_u32 s5, s5, 0x80008 +; GFX9-NEXT: s_lshl_b32 s7, s7, 8 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX9-NEXT: s_and_b32 s6, s9, 0xff -; GFX9-NEXT: s_or_b32 s5, s8, s5 -; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s5, s6, s5 ; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 ; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v1 @@ -2095,78 +2040,80 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX10-LABEL: s_fshl_v2i24: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX10-NEXT: s_lshr_b32 s10, s4, 8 -; GFX10-NEXT: s_lshr_b32 s11, s4, 16 -; GFX10-NEXT: s_and_b32 s10, s10, 0xff -; GFX10-NEXT: s_lshr_b32 s12, s4, 24 +; GFX10-NEXT: s_bfe_u32 s12, s4, 0x80008 +; GFX10-NEXT: s_lshr_b32 s10, s4, 24 +; GFX10-NEXT: s_and_b32 s11, s4, 0xff +; GFX10-NEXT: s_bfe_u32 s4, s4, 0x80010 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_and_b32 s4, s4, 0xff -; GFX10-NEXT: s_and_b32 s11, s11, 0xff -; GFX10-NEXT: s_lshl_b32 s10, s10, 8 -; GFX10-NEXT: s_lshl_b32 s11, s11, 16 -; GFX10-NEXT: s_or_b32 s4, s4, s10 -; GFX10-NEXT: s_lshr_b32 s13, s5, 8 -; GFX10-NEXT: s_and_b32 s5, s5, 0xff -; GFX10-NEXT: s_or_b32 s4, s4, s11 -; GFX10-NEXT: s_lshl_b32 s5, s5, 8 +; GFX10-NEXT: s_lshl_b32 s12, s12, 8 +; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX10-NEXT: s_or_b32 s11, s11, s12 +; GFX10-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-NEXT: s_and_b32 s11, 0xffff, s11 +; GFX10-NEXT: s_and_b32 s13, s5, 0xff +; GFX10-NEXT: s_bfe_u32 s5, s5, 0x80008 +; GFX10-NEXT: s_or_b32 s4, s11, s4 +; GFX10-NEXT: s_lshl_b32 s13, s13, 8 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX10-NEXT: s_and_b32 s13, s13, 0xff -; GFX10-NEXT: s_or_b32 s5, s12, s5 -; GFX10-NEXT: s_lshl_b32 s10, s13, 16 -; GFX10-NEXT: s_lshr_b32 s9, s1, 8 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX10-NEXT: s_or_b32 s10, s10, s13 +; GFX10-NEXT: s_lshl_b32 s5, s5, 16 +; GFX10-NEXT: s_bfe_u32 s8, s0, 0x80008 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: s_or_b32 s5, s5, s10 -; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshr_b32 s10, s2, 8 -; GFX10-NEXT: s_lshr_b32 s8, s0, 24 +; GFX10-NEXT: s_or_b32 s5, s10, s5 +; GFX10-NEXT: s_bfe_u32 s12, s2, 0x80008 +; GFX10-NEXT: s_and_b32 s7, s0, 0xff +; GFX10-NEXT: s_lshr_b32 s10, s2, 24 ; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 -; GFX10-NEXT: s_lshr_b32 s11, s2, 16 -; GFX10-NEXT: s_lshr_b32 s13, s3, 8 -; GFX10-NEXT: s_and_b32 s3, s3, 0xff -; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_and_b32 s10, s10, 0xff -; GFX10-NEXT: s_lshr_b32 s12, s2, 24 -; GFX10-NEXT: s_and_b32 s2, s2, 0xff +; GFX10-NEXT: s_and_b32 s11, s2, 0xff +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX10-NEXT: s_and_b32 s13, s3, 0xff +; GFX10-NEXT: s_bfe_u32 s3, s3, 0x80008 +; GFX10-NEXT: s_lshl_b32 s8, s8, 8 +; GFX10-NEXT: s_lshl_b32 s12, s12, 8 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_or_b32 s1, s8, s1 -; GFX10-NEXT: s_lshl_b32 s8, s10, 8 -; GFX10-NEXT: s_or_b32 s3, s12, s3 -; GFX10-NEXT: s_or_b32 s2, s2, s8 -; GFX10-NEXT: s_lshr_b32 s6, s0, 8 -; GFX10-NEXT: s_lshr_b32 s7, s0, 16 +; GFX10-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX10-NEXT: s_lshl_b32 s2, s2, 16 +; GFX10-NEXT: s_lshl_b32 s3, s3, 16 +; GFX10-NEXT: s_lshr_b32 s6, s0, 24 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_and_b32 s9, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80008 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: s_and_b32 s6, s6, 0xff -; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: s_and_b32 s7, s7, 0xff -; GFX10-NEXT: s_and_b32 s9, s9, 0xff +; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX10-NEXT: s_lshl_b32 s9, s9, 8 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_lshl_b32 s0, s0, 16 ; GFX10-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX10-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX10-NEXT: s_lshl_b32 s6, s6, 8 -; GFX10-NEXT: s_lshl_b32 s7, s7, 16 -; GFX10-NEXT: s_or_b32 s0, s0, s6 +; GFX10-NEXT: s_or_b32 s6, s6, s9 +; GFX10-NEXT: s_lshl_b32 s1, s1, 16 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: s_or_b32 s0, s0, s7 +; GFX10-NEXT: s_or_b32 s1, s6, s1 ; GFX10-NEXT: v_mul_lo_u32 v2, v2, 24 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s4, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, s5, v0 -; GFX10-NEXT: s_and_b32 s4, s11, 0xff -; GFX10-NEXT: s_and_b32 s5, s13, 0xff -; GFX10-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-NEXT: s_lshl_b32 s4, s13, 8 +; GFX10-NEXT: s_or_b32 s5, s7, s8 +; GFX10-NEXT: s_or_b32 s7, s11, s12 ; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 0xffffffe8, v0 -; GFX10-NEXT: s_lshl_b32 s5, s5, 16 -; GFX10-NEXT: s_or_b32 s2, s2, s4 -; GFX10-NEXT: s_or_b32 s3, s3, s5 +; GFX10-NEXT: s_or_b32 s4, s10, s4 +; GFX10-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX10-NEXT: s_or_b32 s3, s4, s3 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX10-NEXT: s_or_b32 s2, s7, s2 ; GFX10-NEXT: s_lshr_b32 s3, s3, 1 ; GFX10-NEXT: s_lshr_b32 s2, s2, 1 ; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX10-NEXT: s_or_b32 s0, s5, s0 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 0xffffffe8, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 @@ -2178,8 +2125,6 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX10-NEXT: v_lshrrev_b32_e64 v3, v3, s2 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX10-NEXT: s_lshl_b32 s2, s9, 16 -; GFX10-NEXT: s_or_b32 s1, s1, s2 ; GFX10-NEXT: v_lshl_or_b32 v2, s0, v2, v3 ; GFX10-NEXT: v_lshrrev_b32_e64 v4, v4, s3 ; GFX10-NEXT: v_mov_b32_e32 v3, 16 @@ -2200,120 +2145,118 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX11-LABEL: s_fshl_v2i24: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX11-NEXT: s_lshr_b32 s10, s4, 8 -; GFX11-NEXT: s_lshr_b32 s11, s4, 16 -; GFX11-NEXT: s_and_b32 s10, s10, 0xff -; GFX11-NEXT: s_lshr_b32 s12, s4, 24 +; GFX11-NEXT: s_bfe_u32 s12, s4, 0x80008 +; GFX11-NEXT: s_lshr_b32 s10, s4, 24 +; GFX11-NEXT: s_and_b32 s11, s4, 0xff +; GFX11-NEXT: s_bfe_u32 s4, s4, 0x80010 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX11-NEXT: s_and_b32 s4, s4, 0xff -; GFX11-NEXT: s_and_b32 s11, s11, 0xff -; GFX11-NEXT: s_lshl_b32 s10, s10, 8 -; GFX11-NEXT: s_lshl_b32 s11, s11, 16 -; GFX11-NEXT: s_or_b32 s4, s4, s10 -; GFX11-NEXT: s_lshr_b32 s13, s5, 8 -; GFX11-NEXT: s_and_b32 s5, s5, 0xff -; GFX11-NEXT: s_or_b32 s4, s4, s11 -; GFX11-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-NEXT: s_lshl_b32 s12, s12, 8 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX11-NEXT: s_or_b32 s11, s11, s12 +; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: s_and_b32 s11, 0xffff, s11 +; GFX11-NEXT: s_and_b32 s13, s5, 0xff +; GFX11-NEXT: s_bfe_u32 s5, s5, 0x80008 +; GFX11-NEXT: s_or_b32 s4, s11, s4 +; GFX11-NEXT: s_lshl_b32 s13, s13, 8 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX11-NEXT: s_and_b32 s13, s13, 0xff -; GFX11-NEXT: s_or_b32 s5, s12, s5 -; GFX11-NEXT: s_lshl_b32 s10, s13, 16 -; GFX11-NEXT: s_lshr_b32 s9, s1, 8 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX11-NEXT: s_or_b32 s10, s10, s13 +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_bfe_u32 s8, s0, 0x80008 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: s_or_b32 s5, s5, s10 -; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_lshr_b32 s10, s2, 8 -; GFX11-NEXT: s_lshr_b32 s8, s0, 24 +; GFX11-NEXT: s_or_b32 s5, s10, s5 +; GFX11-NEXT: s_bfe_u32 s12, s2, 0x80008 +; GFX11-NEXT: s_and_b32 s7, s0, 0xff +; GFX11-NEXT: s_lshr_b32 s10, s2, 24 ; GFX11-NEXT: v_mul_lo_u32 v1, 0xffffffe8, v0 -; GFX11-NEXT: s_lshr_b32 s11, s2, 16 -; GFX11-NEXT: s_lshl_b32 s1, s1, 8 -; GFX11-NEXT: s_and_b32 s9, s9, 0xff -; GFX11-NEXT: s_and_b32 s10, s10, 0xff -; GFX11-NEXT: s_lshr_b32 s12, s2, 24 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_or_b32 s1, s8, s1 +; GFX11-NEXT: s_and_b32 s11, s2, 0xff +; GFX11-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX11-NEXT: s_lshl_b32 s8, s8, 8 +; GFX11-NEXT: s_lshl_b32 s12, s12, 8 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-NEXT: s_and_b32 s13, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 ; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX11-NEXT: s_lshl_b32 s8, s9, 16 -; GFX11-NEXT: s_lshl_b32 s9, s10, 8 -; GFX11-NEXT: s_lshr_b32 s6, s0, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s9 -; GFX11-NEXT: s_lshr_b32 s13, s3, 8 -; GFX11-NEXT: s_and_b32 s3, s3, 0xff -; GFX11-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-NEXT: s_bfe_u32 s3, s3, 0x80008 +; GFX11-NEXT: s_lshr_b32 s6, s0, 24 +; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX11-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 ; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1 -; GFX11-NEXT: s_and_b32 s6, s6, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-NEXT: s_and_b32 s9, s1, 0xff +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX11-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: v_mul_hi_u32 v1, s4, v0 ; GFX11-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX11-NEXT: s_lshl_b32 s6, s6, 8 -; GFX11-NEXT: s_or_b32 s3, s12, s3 -; GFX11-NEXT: s_lshl_b32 s7, s7, 16 -; GFX11-NEXT: s_or_b32 s0, s0, s6 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_or_b32 s0, s0, s7 -; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX11-NEXT: s_or_b32 s6, s6, s9 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u32_e32 v1, s4, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, s5, v0 -; GFX11-NEXT: s_and_b32 s4, s11, 0xff -; GFX11-NEXT: s_and_b32 s5, s13, 0xff -; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_or_b32 s7, s11, s12 +; GFX11-NEXT: s_lshl_b32 s4, s13, 8 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v0 -; GFX11-NEXT: s_or_b32 s2, s2, s4 -; GFX11-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-NEXT: s_lshr_b32 s2, s2, 1 +; GFX11-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX11-NEXT: s_or_b32 s4, s10, s4 +; GFX11-NEXT: s_or_b32 s2, s7, s2 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX11-NEXT: s_or_b32 s3, s3, s5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_lshr_b32 s2, s2, 1 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX11-NEXT: s_or_b32 s3, s4, s3 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 +; GFX11-NEXT: s_or_b32 s0, s5, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 23, v1 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshrrev_b32_e64 v2, v2, s2 ; GFX11-NEXT: s_lshr_b32 s2, s3, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshl_or_b32 v1, s0, v1, v2 -; GFX11-NEXT: s_or_b32 s0, s1, s8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_or_b32 s0, s6, s1 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3 -; GFX11-NEXT: v_bfe_u32 v2, v1, 8, 8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v2, v1, 8, 8 ; GFX11-NEXT: v_lshrrev_b32_e64 v3, v3, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshl_or_b32 v0, s0, v0, v3 ; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX11-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 8, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 8, v5 ; GFX11-NEXT: v_or3_b32 v1, v1, v3, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s0, v1 ; GFX11-NEXT: ; return to shader part epilog %lhs = bitcast i48 %lhs.arg to <2 x i24> diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index 238cc06fc7f7c..9e3378a244270 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -463,42 +463,17 @@ define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) { } define amdgpu_ps i8 @s_fshr_i8_4(i8 inreg %lhs, i8 inreg %rhs) { -; GFX6-LABEL: s_fshr_i8_4: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 4 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x40004 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_fshr_i8_4: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_lshl_b32 s0, s0, 4 -; GFX8-NEXT: s_lshr_b32 s1, s1, 4 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_fshr_i8_4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_lshl_b32 s0, s0, 4 -; GFX9-NEXT: s_lshr_b32 s1, s1, 4 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: s_fshr_i8_4: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: s_lshr_b32 s1, s1, 4 -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: s_fshr_i8_4: +; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b32 s0, s0, 4 +; GCN-NEXT: s_bfe_u32 s1, s1, 0x40004 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshr_i8_4: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_lshl_b32 s0, s0, 4 -; GFX11-NEXT: s_lshr_b32 s1, s1, 4 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x40004 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -556,42 +531,17 @@ define i8 @v_fshr_i8_4(i8 %lhs, i8 %rhs) { } define amdgpu_ps i8 @s_fshr_i8_5(i8 inreg %lhs, i8 inreg %rhs) { -; GFX6-LABEL: s_fshr_i8_5: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 3 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x30005 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_fshr_i8_5: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_lshl_b32 s0, s0, 3 -; GFX8-NEXT: s_lshr_b32 s1, s1, 5 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_fshr_i8_5: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_lshl_b32 s0, s0, 3 -; GFX9-NEXT: s_lshr_b32 s1, s1, 5 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: s_fshr_i8_5: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshl_b32 s0, s0, 3 -; GFX10-NEXT: s_lshr_b32 s1, s1, 5 -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: s_fshr_i8_5: +; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b32 s0, s0, 3 +; GCN-NEXT: s_bfe_u32 s1, s1, 0x30005 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshr_i8_5: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_lshl_b32 s0, s0, 3 -; GFX11-NEXT: s_lshr_b32 s1, s1, 5 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x30005 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -676,22 +626,22 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX8-LABEL: s_fshr_v2i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s3, s0, 8 -; GFX8-NEXT: s_lshr_b32 s4, s1, 8 -; GFX8-NEXT: s_lshr_b32 s5, s2, 8 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: s_andn2_b32 s6, 7, s2 +; GFX8-NEXT: s_andn2_b32 s5, 7, s2 +; GFX8-NEXT: s_lshr_b32 s4, s2, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, s5 ; GFX8-NEXT: s_and_b32 s2, s2, 7 -; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_lshl_b32 s0, s0, s6 -; GFX8-NEXT: s_lshr_b32 s1, s1, s2 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s3, 1 -; GFX8-NEXT: s_andn2_b32 s2, 7, s5 -; GFX8-NEXT: s_lshl_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s2, s5, 7 -; GFX8-NEXT: s_and_b32 s3, s4, 0xff -; GFX8-NEXT: s_lshr_b32 s2, s3, s2 -; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s5, s1, 0xff +; GFX8-NEXT: s_lshr_b32 s2, s5, s2 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_lshl_b32 s2, s3, 1 +; GFX8-NEXT: s_andn2_b32 s3, 7, s4 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX8-NEXT: s_lshl_b32 s2, s2, s3 +; GFX8-NEXT: s_and_b32 s3, s4, 7 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshr_b32 s1, s1, s3 +; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 @@ -701,22 +651,22 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX9-LABEL: s_fshr_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshr_b32 s3, s0, 8 -; GFX9-NEXT: s_lshr_b32 s4, s1, 8 -; GFX9-NEXT: s_lshr_b32 s5, s2, 8 ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 -; GFX9-NEXT: s_andn2_b32 s6, 7, s2 +; GFX9-NEXT: s_andn2_b32 s5, 7, s2 +; GFX9-NEXT: s_lshr_b32 s4, s2, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, s5 ; GFX9-NEXT: s_and_b32 s2, s2, 7 -; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_lshl_b32 s0, s0, s6 -; GFX9-NEXT: s_lshr_b32 s1, s1, s2 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_lshl_b32 s1, s3, 1 -; GFX9-NEXT: s_andn2_b32 s2, 7, s5 -; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s2, s5, 7 -; GFX9-NEXT: s_and_b32 s3, s4, 0xff -; GFX9-NEXT: s_lshr_b32 s2, s3, s2 -; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s5, s1, 0xff +; GFX9-NEXT: s_lshr_b32 s2, s5, s2 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s2, s3, 1 +; GFX9-NEXT: s_andn2_b32 s3, 7, s4 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX9-NEXT: s_lshl_b32 s2, s2, s3 +; GFX9-NEXT: s_and_b32 s3, s4, 7 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_lshr_b32 s1, s1, s3 +; GFX9-NEXT: s_or_b32 s1, s2, s1 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_and_b32 s0, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 @@ -726,23 +676,23 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX10-LABEL: s_fshr_v2i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: s_lshr_b32 s4, s1, 8 ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-NEXT: s_andn2_b32 s5, 7, s2 -; GFX10-NEXT: s_lshr_b32 s6, s2, 8 -; GFX10-NEXT: s_lshl_b32 s0, s0, s5 +; GFX10-NEXT: s_andn2_b32 s4, 7, s2 +; GFX10-NEXT: s_lshr_b32 s5, s2, 8 +; GFX10-NEXT: s_lshl_b32 s0, s0, s4 +; GFX10-NEXT: s_and_b32 s4, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80008 ; GFX10-NEXT: s_lshl_b32 s3, s3, 1 -; GFX10-NEXT: s_andn2_b32 s5, 7, s6 -; GFX10-NEXT: s_and_b32 s6, s6, 7 -; GFX10-NEXT: s_and_b32 s4, s4, 0xff +; GFX10-NEXT: s_andn2_b32 s6, 7, s5 +; GFX10-NEXT: s_and_b32 s5, s5, 7 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_and_b32 s2, s2, 7 +; GFX10-NEXT: s_lshl_b32 s3, s3, s6 +; GFX10-NEXT: s_lshr_b32 s1, s1, s5 +; GFX10-NEXT: s_lshr_b32 s2, s4, s2 +; GFX10-NEXT: s_or_b32 s1, s3, s1 +; GFX10-NEXT: s_or_b32 s0, s0, s2 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshl_b32 s3, s3, s5 -; GFX10-NEXT: s_lshr_b32 s4, s4, s6 -; GFX10-NEXT: s_lshr_b32 s1, s1, s2 -; GFX10-NEXT: s_or_b32 s2, s3, s4 -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_and_b32 s1, s2, 0xff ; GFX10-NEXT: s_and_b32 s0, s0, 0xff ; GFX10-NEXT: s_lshl_b32 s1, s1, 8 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -751,23 +701,23 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX11-LABEL: s_fshr_v2i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_lshr_b32 s3, s0, 8 -; GFX11-NEXT: s_lshr_b32 s4, s1, 8 ; GFX11-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-NEXT: s_and_not1_b32 s5, 7, s2 -; GFX11-NEXT: s_lshr_b32 s6, s2, 8 -; GFX11-NEXT: s_lshl_b32 s0, s0, s5 +; GFX11-NEXT: s_and_not1_b32 s4, 7, s2 +; GFX11-NEXT: s_lshr_b32 s5, s2, 8 +; GFX11-NEXT: s_lshl_b32 s0, s0, s4 +; GFX11-NEXT: s_and_b32 s4, s1, 0xff +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80008 ; GFX11-NEXT: s_lshl_b32 s3, s3, 1 -; GFX11-NEXT: s_and_not1_b32 s5, 7, s6 -; GFX11-NEXT: s_and_b32 s6, s6, 7 -; GFX11-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-NEXT: s_and_not1_b32 s6, 7, s5 +; GFX11-NEXT: s_and_b32 s5, s5, 7 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_and_b32 s2, s2, 7 +; GFX11-NEXT: s_lshl_b32 s3, s3, s6 +; GFX11-NEXT: s_lshr_b32 s1, s1, s5 +; GFX11-NEXT: s_lshr_b32 s2, s4, s2 +; GFX11-NEXT: s_or_b32 s1, s3, s1 +; GFX11-NEXT: s_or_b32 s0, s0, s2 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s3, s5 -; GFX11-NEXT: s_lshr_b32 s4, s4, s6 -; GFX11-NEXT: s_lshr_b32 s1, s1, s2 -; GFX11-NEXT: s_or_b32 s2, s3, s4 -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_and_b32 s1, s2, 0xff ; GFX11-NEXT: s_and_b32 s0, s0, 0xff ; GFX11-NEXT: s_lshl_b32 s1, s1, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -980,44 +930,44 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX8-NEXT: s_lshr_b32 s3, s0, 8 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 ; GFX8-NEXT: s_lshr_b32 s5, s0, 24 -; GFX8-NEXT: s_lshr_b32 s6, s1, 8 -; GFX8-NEXT: s_lshr_b32 s7, s1, 16 -; GFX8-NEXT: s_lshr_b32 s8, s1, 24 -; GFX8-NEXT: s_lshr_b32 s9, s2, 8 -; GFX8-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NEXT: s_lshr_b32 s11, s2, 24 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: s_andn2_b32 s12, 7, s2 +; GFX8-NEXT: s_andn2_b32 s10, 7, s2 +; GFX8-NEXT: s_lshr_b32 s7, s2, 8 +; GFX8-NEXT: s_lshr_b32 s8, s2, 16 +; GFX8-NEXT: s_lshr_b32 s9, s2, 24 +; GFX8-NEXT: s_lshl_b32 s0, s0, s10 ; GFX8-NEXT: s_and_b32 s2, s2, 7 -; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_lshl_b32 s0, s0, s12 -; GFX8-NEXT: s_lshr_b32 s1, s1, s2 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s3, 1 -; GFX8-NEXT: s_andn2_b32 s2, 7, s9 -; GFX8-NEXT: s_lshl_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s2, s9, 7 -; GFX8-NEXT: s_and_b32 s3, s6, 0xff -; GFX8-NEXT: s_lshr_b32 s2, s3, s2 -; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: s_lshl_b32 s2, s4, 1 -; GFX8-NEXT: s_andn2_b32 s3, 7, s10 +; GFX8-NEXT: s_and_b32 s10, s1, 0xff +; GFX8-NEXT: s_lshr_b32 s2, s10, s2 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_lshl_b32 s2, s3, 1 +; GFX8-NEXT: s_andn2_b32 s3, 7, s7 ; GFX8-NEXT: s_lshl_b32 s2, s2, s3 -; GFX8-NEXT: s_and_b32 s3, s10, 7 -; GFX8-NEXT: s_and_b32 s4, s7, 0xff -; GFX8-NEXT: s_lshr_b32 s3, s4, s3 +; GFX8-NEXT: s_and_b32 s3, s7, 7 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX8-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX8-NEXT: s_lshr_b32 s3, s7, s3 +; GFX8-NEXT: s_lshr_b32 s6, s1, 24 ; GFX8-NEXT: s_or_b32 s2, s2, s3 +; GFX8-NEXT: s_lshl_b32 s3, s4, 1 +; GFX8-NEXT: s_andn2_b32 s4, 7, s8 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX8-NEXT: s_lshl_b32 s3, s3, s4 +; GFX8-NEXT: s_and_b32 s4, s8, 7 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshr_b32 s1, s1, s4 +; GFX8-NEXT: s_or_b32 s1, s3, s1 ; GFX8-NEXT: s_lshl_b32 s3, s5, 1 -; GFX8-NEXT: s_andn2_b32 s4, 7, s11 -; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_andn2_b32 s4, 7, s9 ; GFX8-NEXT: s_lshl_b32 s3, s3, s4 -; GFX8-NEXT: s_and_b32 s4, s11, 7 +; GFX8-NEXT: s_and_b32 s4, s9, 7 +; GFX8-NEXT: s_and_b32 s2, s2, 0xff +; GFX8-NEXT: s_lshr_b32 s4, s6, s4 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_lshr_b32 s4, s8, s4 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s2, 0xff +; GFX8-NEXT: s_lshl_b32 s2, s2, 8 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_or_b32 s3, s3, s4 +; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s3, 0xff @@ -1030,44 +980,44 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX9-NEXT: s_lshr_b32 s3, s0, 8 ; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshr_b32 s5, s0, 24 -; GFX9-NEXT: s_lshr_b32 s6, s1, 8 -; GFX9-NEXT: s_lshr_b32 s7, s1, 16 -; GFX9-NEXT: s_lshr_b32 s8, s1, 24 -; GFX9-NEXT: s_lshr_b32 s9, s2, 8 -; GFX9-NEXT: s_lshr_b32 s10, s2, 16 -; GFX9-NEXT: s_lshr_b32 s11, s2, 24 ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 -; GFX9-NEXT: s_andn2_b32 s12, 7, s2 +; GFX9-NEXT: s_andn2_b32 s10, 7, s2 +; GFX9-NEXT: s_lshr_b32 s7, s2, 8 +; GFX9-NEXT: s_lshr_b32 s8, s2, 16 +; GFX9-NEXT: s_lshr_b32 s9, s2, 24 +; GFX9-NEXT: s_lshl_b32 s0, s0, s10 ; GFX9-NEXT: s_and_b32 s2, s2, 7 -; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_lshl_b32 s0, s0, s12 -; GFX9-NEXT: s_lshr_b32 s1, s1, s2 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_lshl_b32 s1, s3, 1 -; GFX9-NEXT: s_andn2_b32 s2, 7, s9 -; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s2, s9, 7 -; GFX9-NEXT: s_and_b32 s3, s6, 0xff -; GFX9-NEXT: s_lshr_b32 s2, s3, s2 -; GFX9-NEXT: s_or_b32 s1, s1, s2 -; GFX9-NEXT: s_lshl_b32 s2, s4, 1 -; GFX9-NEXT: s_andn2_b32 s3, 7, s10 +; GFX9-NEXT: s_and_b32 s10, s1, 0xff +; GFX9-NEXT: s_lshr_b32 s2, s10, s2 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s2, s3, 1 +; GFX9-NEXT: s_andn2_b32 s3, 7, s7 ; GFX9-NEXT: s_lshl_b32 s2, s2, s3 -; GFX9-NEXT: s_and_b32 s3, s10, 7 -; GFX9-NEXT: s_and_b32 s4, s7, 0xff -; GFX9-NEXT: s_lshr_b32 s3, s4, s3 +; GFX9-NEXT: s_and_b32 s3, s7, 7 +; GFX9-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX9-NEXT: s_lshr_b32 s3, s7, s3 +; GFX9-NEXT: s_lshr_b32 s6, s1, 24 ; GFX9-NEXT: s_or_b32 s2, s2, s3 +; GFX9-NEXT: s_lshl_b32 s3, s4, 1 +; GFX9-NEXT: s_andn2_b32 s4, 7, s8 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX9-NEXT: s_lshl_b32 s3, s3, s4 +; GFX9-NEXT: s_and_b32 s4, s8, 7 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_lshr_b32 s1, s1, s4 +; GFX9-NEXT: s_or_b32 s1, s3, s1 ; GFX9-NEXT: s_lshl_b32 s3, s5, 1 -; GFX9-NEXT: s_andn2_b32 s4, 7, s11 -; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_andn2_b32 s4, 7, s9 ; GFX9-NEXT: s_lshl_b32 s3, s3, s4 -; GFX9-NEXT: s_and_b32 s4, s11, 7 +; GFX9-NEXT: s_and_b32 s4, s9, 7 +; GFX9-NEXT: s_and_b32 s2, s2, 0xff +; GFX9-NEXT: s_lshr_b32 s4, s6, s4 ; GFX9-NEXT: s_and_b32 s0, s0, 0xff -; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_lshr_b32 s4, s8, s4 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s2, 0xff +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_or_b32 s3, s3, s4 +; GFX9-NEXT: s_or_b32 s0, s0, s2 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s3, 0xff @@ -1077,48 +1027,48 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; ; GFX10-LABEL: s_fshr_v4i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: s_lshr_b32 s6, s1, 8 -; GFX10-NEXT: s_lshr_b32 s7, s1, 16 -; GFX10-NEXT: s_lshr_b32 s8, s1, 24 -; GFX10-NEXT: s_lshr_b32 s9, s2, 8 -; GFX10-NEXT: s_lshr_b32 s10, s2, 16 -; GFX10-NEXT: s_lshr_b32 s11, s2, 24 -; GFX10-NEXT: s_andn2_b32 s12, 7, s2 +; GFX10-NEXT: s_lshr_b32 s7, s2, 8 +; GFX10-NEXT: s_lshr_b32 s8, s2, 16 +; GFX10-NEXT: s_lshr_b32 s9, s2, 24 +; GFX10-NEXT: s_andn2_b32 s10, 7, s2 ; GFX10-NEXT: s_and_b32 s2, s2, 7 -; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_and_b32 s11, s1, 0xff +; GFX10-NEXT: s_lshr_b32 s3, s0, 8 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s0, 24 ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-NEXT: s_lshr_b32 s1, s1, s2 -; GFX10-NEXT: s_lshl_b32 s2, s3, 1 -; GFX10-NEXT: s_andn2_b32 s3, 7, s9 -; GFX10-NEXT: s_and_b32 s9, s9, 7 -; GFX10-NEXT: s_and_b32 s6, s6, 0xff -; GFX10-NEXT: s_lshl_b32 s0, s0, s12 -; GFX10-NEXT: s_lshl_b32 s2, s2, s3 -; GFX10-NEXT: s_lshr_b32 s3, s6, s9 -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_or_b32 s1, s2, s3 -; GFX10-NEXT: s_lshl_b32 s2, s4, 1 -; GFX10-NEXT: s_andn2_b32 s3, 7, s10 -; GFX10-NEXT: s_and_b32 s4, s10, 7 -; GFX10-NEXT: s_and_b32 s6, s7, 0xff -; GFX10-NEXT: s_lshl_b32 s2, s2, s3 -; GFX10-NEXT: s_lshr_b32 s3, s6, s4 +; GFX10-NEXT: s_lshr_b32 s2, s11, s2 +; GFX10-NEXT: s_bfe_u32 s11, s1, 0x80008 +; GFX10-NEXT: s_lshl_b32 s0, s0, s10 +; GFX10-NEXT: s_lshl_b32 s3, s3, 1 +; GFX10-NEXT: s_andn2_b32 s10, 7, s7 +; GFX10-NEXT: s_and_b32 s7, s7, 7 +; GFX10-NEXT: s_and_b32 s11, 0xffff, s11 +; GFX10-NEXT: s_lshr_b32 s6, s1, 24 +; GFX10-NEXT: s_lshl_b32 s3, s3, s10 +; GFX10-NEXT: s_lshr_b32 s7, s11, s7 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_or_b32 s2, s3, s7 +; GFX10-NEXT: s_lshl_b32 s3, s4, 1 +; GFX10-NEXT: s_andn2_b32 s4, 7, s8 +; GFX10-NEXT: s_and_b32 s7, s8, 7 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_lshl_b32 s3, s3, s4 +; GFX10-NEXT: s_lshr_b32 s1, s1, s7 ; GFX10-NEXT: s_lshl_b32 s4, s5, 1 -; GFX10-NEXT: s_andn2_b32 s5, 7, s11 -; GFX10-NEXT: s_and_b32 s6, s11, 7 +; GFX10-NEXT: s_andn2_b32 s5, 7, s9 +; GFX10-NEXT: s_and_b32 s7, s9, 7 ; GFX10-NEXT: s_lshl_b32 s4, s4, s5 -; GFX10-NEXT: s_lshr_b32 s5, s8, s6 -; GFX10-NEXT: s_or_b32 s2, s2, s3 -; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_lshr_b32 s5, s6, s7 +; GFX10-NEXT: s_or_b32 s1, s3, s1 +; GFX10-NEXT: s_and_b32 s2, s2, 0xff ; GFX10-NEXT: s_or_b32 s3, s4, s5 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_and_b32 s2, s2, 0xff -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, 16 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s1, s1, 16 ; GFX10-NEXT: s_and_b32 s2, s3, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_lshl_b32 s1, s2, 24 @@ -1127,48 +1077,48 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; ; GFX11-LABEL: s_fshr_v4i8: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_lshr_b32 s3, s0, 8 -; GFX11-NEXT: s_lshr_b32 s6, s1, 8 -; GFX11-NEXT: s_lshr_b32 s7, s1, 16 -; GFX11-NEXT: s_lshr_b32 s8, s1, 24 -; GFX11-NEXT: s_lshr_b32 s9, s2, 8 -; GFX11-NEXT: s_lshr_b32 s10, s2, 16 -; GFX11-NEXT: s_lshr_b32 s11, s2, 24 -; GFX11-NEXT: s_and_not1_b32 s12, 7, s2 +; GFX11-NEXT: s_lshr_b32 s7, s2, 8 +; GFX11-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-NEXT: s_lshr_b32 s9, s2, 24 +; GFX11-NEXT: s_and_not1_b32 s10, 7, s2 ; GFX11-NEXT: s_and_b32 s2, s2, 7 -; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_and_b32 s11, s1, 0xff +; GFX11-NEXT: s_lshr_b32 s3, s0, 8 ; GFX11-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-NEXT: s_lshr_b32 s5, s0, 24 ; GFX11-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-NEXT: s_lshr_b32 s1, s1, s2 -; GFX11-NEXT: s_lshl_b32 s2, s3, 1 -; GFX11-NEXT: s_and_not1_b32 s3, 7, s9 -; GFX11-NEXT: s_and_b32 s9, s9, 7 -; GFX11-NEXT: s_and_b32 s6, s6, 0xff -; GFX11-NEXT: s_lshl_b32 s0, s0, s12 -; GFX11-NEXT: s_lshl_b32 s2, s2, s3 -; GFX11-NEXT: s_lshr_b32 s3, s6, s9 -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_lshl_b32 s2, s4, 1 -; GFX11-NEXT: s_and_not1_b32 s3, 7, s10 -; GFX11-NEXT: s_and_b32 s4, s10, 7 -; GFX11-NEXT: s_and_b32 s6, s7, 0xff -; GFX11-NEXT: s_lshl_b32 s2, s2, s3 -; GFX11-NEXT: s_lshr_b32 s3, s6, s4 +; GFX11-NEXT: s_lshr_b32 s2, s11, s2 +; GFX11-NEXT: s_bfe_u32 s11, s1, 0x80008 +; GFX11-NEXT: s_lshl_b32 s0, s0, s10 +; GFX11-NEXT: s_lshl_b32 s3, s3, 1 +; GFX11-NEXT: s_and_not1_b32 s10, 7, s7 +; GFX11-NEXT: s_and_b32 s7, s7, 7 +; GFX11-NEXT: s_and_b32 s11, 0xffff, s11 +; GFX11-NEXT: s_lshr_b32 s6, s1, 24 +; GFX11-NEXT: s_lshl_b32 s3, s3, s10 +; GFX11-NEXT: s_lshr_b32 s7, s11, s7 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_or_b32 s2, s3, s7 +; GFX11-NEXT: s_lshl_b32 s3, s4, 1 +; GFX11-NEXT: s_and_not1_b32 s4, 7, s8 +; GFX11-NEXT: s_and_b32 s7, s8, 7 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_lshl_b32 s3, s3, s4 +; GFX11-NEXT: s_lshr_b32 s1, s1, s7 ; GFX11-NEXT: s_lshl_b32 s4, s5, 1 -; GFX11-NEXT: s_and_not1_b32 s5, 7, s11 -; GFX11-NEXT: s_and_b32 s6, s11, 7 +; GFX11-NEXT: s_and_not1_b32 s5, 7, s9 +; GFX11-NEXT: s_and_b32 s7, s9, 7 ; GFX11-NEXT: s_lshl_b32 s4, s4, s5 -; GFX11-NEXT: s_lshr_b32 s5, s8, s6 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_lshr_b32 s5, s6, s7 +; GFX11-NEXT: s_or_b32 s1, s3, s1 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff ; GFX11-NEXT: s_or_b32 s3, s4, s5 ; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s1, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_lshl_b32 s1, s2, 16 +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 ; GFX11-NEXT: s_and_b32 s2, s3, 0xff ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_lshl_b32 s1, s2, 24 @@ -1785,54 +1735,53 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX6: ; %bb.0: ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX6-NEXT: s_bfe_u32 s9, s0, 0x80008 -; GFX6-NEXT: v_not_b32_e32 v3, 23 -; GFX6-NEXT: s_lshr_b32 s7, s1, 8 +; GFX6-NEXT: s_bfe_u32 s7, s0, 0x80008 +; GFX6-NEXT: s_and_b32 s8, s1, 0xff +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; GFX6-NEXT: s_and_b32 s6, s0, 0xff +; GFX6-NEXT: s_lshl_b32 s7, s7, 8 +; GFX6-NEXT: v_alignbit_b32 v0, s8, v0, 24 +; GFX6-NEXT: s_bfe_u32 s8, s2, 0x80008 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX6-NEXT: s_and_b32 s8, s0, 0xff -; GFX6-NEXT: s_lshl_b32 s9, s9, 8 -; GFX6-NEXT: s_and_b32 s1, s1, 0xff -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: s_bfe_u32 s10, s2, 0x80008 +; GFX6-NEXT: s_or_b32 s6, s6, s7 +; GFX6-NEXT: s_bfe_u32 s7, s0, 0x80010 +; GFX6-NEXT: s_bfe_u32 s0, s1, 0x80008 +; GFX6-NEXT: s_and_b32 s1, s2, 0xff +; GFX6-NEXT: s_lshl_b32 s8, s8, 8 +; GFX6-NEXT: s_or_b32 s1, s1, s8 +; GFX6-NEXT: s_bfe_u32 s8, s2, 0x80010 +; GFX6-NEXT: s_and_b32 s8, 0xffff, s8 +; GFX6-NEXT: v_not_b32_e32 v3, 23 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX6-NEXT: s_lshl_b32 s8, s8, 16 ; GFX6-NEXT: v_mul_lo_u32 v4, v2, v3 -; GFX6-NEXT: s_or_b32 s8, s8, s9 -; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24 -; GFX6-NEXT: s_lshr_b32 s1, s2, 16 -; GFX6-NEXT: s_and_b32 s9, s2, 0xff -; GFX6-NEXT: s_lshl_b32 s10, s10, 8 -; GFX6-NEXT: s_lshr_b32 s6, s0, 16 -; GFX6-NEXT: s_and_b32 s0, s7, 0xff -; GFX6-NEXT: s_lshr_b32 s7, s3, 8 -; GFX6-NEXT: s_or_b32 s9, s9, s10 -; GFX6-NEXT: s_and_b32 s1, s1, 0xff -; GFX6-NEXT: s_and_b32 s3, s3, 0xff +; GFX6-NEXT: s_or_b32 s1, s1, s8 +; GFX6-NEXT: s_and_b32 s8, s3, 0xff ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_and_b32 s9, 0xffff, s9 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 24 -; GFX6-NEXT: s_and_b32 s2, s7, 0xff -; GFX6-NEXT: s_or_b32 s1, s9, s1 +; GFX6-NEXT: s_bfe_u32 s2, s3, 0x80008 +; GFX6-NEXT: v_alignbit_b32 v1, s8, v1, 24 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_bfe_u32 s9, s4, 0x80008 -; GFX6-NEXT: v_mul_hi_u32 v4, v2, v4 +; GFX6-NEXT: s_bfe_u32 s3, s4, 0x80008 ; GFX6-NEXT: v_or_b32_e32 v1, s2, v1 -; GFX6-NEXT: s_lshr_b32 s2, s4, 16 -; GFX6-NEXT: s_and_b32 s7, s4, 0xff -; GFX6-NEXT: s_lshl_b32 s9, s9, 8 -; GFX6-NEXT: s_or_b32 s7, s7, s9 -; GFX6-NEXT: s_and_b32 s2, s2, 0xff -; GFX6-NEXT: s_and_b32 s7, 0xffff, s7 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_or_b32 s2, s7, s2 +; GFX6-NEXT: s_and_b32 s2, s4, 0xff +; GFX6-NEXT: s_lshl_b32 s3, s3, 8 +; GFX6-NEXT: v_mul_hi_u32 v4, v2, v4 +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_bfe_u32 s3, s4, 0x80010 +; GFX6-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_hi_u32 v4, s2, v2 -; GFX6-NEXT: s_lshr_b32 s3, s5, 8 -; GFX6-NEXT: s_and_b32 s5, s5, 0xff +; GFX6-NEXT: s_and_b32 s3, s5, 0xff ; GFX6-NEXT: v_mov_b32_e32 v5, s4 -; GFX6-NEXT: v_alignbit_b32 v5, s5, v5, 24 -; GFX6-NEXT: s_and_b32 s3, s3, 0xff +; GFX6-NEXT: v_alignbit_b32 v5, s3, v5, 24 +; GFX6-NEXT: s_bfe_u32 s3, s5, 0x80008 +; GFX6-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX6-NEXT: v_mul_lo_u32 v4, v4, 24 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 @@ -1851,13 +1800,14 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v2, v3 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX6-NEXT: s_and_b32 s6, s6, 0xff -; GFX6-NEXT: s_and_b32 s8, 0xffff, s8 +; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX6-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v3 -; GFX6-NEXT: s_lshl_b32 s2, s6, 17 -; GFX6-NEXT: s_lshl_b32 s3, s8, 1 +; GFX6-NEXT: s_lshl_b32 s2, s7, 17 +; GFX6-NEXT: s_lshl_b32 s3, s6, 1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffffff, v6 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 @@ -1896,74 +1846,72 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX8: ; %bb.0: ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_lshr_b32 s9, s1, 8 -; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_lshr_b32 s6, s0, 8 -; GFX8-NEXT: s_lshr_b32 s8, s0, 24 -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x80008 +; GFX8-NEXT: s_and_b32 s7, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s8, s8, 8 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX8-NEXT: s_and_b32 s6, s6, 0xff -; GFX8-NEXT: s_or_b32 s1, s8, s1 -; GFX8-NEXT: s_lshr_b32 s8, s2, 8 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_lshr_b32 s7, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xff -; GFX8-NEXT: s_lshl_b32 s6, s6, 8 -; GFX8-NEXT: s_and_b32 s8, s8, 0xff -; GFX8-NEXT: s_or_b32 s0, s0, s6 -; GFX8-NEXT: s_and_b32 s6, s7, 0xff -; GFX8-NEXT: s_and_b32 s7, s9, 0xff -; GFX8-NEXT: s_lshr_b32 s9, s2, 16 -; GFX8-NEXT: s_lshr_b32 s10, s2, 24 -; GFX8-NEXT: s_and_b32 s2, s2, 0xff +; GFX8-NEXT: s_or_b32 s7, s7, s8 +; GFX8-NEXT: s_and_b32 s8, s1, 0xff +; GFX8-NEXT: s_lshr_b32 s6, s0, 24 ; GFX8-NEXT: s_lshl_b32 s8, s8, 8 -; GFX8-NEXT: s_or_b32 s2, s2, s8 -; GFX8-NEXT: s_and_b32 s8, s9, 0xff +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x80008 +; GFX8-NEXT: s_or_b32 s6, s6, s8 +; GFX8-NEXT: s_lshr_b32 s8, s2, 24 +; GFX8-NEXT: s_and_b32 s9, s2, 0xff +; GFX8-NEXT: s_lshl_b32 s10, s10, 8 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX8-NEXT: v_not_b32_e32 v1, 23 -; GFX8-NEXT: s_lshr_b32 s11, s3, 8 -; GFX8-NEXT: s_lshl_b32 s8, s8, 16 -; GFX8-NEXT: s_and_b32 s3, s3, 0xff +; GFX8-NEXT: s_or_b32 s9, s9, s10 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: v_mul_lo_u32 v2, v0, v1 -; GFX8-NEXT: s_or_b32 s2, s2, s8 -; GFX8-NEXT: s_lshl_b32 s3, s3, 8 -; GFX8-NEXT: s_and_b32 s8, s11, 0xff -; GFX8-NEXT: s_or_b32 s3, s10, s3 -; GFX8-NEXT: s_lshl_b32 s8, s8, 16 -; GFX8-NEXT: s_or_b32 s3, s3, s8 -; GFX8-NEXT: s_lshr_b32 s8, s4, 8 -; GFX8-NEXT: s_and_b32 s8, s8, 0xff +; GFX8-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_or_b32 s2, s9, s2 +; GFX8-NEXT: s_and_b32 s9, s3, 0xff +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x80008 +; GFX8-NEXT: s_lshl_b32 s9, s9, 8 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_or_b32 s8, s8, s9 +; GFX8-NEXT: s_lshl_b32 s3, s3, 16 +; GFX8-NEXT: s_bfe_u32 s10, s4, 0x80008 ; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX8-NEXT: s_lshr_b32 s9, s4, 16 -; GFX8-NEXT: s_lshr_b32 s10, s4, 24 -; GFX8-NEXT: s_and_b32 s4, s4, 0xff -; GFX8-NEXT: s_lshl_b32 s8, s8, 8 -; GFX8-NEXT: s_or_b32 s4, s4, s8 -; GFX8-NEXT: s_and_b32 s8, s9, 0xff -; GFX8-NEXT: s_lshl_b32 s8, s8, 16 -; GFX8-NEXT: s_or_b32 s4, s4, s8 +; GFX8-NEXT: s_or_b32 s3, s8, s3 +; GFX8-NEXT: s_lshr_b32 s8, s4, 24 +; GFX8-NEXT: s_and_b32 s9, s4, 0xff +; GFX8-NEXT: s_lshl_b32 s10, s10, 8 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x80010 +; GFX8-NEXT: s_or_b32 s9, s9, s10 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX8-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_or_b32 s4, s9, s4 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 -; GFX8-NEXT: s_lshr_b32 s11, s5, 8 -; GFX8-NEXT: s_and_b32 s5, s5, 0xff -; GFX8-NEXT: s_lshl_b32 s5, s5, 8 +; GFX8-NEXT: s_and_b32 s9, s5, 0xff +; GFX8-NEXT: s_bfe_u32 s5, s5, 0x80008 +; GFX8-NEXT: s_lshl_b32 s9, s9, 8 ; GFX8-NEXT: v_mul_lo_u32 v2, v2, 24 -; GFX8-NEXT: s_and_b32 s8, s11, 0xff -; GFX8-NEXT: s_or_b32 s5, s10, s5 -; GFX8-NEXT: s_lshl_b32 s8, s8, 16 -; GFX8-NEXT: s_or_b32 s5, s5, s8 +; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX8-NEXT: s_or_b32 s8, s8, s9 +; GFX8-NEXT: s_lshl_b32 s5, s5, 16 +; GFX8-NEXT: s_or_b32 s5, s8, s5 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s4, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX8-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v2 -; GFX8-NEXT: s_lshl_b32 s4, s6, 17 -; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: s_or_b32 s0, s4, s0 +; GFX8-NEXT: s_lshl_b32 s0, s0, 17 +; GFX8-NEXT: s_lshl_b32 s4, s7, 1 +; GFX8-NEXT: s_or_b32 s0, s0, s4 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v3, s0 @@ -1974,11 +1922,13 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v0, v1 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80008 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 23, v0 -; GFX8-NEXT: s_lshl_b32 s0, s7, 17 -; GFX8-NEXT: s_lshl_b32 s1, s1, 1 +; GFX8-NEXT: s_lshl_b32 s0, s1, 17 +; GFX8-NEXT: s_lshl_b32 s1, s6, 1 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffffff, v0 @@ -2004,75 +1954,73 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX9: ; %bb.0: ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_lshr_b32 s9, s1, 8 -; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_lshr_b32 s6, s0, 8 -; GFX9-NEXT: s_lshr_b32 s8, s0, 24 -; GFX9-NEXT: s_lshl_b32 s1, s1, 8 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x80008 +; GFX9-NEXT: s_and_b32 s7, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX9-NEXT: s_and_b32 s6, s6, 0xff -; GFX9-NEXT: s_or_b32 s1, s8, s1 -; GFX9-NEXT: s_lshr_b32 s8, s2, 8 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_lshr_b32 s7, s0, 16 -; GFX9-NEXT: s_and_b32 s0, s0, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_and_b32 s8, s8, 0xff -; GFX9-NEXT: s_or_b32 s0, s0, s6 -; GFX9-NEXT: s_and_b32 s6, s7, 0xff -; GFX9-NEXT: s_and_b32 s7, s9, 0xff -; GFX9-NEXT: s_lshr_b32 s9, s2, 16 -; GFX9-NEXT: s_lshr_b32 s10, s2, 24 -; GFX9-NEXT: s_and_b32 s2, s2, 0xff +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s1, 0xff +; GFX9-NEXT: s_lshr_b32 s6, s0, 24 ; GFX9-NEXT: s_lshl_b32 s8, s8, 8 -; GFX9-NEXT: s_or_b32 s2, s2, s8 -; GFX9-NEXT: s_and_b32 s8, s9, 0xff +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x80008 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: s_lshr_b32 s8, s2, 24 +; GFX9-NEXT: s_and_b32 s9, s2, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s10, 8 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX9-NEXT: v_not_b32_e32 v1, 23 -; GFX9-NEXT: s_lshr_b32 s11, s3, 8 -; GFX9-NEXT: s_lshl_b32 s8, s8, 16 -; GFX9-NEXT: s_and_b32 s3, s3, 0xff +; GFX9-NEXT: s_or_b32 s9, s9, s10 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, v1 -; GFX9-NEXT: s_or_b32 s2, s2, s8 -; GFX9-NEXT: s_lshl_b32 s3, s3, 8 -; GFX9-NEXT: s_and_b32 s8, s11, 0xff -; GFX9-NEXT: s_or_b32 s3, s10, s3 -; GFX9-NEXT: s_lshl_b32 s8, s8, 16 -; GFX9-NEXT: s_or_b32 s3, s3, s8 -; GFX9-NEXT: s_lshr_b32 s8, s4, 8 -; GFX9-NEXT: s_and_b32 s8, s8, 0xff +; GFX9-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_or_b32 s2, s9, s2 +; GFX9-NEXT: s_and_b32 s9, s3, 0xff +; GFX9-NEXT: s_bfe_u32 s3, s3, 0x80008 +; GFX9-NEXT: s_lshl_b32 s9, s9, 8 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: s_lshl_b32 s3, s3, 16 +; GFX9-NEXT: s_bfe_u32 s10, s4, 0x80008 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX9-NEXT: s_lshr_b32 s9, s4, 16 -; GFX9-NEXT: s_lshr_b32 s10, s4, 24 -; GFX9-NEXT: s_and_b32 s4, s4, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s8, 8 -; GFX9-NEXT: s_or_b32 s4, s4, s8 -; GFX9-NEXT: s_and_b32 s8, s9, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s8, 16 -; GFX9-NEXT: s_or_b32 s4, s4, s8 +; GFX9-NEXT: s_or_b32 s3, s8, s3 +; GFX9-NEXT: s_lshr_b32 s8, s4, 24 +; GFX9-NEXT: s_and_b32 s9, s4, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s10, 8 +; GFX9-NEXT: s_bfe_u32 s4, s4, 0x80010 +; GFX9-NEXT: s_or_b32 s9, s9, s10 +; GFX9-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX9-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: s_or_b32 s4, s9, s4 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v1, s4, v0 -; GFX9-NEXT: s_lshr_b32 s11, s5, 8 -; GFX9-NEXT: s_and_b32 s5, s5, 0xff -; GFX9-NEXT: s_lshl_b32 s5, s5, 8 -; GFX9-NEXT: s_and_b32 s8, s11, 0xff -; GFX9-NEXT: s_or_b32 s5, s10, s5 +; GFX9-NEXT: s_and_b32 s9, s5, 0xff +; GFX9-NEXT: s_bfe_u32 s5, s5, 0x80008 +; GFX9-NEXT: s_lshl_b32 s9, s9, 8 +; GFX9-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX9-NEXT: s_lshl_b32 s8, s8, 16 -; GFX9-NEXT: s_or_b32 s5, s5, s8 +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s5, s8, s5 ; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 ; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 +; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_sub_u32_e32 v2, 23, v1 -; GFX9-NEXT: s_lshl_b32 s4, s6, 17 -; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_lshl_b32 s0, s0, 17 +; GFX9-NEXT: s_lshl_b32 s4, s7, 1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX9-NEXT: s_or_b32 s0, s4, s0 +; GFX9-NEXT: s_or_b32 s0, s0, s4 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX9-NEXT: v_lshrrev_b32_e64 v1, v1, s2 ; GFX9-NEXT: v_sub_u32_e32 v0, s5, v0 @@ -2080,12 +2028,14 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80008 ; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_sub_u32_e32 v2, 23, v0 -; GFX9-NEXT: s_lshl_b32 s0, s7, 17 -; GFX9-NEXT: s_lshl_b32 s1, s1, 1 +; GFX9-NEXT: s_lshl_b32 s0, s1, 17 +; GFX9-NEXT: s_lshl_b32 s1, s6, 1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 @@ -2110,77 +2060,75 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX10-LABEL: s_fshr_v2i24: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX10-NEXT: s_lshr_b32 s13, s4, 8 -; GFX10-NEXT: s_lshr_b32 s14, s4, 16 -; GFX10-NEXT: s_and_b32 s13, s13, 0xff -; GFX10-NEXT: s_lshr_b32 s15, s4, 24 +; GFX10-NEXT: s_bfe_u32 s15, s4, 0x80008 +; GFX10-NEXT: s_lshr_b32 s13, s4, 24 +; GFX10-NEXT: s_and_b32 s14, s4, 0xff +; GFX10-NEXT: s_bfe_u32 s4, s4, 0x80010 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_and_b32 s4, s4, 0xff -; GFX10-NEXT: s_and_b32 s14, s14, 0xff -; GFX10-NEXT: s_lshl_b32 s13, s13, 8 -; GFX10-NEXT: s_lshl_b32 s14, s14, 16 -; GFX10-NEXT: s_or_b32 s4, s4, s13 -; GFX10-NEXT: s_lshr_b32 s16, s5, 8 -; GFX10-NEXT: s_and_b32 s5, s5, 0xff -; GFX10-NEXT: s_or_b32 s4, s4, s14 -; GFX10-NEXT: s_lshl_b32 s5, s5, 8 +; GFX10-NEXT: s_lshl_b32 s15, s15, 8 +; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX10-NEXT: s_or_b32 s14, s14, s15 +; GFX10-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-NEXT: s_and_b32 s14, 0xffff, s14 +; GFX10-NEXT: s_and_b32 s16, s5, 0xff +; GFX10-NEXT: s_bfe_u32 s5, s5, 0x80008 +; GFX10-NEXT: s_or_b32 s4, s14, s4 +; GFX10-NEXT: s_lshl_b32 s16, s16, 8 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX10-NEXT: s_and_b32 s16, s16, 0xff -; GFX10-NEXT: s_or_b32 s5, s15, s5 -; GFX10-NEXT: s_lshl_b32 s13, s16, 16 -; GFX10-NEXT: s_lshr_b32 s10, s2, 8 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX10-NEXT: s_or_b32 s13, s13, s16 +; GFX10-NEXT: s_lshl_b32 s5, s5, 16 +; GFX10-NEXT: s_bfe_u32 s12, s2, 0x80008 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: s_or_b32 s5, s5, s13 -; GFX10-NEXT: s_lshr_b32 s9, s1, 8 -; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshr_b32 s11, s2, 16 +; GFX10-NEXT: s_or_b32 s5, s13, s5 +; GFX10-NEXT: s_and_b32 s9, s1, 0xff +; GFX10-NEXT: s_lshr_b32 s10, s2, 24 +; GFX10-NEXT: s_and_b32 s11, s2, 0xff ; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 -; GFX10-NEXT: s_lshr_b32 s13, s3, 8 -; GFX10-NEXT: s_and_b32 s3, s3, 0xff -; GFX10-NEXT: s_and_b32 s10, s10, 0xff -; GFX10-NEXT: s_lshr_b32 s6, s0, 8 -; GFX10-NEXT: s_lshr_b32 s8, s0, 24 -; GFX10-NEXT: s_lshr_b32 s12, s2, 24 -; GFX10-NEXT: s_and_b32 s2, s2, 0xff +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX10-NEXT: s_and_b32 s13, s3, 0xff +; GFX10-NEXT: s_bfe_u32 s3, s3, 0x80008 +; GFX10-NEXT: s_lshl_b32 s12, s12, 8 +; GFX10-NEXT: s_bfe_u32 s8, s0, 0x80008 +; GFX10-NEXT: s_lshr_b32 s6, s0, 24 +; GFX10-NEXT: s_lshl_b32 s9, s9, 8 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_and_b32 s11, s11, 0xff -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_and_b32 s13, s13, 0xff -; GFX10-NEXT: s_and_b32 s6, s6, 0xff -; GFX10-NEXT: s_or_b32 s1, s8, s1 -; GFX10-NEXT: s_or_b32 s3, s12, s3 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX10-NEXT: s_lshl_b32 s13, s13, 8 +; GFX10-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX10-NEXT: s_and_b32 s7, s0, 0xff +; GFX10-NEXT: s_lshl_b32 s8, s8, 8 +; GFX10-NEXT: s_lshl_b32 s2, s2, 16 +; GFX10-NEXT: s_lshl_b32 s3, s3, 16 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: s_lshl_b32 s8, s13, 16 -; GFX10-NEXT: s_lshr_b32 s7, s0, 16 -; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: s_lshl_b32 s6, s6, 8 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX10-NEXT: s_or_b32 s7, s7, s8 +; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX10-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX10-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX10-NEXT: s_or_b32 s3, s3, s8 -; GFX10-NEXT: s_and_b32 s7, s7, 0xff -; GFX10-NEXT: s_and_b32 s9, s9, 0xff -; GFX10-NEXT: s_or_b32 s0, s0, s6 -; GFX10-NEXT: s_lshl_b32 s7, s7, 17 -; GFX10-NEXT: s_lshl_b32 s9, s9, 17 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX10-NEXT: s_lshl_b32 s0, s0, 17 +; GFX10-NEXT: s_lshl_b32 s1, s1, 17 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: v_mul_lo_u32 v2, v2, 24 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-NEXT: s_lshl_b32 s1, s1, 1 -; GFX10-NEXT: s_or_b32 s0, s7, s0 -; GFX10-NEXT: s_or_b32 s1, s9, s1 -; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s4, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, s5, v0 -; GFX10-NEXT: s_lshl_b32 s4, s10, 8 -; GFX10-NEXT: s_lshl_b32 s5, s11, 16 -; GFX10-NEXT: s_or_b32 s2, s2, s4 +; GFX10-NEXT: s_or_b32 s5, s11, s12 +; GFX10-NEXT: s_or_b32 s4, s6, s9 +; GFX10-NEXT: s_or_b32 s6, s10, s13 ; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 0xffffffe8, v0 -; GFX10-NEXT: s_or_b32 s2, s2, s5 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX10-NEXT: s_or_b32 s3, s6, s3 +; GFX10-NEXT: s_or_b32 s2, s5, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX10-NEXT: s_lshl_b32 s5, s7, 1 +; GFX10-NEXT: s_or_b32 s0, s0, s5 ; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 @@ -2194,8 +2142,10 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX10-NEXT: v_sub_nc_u32_e32 v4, 23, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX10-NEXT: v_lshrrev_b32_e64 v2, v2, s2 +; GFX10-NEXT: s_lshl_b32 s2, s4, 1 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s3 +; GFX10-NEXT: s_or_b32 s1, s1, s2 ; GFX10-NEXT: v_lshl_or_b32 v2, s0, v3, v2 ; GFX10-NEXT: v_mov_b32_e32 v3, 16 ; GFX10-NEXT: v_lshl_or_b32 v0, s1, v4, v0 @@ -2215,78 +2165,79 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX11-LABEL: s_fshr_v2i24: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX11-NEXT: s_lshr_b32 s14, s4, 8 -; GFX11-NEXT: s_lshr_b32 s15, s4, 16 -; GFX11-NEXT: s_and_b32 s14, s14, 0xff -; GFX11-NEXT: s_lshr_b32 s16, s4, 24 +; GFX11-NEXT: s_bfe_u32 s15, s4, 0x80008 +; GFX11-NEXT: s_lshr_b32 s13, s4, 24 +; GFX11-NEXT: s_and_b32 s14, s4, 0xff +; GFX11-NEXT: s_bfe_u32 s4, s4, 0x80010 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX11-NEXT: s_and_b32 s4, s4, 0xff -; GFX11-NEXT: s_and_b32 s15, s15, 0xff -; GFX11-NEXT: s_lshl_b32 s14, s14, 8 -; GFX11-NEXT: s_lshl_b32 s15, s15, 16 -; GFX11-NEXT: s_or_b32 s4, s4, s14 -; GFX11-NEXT: s_lshr_b32 s17, s5, 8 -; GFX11-NEXT: s_and_b32 s5, s5, 0xff -; GFX11-NEXT: s_or_b32 s4, s4, s15 -; GFX11-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-NEXT: s_lshl_b32 s15, s15, 8 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX11-NEXT: s_or_b32 s14, s14, s15 +; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: s_and_b32 s14, 0xffff, s14 +; GFX11-NEXT: s_and_b32 s16, s5, 0xff +; GFX11-NEXT: s_bfe_u32 s5, s5, 0x80008 +; GFX11-NEXT: s_or_b32 s4, s14, s4 +; GFX11-NEXT: s_lshl_b32 s16, s16, 8 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX11-NEXT: s_and_b32 s17, s17, 0xff -; GFX11-NEXT: s_or_b32 s5, s16, s5 -; GFX11-NEXT: s_lshl_b32 s14, s17, 16 -; GFX11-NEXT: s_lshr_b32 s10, s2, 8 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX11-NEXT: s_or_b32 s13, s13, s16 +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s9, s1, 0xff ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: s_or_b32 s5, s5, s14 -; GFX11-NEXT: s_lshr_b32 s11, s2, 16 -; GFX11-NEXT: s_and_b32 s10, s10, 0xff -; GFX11-NEXT: s_lshr_b32 s6, s0, 8 +; GFX11-NEXT: s_or_b32 s5, s13, s5 +; GFX11-NEXT: s_bfe_u32 s12, s2, 0x80008 +; GFX11-NEXT: s_lshr_b32 s6, s0, 24 +; GFX11-NEXT: s_lshr_b32 s10, s2, 24 ; GFX11-NEXT: v_mul_lo_u32 v1, 0xffffffe8, v0 -; GFX11-NEXT: s_lshr_b32 s12, s2, 24 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_and_b32 s11, s11, 0xff -; GFX11-NEXT: s_and_b32 s6, s6, 0xff -; GFX11-NEXT: s_lshr_b32 s7, s0, 16 -; GFX11-NEXT: s_lshr_b32 s8, s0, 24 -; GFX11-NEXT: s_lshr_b32 s9, s1, 8 +; GFX11-NEXT: s_and_b32 s11, s2, 0xff +; GFX11-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX11-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-NEXT: s_lshl_b32 s12, s12, 8 +; GFX11-NEXT: s_bfe_u32 s8, s0, 0x80008 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-NEXT: s_and_b32 s7, s0, 0xff ; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_lshr_b32 s13, s3, 8 -; GFX11-NEXT: s_and_b32 s3, s3, 0xff -; GFX11-NEXT: s_lshl_b32 s6, s6, 8 -; GFX11-NEXT: s_and_b32 s7, s7, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s13, s3, 0xff +; GFX11-NEXT: s_bfe_u32 s3, s3, 0x80008 +; GFX11-NEXT: s_lshl_b32 s8, s8, 8 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX11-NEXT: s_lshl_b32 s13, s13, 8 +; GFX11-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1 -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: s_and_b32 s13, s13, 0xff -; GFX11-NEXT: s_or_b32 s0, s0, s6 -; GFX11-NEXT: s_or_b32 s1, s8, s1 +; GFX11-NEXT: s_or_b32 s7, s7, s8 +; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX11-NEXT: v_mul_hi_u32 v1, s4, v0 ; GFX11-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX11-NEXT: s_or_b32 s3, s12, s3 -; GFX11-NEXT: s_lshl_b32 s8, s13, 16 -; GFX11-NEXT: s_lshl_b32 s7, s7, 17 -; GFX11-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-NEXT: s_or_b32 s3, s3, s8 -; GFX11-NEXT: s_or_b32 s0, s7, s0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 17 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_lshl_b32 s1, s1, 17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX11-NEXT: s_and_b32 s9, s9, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s1, 1 -; GFX11-NEXT: s_lshl_b32 s9, s9, 17 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u32_e32 v1, s4, v1 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, s5, v0 -; GFX11-NEXT: s_lshl_b32 s4, s10, 8 -; GFX11-NEXT: s_lshl_b32 s5, s11, 16 -; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: s_or_b32 s4, s6, s9 +; GFX11-NEXT: s_or_b32 s6, s11, s12 +; GFX11-NEXT: s_or_b32 s5, s10, s13 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v0 -; GFX11-NEXT: s_or_b32 s2, s2, s5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX11-NEXT: s_or_b32 s3, s5, s3 +; GFX11-NEXT: s_or_b32 s2, s6, s2 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX11-NEXT: s_lshl_b32 s5, s7, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_or_b32 s0, s0, s5 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 @@ -2299,34 +2250,35 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_lshl_b32 s2, s4, 1 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshl_or_b32 v1, s0, v2, v1 -; GFX11-NEXT: s_or_b32 s0, s9, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_or_b32 s0, s1, s2 ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX11-NEXT: v_bfe_u32 v2, v1, 8, 8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v2, v1, 8, 8 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshrrev_b32_e64 v0, v0, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshl_or_b32 v0, s0, v3, v0 ; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX11-NEXT: v_bfe_u32 v5, v0, 8, 8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshl_or_b32 v0, v0, 8, v5 -; GFX11-NEXT: v_or3_b32 v1, v1, v3, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or3_b32 v1, v1, v3, v4 ; GFX11-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s0, v1 ; GFX11-NEXT: ; return to shader part epilog %lhs = bitcast i48 %lhs.arg to <2 x i24> diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll index 7ec27f47578c2..2166e1e348027 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll @@ -322,15 +322,14 @@ define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) { ; ; GFX8-LABEL: abs_sgpr_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_abs_i32 s1, s1 +; GFX8-NEXT: s_sext_i32_i16 s1, s0 +; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010 ; GFX8-NEXT: s_abs_i32 s0, s0 -; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_abs_i32 s1, s1 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 16 +; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: abs_sgpr_v2i16: @@ -392,17 +391,16 @@ define amdgpu_cs <3 x i16> @abs_sgpr_v3i16(<3 x i16> inreg %arg) { ; ; GFX8-LABEL: abs_sgpr_v3i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_abs_i32 s2, s2 +; GFX8-NEXT: s_sext_i32_i16 s2, s0 +; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010 ; GFX8-NEXT: s_abs_i32 s0, s0 +; GFX8-NEXT: s_abs_i32 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_abs_i32 s1, s1 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_abs_i32 s1, s1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, 16 +; GFX8-NEXT: s_or_b32 s0, s2, s0 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index 832f066adaa84..f317526e6de47 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -2831,28 +2831,27 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; ; GFX8-LABEL: s_saddsat_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s4, s0 -; GFX8-NEXT: s_max_i32 s5, s4, 0 -; GFX8-NEXT: s_min_i32 s4, s4, 0 -; GFX8-NEXT: s_sub_i32 s4, 0x8000, s4 -; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_sext_i32_i16 s3, s0 +; GFX8-NEXT: s_max_i32 s4, s3, 0 +; GFX8-NEXT: s_min_i32 s3, s3, 0 +; GFX8-NEXT: s_sub_i32 s3, 0x8000, s3 +; GFX8-NEXT: s_sext_i32_i16 s3, s3 +; GFX8-NEXT: s_sext_i32_i16 s5, s1 +; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 +; GFX8-NEXT: s_max_i32 s3, s3, s5 +; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 -; GFX8-NEXT: s_max_i32 s1, s4, s1 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s4, s5 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_min_i32 s1, s1, s4 -; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_sext_i32_i16 s1, s2 -; GFX8-NEXT: s_max_i32 s4, s1, 0 -; GFX8-NEXT: s_min_i32 s1, s1, 0 -; GFX8-NEXT: s_sub_i32 s1, 0x8000, s1 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 +; GFX8-NEXT: s_min_i32 s3, s3, s4 +; GFX8-NEXT: s_add_i32 s0, s0, s3 +; GFX8-NEXT: s_sext_i32_i16 s3, s2 +; GFX8-NEXT: s_max_i32 s4, s3, 0 +; GFX8-NEXT: s_min_i32 s3, s3, 0 +; GFX8-NEXT: s_sub_i32 s3, 0x8000, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 +; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010 ; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 -; GFX8-NEXT: s_max_i32 s1, s1, s3 +; GFX8-NEXT: s_max_i32 s1, s3, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s3, s4 ; GFX8-NEXT: s_min_i32 s1, s1, s3 @@ -3187,56 +3186,54 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; ; GFX8-LABEL: s_saddsat_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s8, s0 -; GFX8-NEXT: s_max_i32 s9, s8, 0 -; GFX8-NEXT: s_min_i32 s8, s8, 0 -; GFX8-NEXT: s_sub_i32 s8, 0x8000, s8 -; GFX8-NEXT: s_lshr_b32 s6, s2, 16 -; GFX8-NEXT: s_sext_i32_i16 s8, s8 -; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s9, 0x7fff, s9 -; GFX8-NEXT: s_max_i32 s2, s8, s2 -; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sext_i32_i16 s8, s9 +; GFX8-NEXT: s_sext_i32_i16 s6, s0 +; GFX8-NEXT: s_max_i32 s7, s6, 0 +; GFX8-NEXT: s_min_i32 s6, s6, 0 +; GFX8-NEXT: s_sub_i32 s6, 0x8000, s6 +; GFX8-NEXT: s_sext_i32_i16 s6, s6 +; GFX8-NEXT: s_sext_i32_i16 s8, s2 +; GFX8-NEXT: s_sub_i32 s7, 0x7fff, s7 +; GFX8-NEXT: s_max_i32 s6, s6, s8 +; GFX8-NEXT: s_sext_i32_i16 s6, s6 +; GFX8-NEXT: s_sext_i32_i16 s7, s7 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_min_i32 s2, s2, s8 -; GFX8-NEXT: s_add_i32 s0, s0, s2 -; GFX8-NEXT: s_sext_i32_i16 s2, s4 -; GFX8-NEXT: s_max_i32 s8, s2, 0 -; GFX8-NEXT: s_min_i32 s2, s2, 0 -; GFX8-NEXT: s_sub_i32 s2, 0x8000, s2 -; GFX8-NEXT: s_sext_i32_i16 s2, s2 +; GFX8-NEXT: s_min_i32 s6, s6, s7 +; GFX8-NEXT: s_add_i32 s0, s0, s6 +; GFX8-NEXT: s_sext_i32_i16 s6, s4 +; GFX8-NEXT: s_max_i32 s7, s6, 0 +; GFX8-NEXT: s_min_i32 s6, s6, 0 +; GFX8-NEXT: s_sub_i32 s6, 0x8000, s6 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_sub_i32 s8, 0x7fff, s8 -; GFX8-NEXT: s_max_i32 s2, s2, s6 +; GFX8-NEXT: s_bfe_i32 s2, s2, 0x100010 +; GFX8-NEXT: s_sub_i32 s7, 0x7fff, s7 +; GFX8-NEXT: s_max_i32 s2, s6, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sext_i32_i16 s6, s8 +; GFX8-NEXT: s_sext_i32_i16 s6, s7 ; GFX8-NEXT: s_min_i32 s2, s2, s6 ; GFX8-NEXT: s_add_i32 s4, s4, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s1 ; GFX8-NEXT: s_max_i32 s6, s2, 0 ; GFX8-NEXT: s_min_i32 s2, s2, 0 ; GFX8-NEXT: s_sub_i32 s2, 0x8000, s2 -; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sext_i32_i16 s3, s3 +; GFX8-NEXT: s_sext_i32_i16 s7, s3 ; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6 -; GFX8-NEXT: s_max_i32 s2, s2, s3 +; GFX8-NEXT: s_max_i32 s2, s2, s7 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sext_i32_i16 s3, s6 +; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_min_i32 s2, s2, s3 +; GFX8-NEXT: s_min_i32 s2, s2, s6 ; GFX8-NEXT: s_add_i32 s1, s1, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s5 -; GFX8-NEXT: s_max_i32 s3, s2, 0 +; GFX8-NEXT: s_max_i32 s6, s2, 0 ; GFX8-NEXT: s_min_i32 s2, s2, 0 ; GFX8-NEXT: s_sub_i32 s2, 0x8000, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sext_i32_i16 s6, s7 -; GFX8-NEXT: s_sub_i32 s3, 0x7fff, s3 -; GFX8-NEXT: s_max_i32 s2, s2, s6 +; GFX8-NEXT: s_bfe_i32 s3, s3, 0x100010 +; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6 +; GFX8-NEXT: s_max_i32 s2, s2, s3 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sext_i32_i16 s3, s3 +; GFX8-NEXT: s_sext_i32_i16 s3, s6 ; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: s_add_i32 s5, s5, s2 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s4 @@ -3512,67 +3509,64 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; ; GFX8-LABEL: s_saddsat_v6i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s12, s0 -; GFX8-NEXT: s_max_i32 s13, s12, 0 -; GFX8-NEXT: s_min_i32 s12, s12, 0 -; GFX8-NEXT: s_sub_i32 s12, 0x8000, s12 -; GFX8-NEXT: s_lshr_b32 s9, s3, 16 -; GFX8-NEXT: s_sext_i32_i16 s12, s12 -; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s13, 0x7fff, s13 -; GFX8-NEXT: s_max_i32 s3, s12, s3 -; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sext_i32_i16 s12, s13 +; GFX8-NEXT: s_sext_i32_i16 s9, s0 +; GFX8-NEXT: s_max_i32 s10, s9, 0 +; GFX8-NEXT: s_min_i32 s9, s9, 0 +; GFX8-NEXT: s_sub_i32 s9, 0x8000, s9 +; GFX8-NEXT: s_sext_i32_i16 s9, s9 +; GFX8-NEXT: s_sext_i32_i16 s11, s3 +; GFX8-NEXT: s_sub_i32 s10, 0x7fff, s10 +; GFX8-NEXT: s_max_i32 s9, s9, s11 +; GFX8-NEXT: s_sext_i32_i16 s9, s9 +; GFX8-NEXT: s_sext_i32_i16 s10, s10 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 -; GFX8-NEXT: s_min_i32 s3, s3, s12 -; GFX8-NEXT: s_add_i32 s0, s0, s3 -; GFX8-NEXT: s_sext_i32_i16 s3, s6 -; GFX8-NEXT: s_max_i32 s12, s3, 0 -; GFX8-NEXT: s_min_i32 s3, s3, 0 -; GFX8-NEXT: s_sub_i32 s3, 0x8000, s3 -; GFX8-NEXT: s_sext_i32_i16 s3, s3 +; GFX8-NEXT: s_min_i32 s9, s9, s10 +; GFX8-NEXT: s_add_i32 s0, s0, s9 +; GFX8-NEXT: s_sext_i32_i16 s9, s6 +; GFX8-NEXT: s_max_i32 s10, s9, 0 +; GFX8-NEXT: s_min_i32 s9, s9, 0 +; GFX8-NEXT: s_sub_i32 s9, 0x8000, s9 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 -; GFX8-NEXT: s_sub_i32 s12, 0x7fff, s12 -; GFX8-NEXT: s_max_i32 s3, s3, s9 +; GFX8-NEXT: s_bfe_i32 s3, s3, 0x100010 +; GFX8-NEXT: s_sub_i32 s10, 0x7fff, s10 +; GFX8-NEXT: s_max_i32 s3, s9, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sext_i32_i16 s9, s12 +; GFX8-NEXT: s_sext_i32_i16 s9, s10 ; GFX8-NEXT: s_min_i32 s3, s3, s9 ; GFX8-NEXT: s_add_i32 s6, s6, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s1 ; GFX8-NEXT: s_max_i32 s9, s3, 0 ; GFX8-NEXT: s_min_i32 s3, s3, 0 ; GFX8-NEXT: s_sub_i32 s3, 0x8000, s3 -; GFX8-NEXT: s_lshr_b32 s10, s4, 16 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 +; GFX8-NEXT: s_sext_i32_i16 s10, s4 ; GFX8-NEXT: s_sub_i32 s9, 0x7fff, s9 -; GFX8-NEXT: s_max_i32 s3, s3, s4 +; GFX8-NEXT: s_max_i32 s3, s3, s10 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sext_i32_i16 s4, s9 +; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 -; GFX8-NEXT: s_min_i32 s3, s3, s4 +; GFX8-NEXT: s_min_i32 s3, s3, s9 ; GFX8-NEXT: s_add_i32 s1, s1, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s7 -; GFX8-NEXT: s_max_i32 s4, s3, 0 +; GFX8-NEXT: s_max_i32 s9, s3, 0 ; GFX8-NEXT: s_min_i32 s3, s3, 0 ; GFX8-NEXT: s_sub_i32 s3, 0x8000, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sext_i32_i16 s9, s10 -; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 -; GFX8-NEXT: s_max_i32 s3, s3, s9 +; GFX8-NEXT: s_bfe_i32 s4, s4, 0x100010 +; GFX8-NEXT: s_sub_i32 s9, 0x7fff, s9 +; GFX8-NEXT: s_max_i32 s3, s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 +; GFX8-NEXT: s_sext_i32_i16 s4, s9 ; GFX8-NEXT: s_min_i32 s3, s3, s4 ; GFX8-NEXT: s_add_i32 s7, s7, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s2 ; GFX8-NEXT: s_max_i32 s4, s3, 0 ; GFX8-NEXT: s_min_i32 s3, s3, 0 ; GFX8-NEXT: s_sub_i32 s3, 0x8000, s3 -; GFX8-NEXT: s_lshr_b32 s11, s5, 16 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sext_i32_i16 s5, s5 +; GFX8-NEXT: s_sext_i32_i16 s9, s5 ; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 -; GFX8-NEXT: s_max_i32 s3, s3, s5 +; GFX8-NEXT: s_max_i32 s3, s3, s9 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_lshr_b32 s8, s2, 16 @@ -3583,7 +3577,7 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-NEXT: s_min_i32 s3, s3, 0 ; GFX8-NEXT: s_sub_i32 s3, 0x8000, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sext_i32_i16 s5, s11 +; GFX8-NEXT: s_bfe_i32 s5, s5, 0x100010 ; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 ; GFX8-NEXT: s_max_i32 s3, s3, s5 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 @@ -3926,67 +3920,64 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; ; GFX8-LABEL: s_saddsat_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s16, s0 -; GFX8-NEXT: s_max_i32 s17, s16, 0 -; GFX8-NEXT: s_min_i32 s16, s16, 0 -; GFX8-NEXT: s_sub_i32 s16, 0x8000, s16 -; GFX8-NEXT: s_lshr_b32 s12, s4, 16 -; GFX8-NEXT: s_sext_i32_i16 s16, s16 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sub_i32 s17, 0x7fff, s17 -; GFX8-NEXT: s_max_i32 s4, s16, s4 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sext_i32_i16 s16, s17 +; GFX8-NEXT: s_sext_i32_i16 s12, s0 +; GFX8-NEXT: s_max_i32 s13, s12, 0 +; GFX8-NEXT: s_min_i32 s12, s12, 0 +; GFX8-NEXT: s_sub_i32 s12, 0x8000, s12 +; GFX8-NEXT: s_sext_i32_i16 s12, s12 +; GFX8-NEXT: s_sext_i32_i16 s14, s4 +; GFX8-NEXT: s_sub_i32 s13, 0x7fff, s13 +; GFX8-NEXT: s_max_i32 s12, s12, s14 +; GFX8-NEXT: s_sext_i32_i16 s12, s12 +; GFX8-NEXT: s_sext_i32_i16 s13, s13 ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 -; GFX8-NEXT: s_min_i32 s4, s4, s16 -; GFX8-NEXT: s_add_i32 s0, s0, s4 -; GFX8-NEXT: s_sext_i32_i16 s4, s8 -; GFX8-NEXT: s_max_i32 s16, s4, 0 -; GFX8-NEXT: s_min_i32 s4, s4, 0 -; GFX8-NEXT: s_sub_i32 s4, 0x8000, s4 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 +; GFX8-NEXT: s_min_i32 s12, s12, s13 +; GFX8-NEXT: s_add_i32 s0, s0, s12 +; GFX8-NEXT: s_sext_i32_i16 s12, s8 +; GFX8-NEXT: s_max_i32 s13, s12, 0 +; GFX8-NEXT: s_min_i32 s12, s12, 0 +; GFX8-NEXT: s_sub_i32 s12, 0x8000, s12 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 -; GFX8-NEXT: s_sub_i32 s16, 0x7fff, s16 -; GFX8-NEXT: s_max_i32 s4, s4, s12 +; GFX8-NEXT: s_bfe_i32 s4, s4, 0x100010 +; GFX8-NEXT: s_sub_i32 s13, 0x7fff, s13 +; GFX8-NEXT: s_max_i32 s4, s12, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sext_i32_i16 s12, s16 +; GFX8-NEXT: s_sext_i32_i16 s12, s13 ; GFX8-NEXT: s_min_i32 s4, s4, s12 ; GFX8-NEXT: s_add_i32 s8, s8, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s1 ; GFX8-NEXT: s_max_i32 s12, s4, 0 ; GFX8-NEXT: s_min_i32 s4, s4, 0 ; GFX8-NEXT: s_sub_i32 s4, 0x8000, s4 -; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sext_i32_i16 s5, s5 +; GFX8-NEXT: s_sext_i32_i16 s13, s5 ; GFX8-NEXT: s_sub_i32 s12, 0x7fff, s12 -; GFX8-NEXT: s_max_i32 s4, s4, s5 +; GFX8-NEXT: s_max_i32 s4, s4, s13 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sext_i32_i16 s5, s12 +; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16 -; GFX8-NEXT: s_min_i32 s4, s4, s5 +; GFX8-NEXT: s_min_i32 s4, s4, s12 ; GFX8-NEXT: s_add_i32 s1, s1, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s9 -; GFX8-NEXT: s_max_i32 s5, s4, 0 +; GFX8-NEXT: s_max_i32 s12, s4, 0 ; GFX8-NEXT: s_min_i32 s4, s4, 0 ; GFX8-NEXT: s_sub_i32 s4, 0x8000, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sext_i32_i16 s12, s13 -; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 -; GFX8-NEXT: s_max_i32 s4, s4, s12 +; GFX8-NEXT: s_bfe_i32 s5, s5, 0x100010 +; GFX8-NEXT: s_sub_i32 s12, 0x7fff, s12 +; GFX8-NEXT: s_max_i32 s4, s4, s5 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sext_i32_i16 s5, s5 +; GFX8-NEXT: s_sext_i32_i16 s5, s12 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_add_i32 s9, s9, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s2 ; GFX8-NEXT: s_max_i32 s5, s4, 0 ; GFX8-NEXT: s_min_i32 s4, s4, 0 ; GFX8-NEXT: s_sub_i32 s4, 0x8000, s4 -; GFX8-NEXT: s_lshr_b32 s14, s6, 16 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sext_i32_i16 s6, s6 +; GFX8-NEXT: s_sext_i32_i16 s12, s6 ; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 -; GFX8-NEXT: s_max_i32 s4, s4, s6 +; GFX8-NEXT: s_max_i32 s4, s4, s12 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 @@ -3997,7 +3988,7 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_min_i32 s4, s4, 0 ; GFX8-NEXT: s_sub_i32 s4, 0x8000, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sext_i32_i16 s6, s14 +; GFX8-NEXT: s_bfe_i32 s6, s6, 0x100010 ; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_max_i32 s4, s4, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 @@ -4020,10 +4011,9 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_sext_i32_i16 s4, s11 ; GFX8-NEXT: s_max_i32 s5, s4, 0 ; GFX8-NEXT: s_min_i32 s4, s4, 0 -; GFX8-NEXT: s_lshr_b32 s15, s7, 16 ; GFX8-NEXT: s_sub_i32 s4, 0x8000, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sext_i32_i16 s6, s15 +; GFX8-NEXT: s_bfe_i32 s6, s7, 0x100010 ; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_max_i32 s4, s4, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 2673ac4fb5bae..6873c9e6b9b4e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -2835,31 +2835,30 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; ; GFX8-LABEL: s_ssubsat_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s4, s0 -; GFX8-NEXT: s_max_i32 s5, s4, -1 -; GFX8-NEXT: s_add_i32 s5, s5, 0x8001 -; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_min_i32 s4, s4, -1 -; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_add_i32 s4, s4, 0x8000 -; GFX8-NEXT: s_max_i32 s1, s5, s1 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 +; GFX8-NEXT: s_sext_i32_i16 s3, s0 +; GFX8-NEXT: s_max_i32 s4, s3, -1 +; GFX8-NEXT: s_add_i32 s4, s4, 0x8001 +; GFX8-NEXT: s_min_i32 s3, s3, -1 +; GFX8-NEXT: s_sext_i32_i16 s4, s4 +; GFX8-NEXT: s_sext_i32_i16 s5, s1 +; GFX8-NEXT: s_add_i32 s3, s3, 0x8000 +; GFX8-NEXT: s_max_i32 s4, s4, s5 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 +; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_min_i32 s1, s1, s4 -; GFX8-NEXT: s_sub_i32 s0, s0, s1 -; GFX8-NEXT: s_sext_i32_i16 s1, s2 -; GFX8-NEXT: s_max_i32 s4, s1, -1 +; GFX8-NEXT: s_min_i32 s3, s4, s3 +; GFX8-NEXT: s_sub_i32 s0, s0, s3 +; GFX8-NEXT: s_sext_i32_i16 s3, s2 +; GFX8-NEXT: s_max_i32 s4, s3, -1 ; GFX8-NEXT: s_add_i32 s4, s4, 0x8001 -; GFX8-NEXT: s_min_i32 s1, s1, -1 +; GFX8-NEXT: s_min_i32 s3, s3, -1 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_add_i32 s1, s1, 0x8000 -; GFX8-NEXT: s_max_i32 s3, s4, s3 -; GFX8-NEXT: s_sext_i32_i16 s3, s3 +; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010 +; GFX8-NEXT: s_add_i32 s3, s3, 0x8000 +; GFX8-NEXT: s_max_i32 s1, s4, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_min_i32 s1, s3, s1 +; GFX8-NEXT: s_sext_i32_i16 s3, s3 +; GFX8-NEXT: s_min_i32 s1, s1, s3 ; GFX8-NEXT: s_sub_i32 s1, s2, s1 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 @@ -3193,57 +3192,55 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; ; GFX8-LABEL: s_ssubsat_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s8, s0 -; GFX8-NEXT: s_max_i32 s9, s8, -1 -; GFX8-NEXT: s_add_i32 s9, s9, 0x8001 -; GFX8-NEXT: s_lshr_b32 s6, s2, 16 -; GFX8-NEXT: s_min_i32 s8, s8, -1 -; GFX8-NEXT: s_sext_i32_i16 s9, s9 -; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_add_i32 s8, s8, 0x8000 -; GFX8-NEXT: s_max_i32 s2, s9, s2 -; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sext_i32_i16 s8, s8 -; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_min_i32 s2, s2, s8 -; GFX8-NEXT: s_sub_i32 s0, s0, s2 -; GFX8-NEXT: s_sext_i32_i16 s2, s4 -; GFX8-NEXT: s_max_i32 s8, s2, -1 -; GFX8-NEXT: s_add_i32 s8, s8, 0x8001 -; GFX8-NEXT: s_min_i32 s2, s2, -1 -; GFX8-NEXT: s_sext_i32_i16 s8, s8 -; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_add_i32 s2, s2, 0x8000 -; GFX8-NEXT: s_max_i32 s6, s8, s6 +; GFX8-NEXT: s_sext_i32_i16 s6, s0 +; GFX8-NEXT: s_max_i32 s7, s6, -1 +; GFX8-NEXT: s_add_i32 s7, s7, 0x8001 +; GFX8-NEXT: s_min_i32 s6, s6, -1 +; GFX8-NEXT: s_sext_i32_i16 s7, s7 +; GFX8-NEXT: s_sext_i32_i16 s8, s2 +; GFX8-NEXT: s_add_i32 s6, s6, 0x8000 +; GFX8-NEXT: s_max_i32 s7, s7, s8 +; GFX8-NEXT: s_sext_i32_i16 s7, s7 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 +; GFX8-NEXT: s_lshr_b32 s4, s0, 16 +; GFX8-NEXT: s_min_i32 s6, s7, s6 +; GFX8-NEXT: s_sub_i32 s0, s0, s6 +; GFX8-NEXT: s_sext_i32_i16 s6, s4 +; GFX8-NEXT: s_max_i32 s7, s6, -1 +; GFX8-NEXT: s_add_i32 s7, s7, 0x8001 +; GFX8-NEXT: s_min_i32 s6, s6, -1 +; GFX8-NEXT: s_sext_i32_i16 s7, s7 +; GFX8-NEXT: s_bfe_i32 s2, s2, 0x100010 +; GFX8-NEXT: s_add_i32 s6, s6, 0x8000 +; GFX8-NEXT: s_max_i32 s2, s7, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_min_i32 s2, s6, s2 +; GFX8-NEXT: s_sext_i32_i16 s6, s6 +; GFX8-NEXT: s_min_i32 s2, s2, s6 ; GFX8-NEXT: s_sub_i32 s2, s4, s2 ; GFX8-NEXT: s_sext_i32_i16 s4, s1 ; GFX8-NEXT: s_max_i32 s6, s4, -1 ; GFX8-NEXT: s_add_i32 s6, s6, 0x8001 -; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_min_i32 s4, s4, -1 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_sext_i32_i16 s3, s3 +; GFX8-NEXT: s_sext_i32_i16 s7, s3 +; GFX8-NEXT: s_add_i32 s4, s4, 0x8000 +; GFX8-NEXT: s_max_i32 s6, s6, s7 +; GFX8-NEXT: s_sext_i32_i16 s6, s6 +; GFX8-NEXT: s_sext_i32_i16 s4, s4 +; GFX8-NEXT: s_lshr_b32 s5, s1, 16 +; GFX8-NEXT: s_min_i32 s4, s6, s4 +; GFX8-NEXT: s_sub_i32 s1, s1, s4 +; GFX8-NEXT: s_sext_i32_i16 s4, s5 +; GFX8-NEXT: s_max_i32 s6, s4, -1 +; GFX8-NEXT: s_add_i32 s6, s6, 0x8001 +; GFX8-NEXT: s_min_i32 s4, s4, -1 +; GFX8-NEXT: s_sext_i32_i16 s6, s6 +; GFX8-NEXT: s_bfe_i32 s3, s3, 0x100010 ; GFX8-NEXT: s_add_i32 s4, s4, 0x8000 ; GFX8-NEXT: s_max_i32 s3, s6, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_lshr_b32 s5, s1, 16 ; GFX8-NEXT: s_min_i32 s3, s3, s4 -; GFX8-NEXT: s_sub_i32 s1, s1, s3 -; GFX8-NEXT: s_sext_i32_i16 s3, s5 -; GFX8-NEXT: s_max_i32 s4, s3, -1 -; GFX8-NEXT: s_add_i32 s4, s4, 0x8001 -; GFX8-NEXT: s_min_i32 s3, s3, -1 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sext_i32_i16 s6, s7 -; GFX8-NEXT: s_add_i32 s3, s3, 0x8000 -; GFX8-NEXT: s_max_i32 s4, s4, s6 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_min_i32 s3, s4, s3 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_sub_i32 s3, s5, s3 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 @@ -3518,86 +3515,83 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; ; GFX8-LABEL: s_ssubsat_v6i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s12, s0 -; GFX8-NEXT: s_max_i32 s13, s12, -1 -; GFX8-NEXT: s_add_i32 s13, s13, 0x8001 -; GFX8-NEXT: s_lshr_b32 s9, s3, 16 -; GFX8-NEXT: s_min_i32 s12, s12, -1 -; GFX8-NEXT: s_sext_i32_i16 s13, s13 -; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_add_i32 s12, s12, 0x8000 -; GFX8-NEXT: s_max_i32 s3, s13, s3 -; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sext_i32_i16 s12, s12 -; GFX8-NEXT: s_lshr_b32 s6, s0, 16 -; GFX8-NEXT: s_min_i32 s3, s3, s12 -; GFX8-NEXT: s_sub_i32 s0, s0, s3 -; GFX8-NEXT: s_sext_i32_i16 s3, s6 -; GFX8-NEXT: s_max_i32 s12, s3, -1 -; GFX8-NEXT: s_add_i32 s12, s12, 0x8001 -; GFX8-NEXT: s_min_i32 s3, s3, -1 -; GFX8-NEXT: s_sext_i32_i16 s12, s12 -; GFX8-NEXT: s_sext_i32_i16 s9, s9 -; GFX8-NEXT: s_add_i32 s3, s3, 0x8000 -; GFX8-NEXT: s_max_i32 s9, s12, s9 +; GFX8-NEXT: s_sext_i32_i16 s9, s0 +; GFX8-NEXT: s_max_i32 s10, s9, -1 +; GFX8-NEXT: s_add_i32 s10, s10, 0x8001 +; GFX8-NEXT: s_min_i32 s9, s9, -1 +; GFX8-NEXT: s_sext_i32_i16 s10, s10 +; GFX8-NEXT: s_sext_i32_i16 s11, s3 +; GFX8-NEXT: s_add_i32 s9, s9, 0x8000 +; GFX8-NEXT: s_max_i32 s10, s10, s11 +; GFX8-NEXT: s_sext_i32_i16 s10, s10 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 +; GFX8-NEXT: s_lshr_b32 s6, s0, 16 +; GFX8-NEXT: s_min_i32 s9, s10, s9 +; GFX8-NEXT: s_sub_i32 s0, s0, s9 +; GFX8-NEXT: s_sext_i32_i16 s9, s6 +; GFX8-NEXT: s_max_i32 s10, s9, -1 +; GFX8-NEXT: s_add_i32 s10, s10, 0x8001 +; GFX8-NEXT: s_min_i32 s9, s9, -1 +; GFX8-NEXT: s_sext_i32_i16 s10, s10 +; GFX8-NEXT: s_bfe_i32 s3, s3, 0x100010 +; GFX8-NEXT: s_add_i32 s9, s9, 0x8000 +; GFX8-NEXT: s_max_i32 s3, s10, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_min_i32 s3, s9, s3 +; GFX8-NEXT: s_sext_i32_i16 s9, s9 +; GFX8-NEXT: s_min_i32 s3, s3, s9 ; GFX8-NEXT: s_sub_i32 s3, s6, s3 ; GFX8-NEXT: s_sext_i32_i16 s6, s1 ; GFX8-NEXT: s_max_i32 s9, s6, -1 ; GFX8-NEXT: s_add_i32 s9, s9, 0x8001 -; GFX8-NEXT: s_lshr_b32 s10, s4, 16 ; GFX8-NEXT: s_min_i32 s6, s6, -1 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 +; GFX8-NEXT: s_sext_i32_i16 s10, s4 +; GFX8-NEXT: s_add_i32 s6, s6, 0x8000 +; GFX8-NEXT: s_max_i32 s9, s9, s10 +; GFX8-NEXT: s_sext_i32_i16 s9, s9 +; GFX8-NEXT: s_sext_i32_i16 s6, s6 +; GFX8-NEXT: s_lshr_b32 s7, s1, 16 +; GFX8-NEXT: s_min_i32 s6, s9, s6 +; GFX8-NEXT: s_sub_i32 s1, s1, s6 +; GFX8-NEXT: s_sext_i32_i16 s6, s7 +; GFX8-NEXT: s_max_i32 s9, s6, -1 +; GFX8-NEXT: s_add_i32 s9, s9, 0x8001 +; GFX8-NEXT: s_min_i32 s6, s6, -1 +; GFX8-NEXT: s_sext_i32_i16 s9, s9 +; GFX8-NEXT: s_bfe_i32 s4, s4, 0x100010 ; GFX8-NEXT: s_add_i32 s6, s6, 0x8000 ; GFX8-NEXT: s_max_i32 s4, s9, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_lshr_b32 s7, s1, 16 ; GFX8-NEXT: s_min_i32 s4, s4, s6 -; GFX8-NEXT: s_sub_i32 s1, s1, s4 -; GFX8-NEXT: s_sext_i32_i16 s4, s7 -; GFX8-NEXT: s_max_i32 s6, s4, -1 -; GFX8-NEXT: s_add_i32 s6, s6, 0x8001 -; GFX8-NEXT: s_min_i32 s4, s4, -1 -; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_sext_i32_i16 s9, s10 -; GFX8-NEXT: s_add_i32 s4, s4, 0x8000 -; GFX8-NEXT: s_max_i32 s6, s6, s9 -; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_min_i32 s4, s6, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s2 ; GFX8-NEXT: s_sub_i32 s4, s7, s4 ; GFX8-NEXT: s_max_i32 s7, s6, -1 ; GFX8-NEXT: s_add_i32 s7, s7, 0x8001 -; GFX8-NEXT: s_lshr_b32 s11, s5, 16 ; GFX8-NEXT: s_min_i32 s6, s6, -1 ; GFX8-NEXT: s_sext_i32_i16 s7, s7 -; GFX8-NEXT: s_sext_i32_i16 s5, s5 +; GFX8-NEXT: s_sext_i32_i16 s9, s5 ; GFX8-NEXT: s_add_i32 s6, s6, 0x8000 -; GFX8-NEXT: s_max_i32 s5, s7, s5 -; GFX8-NEXT: s_sext_i32_i16 s5, s5 +; GFX8-NEXT: s_max_i32 s7, s7, s9 +; GFX8-NEXT: s_sext_i32_i16 s7, s7 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_lshr_b32 s8, s2, 16 -; GFX8-NEXT: s_min_i32 s5, s5, s6 -; GFX8-NEXT: s_sub_i32 s2, s2, s5 -; GFX8-NEXT: s_sext_i32_i16 s5, s8 -; GFX8-NEXT: s_max_i32 s6, s5, -1 -; GFX8-NEXT: s_add_i32 s6, s6, 0x8001 -; GFX8-NEXT: s_min_i32 s5, s5, -1 -; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_sext_i32_i16 s7, s11 -; GFX8-NEXT: s_add_i32 s5, s5, 0x8000 -; GFX8-NEXT: s_max_i32 s6, s6, s7 +; GFX8-NEXT: s_min_i32 s6, s7, s6 +; GFX8-NEXT: s_sub_i32 s2, s2, s6 +; GFX8-NEXT: s_sext_i32_i16 s6, s8 +; GFX8-NEXT: s_max_i32 s7, s6, -1 +; GFX8-NEXT: s_add_i32 s7, s7, 0x8001 +; GFX8-NEXT: s_min_i32 s6, s6, -1 +; GFX8-NEXT: s_sext_i32_i16 s7, s7 +; GFX8-NEXT: s_bfe_i32 s5, s5, 0x100010 +; GFX8-NEXT: s_add_i32 s6, s6, 0x8000 +; GFX8-NEXT: s_max_i32 s5, s7, s5 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 +; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NEXT: s_min_i32 s5, s6, s5 +; GFX8-NEXT: s_min_i32 s5, s5, s6 ; GFX8-NEXT: s_or_b32 s0, s0, s3 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s4 ; GFX8-NEXT: s_sub_i32 s5, s8, s5 @@ -3932,116 +3926,112 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; ; GFX8-LABEL: s_ssubsat_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s16, s0 -; GFX8-NEXT: s_max_i32 s17, s16, -1 -; GFX8-NEXT: s_add_i32 s17, s17, 0x8001 -; GFX8-NEXT: s_lshr_b32 s12, s4, 16 -; GFX8-NEXT: s_min_i32 s16, s16, -1 -; GFX8-NEXT: s_sext_i32_i16 s17, s17 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_add_i32 s16, s16, 0x8000 -; GFX8-NEXT: s_max_i32 s4, s17, s4 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sext_i32_i16 s16, s16 -; GFX8-NEXT: s_lshr_b32 s8, s0, 16 -; GFX8-NEXT: s_min_i32 s4, s4, s16 -; GFX8-NEXT: s_sub_i32 s0, s0, s4 -; GFX8-NEXT: s_sext_i32_i16 s4, s8 -; GFX8-NEXT: s_max_i32 s16, s4, -1 -; GFX8-NEXT: s_add_i32 s16, s16, 0x8001 -; GFX8-NEXT: s_min_i32 s4, s4, -1 -; GFX8-NEXT: s_sext_i32_i16 s16, s16 -; GFX8-NEXT: s_sext_i32_i16 s12, s12 -; GFX8-NEXT: s_add_i32 s4, s4, 0x8000 -; GFX8-NEXT: s_max_i32 s12, s16, s12 +; GFX8-NEXT: s_sext_i32_i16 s12, s0 +; GFX8-NEXT: s_max_i32 s13, s12, -1 +; GFX8-NEXT: s_add_i32 s13, s13, 0x8001 +; GFX8-NEXT: s_min_i32 s12, s12, -1 +; GFX8-NEXT: s_sext_i32_i16 s13, s13 +; GFX8-NEXT: s_sext_i32_i16 s14, s4 +; GFX8-NEXT: s_add_i32 s12, s12, 0x8000 +; GFX8-NEXT: s_max_i32 s13, s13, s14 +; GFX8-NEXT: s_sext_i32_i16 s13, s13 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 +; GFX8-NEXT: s_lshr_b32 s8, s0, 16 +; GFX8-NEXT: s_min_i32 s12, s13, s12 +; GFX8-NEXT: s_sub_i32 s0, s0, s12 +; GFX8-NEXT: s_sext_i32_i16 s12, s8 +; GFX8-NEXT: s_max_i32 s13, s12, -1 +; GFX8-NEXT: s_add_i32 s13, s13, 0x8001 +; GFX8-NEXT: s_min_i32 s12, s12, -1 +; GFX8-NEXT: s_sext_i32_i16 s13, s13 +; GFX8-NEXT: s_bfe_i32 s4, s4, 0x100010 +; GFX8-NEXT: s_add_i32 s12, s12, 0x8000 +; GFX8-NEXT: s_max_i32 s4, s13, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_min_i32 s4, s12, s4 +; GFX8-NEXT: s_sext_i32_i16 s12, s12 +; GFX8-NEXT: s_min_i32 s4, s4, s12 ; GFX8-NEXT: s_sub_i32 s4, s8, s4 ; GFX8-NEXT: s_sext_i32_i16 s8, s1 ; GFX8-NEXT: s_max_i32 s12, s8, -1 ; GFX8-NEXT: s_add_i32 s12, s12, 0x8001 -; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_min_i32 s8, s8, -1 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 -; GFX8-NEXT: s_sext_i32_i16 s5, s5 +; GFX8-NEXT: s_sext_i32_i16 s13, s5 +; GFX8-NEXT: s_add_i32 s8, s8, 0x8000 +; GFX8-NEXT: s_max_i32 s12, s12, s13 +; GFX8-NEXT: s_sext_i32_i16 s12, s12 +; GFX8-NEXT: s_sext_i32_i16 s8, s8 +; GFX8-NEXT: s_lshr_b32 s9, s1, 16 +; GFX8-NEXT: s_min_i32 s8, s12, s8 +; GFX8-NEXT: s_sub_i32 s1, s1, s8 +; GFX8-NEXT: s_sext_i32_i16 s8, s9 +; GFX8-NEXT: s_max_i32 s12, s8, -1 +; GFX8-NEXT: s_add_i32 s12, s12, 0x8001 +; GFX8-NEXT: s_min_i32 s8, s8, -1 +; GFX8-NEXT: s_sext_i32_i16 s12, s12 +; GFX8-NEXT: s_bfe_i32 s5, s5, 0x100010 ; GFX8-NEXT: s_add_i32 s8, s8, 0x8000 ; GFX8-NEXT: s_max_i32 s5, s12, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 -; GFX8-NEXT: s_lshr_b32 s9, s1, 16 ; GFX8-NEXT: s_min_i32 s5, s5, s8 -; GFX8-NEXT: s_sub_i32 s1, s1, s5 -; GFX8-NEXT: s_sext_i32_i16 s5, s9 -; GFX8-NEXT: s_max_i32 s8, s5, -1 -; GFX8-NEXT: s_add_i32 s8, s8, 0x8001 -; GFX8-NEXT: s_min_i32 s5, s5, -1 -; GFX8-NEXT: s_sext_i32_i16 s8, s8 -; GFX8-NEXT: s_sext_i32_i16 s12, s13 -; GFX8-NEXT: s_add_i32 s5, s5, 0x8000 -; GFX8-NEXT: s_max_i32 s8, s8, s12 -; GFX8-NEXT: s_sext_i32_i16 s8, s8 -; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_min_i32 s5, s8, s5 ; GFX8-NEXT: s_sext_i32_i16 s8, s2 ; GFX8-NEXT: s_sub_i32 s5, s9, s5 ; GFX8-NEXT: s_max_i32 s9, s8, -1 ; GFX8-NEXT: s_add_i32 s9, s9, 0x8001 -; GFX8-NEXT: s_lshr_b32 s14, s6, 16 ; GFX8-NEXT: s_min_i32 s8, s8, -1 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 -; GFX8-NEXT: s_sext_i32_i16 s6, s6 +; GFX8-NEXT: s_sext_i32_i16 s12, s6 +; GFX8-NEXT: s_add_i32 s8, s8, 0x8000 +; GFX8-NEXT: s_max_i32 s9, s9, s12 +; GFX8-NEXT: s_sext_i32_i16 s9, s9 +; GFX8-NEXT: s_sext_i32_i16 s8, s8 +; GFX8-NEXT: s_lshr_b32 s10, s2, 16 +; GFX8-NEXT: s_min_i32 s8, s9, s8 +; GFX8-NEXT: s_sub_i32 s2, s2, s8 +; GFX8-NEXT: s_sext_i32_i16 s8, s10 +; GFX8-NEXT: s_max_i32 s9, s8, -1 +; GFX8-NEXT: s_add_i32 s9, s9, 0x8001 +; GFX8-NEXT: s_min_i32 s8, s8, -1 +; GFX8-NEXT: s_sext_i32_i16 s9, s9 +; GFX8-NEXT: s_bfe_i32 s6, s6, 0x100010 ; GFX8-NEXT: s_add_i32 s8, s8, 0x8000 ; GFX8-NEXT: s_max_i32 s6, s9, s6 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 -; GFX8-NEXT: s_lshr_b32 s10, s2, 16 ; GFX8-NEXT: s_min_i32 s6, s6, s8 -; GFX8-NEXT: s_sub_i32 s2, s2, s6 -; GFX8-NEXT: s_sext_i32_i16 s6, s10 -; GFX8-NEXT: s_max_i32 s8, s6, -1 -; GFX8-NEXT: s_add_i32 s8, s8, 0x8001 -; GFX8-NEXT: s_min_i32 s6, s6, -1 -; GFX8-NEXT: s_sext_i32_i16 s8, s8 -; GFX8-NEXT: s_sext_i32_i16 s9, s14 -; GFX8-NEXT: s_add_i32 s6, s6, 0x8000 -; GFX8-NEXT: s_max_i32 s8, s8, s9 -; GFX8-NEXT: s_sext_i32_i16 s8, s8 -; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_min_i32 s6, s8, s6 ; GFX8-NEXT: s_sext_i32_i16 s8, s3 ; GFX8-NEXT: s_max_i32 s9, s8, -1 ; GFX8-NEXT: s_add_i32 s9, s9, 0x8001 -; GFX8-NEXT: s_lshr_b32 s15, s7, 16 +; GFX8-NEXT: s_sub_i32 s6, s10, s6 ; GFX8-NEXT: s_min_i32 s8, s8, -1 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 -; GFX8-NEXT: s_sext_i32_i16 s7, s7 +; GFX8-NEXT: s_sext_i32_i16 s10, s7 ; GFX8-NEXT: s_add_i32 s8, s8, 0x8000 -; GFX8-NEXT: s_max_i32 s7, s9, s7 -; GFX8-NEXT: s_sext_i32_i16 s7, s7 +; GFX8-NEXT: s_max_i32 s9, s9, s10 +; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: s_min_i32 s7, s7, s8 -; GFX8-NEXT: s_sub_i32 s3, s3, s7 -; GFX8-NEXT: s_sext_i32_i16 s7, s11 -; GFX8-NEXT: s_max_i32 s8, s7, -1 -; GFX8-NEXT: s_add_i32 s8, s8, 0x8001 +; GFX8-NEXT: s_min_i32 s8, s9, s8 +; GFX8-NEXT: s_sub_i32 s3, s3, s8 +; GFX8-NEXT: s_sext_i32_i16 s8, s11 +; GFX8-NEXT: s_max_i32 s9, s8, -1 +; GFX8-NEXT: s_add_i32 s9, s9, 0x8001 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX8-NEXT: s_min_i32 s7, s7, -1 -; GFX8-NEXT: s_sext_i32_i16 s8, s8 -; GFX8-NEXT: s_sext_i32_i16 s9, s15 +; GFX8-NEXT: s_min_i32 s8, s8, -1 +; GFX8-NEXT: s_sext_i32_i16 s9, s9 +; GFX8-NEXT: s_bfe_i32 s7, s7, 0x100010 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_add_i32 s7, s7, 0x8000 -; GFX8-NEXT: s_max_i32 s8, s8, s9 +; GFX8-NEXT: s_add_i32 s8, s8, 0x8000 +; GFX8-NEXT: s_max_i32 s7, s9, s7 ; GFX8-NEXT: s_or_b32 s0, s0, s4 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s5 -; GFX8-NEXT: s_sub_i32 s6, s10, s6 -; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s7, s7 +; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_min_i32 s7, s8, s7 +; GFX8-NEXT: s_min_i32 s7, s7, s8 ; GFX8-NEXT: s_or_b32 s1, s1, s4 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_sub_i32 s7, s11, s7 diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll index 6ab3022a91cd7..79c4cda2eeaef 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -742,19 +742,18 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; GISEL-VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: s_lshr_b32 s3, s2, 16 -; GISEL-VI-NEXT: s_sext_i32_i16 s3, s3 -; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2 -; GISEL-VI-NEXT: s_max_i32 s3, s3, 0 +; GISEL-VI-NEXT: s_sext_i32_i16 s3, s2 +; GISEL-VI-NEXT: s_bfe_i32 s2, s2, 0x100010 ; GISEL-VI-NEXT: s_max_i32 s2, s2, 0 -; GISEL-VI-NEXT: s_sext_i32_i16 s3, s3 +; GISEL-VI-NEXT: s_max_i32 s3, s3, 0 ; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2 -; GISEL-VI-NEXT: s_min_i32 s3, s3, 0xff +; GISEL-VI-NEXT: s_sext_i32_i16 s3, s3 ; GISEL-VI-NEXT: s_min_i32 s2, s2, 0xff -; GISEL-VI-NEXT: s_and_b32 s3, 0xffff, s3 +; GISEL-VI-NEXT: s_min_i32 s3, s3, 0xff ; GISEL-VI-NEXT: s_and_b32 s2, 0xffff, s2 -; GISEL-VI-NEXT: s_lshl_b32 s3, s3, 16 -; GISEL-VI-NEXT: s_or_b32 s2, s2, s3 +; GISEL-VI-NEXT: s_and_b32 s3, 0xffff, s3 +; GISEL-VI-NEXT: s_lshl_b32 s2, s2, 16 +; GISEL-VI-NEXT: s_or_b32 s2, s3, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1