diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 365afc6bd8c61..4322eaef9f467 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1886,10 +1886,14 @@ multiclass PRMT { } let hasSideEffects = false in { - defm BFE_S32 : BFE<"bfe.s32", i32, Int32Regs>; + // order is somewhat important here. signed/unsigned variants match + // the same patterns, so the first one wins. Having unsigned byte extraction + // has the benefit of always having zero in unused bits, which makes some + // optimizations easier (e.g. no need to mask them). defm BFE_U32 : BFE<"bfe.u32", i32, Int32Regs>; - defm BFE_S64 : BFE<"bfe.s64", i64, Int64Regs>; + defm BFE_S32 : BFE<"bfe.s32", i32, Int32Regs>; defm BFE_U64 : BFE<"bfe.u64", i64, Int64Regs>; + defm BFE_S64 : BFE<"bfe.s64", i64, Int64Regs>; defm BFI_B32 : BFI<"bfi.b32", i32, Int32Regs, i32imm>; defm BFI_B64 : BFI<"bfi.b64", i64, Int64Regs, i64imm>; @@ -2259,27 +2263,69 @@ def : Pat<(setueq Int1Regs:$a, Int1Regs:$b), (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>; // comparisons of i8 extracted with BFE as i32 -def: Pat<(setgt (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)), - (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpGT)>; -def: Pat<(setge (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)), - (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpGE)>; -def: Pat<(setlt (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)), - (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpLT)>; -def: Pat<(setle (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)), - (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpLE)>; - -def: Pat<(setugt (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))), - (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpHI)>; -def: Pat<(setuge (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))), - (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpHS)>; -def: Pat<(setult (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))), - (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpLO)>; -def: Pat<(setule (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))), - (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpLS)>; -def: Pat<(seteq (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))), - (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpEQ)>; -def: Pat<(setne (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))), - (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpNE)>; +// It's faster to do comparison directly on i32 extracted by BFE, +// instead of the long conversion and sign extending. +def: Pat<(setgt (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)), + (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))), + (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpGT)>; +def: Pat<(setgt (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)), + (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))), + (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpGT)>; +def: Pat<(setge (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)), + (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))), + (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpGE)>; +def: Pat<(setge (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)), + (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))), + (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpGE)>; +def: Pat<(setlt (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)), + (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))), + (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpLT)>; +def: Pat<(setlt (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)), + (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))), + (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpLT)>; +def: Pat<(setle (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)), + (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))), + (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpLE)>; +def: Pat<(setle (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)), + (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))), + (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpLE)>; + +def: Pat<(setugt (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)), + (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))), + (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpHI)>; +def: Pat<(setugt (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)), + (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))), + (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpHI)>; +def: Pat<(setuge (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)), + (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))), + (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpHS)>; +def: Pat<(setuge (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)), + (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))), + (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpHS)>; +def: Pat<(setult (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)), + (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))), + (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpLO)>; +def: Pat<(setult (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)), + (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))), + (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpLO)>; +def: Pat<(setule (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)), + (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))), + (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpLS)>; +def: Pat<(setule (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)), + (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))), + (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpLS)>; +def: Pat<(seteq (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)), + (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))), + (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpEQ)>; +def: Pat<(seteq (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)), + (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))), + (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpEQ)>; +def: Pat<(setne (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)), + (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))), + (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpNE)>; +def: Pat<(setne (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)), + (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))), + (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpNE)>; // i1 compare -> i32 def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)), diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll index 1ec68b4a271ba..6895699a1dfea 100644 --- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll @@ -106,29 +106,29 @@ define <4 x i8> @test_add(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_add_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_add_param_0]; -; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; -; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; ; CHECK-NEXT: add.s16 %rs3, %rs2, %rs1; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; -; CHECK-NEXT: bfe.s32 %r6, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; -; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; ; CHECK-NEXT: add.s16 %rs6, %rs5, %rs4; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; ; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8; -; CHECK-NEXT: bfe.s32 %r10, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r10, %r2, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; -; CHECK-NEXT: bfe.s32 %r11, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs8, %r11; ; CHECK-NEXT: add.s16 %rs9, %rs8, %rs7; ; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; ; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8; -; CHECK-NEXT: bfe.s32 %r14, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r14, %r2, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs10, %r14; -; CHECK-NEXT: bfe.s32 %r15, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r15, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs11, %r15; ; CHECK-NEXT: add.s16 %rs12, %rs11, %rs10; ; CHECK-NEXT: cvt.u32.u16 %r16, %rs12; @@ -147,21 +147,21 @@ define <4 x i8> @test_add_imm_0(<4 x i8> %a) #0 { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_add_imm_0_param_0]; -; CHECK-NEXT: bfe.s32 %r2, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; ; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; -; CHECK-NEXT: bfe.s32 %r4, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; ; CHECK-NEXT: add.s16 %rs4, %rs3, 2; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; ; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; -; CHECK-NEXT: bfe.s32 %r7, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; ; CHECK-NEXT: add.s16 %rs6, %rs5, 3; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; ; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; -; CHECK-NEXT: bfe.s32 %r10, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; ; CHECK-NEXT: add.s16 %rs8, %rs7, 4; ; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; @@ -180,21 +180,21 @@ define <4 x i8> @test_add_imm_1(<4 x i8> %a) #0 { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_add_imm_1_param_0]; -; CHECK-NEXT: bfe.s32 %r2, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; ; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; -; CHECK-NEXT: bfe.s32 %r4, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; ; CHECK-NEXT: add.s16 %rs4, %rs3, 2; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; ; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; -; CHECK-NEXT: bfe.s32 %r7, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; ; CHECK-NEXT: add.s16 %rs6, %rs5, 3; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; ; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; -; CHECK-NEXT: bfe.s32 %r10, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; ; CHECK-NEXT: add.s16 %rs8, %rs7, 4; ; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; @@ -214,29 +214,29 @@ define <4 x i8> @test_sub(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_sub_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_sub_param_0]; -; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; -; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; ; CHECK-NEXT: sub.s16 %rs3, %rs2, %rs1; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; -; CHECK-NEXT: bfe.s32 %r6, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; -; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; ; CHECK-NEXT: sub.s16 %rs6, %rs5, %rs4; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; ; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8; -; CHECK-NEXT: bfe.s32 %r10, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r10, %r2, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; -; CHECK-NEXT: bfe.s32 %r11, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs8, %r11; ; CHECK-NEXT: sub.s16 %rs9, %rs8, %rs7; ; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; ; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8; -; CHECK-NEXT: bfe.s32 %r14, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r14, %r2, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs10, %r14; -; CHECK-NEXT: bfe.s32 %r15, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r15, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs11, %r15; ; CHECK-NEXT: sub.s16 %rs12, %rs11, %rs10; ; CHECK-NEXT: cvt.u32.u16 %r16, %rs12; @@ -251,31 +251,39 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_smax( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-NEXT: .reg .b32 %r<27>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_smax_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_smax_param_0]; -; CHECK-NEXT: bfe.s32 %r3, %r1, 24, 8; -; CHECK-NEXT: bfe.s32 %r4, %r2, 24, 8; -; CHECK-NEXT: setp.gt.s32 %p1, %r3, %r4; -; CHECK-NEXT: bfe.s32 %r5, %r1, 16, 8; -; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8; -; CHECK-NEXT: setp.gt.s32 %p2, %r5, %r6; -; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8; -; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8; -; CHECK-NEXT: setp.gt.s32 %p3, %r7, %r8; -; CHECK-NEXT: bfe.s32 %r9, %r1, 0, 8; -; CHECK-NEXT: bfe.s32 %r10, %r2, 0, 8; -; CHECK-NEXT: setp.gt.s32 %p4, %r9, %r10; -; CHECK-NEXT: selp.b32 %r11, %r9, %r10, %p4; -; CHECK-NEXT: selp.b32 %r12, %r7, %r8, %p3; -; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8; -; CHECK-NEXT: selp.b32 %r14, %r5, %r6, %p2; -; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8; -; CHECK-NEXT: selp.b32 %r16, %r3, %r4, %p1; -; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; +; CHECK-NEXT: bfe.s32 %r3, %r2, 24, 8; +; CHECK-NEXT: bfe.s32 %r4, %r1, 24, 8; +; CHECK-NEXT: setp.gt.s32 %p1, %r4, %r3; +; CHECK-NEXT: bfe.s32 %r5, %r2, 16, 8; +; CHECK-NEXT: bfe.s32 %r6, %r1, 16, 8; +; CHECK-NEXT: setp.gt.s32 %p2, %r6, %r5; +; CHECK-NEXT: bfe.s32 %r7, %r2, 8, 8; +; CHECK-NEXT: bfe.s32 %r8, %r1, 8, 8; +; CHECK-NEXT: setp.gt.s32 %p3, %r8, %r7; +; CHECK-NEXT: bfe.s32 %r9, %r2, 0, 8; +; CHECK-NEXT: bfe.s32 %r10, %r1, 0, 8; +; CHECK-NEXT: setp.gt.s32 %p4, %r10, %r9; +; CHECK-NEXT: bfe.u32 %r11, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r12, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r13, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r15, %r2, 0, 8; +; CHECK-NEXT: selp.b32 %r16, %r14, %r15, %p4; +; CHECK-NEXT: bfe.u32 %r17, %r2, 8, 8; +; CHECK-NEXT: selp.b32 %r18, %r13, %r17, %p3; +; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 8, 8; +; CHECK-NEXT: bfe.u32 %r20, %r2, 16, 8; +; CHECK-NEXT: selp.b32 %r21, %r12, %r20, %p2; +; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 16, 8; +; CHECK-NEXT: bfe.u32 %r23, %r2, 24, 8; +; CHECK-NEXT: selp.b32 %r24, %r11, %r23, %p1; +; CHECK-NEXT: bfi.b32 %r25, %r24, %r22, 24, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r25; ; CHECK-NEXT: ret; %cmp = icmp sgt <4 x i8> %a, %b %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b @@ -291,24 +299,24 @@ define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_umax_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_umax_param_0]; -; CHECK-NEXT: bfe.s32 %r3, %r1, 24, 8; -; CHECK-NEXT: bfe.s32 %r4, %r2, 24, 8; -; CHECK-NEXT: setp.hi.u32 %p1, %r3, %r4; -; CHECK-NEXT: bfe.s32 %r5, %r1, 16, 8; -; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8; -; CHECK-NEXT: setp.hi.u32 %p2, %r5, %r6; -; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8; -; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8; -; CHECK-NEXT: setp.hi.u32 %p3, %r7, %r8; -; CHECK-NEXT: bfe.s32 %r9, %r1, 0, 8; -; CHECK-NEXT: bfe.s32 %r10, %r2, 0, 8; -; CHECK-NEXT: setp.hi.u32 %p4, %r9, %r10; -; CHECK-NEXT: selp.b32 %r11, %r9, %r10, %p4; -; CHECK-NEXT: selp.b32 %r12, %r7, %r8, %p3; +; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; +; CHECK-NEXT: setp.hi.u32 %p1, %r4, %r3; +; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r6, %r1, 16, 8; +; CHECK-NEXT: setp.hi.u32 %p2, %r6, %r5; +; CHECK-NEXT: bfe.u32 %r7, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r8, %r1, 8, 8; +; CHECK-NEXT: setp.hi.u32 %p3, %r8, %r7; +; CHECK-NEXT: bfe.u32 %r9, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 0, 8; +; CHECK-NEXT: setp.hi.u32 %p4, %r10, %r9; +; CHECK-NEXT: selp.b32 %r11, %r10, %r9, %p4; +; CHECK-NEXT: selp.b32 %r12, %r8, %r7, %p3; ; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8; -; CHECK-NEXT: selp.b32 %r14, %r5, %r6, %p2; +; CHECK-NEXT: selp.b32 %r14, %r6, %r5, %p2; ; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8; -; CHECK-NEXT: selp.b32 %r16, %r3, %r4, %p1; +; CHECK-NEXT: selp.b32 %r16, %r4, %r3, %p1; ; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; ; CHECK-NEXT: ret; @@ -321,31 +329,39 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_smin( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-NEXT: .reg .b32 %r<27>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_smin_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_smin_param_0]; -; CHECK-NEXT: bfe.s32 %r3, %r1, 24, 8; -; CHECK-NEXT: bfe.s32 %r4, %r2, 24, 8; -; CHECK-NEXT: setp.le.s32 %p1, %r3, %r4; -; CHECK-NEXT: bfe.s32 %r5, %r1, 16, 8; -; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8; -; CHECK-NEXT: setp.le.s32 %p2, %r5, %r6; -; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8; -; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8; -; CHECK-NEXT: setp.le.s32 %p3, %r7, %r8; -; CHECK-NEXT: bfe.s32 %r9, %r1, 0, 8; -; CHECK-NEXT: bfe.s32 %r10, %r2, 0, 8; -; CHECK-NEXT: setp.le.s32 %p4, %r9, %r10; -; CHECK-NEXT: selp.b32 %r11, %r9, %r10, %p4; -; CHECK-NEXT: selp.b32 %r12, %r7, %r8, %p3; -; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8; -; CHECK-NEXT: selp.b32 %r14, %r5, %r6, %p2; -; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8; -; CHECK-NEXT: selp.b32 %r16, %r3, %r4, %p1; -; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; +; CHECK-NEXT: bfe.s32 %r3, %r2, 24, 8; +; CHECK-NEXT: bfe.s32 %r4, %r1, 24, 8; +; CHECK-NEXT: setp.le.s32 %p1, %r4, %r3; +; CHECK-NEXT: bfe.s32 %r5, %r2, 16, 8; +; CHECK-NEXT: bfe.s32 %r6, %r1, 16, 8; +; CHECK-NEXT: setp.le.s32 %p2, %r6, %r5; +; CHECK-NEXT: bfe.s32 %r7, %r2, 8, 8; +; CHECK-NEXT: bfe.s32 %r8, %r1, 8, 8; +; CHECK-NEXT: setp.le.s32 %p3, %r8, %r7; +; CHECK-NEXT: bfe.s32 %r9, %r2, 0, 8; +; CHECK-NEXT: bfe.s32 %r10, %r1, 0, 8; +; CHECK-NEXT: setp.le.s32 %p4, %r10, %r9; +; CHECK-NEXT: bfe.u32 %r11, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r12, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r13, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r15, %r2, 0, 8; +; CHECK-NEXT: selp.b32 %r16, %r14, %r15, %p4; +; CHECK-NEXT: bfe.u32 %r17, %r2, 8, 8; +; CHECK-NEXT: selp.b32 %r18, %r13, %r17, %p3; +; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 8, 8; +; CHECK-NEXT: bfe.u32 %r20, %r2, 16, 8; +; CHECK-NEXT: selp.b32 %r21, %r12, %r20, %p2; +; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 16, 8; +; CHECK-NEXT: bfe.u32 %r23, %r2, 24, 8; +; CHECK-NEXT: selp.b32 %r24, %r11, %r23, %p1; +; CHECK-NEXT: bfi.b32 %r25, %r24, %r22, 24, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r25; ; CHECK-NEXT: ret; %cmp = icmp sle <4 x i8> %a, %b %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b @@ -361,24 +377,24 @@ define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_umin_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_umin_param_0]; -; CHECK-NEXT: bfe.s32 %r3, %r1, 24, 8; -; CHECK-NEXT: bfe.s32 %r4, %r2, 24, 8; -; CHECK-NEXT: setp.ls.u32 %p1, %r3, %r4; -; CHECK-NEXT: bfe.s32 %r5, %r1, 16, 8; -; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8; -; CHECK-NEXT: setp.ls.u32 %p2, %r5, %r6; -; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8; -; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8; -; CHECK-NEXT: setp.ls.u32 %p3, %r7, %r8; -; CHECK-NEXT: bfe.s32 %r9, %r1, 0, 8; -; CHECK-NEXT: bfe.s32 %r10, %r2, 0, 8; -; CHECK-NEXT: setp.ls.u32 %p4, %r9, %r10; -; CHECK-NEXT: selp.b32 %r11, %r9, %r10, %p4; -; CHECK-NEXT: selp.b32 %r12, %r7, %r8, %p3; +; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; +; CHECK-NEXT: setp.ls.u32 %p1, %r4, %r3; +; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r6, %r1, 16, 8; +; CHECK-NEXT: setp.ls.u32 %p2, %r6, %r5; +; CHECK-NEXT: bfe.u32 %r7, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r8, %r1, 8, 8; +; CHECK-NEXT: setp.ls.u32 %p3, %r8, %r7; +; CHECK-NEXT: bfe.u32 %r9, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 0, 8; +; CHECK-NEXT: setp.ls.u32 %p4, %r10, %r9; +; CHECK-NEXT: selp.b32 %r11, %r10, %r9, %p4; +; CHECK-NEXT: selp.b32 %r12, %r8, %r7, %p3; ; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8; -; CHECK-NEXT: selp.b32 %r14, %r5, %r6, %p2; +; CHECK-NEXT: selp.b32 %r14, %r6, %r5, %p2; ; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8; -; CHECK-NEXT: selp.b32 %r16, %r3, %r4, %p1; +; CHECK-NEXT: selp.b32 %r16, %r4, %r3, %p1; ; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; ; CHECK-NEXT: ret; @@ -397,27 +413,27 @@ define <4 x i8> @test_eq(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 { ; CHECK-NEXT: ld.param.u32 %r3, [test_eq_param_2]; ; CHECK-NEXT: ld.param.u32 %r2, [test_eq_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_eq_param_0]; -; CHECK-NEXT: bfe.s32 %r4, %r2, 24, 8; -; CHECK-NEXT: bfe.s32 %r5, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r4, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r5, %r1, 24, 8; ; CHECK-NEXT: setp.eq.u32 %p1, %r5, %r4; -; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8; -; CHECK-NEXT: bfe.s32 %r7, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; ; CHECK-NEXT: setp.eq.u32 %p2, %r7, %r6; -; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8; -; CHECK-NEXT: bfe.s32 %r9, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r9, %r1, 8, 8; ; CHECK-NEXT: setp.eq.u32 %p3, %r9, %r8; -; CHECK-NEXT: bfe.s32 %r10, %r2, 0, 8; -; CHECK-NEXT: bfe.s32 %r11, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 0, 8; ; CHECK-NEXT: setp.eq.u32 %p4, %r11, %r10; -; CHECK-NEXT: bfe.s32 %r12, %r3, 0, 8; +; CHECK-NEXT: bfe.u32 %r12, %r3, 0, 8; ; CHECK-NEXT: selp.b32 %r13, %r11, %r12, %p4; -; CHECK-NEXT: bfe.s32 %r14, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r14, %r3, 8, 8; ; CHECK-NEXT: selp.b32 %r15, %r9, %r14, %p3; ; CHECK-NEXT: bfi.b32 %r16, %r15, %r13, 8, 8; -; CHECK-NEXT: bfe.s32 %r17, %r3, 16, 8; +; CHECK-NEXT: bfe.u32 %r17, %r3, 16, 8; ; CHECK-NEXT: selp.b32 %r18, %r7, %r17, %p2; ; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 16, 8; -; CHECK-NEXT: bfe.s32 %r20, %r3, 24, 8; +; CHECK-NEXT: bfe.u32 %r20, %r3, 24, 8; ; CHECK-NEXT: selp.b32 %r21, %r5, %r20, %p1; ; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r22; @@ -437,27 +453,27 @@ define <4 x i8> @test_ne(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 { ; CHECK-NEXT: ld.param.u32 %r3, [test_ne_param_2]; ; CHECK-NEXT: ld.param.u32 %r2, [test_ne_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_ne_param_0]; -; CHECK-NEXT: bfe.s32 %r4, %r2, 24, 8; -; CHECK-NEXT: bfe.s32 %r5, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r4, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r5, %r1, 24, 8; ; CHECK-NEXT: setp.ne.u32 %p1, %r5, %r4; -; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8; -; CHECK-NEXT: bfe.s32 %r7, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; ; CHECK-NEXT: setp.ne.u32 %p2, %r7, %r6; -; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8; -; CHECK-NEXT: bfe.s32 %r9, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r9, %r1, 8, 8; ; CHECK-NEXT: setp.ne.u32 %p3, %r9, %r8; -; CHECK-NEXT: bfe.s32 %r10, %r2, 0, 8; -; CHECK-NEXT: bfe.s32 %r11, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 0, 8; ; CHECK-NEXT: setp.ne.u32 %p4, %r11, %r10; -; CHECK-NEXT: bfe.s32 %r12, %r3, 0, 8; +; CHECK-NEXT: bfe.u32 %r12, %r3, 0, 8; ; CHECK-NEXT: selp.b32 %r13, %r11, %r12, %p4; -; CHECK-NEXT: bfe.s32 %r14, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r14, %r3, 8, 8; ; CHECK-NEXT: selp.b32 %r15, %r9, %r14, %p3; ; CHECK-NEXT: bfi.b32 %r16, %r15, %r13, 8, 8; -; CHECK-NEXT: bfe.s32 %r17, %r3, 16, 8; +; CHECK-NEXT: bfe.u32 %r17, %r3, 16, 8; ; CHECK-NEXT: selp.b32 %r18, %r7, %r17, %p2; ; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 16, 8; -; CHECK-NEXT: bfe.s32 %r20, %r3, 24, 8; +; CHECK-NEXT: bfe.u32 %r20, %r3, 24, 8; ; CHECK-NEXT: selp.b32 %r21, %r5, %r20, %p1; ; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r22; @@ -476,29 +492,29 @@ define <4 x i8> @test_mul(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_mul_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_mul_param_0]; -; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; -; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; ; CHECK-NEXT: mul.lo.s16 %rs3, %rs2, %rs1; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; -; CHECK-NEXT: bfe.s32 %r6, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; -; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; ; CHECK-NEXT: mul.lo.s16 %rs6, %rs5, %rs4; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; ; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8; -; CHECK-NEXT: bfe.s32 %r10, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r10, %r2, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; -; CHECK-NEXT: bfe.s32 %r11, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs8, %r11; ; CHECK-NEXT: mul.lo.s16 %rs9, %rs8, %rs7; ; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; ; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8; -; CHECK-NEXT: bfe.s32 %r14, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r14, %r2, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs10, %r14; -; CHECK-NEXT: bfe.s32 %r15, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r15, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs11, %r15; ; CHECK-NEXT: mul.lo.s16 %rs12, %rs11, %rs10; ; CHECK-NEXT: cvt.u32.u16 %r16, %rs12; @@ -732,7 +748,7 @@ define void @test_ldst_v3i8(ptr %a, ptr %b) { ; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v3i8_param_0]; ; CHECK-NEXT: ld.u32 %r1, [%rd1]; ; CHECK-NEXT: st.u16 [%rd2], %r1; -; CHECK-NEXT: bfe.s32 %r3, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r3, %r1, 16, 8; ; CHECK-NEXT: st.u8 [%rd2+2], %r3; ; CHECK-NEXT: ret; %t1 = load <3 x i8>, ptr %a @@ -920,31 +936,31 @@ define <4 x i8> @test_select_cc(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> ; CHECK-NEXT: ld.param.u32 %r3, [test_select_cc_param_2]; ; CHECK-NEXT: ld.param.u32 %r2, [test_select_cc_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_select_cc_param_0]; -; CHECK-NEXT: bfe.s32 %r5, %r4, 24, 8; -; CHECK-NEXT: bfe.s32 %r6, %r3, 24, 8; +; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; +; CHECK-NEXT: bfe.u32 %r6, %r3, 24, 8; ; CHECK-NEXT: setp.ne.u32 %p1, %r6, %r5; -; CHECK-NEXT: bfe.s32 %r7, %r4, 16, 8; -; CHECK-NEXT: bfe.s32 %r8, %r3, 16, 8; +; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; +; CHECK-NEXT: bfe.u32 %r8, %r3, 16, 8; ; CHECK-NEXT: setp.ne.u32 %p2, %r8, %r7; -; CHECK-NEXT: bfe.s32 %r9, %r4, 8, 8; -; CHECK-NEXT: bfe.s32 %r10, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r9, %r4, 8, 8; +; CHECK-NEXT: bfe.u32 %r10, %r3, 8, 8; ; CHECK-NEXT: setp.ne.u32 %p3, %r10, %r9; -; CHECK-NEXT: bfe.s32 %r11, %r4, 0, 8; -; CHECK-NEXT: bfe.s32 %r12, %r3, 0, 8; +; CHECK-NEXT: bfe.u32 %r11, %r4, 0, 8; +; CHECK-NEXT: bfe.u32 %r12, %r3, 0, 8; ; CHECK-NEXT: setp.ne.u32 %p4, %r12, %r11; -; CHECK-NEXT: bfe.s32 %r13, %r2, 0, 8; -; CHECK-NEXT: bfe.s32 %r14, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r13, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; ; CHECK-NEXT: selp.b32 %r15, %r14, %r13, %p4; -; CHECK-NEXT: bfe.s32 %r16, %r2, 8, 8; -; CHECK-NEXT: bfe.s32 %r17, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r16, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r17, %r1, 8, 8; ; CHECK-NEXT: selp.b32 %r18, %r17, %r16, %p3; ; CHECK-NEXT: bfi.b32 %r19, %r18, %r15, 8, 8; -; CHECK-NEXT: bfe.s32 %r20, %r2, 16, 8; -; CHECK-NEXT: bfe.s32 %r21, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r20, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r21, %r1, 16, 8; ; CHECK-NEXT: selp.b32 %r22, %r21, %r20, %p2; ; CHECK-NEXT: bfi.b32 %r23, %r22, %r19, 16, 8; -; CHECK-NEXT: bfe.s32 %r24, %r2, 24, 8; -; CHECK-NEXT: bfe.s32 %r25, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r24, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r25, %r1, 24, 8; ; CHECK-NEXT: selp.b32 %r26, %r25, %r24, %p1; ; CHECK-NEXT: bfi.b32 %r27, %r26, %r23, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r27; @@ -965,17 +981,17 @@ define <4 x i32> @test_select_cc_i32_i8(<4 x i32> %a, <4 x i32> %b, ; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [test_select_cc_i32_i8_param_0]; ; CHECK-NEXT: ld.param.u32 %r10, [test_select_cc_i32_i8_param_3]; ; CHECK-NEXT: ld.param.u32 %r9, [test_select_cc_i32_i8_param_2]; -; CHECK-NEXT: bfe.s32 %r11, %r10, 0, 8; -; CHECK-NEXT: bfe.s32 %r12, %r9, 0, 8; +; CHECK-NEXT: bfe.u32 %r11, %r10, 0, 8; +; CHECK-NEXT: bfe.u32 %r12, %r9, 0, 8; ; CHECK-NEXT: setp.ne.u32 %p1, %r12, %r11; -; CHECK-NEXT: bfe.s32 %r13, %r10, 8, 8; -; CHECK-NEXT: bfe.s32 %r14, %r9, 8, 8; +; CHECK-NEXT: bfe.u32 %r13, %r10, 8, 8; +; CHECK-NEXT: bfe.u32 %r14, %r9, 8, 8; ; CHECK-NEXT: setp.ne.u32 %p2, %r14, %r13; -; CHECK-NEXT: bfe.s32 %r15, %r10, 16, 8; -; CHECK-NEXT: bfe.s32 %r16, %r9, 16, 8; +; CHECK-NEXT: bfe.u32 %r15, %r10, 16, 8; +; CHECK-NEXT: bfe.u32 %r16, %r9, 16, 8; ; CHECK-NEXT: setp.ne.u32 %p3, %r16, %r15; -; CHECK-NEXT: bfe.s32 %r17, %r10, 24, 8; -; CHECK-NEXT: bfe.s32 %r18, %r9, 24, 8; +; CHECK-NEXT: bfe.u32 %r17, %r10, 24, 8; +; CHECK-NEXT: bfe.u32 %r18, %r9, 24, 8; ; CHECK-NEXT: setp.ne.u32 %p4, %r18, %r17; ; CHECK-NEXT: selp.b32 %r19, %r4, %r8, %p4; ; CHECK-NEXT: selp.b32 %r20, %r3, %r7, %p3; @@ -1004,19 +1020,19 @@ define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b, ; CHECK-NEXT: setp.ne.s32 %p2, %r5, %r9; ; CHECK-NEXT: setp.ne.s32 %p3, %r4, %r8; ; CHECK-NEXT: setp.ne.s32 %p4, %r3, %r7; -; CHECK-NEXT: bfe.s32 %r11, %r2, 0, 8; -; CHECK-NEXT: bfe.s32 %r12, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r11, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r12, %r1, 0, 8; ; CHECK-NEXT: selp.b32 %r13, %r12, %r11, %p4; -; CHECK-NEXT: bfe.s32 %r14, %r2, 8, 8; -; CHECK-NEXT: bfe.s32 %r15, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r14, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r15, %r1, 8, 8; ; CHECK-NEXT: selp.b32 %r16, %r15, %r14, %p3; ; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 8, 8; -; CHECK-NEXT: bfe.s32 %r18, %r2, 16, 8; -; CHECK-NEXT: bfe.s32 %r19, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r18, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r19, %r1, 16, 8; ; CHECK-NEXT: selp.b32 %r20, %r19, %r18, %p2; ; CHECK-NEXT: bfi.b32 %r21, %r20, %r17, 16, 8; -; CHECK-NEXT: bfe.s32 %r22, %r2, 24, 8; -; CHECK-NEXT: bfe.s32 %r23, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r22, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r23, %r1, 24, 8; ; CHECK-NEXT: selp.b32 %r24, %r23, %r22, %p1; ; CHECK-NEXT: bfi.b32 %r25, %r24, %r21, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r25; @@ -1091,16 +1107,16 @@ define <4 x i64> @test_zext_2xi64(<4 x i8> %a) #0 { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_zext_2xi64_param_0]; -; CHECK-NEXT: bfe.s32 %r2, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; ; CHECK-NEXT: cvt.u64.u32 %rd1, %r2; ; CHECK-NEXT: and.b64 %rd2, %rd1, 255; -; CHECK-NEXT: bfe.s32 %r3, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r3, %r1, 16, 8; ; CHECK-NEXT: cvt.u64.u32 %rd3, %r3; ; CHECK-NEXT: and.b64 %rd4, %rd3, 255; -; CHECK-NEXT: bfe.s32 %r4, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; ; CHECK-NEXT: cvt.u64.u32 %rd5, %r4; ; CHECK-NEXT: and.b64 %rd6, %rd5, 255; -; CHECK-NEXT: bfe.s32 %r5, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8; ; CHECK-NEXT: cvt.u64.u32 %rd7, %r5; ; CHECK-NEXT: and.b64 %rd8, %rd7, 255; ; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%rd8, %rd6}; @@ -1424,17 +1440,17 @@ define void @test_sext_v4i1_to_v4i8(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: ld.param.u64 %rd1, [test_sext_v4i1_to_v4i8_param_0]; ; CHECK-NEXT: ld.u32 %r1, [%rd1]; ; CHECK-NEXT: ld.u32 %r2, [%rd2]; -; CHECK-NEXT: bfe.s32 %r3, %r2, 24, 8; -; CHECK-NEXT: bfe.s32 %r4, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; ; CHECK-NEXT: setp.hi.u32 %p1, %r4, %r3; -; CHECK-NEXT: bfe.s32 %r5, %r2, 16, 8; -; CHECK-NEXT: bfe.s32 %r6, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r6, %r1, 16, 8; ; CHECK-NEXT: setp.hi.u32 %p2, %r6, %r5; -; CHECK-NEXT: bfe.s32 %r7, %r2, 8, 8; -; CHECK-NEXT: bfe.s32 %r8, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r8, %r1, 8, 8; ; CHECK-NEXT: setp.hi.u32 %p3, %r8, %r7; -; CHECK-NEXT: bfe.s32 %r9, %r2, 0, 8; -; CHECK-NEXT: bfe.s32 %r10, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r9, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 0, 8; ; CHECK-NEXT: setp.hi.u32 %p4, %r10, %r9; ; CHECK-NEXT: selp.s32 %r11, -1, 0, %p4; ; CHECK-NEXT: selp.s32 %r12, -1, 0, %p3;