diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 8c0fd66225513..ab9460e09ebb0 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/Utils/Local.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/DataLayout.h" @@ -6100,6 +6101,91 @@ bool InstCombinerImpl::replacedSelectWithOperand(SelectInst *SI, return false; } +// Try to "strengthen" the RHS of compare based on known bits. +// For example, replace `icmp ugt %x, 14` with `icmp ugt %x, 15` when +// it is known that the two least significant bits of `%x` is zero. +static Instruction *strengthenICmpUsingKnownBits(ICmpInst &I, + KnownBits Op0Known, + KnownBits Op1Known, + unsigned BitWidth) { + if (!BitWidth) + return nullptr; + if (!(Op1Known.isConstant() && Op0Known.Zero.isMask())) + return nullptr; + + Value *Op0 = I.getOperand(0); + ICmpInst::Predicate Pred = I.getPredicate(); + Type *Ty = Op0->getType(); + APInt RHSConst = Op1Known.getConstant(); + + ConstantRange Op0PredRange = + ConstantRange::makeExactICmpRegion(Pred, RHSConst); + int KnownZeroMaskLength = BitWidth - Op0Known.Zero.countLeadingZeros(); + if (KnownZeroMaskLength == 0) + return nullptr; + + APInt PowOf2(BitWidth, 1 << KnownZeroMaskLength); + APInt Op0MinAccordingToPred(BitWidth, 0); + APInt Op0MaxAccordingToPred(BitWidth, 0); + APInt Op0MinRefinedByKnownBits(BitWidth, 0); + APInt Op0MaxRefinedByKnownBits(BitWidth, 0); + APInt NewLower(BitWidth, 0); + APInt NewUpper(BitWidth, 0); + bool ImprovedLower = false; + bool ImprovedUpper = false; + if (I.isSigned()) { + Op0MinAccordingToPred = Op0PredRange.getSignedMin(); + Op0MaxAccordingToPred = Op0PredRange.getSignedMax(); + // Compute the smallest number satisfying the known-bits constrained + // which is at greater or equal Op0MinAccordingToPred. + Op0MinRefinedByKnownBits = + PowOf2 * APIntOps::RoundingSDiv(Op0MinAccordingToPred, PowOf2, + APInt::Rounding::UP); + // Compute the largest number satisfying the known-bits constrained + // which is at less or equal Op0MaxAccordingToPred. + Op0MaxRefinedByKnownBits = + PowOf2 * APIntOps::RoundingSDiv(Op0MaxAccordingToPred, PowOf2, + APInt::Rounding::DOWN); + NewLower = APIntOps::smax(Op0MinRefinedByKnownBits, Op0MinAccordingToPred); + NewUpper = APIntOps::smin(Op0MaxRefinedByKnownBits, Op0MaxAccordingToPred); + ImprovedLower = NewLower.sgt(Op0MinAccordingToPred); + ImprovedUpper = NewUpper.slt(Op0MaxAccordingToPred); + } else { + Op0MinAccordingToPred = Op0PredRange.getUnsignedMin(); + Op0MaxAccordingToPred = Op0PredRange.getUnsignedMax(); + Op0MinRefinedByKnownBits = + PowOf2 * APIntOps::RoundingUDiv(Op0MinAccordingToPred, PowOf2, + APInt::Rounding::UP); + Op0MaxRefinedByKnownBits = + PowOf2 * APIntOps::RoundingUDiv(Op0MaxAccordingToPred, PowOf2, + APInt::Rounding::DOWN); + NewLower = APIntOps::umax(Op0MinRefinedByKnownBits, Op0MinAccordingToPred); + NewUpper = APIntOps::umin(Op0MaxRefinedByKnownBits, Op0MaxAccordingToPred); + ImprovedLower = NewLower.ugt(Op0MinAccordingToPred); + ImprovedUpper = NewUpper.ult(Op0MaxAccordingToPred); + } + + // Non-strict inequalities should have been canonicalized to strict ones + // by now. + switch (Pred) { + default: + break; + case ICmpInst::ICMP_ULT: + case ICmpInst::ICMP_SLT: { + if (ImprovedUpper) + return new ICmpInst(Pred, Op0, ConstantInt::get(Ty, NewUpper + 1)); + break; + } + case ICmpInst::ICMP_UGT: + case ICmpInst::ICMP_SGT: { + if (ImprovedLower) + return new ICmpInst(Pred, Op0, ConstantInt::get(Ty, NewLower - 1)); + break; + } + } + return nullptr; +} + /// Try to fold the comparison based on range information we can get by checking /// whether bits are known to be zero or one in the inputs. Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) { @@ -6357,6 +6443,23 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) { (Op0Known.One.isNegative() && Op1Known.One.isNegative()))) return new ICmpInst(I.getUnsignedPredicate(), Op0, Op1); + // if the result of compare is used only in conditional branches, try to + // "strengthen" the compare. This may allow us to deduce stronger results + // about the value involved in comparison in the blocks dominated by these branches. + bool AllUsesAreInBranches = true; + for (const Use &U : I.uses()) { + const Instruction *UI = cast(U.getUser()); + if (!dyn_cast(UI)) { + AllUsesAreInBranches = false; + break; + } + } + if (AllUsesAreInBranches) { + if (Instruction *Res = + strengthenICmpUsingKnownBits(I, Op0Known, Op1Known, BitWidth)) + return Res; + } + return nullptr; } diff --git a/llvm/test/Transforms/InstCombine/assume-loop-align.ll b/llvm/test/Transforms/InstCombine/assume-loop-align.ll index e7eb18c61b6bb..79af1b0fede4b 100644 --- a/llvm/test/Transforms/InstCombine/assume-loop-align.ll +++ b/llvm/test/Transforms/InstCombine/assume-loop-align.ll @@ -28,7 +28,7 @@ define void @foo(ptr %a, ptr %b) #0 { ; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 ; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 1648 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 1633 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/InstCombine/icmp-mul.ll b/llvm/test/Transforms/InstCombine/icmp-mul.ll index 7f76a94f395b6..6c6befcfe39e4 100644 --- a/llvm/test/Transforms/InstCombine/icmp-mul.ll +++ b/llvm/test/Transforms/InstCombine/icmp-mul.ll @@ -969,7 +969,7 @@ define i1 @mul_of_pow2_no_lz_other_op(i32 %x, i8 %y) { ; CHECK-NEXT: [[B:%.*]] = and i32 [[X:%.*]], 2 ; CHECK-NEXT: [[S:%.*]] = sext i8 [[Y:%.*]] to i32 ; CHECK-NEXT: [[M:%.*]] = mul nuw nsw i32 [[B]], [[S]] -; CHECK-NEXT: [[R:%.*]] = icmp sgt i32 [[M]], 254 +; CHECK-NEXT: [[R:%.*]] = icmp sgt i32 [[M]], 255 ; CHECK-NEXT: ret i1 [[R]] ; %b = and i32 %x, 2 diff --git a/llvm/test/Transforms/InstCombine/icmp-or.ll b/llvm/test/Transforms/InstCombine/icmp-or.ll index 922845c1e7e2d..587df66417eb0 100644 --- a/llvm/test/Transforms/InstCombine/icmp-or.ll +++ b/llvm/test/Transforms/InstCombine/icmp-or.ll @@ -308,7 +308,7 @@ define i1 @decrement_sgt_n1_commute_use1(i8 %px) { ; CHECK-NEXT: [[X:%.*]] = mul i8 [[PX:%.*]], 42 ; CHECK-NEXT: [[DEC:%.*]] = add i8 [[X]], -1 ; CHECK-NEXT: call void @use(i8 [[DEC]]) -; CHECK-NEXT: [[R:%.*]] = icmp sgt i8 [[X]], 0 +; CHECK-NEXT: [[R:%.*]] = icmp sgt i8 [[X]], 1 ; CHECK-NEXT: ret i1 [[R]] ; %x = mul i8 %px, 42 ; thwart complexity-based canonicalization diff --git a/llvm/test/Transforms/InstCombine/icmp-shl-nsw.ll b/llvm/test/Transforms/InstCombine/icmp-shl-nsw.ll index 6b9ea1f8ef97e..5b827c839a4e4 100644 --- a/llvm/test/Transforms/InstCombine/icmp-shl-nsw.ll +++ b/llvm/test/Transforms/InstCombine/icmp-shl-nsw.ll @@ -136,7 +136,7 @@ define i1 @icmp_sgt6(i8 %x) { define i1 @icmp_sgt7(i8 %x) { ; CHECK-LABEL: @icmp_sgt7( -; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i8 [[X:%.*]], 62 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[X:%.*]], 63 ; CHECK-NEXT: ret i1 [[CMP]] ; %shl = shl nsw i8 %x, 1 @@ -224,7 +224,7 @@ define i1 @icmp_sle1(i8 %x) { define i1 @icmp_sle2(i8 %x) { ; CHECK-LABEL: @icmp_sle2( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[X:%.*]], -63 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[X:%.*]], -64 ; CHECK-NEXT: ret i1 [[CMP]] ; %shl = shl nsw i8 %x, 1 diff --git a/llvm/test/Transforms/InstCombine/icmp.ll b/llvm/test/Transforms/InstCombine/icmp.ll index 10ab1fe118348..e3a861c55649d 100644 --- a/llvm/test/Transforms/InstCombine/icmp.ll +++ b/llvm/test/Transforms/InstCombine/icmp.ll @@ -1490,8 +1490,8 @@ define <2 x i1> @test70vec(<2 x i32> %X) { define i1 @icmp_sext16trunc(i32 %x) { ; CHECK-LABEL: @icmp_sext16trunc( -; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X:%.*]] to i16 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i16 [[TMP1]], 36 +; CHECK-NEXT: [[SEXT1:%.*]] = shl i32 [[X:%.*]], 16 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[SEXT1]], 2293761 ; CHECK-NEXT: ret i1 [[CMP]] ; %trunc = trunc i32 %x to i16 @@ -1502,8 +1502,8 @@ define i1 @icmp_sext16trunc(i32 %x) { define i1 @icmp_sext8trunc(i32 %x) { ; CHECK-LABEL: @icmp_sext8trunc( -; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X:%.*]] to i8 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[TMP1]], 36 +; CHECK-NEXT: [[SEXT1:%.*]] = shl i32 [[X:%.*]], 24 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[SEXT1]], 587202561 ; CHECK-NEXT: ret i1 [[CMP]] ; %trunc = trunc i32 %x to i8 @@ -1515,8 +1515,8 @@ define i1 @icmp_sext8trunc(i32 %x) { ; Vectors should fold the same way. define <2 x i1> @icmp_sext8trunc_vec(<2 x i32> %x) { ; CHECK-LABEL: @icmp_sext8trunc_vec( -; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> [[X:%.*]] to <2 x i8> -; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i8> [[TMP1]], +; CHECK-NEXT: [[TMP1:%.*]] = shl <2 x i32> [[X:%.*]], +; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i32> [[TMP1]], ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; %trunc = trunc <2 x i32> %x to <2 x i8> @@ -1527,8 +1527,8 @@ define <2 x i1> @icmp_sext8trunc_vec(<2 x i32> %x) { define i1 @icmp_shl16(i32 %x) { ; CHECK-LABEL: @icmp_shl16( -; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X:%.*]] to i16 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i16 [[TMP1]], 36 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[X:%.*]], 16 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[SHL]], 2293761 ; CHECK-NEXT: ret i1 [[CMP]] ; %shl = shl i32 %x, 16 @@ -1541,7 +1541,7 @@ define i1 @icmp_shl16(i32 %x) { define i1 @icmp_shl17(i32 %x) { ; CHECK-LABEL: @icmp_shl17( ; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[X:%.*]], 17 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[SHL]], 2359296 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[SHL]], 2228225 ; CHECK-NEXT: ret i1 [[CMP]] ; %shl = shl i32 %x, 17 @@ -1551,8 +1551,8 @@ define i1 @icmp_shl17(i32 %x) { define <2 x i1> @icmp_shl16_vec(<2 x i32> %x) { ; CHECK-LABEL: @icmp_shl16_vec( -; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> [[X:%.*]] to <2 x i16> -; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i16> [[TMP1]], +; CHECK-NEXT: [[SHL:%.*]] = shl <2 x i32> [[X:%.*]], +; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i32> [[SHL]], ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; %shl = shl <2 x i32> %x, @@ -1562,8 +1562,8 @@ define <2 x i1> @icmp_shl16_vec(<2 x i32> %x) { define i1 @icmp_shl24(i32 %x) { ; CHECK-LABEL: @icmp_shl24( -; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X:%.*]] to i8 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[TMP1]], 36 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[X:%.*]], 24 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[SHL]], 587202561 ; CHECK-NEXT: ret i1 [[CMP]] ; %shl = shl i32 %x, 24 @@ -2199,7 +2199,7 @@ define i1 @icmp_ashr_and_overshift(i8 %X) { define i1 @icmp_and_ashr_neg_and_legal(i8 %x) { ; CHECK-LABEL: @icmp_and_ashr_neg_and_legal( ; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], -32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[TMP1]], 16 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[TMP1]], 1 ; CHECK-NEXT: ret i1 [[CMP]] ; %ashr = ashr i8 %x, 4 @@ -2225,7 +2225,7 @@ define i1 @icmp_and_ashr_mixed_and_shiftout(i8 %x) { define i1 @icmp_and_ashr_neg_cmp_slt_legal(i8 %x) { ; CHECK-LABEL: @icmp_and_ashr_neg_cmp_slt_legal( ; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], -32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[TMP1]], -64 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[TMP1]], -95 ; CHECK-NEXT: ret i1 [[CMP]] ; %ashr = ashr i8 %x, 4 @@ -2239,7 +2239,7 @@ define i1 @icmp_and_ashr_neg_cmp_slt_shiftout(i8 %x) { ; CHECK-LABEL: @icmp_and_ashr_neg_cmp_slt_shiftout( ; CHECK-NEXT: [[ASHR:%.*]] = ashr i8 [[X:%.*]], 4 ; CHECK-NEXT: [[AND:%.*]] = and i8 [[ASHR]], -2 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[AND]], -68 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[AND]], -69 ; CHECK-NEXT: ret i1 [[CMP]] ; %ashr = ashr i8 %x, 4 @@ -5183,3 +5183,121 @@ entry: %cmp = icmp eq i8 %add2, %add1 ret i1 %cmp } + +define i1 @strengthen_icmp_using_known_bits_ugt(i16 %a) { +; CHECK-LABEL: @strengthen_icmp_using_known_bits_ugt( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i16 [[A:%.*]], 15 +; CHECK-NEXT: ret i1 [[CMP]] +; +entry: + %and_ = and i16 %a, 65532 + %cmp = icmp ugt i16 %and_, 14 + ret i1 %cmp +} + +define i1 @strengthen_icmp_using_known_bits_ult(i16 %a) { +; CHECK-LABEL: @strengthen_icmp_using_known_bits_ult( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[AND_:%.*]] = and i16 [[A:%.*]], -4 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i16 [[AND_]], 17 +; CHECK-NEXT: ret i1 [[CMP]] +; +entry: + %and_ = and i16 %a, 65532 + %cmp = icmp ult i16 %and_, 18 + ret i1 %cmp +} + +define i1 @strengthen_icmp_using_known_bits_sgt(i16 %a) { +; CHECK-LABEL: @strengthen_icmp_using_known_bits_sgt( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i16 [[A:%.*]], -1 +; CHECK-NEXT: ret i1 [[CMP]] +; +entry: + %and_ = and i16 %a, 65520 + %cmp = icmp sgt i16 %and_, -15 + ret i1 %cmp +} + +define i1 @strengthen_icmp_using_known_bits_slt(i16 %a) { +; CHECK-LABEL: @strengthen_icmp_using_known_bits_slt( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[AND_:%.*]] = and i16 [[A:%.*]], -4 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i16 [[AND_]], -15 +; CHECK-NEXT: ret i1 [[CMP]] +; +entry: + %and_ = and i16 %a, 65532 + %cmp = icmp slt i16 %and_, -14 + ret i1 %cmp +} + +define i1 @dont_strengthen_icmp_in_sign_bit_check(i8 %a) { +; CHECK-LABEL: @dont_strengthen_icmp_in_sign_bit_check( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ICMP_:%.*]] = icmp sgt i8 [[A:%.*]], -1 +; CHECK-NEXT: ret i1 [[ICMP_]] +; +entry: + %shl_ = and i8 %a, 252 + %icmp_ = icmp sgt i8 %shl_, -1 + ret i1 %icmp_ +} + +define i8 @dont_strengthen_icmp_in_smin(i8 %a) { +; CHECK-LABEL: @dont_strengthen_icmp_in_smin( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SHL_:%.*]] = shl i8 [[A:%.*]], 2 +; CHECK-NEXT: [[SELECT_:%.*]] = call i8 @llvm.smin.i8(i8 [[SHL_]], i8 7) +; CHECK-NEXT: ret i8 [[SELECT_]] +; +entry: + %shl_ = shl i8 %a, 2 + %icmp_ = icmp slt i8 %shl_, 7 + %select_ = select i1 %icmp_, i8 %shl_, i8 7 + ret i8 %select_ +} + +define i8 @dont_strengthen_icmp_in_umin(i8 %a) { +; CHECK-LABEL: @dont_strengthen_icmp_in_umin( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SHL_:%.*]] = shl i8 [[A:%.*]], 2 +; CHECK-NEXT: [[SELECT_:%.*]] = call i8 @llvm.umin.i8(i8 [[SHL_]], i8 7) +; CHECK-NEXT: ret i8 [[SELECT_]] +; +entry: + %shl_ = shl i8 %a, 2 + %icmp_ = icmp ult i8 %shl_, 7 + %select_ = select i1 %icmp_, i8 %shl_, i8 7 + ret i8 %select_ +} + +define i8 @dont_strengthen_icmp_in_smax(i8 %a) { +; CHECK-LABEL: @dont_strengthen_icmp_in_smax( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SHL_:%.*]] = shl i8 [[A:%.*]], 2 +; CHECK-NEXT: [[SELECT_:%.*]] = call i8 @llvm.smax.i8(i8 [[SHL_]], i8 6) +; CHECK-NEXT: ret i8 [[SELECT_]] +; +entry: + %shl_ = shl i8 %a, 2 + %icmp_ = icmp sgt i8 %shl_, 6 + %select_ = select i1 %icmp_, i8 %shl_, i8 6 + ret i8 %select_ +} + +define i8 @dont_strengthen_icmp_in_umax(i8 %a) { +; CHECK-LABEL: @dont_strengthen_icmp_in_umax( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SHL_:%.*]] = shl i8 [[A:%.*]], 2 +; CHECK-NEXT: [[SELECT_:%.*]] = call i8 @llvm.umax.i8(i8 [[SHL_]], i8 6) +; CHECK-NEXT: ret i8 [[SELECT_]] +; +entry: + %shl_ = shl i8 %a, 2 + %icmp_ = icmp ugt i8 %shl_, 6 + %select_ = select i1 %icmp_, i8 %shl_, i8 6 + ret i8 %select_ +} diff --git a/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll b/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll index 2b5b3fce70535..bd32a9270f224 100644 --- a/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll +++ b/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll @@ -11,7 +11,7 @@ define ptr@test1(ptr %A, i32 %Offset) { ; CHECK: bb: ; CHECK-NEXT: [[RHS_IDX:%.*]] = phi i32 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[RHS_ADD]] = add nsw i32 [[RHS_IDX]], 4 -; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 400 +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 403 ; CHECK-NEXT: br i1 [[COND]], label [[BB2:%.*]], label [[BB]] ; CHECK: bb2: ; CHECK-NEXT: [[RHS_PTR:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[RHS_IDX]] @@ -40,7 +40,7 @@ define ptr@test2(i32 %A, i32 %Offset) { ; CHECK: bb: ; CHECK-NEXT: [[RHS_IDX:%.*]] = phi i32 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[RHS_ADD]] = add nsw i32 [[RHS_IDX]], 4 -; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 400 +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 403 ; CHECK-NEXT: br i1 [[COND]], label [[BB2:%.*]], label [[BB]] ; CHECK: bb2: ; CHECK-NEXT: [[A_PTR:%.*]] = inttoptr i32 [[A:%.*]] to ptr @@ -164,7 +164,7 @@ define ptr@test4(i16 %A, i32 %Offset) { ; CHECK: bb: ; CHECK-NEXT: [[RHS_IDX:%.*]] = phi i32 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[RHS_ADD]] = add nsw i32 [[RHS_IDX]], 4 -; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 400 +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 403 ; CHECK-NEXT: br i1 [[COND]], label [[BB2:%.*]], label [[BB]] ; CHECK: bb2: ; CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32 @@ -203,7 +203,7 @@ define ptr@test5(i32 %Offset) personality ptr @__gxx_personality_v0 { ; CHECK: bb: ; CHECK-NEXT: [[RHS_IDX:%.*]] = phi i32 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[CONT]] ] ; CHECK-NEXT: [[RHS_ADD]] = add nsw i32 [[RHS_IDX]], 4 -; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 400 +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 403 ; CHECK-NEXT: br i1 [[COND]], label [[BB2:%.*]], label [[BB]] ; CHECK: bb2: ; CHECK-NEXT: [[RHS_PTR:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 [[RHS_IDX]] @@ -248,7 +248,7 @@ define ptr@test6(i32 %Offset) personality ptr @__gxx_personality_v0 { ; CHECK: bb: ; CHECK-NEXT: [[RHS_IDX:%.*]] = phi i32 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[CONT]] ] ; CHECK-NEXT: [[RHS_ADD]] = add nsw i32 [[RHS_IDX]], 4 -; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 400 +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 403 ; CHECK-NEXT: br i1 [[COND]], label [[BB2:%.*]], label [[BB]] ; CHECK: bb2: ; CHECK-NEXT: [[A_PTR:%.*]] = inttoptr i32 [[A]] to ptr diff --git a/llvm/test/Transforms/InstCombine/opaque-ptr.ll b/llvm/test/Transforms/InstCombine/opaque-ptr.ll index 4d38e2cd37c95..f92c27cd6b07d 100644 --- a/llvm/test/Transforms/InstCombine/opaque-ptr.ll +++ b/llvm/test/Transforms/InstCombine/opaque-ptr.ll @@ -387,7 +387,7 @@ define ptr @indexed_compare(ptr %A, i64 %offset) { ; CHECK: bb: ; CHECK-NEXT: [[RHS_IDX:%.*]] = phi i64 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[RHS_ADD]] = add nsw i64 [[RHS_IDX]], 4 -; CHECK-NEXT: [[COND:%.*]] = icmp sgt i64 [[RHS_IDX]], 400 +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i64 [[RHS_IDX]], 403 ; CHECK-NEXT: br i1 [[COND]], label [[BB2:%.*]], label [[BB]] ; CHECK: bb2: ; CHECK-NEXT: [[RHS_PTR:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[RHS_IDX]] @@ -416,7 +416,7 @@ define ptr @indexed_compare_different_types(ptr %A, i64 %offset) { ; CHECK: bb: ; CHECK-NEXT: [[RHS_IDX:%.*]] = phi i64 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[RHS_ADD]] = add nsw i64 [[RHS_IDX]], 4 -; CHECK-NEXT: [[COND:%.*]] = icmp sgt i64 [[RHS_IDX]], 800 +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i64 [[RHS_IDX]], 803 ; CHECK-NEXT: br i1 [[COND]], label [[BB2:%.*]], label [[BB]] ; CHECK: bb2: ; CHECK-NEXT: [[RHS_PTR:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[RHS_IDX]] diff --git a/llvm/test/Transforms/InstCombine/pr17827.ll b/llvm/test/Transforms/InstCombine/pr17827.ll index 6c6110aa073a5..d87909a283495 100644 --- a/llvm/test/Transforms/InstCombine/pr17827.ll +++ b/llvm/test/Transforms/InstCombine/pr17827.ll @@ -6,7 +6,7 @@ define i1 @test_shift_and_cmp_not_changed1(i8 %p) { ; CHECK-LABEL: @test_shift_and_cmp_not_changed1( ; CHECK-NEXT: [[SHLP:%.*]] = shl i8 [[P:%.*]], 5 ; CHECK-NEXT: [[ANDP:%.*]] = and i8 [[SHLP]], -64 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[ANDP]], 32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[ANDP]], 1 ; CHECK-NEXT: ret i1 [[CMP]] ; %shlp = shl i8 %p, 5 @@ -20,7 +20,7 @@ define i1 @test_shift_and_cmp_not_changed2(i8 %p) { ; CHECK-LABEL: @test_shift_and_cmp_not_changed2( ; CHECK-NEXT: [[SHLP:%.*]] = ashr i8 [[P:%.*]], 5 ; CHECK-NEXT: [[ANDP:%.*]] = and i8 [[SHLP]], -64 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[ANDP]], 32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[ANDP]], 1 ; CHECK-NEXT: ret i1 [[CMP]] ; %shlp = ashr i8 %p, 5 @@ -35,7 +35,7 @@ define i1 @test_shift_and_cmp_changed1(i8 %p, i8 %q) { ; CHECK-LABEL: @test_shift_and_cmp_changed1( ; CHECK-NEXT: [[ANDP:%.*]] = shl i8 [[P:%.*]], 5 ; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[ANDP]], -64 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[TMP1]], 32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[TMP1]], 1 ; CHECK-NEXT: ret i1 [[CMP]] ; %andp = and i8 %p, 6 @@ -51,7 +51,7 @@ define <2 x i1> @test_shift_and_cmp_changed1_vec(<2 x i8> %p, <2 x i8> %q) { ; CHECK-LABEL: @test_shift_and_cmp_changed1_vec( ; CHECK-NEXT: [[ANDP:%.*]] = shl <2 x i8> [[P:%.*]], ; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i8> [[ANDP]], -; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i8> [[TMP1]], +; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i8> [[TMP1]], ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; %andp = and <2 x i8> %p, @@ -66,8 +66,8 @@ define <2 x i1> @test_shift_and_cmp_changed1_vec(<2 x i8> %p, <2 x i8> %q) { ; Unsigned compare allows a transformation to compare against 0. define i1 @test_shift_and_cmp_changed2(i8 %p) { ; CHECK-LABEL: @test_shift_and_cmp_changed2( -; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[P:%.*]], 6 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[TMP1]], 0 +; CHECK-NEXT: [[SHLP:%.*]] = shl i8 [[P:%.*]], 5 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[SHLP]], 33 ; CHECK-NEXT: ret i1 [[CMP]] ; %shlp = shl i8 %p, 5 @@ -78,8 +78,8 @@ define i1 @test_shift_and_cmp_changed2(i8 %p) { define <2 x i1> @test_shift_and_cmp_changed2_vec(<2 x i8> %p) { ; CHECK-LABEL: @test_shift_and_cmp_changed2_vec( -; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i8> [[P:%.*]], -; CHECK-NEXT: [[CMP:%.*]] = icmp eq <2 x i8> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[SHLP:%.*]] = shl <2 x i8> [[P:%.*]], +; CHECK-NEXT: [[CMP:%.*]] = icmp ult <2 x i8> [[SHLP]], ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; %shlp = shl <2 x i8> %p, @@ -93,7 +93,7 @@ define i1 @test_shift_and_cmp_changed3(i8 %p) { ; CHECK-LABEL: @test_shift_and_cmp_changed3( ; CHECK-NEXT: [[SHLP:%.*]] = shl nsw i8 [[P:%.*]], 5 ; CHECK-NEXT: [[ANDP:%.*]] = and i8 [[SHLP]], -64 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[ANDP]], 32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[ANDP]], 1 ; CHECK-NEXT: ret i1 [[CMP]] ; %shlp = shl nsw i8 %p, 5 diff --git a/llvm/test/Transforms/InstCombine/pr27343.ll b/llvm/test/Transforms/InstCombine/pr27343.ll index e67d0b34056bf..f16affde2ce41 100644 --- a/llvm/test/Transforms/InstCombine/pr27343.ll +++ b/llvm/test/Transforms/InstCombine/pr27343.ll @@ -6,7 +6,7 @@ define i32 @__isnan(float %x) alwaysinline nounwind optsize { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[DOTCAST:%.*]] = bitcast float [[X:%.*]] to i32 ; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[DOTCAST]], 1 -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[SHL]], -16777216 +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[SHL]], -16777215 ; CHECK-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; CHECK-NEXT: ret i32 [[CONV]] ; @@ -24,7 +24,7 @@ entry: define i1 @icmp_shl7(i32 %x) { ; CHECK-LABEL: @icmp_shl7( ; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[X:%.*]], 7 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[SHL]], 4608 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[SHL]], 4481 ; CHECK-NEXT: ret i1 [[CMP]] ; %shl = shl i32 %x, 7 diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll index c5f1b77c6d740..65b2c978c36c0 100644 --- a/llvm/test/Transforms/InstCombine/select.ll +++ b/llvm/test/Transforms/InstCombine/select.ll @@ -3423,7 +3423,7 @@ define @scalable_sign_bits( %x) { define @scalable_non_zero( %x) { ; CHECK-LABEL: @scalable_non_zero( ; CHECK-NEXT: [[A:%.*]] = or [[X:%.*]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[CMP:%.*]] = icmp ule [[A]], shufflevector ( insertelement ( poison, i32 56, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[CMP:%.*]] = icmp ule [[A]], shufflevector ( insertelement ( poison, i32 55, i64 0), poison, zeroinitializer) ; CHECK-NEXT: ret [[CMP]] ; %a = or %x, shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) diff --git a/llvm/test/Transforms/InstCombine/shift.ll b/llvm/test/Transforms/InstCombine/shift.ll index d783adbe93863..dda06351482fb 100644 --- a/llvm/test/Transforms/InstCombine/shift.ll +++ b/llvm/test/Transforms/InstCombine/shift.ll @@ -658,8 +658,8 @@ define i8 @test39(i32 %a0) { ; CHECK-NEXT: [[I51:%.*]] = xor i8 [[I50]], [[I5]] ; CHECK-NEXT: [[TMP0:%.*]] = lshr exact i8 [[I5]], 3 ; CHECK-NEXT: [[I54:%.*]] = and i8 [[TMP0]], 16 -; CHECK-NEXT: [[I551:%.*]] = or disjoint i8 [[I54]], [[I51]] -; CHECK-NEXT: ret i8 [[I551]] +; CHECK-NEXT: [[I55:%.*]] = or disjoint i8 [[I54]], [[I51]] +; CHECK-NEXT: ret i8 [[I55]] ; entry: %i4 = trunc i32 %a0 to i8 diff --git a/llvm/test/Transforms/InstCombine/shl-unsigned-cmp-const.ll b/llvm/test/Transforms/InstCombine/shl-unsigned-cmp-const.ll index 25b26770c366d..9e1473a621d27 100644 --- a/llvm/test/Transforms/InstCombine/shl-unsigned-cmp-const.ll +++ b/llvm/test/Transforms/InstCombine/shl-unsigned-cmp-const.ll @@ -9,8 +9,8 @@ ; C2 Shift amount smaller than C1 trailing zeros. define i1 @scalar_i8_shl_ult_const_1(i8 %x) { ; CHECK-LABEL: @scalar_i8_shl_ult_const_1( -; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], 6 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[TMP1]], 0 +; CHECK-NEXT: [[SHL:%.*]] = shl i8 [[X:%.*]], 5 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[SHL]], 33 ; CHECK-NEXT: ret i1 [[CMP]] ; %shl = shl i8 %x, 5 @@ -45,8 +45,8 @@ define i1 @scalar_i8_shl_ult_const_3(i8 %x) { ; C2 Shift amount smaller than C1 trailing zeros. define i1 @scalar_i16_shl_ult_const(i16 %x) { ; CHECK-LABEL: @scalar_i16_shl_ult_const( -; CHECK-NEXT: [[TMP1:%.*]] = and i16 [[X:%.*]], 252 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0 +; CHECK-NEXT: [[SHL:%.*]] = shl i16 [[X:%.*]], 8 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i16 [[SHL]], 769 ; CHECK-NEXT: ret i1 [[CMP]] ; %shl = shl i16 %x, 8 @@ -56,8 +56,8 @@ define i1 @scalar_i16_shl_ult_const(i16 %x) { define i1 @scalar_i32_shl_ult_const(i32 %x) { ; CHECK-LABEL: @scalar_i32_shl_ult_const( -; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[X:%.*]], 2097088 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[X:%.*]], 11 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[SHL]], 129025 ; CHECK-NEXT: ret i1 [[CMP]] ; %shl = shl i32 %x, 11 @@ -67,8 +67,8 @@ define i1 @scalar_i32_shl_ult_const(i32 %x) { define i1 @scalar_i64_shl_ult_const(i64 %x) { ; CHECK-LABEL: @scalar_i64_shl_ult_const( -; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[X:%.*]], 549755813632 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TMP1]], 0 +; CHECK-NEXT: [[SHL:%.*]] = shl i64 [[X:%.*]], 25 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[SHL]], 8556380161 ; CHECK-NEXT: ret i1 [[CMP]] ; %shl = shl i64 %x, 25 @@ -91,8 +91,8 @@ define i1 @scalar_i8_shl_uge_const(i8 %x) { ; Check 'ule' predicate define i1 @scalar_i8_shl_ule_const(i8 %x) { ; CHECK-LABEL: @scalar_i8_shl_ule_const( -; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], 6 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[TMP1]], 0 +; CHECK-NEXT: [[SHL:%.*]] = shl i8 [[X:%.*]], 5 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[SHL]], 33 ; CHECK-NEXT: ret i1 [[CMP]] ; %shl = shl i8 %x, 5 @@ -116,8 +116,8 @@ define i1 @scalar_i8_shl_ugt_const(i8 %x) { define <4 x i1> @vector_4xi32_shl_ult_const(<4 x i32> %x) { ; CHECK-LABEL: @vector_4xi32_shl_ult_const( -; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[X:%.*]], -; CHECK-NEXT: [[CMP:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[SHL:%.*]] = shl <4 x i32> [[X:%.*]], +; CHECK-NEXT: [[CMP:%.*]] = icmp ult <4 x i32> [[SHL]], ; CHECK-NEXT: ret <4 x i1> [[CMP]] ; %shl = shl <4 x i32> %x, @@ -173,8 +173,8 @@ define <4 x i1> @vector_4xi32_shl_uge_const(<4 x i32> %x) { ; Check 'ule' predicate define <4 x i1> @vector_4xi32_shl_ule_const(<4 x i32> %x) { ; CHECK-LABEL: @vector_4xi32_shl_ule_const( -; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[X:%.*]], -; CHECK-NEXT: [[CMP:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[SHL:%.*]] = shl <4 x i32> [[X:%.*]], +; CHECK-NEXT: [[CMP:%.*]] = icmp ult <4 x i32> [[SHL]], ; CHECK-NEXT: ret <4 x i1> [[CMP]] ; %shl = shl <4 x i32> %x, @@ -201,7 +201,7 @@ define i1 @scalar_i8_shl_ult_const_extra_use_shl(i8 %x, ptr %p) { ; CHECK-LABEL: @scalar_i8_shl_ult_const_extra_use_shl( ; CHECK-NEXT: [[SHL:%.*]] = shl i8 [[X:%.*]], 5 ; CHECK-NEXT: store i8 [[SHL]], ptr [[P:%.*]], align 1 -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[SHL]], 64 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[SHL]], 33 ; CHECK-NEXT: ret i1 [[CMP]] ; %shl = shl i8 %x, 5 @@ -216,7 +216,7 @@ define i1 @scalar_i8_shl_ult_const_extra_use_shl(i8 %x, ptr %p) { define i1 @scalar_i8_shl_slt_const(i8 %x) { ; CHECK-LABEL: @scalar_i8_shl_slt_const( ; CHECK-NEXT: [[SHL:%.*]] = shl i8 [[X:%.*]], 5 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[SHL]], 64 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[SHL]], 33 ; CHECK-NEXT: ret i1 [[CMP]] ; %shl = shl i8 %x, 5 @@ -227,7 +227,7 @@ define i1 @scalar_i8_shl_slt_const(i8 %x) { define i1 @scalar_i8_shl_ugt_const_not_power_of_2(i8 %x) { ; CHECK-LABEL: @scalar_i8_shl_ugt_const_not_power_of_2( ; CHECK-NEXT: [[SHL:%.*]] = shl i8 [[X:%.*]], 5 -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i8 [[SHL]], 66 +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i8 [[SHL]], 95 ; CHECK-NEXT: ret i1 [[CMP]] ; %shl = shl i8 %x, 5 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll index 3ba91360850e7..082d8146bfce2 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll @@ -28,45 +28,45 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no ; SCALAR_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 ; SCALAR_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() ; SCALAR_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4 -; SCALAR_TAIL_FOLDING-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024 +; SCALAR_TAIL_FOLDING-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1039 ; SCALAR_TAIL_FOLDING-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALAR_TAIL_FOLDING: vector.ph: ; SCALAR_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() ; SCALAR_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4 ; SCALAR_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]] ; SCALAR_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call i32 @llvm.vscale.i32() -; SCALAR_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = shl i32 [[TMP19]], 4 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv16i32() -; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32() -; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = shl i32 [[TMP5]], 4 -; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP6]], i64 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv16i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[TMP7]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP8]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALAR_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALAR_TAIL_FOLDING: vector.body: ; SCALAR_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[INDEX]], 1 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]] -; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) -; SCALAR_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP10]], i32 1, [[INTERLEAVED_MASK]], poison) +; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = shl i32 [[INDEX]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP11]] +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i1( [[TMP9]], [[TMP9]]) +; SCALAR_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP12]], i32 1, [[INTERLEAVED_MASK]], poison) ; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv32i8( [[WIDE_MASKED_VEC]]) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = or disjoint i32 [[TMP8]], 1 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = call @llvm.smax.nxv16i8( [[TMP11]], [[TMP12]]) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = sub zeroinitializer, [[TMP14]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = sext i32 [[TMP13]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i64 -1 -; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i8( [[TMP14]], [[TMP15]]) -; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) -; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, [[INTERLEAVED_MASK1]]) -; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP20]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = or disjoint i32 [[TMP10]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = call @llvm.smax.nxv16i8( [[TMP13]], [[TMP14]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = sub zeroinitializer, [[TMP16]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = sext i32 [[TMP15]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP18]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i64 -1 +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i8( [[TMP16]], [[TMP17]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i1( [[TMP9]], [[TMP9]]) +; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP20]], i32 1, [[INTERLEAVED_MASK1]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]] ; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; SCALAR_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -111,16 +111,18 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no ; PREDICATED_TAIL_FOLDING-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; PREDICATED_TAIL_FOLDING: vector.ph: ; PREDICATED_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call i32 @llvm.vscale.i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = shl i32 [[TMP19]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP1]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = sub i32 1024, [[TMP3]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP3]], 1009 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = call @llvm.experimental.stepvector.nxv16i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4 -; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = call @llvm.experimental.stepvector.nxv16i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = shl i32 [[TMP8]], 4 +; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP9]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer @@ -128,31 +130,31 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no ; PREDICATED_TAIL_FOLDING: vector.body: ; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = shl i32 [[INDEX]], 1 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]] -; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i1( [[TMP10]], [[TMP10]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP9]], i32 1, [[INTERLEAVED_MASK]], poison) +; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP10]], zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = shl i32 [[INDEX]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = sext i32 [[TMP12]] to i64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP13]] +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i1( [[TMP11]], [[TMP11]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP14]], i32 1, [[INTERLEAVED_MASK]], poison) ; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv32i8( [[WIDE_MASKED_VEC]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = or disjoint i32 [[TMP7]], 1 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = call @llvm.smax.nxv16i8( [[TMP11]], [[TMP12]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = sub zeroinitializer, [[TMP14]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = sext i32 [[TMP13]] to i64 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i64 -1 -; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i8( [[TMP14]], [[TMP15]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i1( [[TMP10]], [[TMP10]]) -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, [[INTERLEAVED_MASK1]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP20]] -; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = or disjoint i32 [[TMP12]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = call @llvm.smax.nxv16i8( [[TMP15]], [[TMP16]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = sub zeroinitializer, [[TMP18]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = sext i32 [[TMP17]] to i64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP20]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[TMP21]], i64 -1 +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i8( [[TMP18]], [[TMP19]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i1( [[TMP11]], [[TMP11]]) +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP22]], i32 1, [[INTERLEAVED_MASK1]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]] +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 -; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP21]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP23:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP23]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]] ; PREDICATED_TAIL_FOLDING: middle.block: ; PREDICATED_TAIL_FOLDING-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; PREDICATED_TAIL_FOLDING: scalar.ph: @@ -220,36 +222,36 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no ; SCALAR_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 ; SCALAR_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() ; SCALAR_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4 -; SCALAR_TAIL_FOLDING-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024 +; SCALAR_TAIL_FOLDING-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1039 ; SCALAR_TAIL_FOLDING-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALAR_TAIL_FOLDING: vector.ph: ; SCALAR_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() ; SCALAR_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4 ; SCALAR_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]] ; SCALAR_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() -; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = shl i32 [[TMP14]], 4 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv16i32() -; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32() -; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = shl i32 [[TMP5]], 4 -; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP6]], i64 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv16i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[TMP7]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP8]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALAR_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALAR_TAIL_FOLDING: vector.body: ; SCALAR_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = zext nneg [[TMP7]] to -; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP8]] -; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), [[TMP9]], i32 1, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = or disjoint [[TMP7]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = zext nneg [[TMP11]] to -; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP12]] -; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP13]], i32 1, [[TMP10]]) -; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP15]] +; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = zext nneg [[TMP9]] to +; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP10]] +; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), [[TMP11]], i32 1, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = or disjoint [[TMP9]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = zext nneg [[TMP13]] to +; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP14]] +; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP15]], i32 1, [[TMP12]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]] ; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -286,16 +288,18 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no ; PREDICATED_TAIL_FOLDING-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; PREDICATED_TAIL_FOLDING: vector.ph: ; PREDICATED_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = shl i32 [[TMP14]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP1]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = sub i32 1024, [[TMP3]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP3]], 1009 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = call @llvm.experimental.stepvector.nxv16i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4 -; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = call @llvm.experimental.stepvector.nxv16i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = shl i32 [[TMP8]], 4 +; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP9]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer @@ -303,22 +307,22 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no ; PREDICATED_TAIL_FOLDING: vector.body: ; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = zext nneg [[TMP6]] to -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP7]] -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), [[TMP8]], i32 1, [[ACTIVE_LANE_MASK]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP9]], zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = or disjoint [[TMP6]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = zext nneg [[TMP10]] to ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP11]] -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP12]], i32 1, [[TMP13]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP15]] -; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]]) +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = or disjoint [[TMP10]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = zext nneg [[TMP15]] to +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP16]] +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP17]], i32 1, [[TMP14]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]] +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 -; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP18]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]] ; PREDICATED_TAIL_FOLDING: middle.block: ; PREDICATED_TAIL_FOLDING-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; PREDICATED_TAIL_FOLDING: scalar.ph: @@ -382,19 +386,19 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no ; SCALAR_TAIL_FOLDING-NEXT: [[CONV3:%.*]] = zext i8 [[GUARD2]] to i32 ; SCALAR_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() ; SCALAR_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4 -; SCALAR_TAIL_FOLDING-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024 +; SCALAR_TAIL_FOLDING-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1039 ; SCALAR_TAIL_FOLDING-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALAR_TAIL_FOLDING: vector.ph: ; SCALAR_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() ; SCALAR_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4 ; SCALAR_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]] ; SCALAR_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32() -; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = shl i32 [[TMP15]], 4 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv16i32() -; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32() -; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = shl i32 [[TMP5]], 4 -; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP6]], i64 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv16i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[TMP7]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP8]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer @@ -403,18 +407,18 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no ; SCALAR_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALAR_TAIL_FOLDING: vector.body: ; SCALAR_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = zext nneg [[TMP7]] to -; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP9]] -; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), [[TMP10]], i32 1, [[TMP8]]) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT2]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = or disjoint [[TMP7]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = zext nneg [[TMP12]] to -; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP13]] -; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP14]], i32 1, [[TMP11]]) -; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP16]] +; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = zext nneg [[TMP9]] to +; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP11]] +; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), [[TMP12]], i32 1, [[TMP10]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT2]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = or disjoint [[TMP9]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = zext nneg [[TMP14]] to +; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP15]] +; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP16]], i32 1, [[TMP13]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]] ; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; SCALAR_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -457,16 +461,18 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no ; PREDICATED_TAIL_FOLDING: vector.ph: ; PREDICATED_TAIL_FOLDING-NEXT: [[CONV3:%.*]] = zext i8 [[GUARD2]] to i32 ; PREDICATED_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD1]] to i32 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = shl i32 [[TMP16]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP1]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = sub i32 1024, [[TMP3]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP3]], 1009 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = call @llvm.experimental.stepvector.nxv16i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4 -; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = call @llvm.experimental.stepvector.nxv16i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = shl i32 [[TMP8]], 4 +; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP9]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer @@ -476,24 +482,24 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no ; PREDICATED_TAIL_FOLDING: vector.body: ; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP7]], zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = zext nneg [[TMP6]] to -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP8]] -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), [[TMP9]], i32 1, [[TMP10]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = or disjoint [[TMP6]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = zext nneg [[TMP12]] to +; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = zext nneg [[TMP10]] to ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP13]] -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP14]], i32 1, [[TMP15]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP17]] -; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]]) +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), [[TMP14]], i32 1, [[TMP12]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT2]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP15]], zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = or disjoint [[TMP10]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = zext nneg [[TMP17]] to +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP18]] +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP19]], i32 1, [[TMP16]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]] +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 -; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP18]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP6:![0-9]+]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP20]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP6:![0-9]+]] ; PREDICATED_TAIL_FOLDING: middle.block: ; PREDICATED_TAIL_FOLDING-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; PREDICATED_TAIL_FOLDING: scalar.ph: