diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 8c0fd66225513..ab9460e09ebb0 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/Utils/Local.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/DataLayout.h"
@@ -6100,6 +6101,91 @@ bool InstCombinerImpl::replacedSelectWithOperand(SelectInst *SI,
   return false;
 }
 
+// Try to "strengthen" the RHS of compare based on known bits.
+// For example, replace `icmp ugt %x, 14` with `icmp ugt %x, 15` when
+// it is known that the two least significant bits of `%x` is zero.
+static Instruction *strengthenICmpUsingKnownBits(ICmpInst &I,
+                                                 KnownBits Op0Known,
+                                                 KnownBits Op1Known,
+                                                 unsigned BitWidth) {
+  if (!BitWidth)
+    return nullptr;
+  if (!(Op1Known.isConstant() && Op0Known.Zero.isMask()))
+    return nullptr;
+
+  Value *Op0 = I.getOperand(0);
+  ICmpInst::Predicate Pred = I.getPredicate();
+  Type *Ty = Op0->getType();
+  APInt RHSConst = Op1Known.getConstant();
+
+  ConstantRange Op0PredRange =
+      ConstantRange::makeExactICmpRegion(Pred, RHSConst);
+  int KnownZeroMaskLength = BitWidth - Op0Known.Zero.countLeadingZeros();
+  if (KnownZeroMaskLength == 0)
+    return nullptr;
+
+  APInt PowOf2(BitWidth, 1 << KnownZeroMaskLength);
+  APInt Op0MinAccordingToPred(BitWidth, 0);
+  APInt Op0MaxAccordingToPred(BitWidth, 0);
+  APInt Op0MinRefinedByKnownBits(BitWidth, 0);
+  APInt Op0MaxRefinedByKnownBits(BitWidth, 0);
+  APInt NewLower(BitWidth, 0);
+  APInt NewUpper(BitWidth, 0);
+  bool ImprovedLower = false;
+  bool ImprovedUpper = false;
+  if (I.isSigned()) {
+    Op0MinAccordingToPred = Op0PredRange.getSignedMin();
+    Op0MaxAccordingToPred = Op0PredRange.getSignedMax();
+    // Compute the smallest number satisfying the known-bits constrained
+    // which is at greater or equal Op0MinAccordingToPred.
+    Op0MinRefinedByKnownBits =
+        PowOf2 * APIntOps::RoundingSDiv(Op0MinAccordingToPred, PowOf2,
+                                        APInt::Rounding::UP);
+    // Compute the largest number satisfying the known-bits constrained
+    // which is at less or equal Op0MaxAccordingToPred.
+    Op0MaxRefinedByKnownBits =
+        PowOf2 * APIntOps::RoundingSDiv(Op0MaxAccordingToPred, PowOf2,
+                                        APInt::Rounding::DOWN);
+    NewLower = APIntOps::smax(Op0MinRefinedByKnownBits, Op0MinAccordingToPred);
+    NewUpper = APIntOps::smin(Op0MaxRefinedByKnownBits, Op0MaxAccordingToPred);
+    ImprovedLower = NewLower.sgt(Op0MinAccordingToPred);
+    ImprovedUpper = NewUpper.slt(Op0MaxAccordingToPred);
+  } else {
+    Op0MinAccordingToPred = Op0PredRange.getUnsignedMin();
+    Op0MaxAccordingToPred = Op0PredRange.getUnsignedMax();
+    Op0MinRefinedByKnownBits =
+        PowOf2 * APIntOps::RoundingUDiv(Op0MinAccordingToPred, PowOf2,
+                                        APInt::Rounding::UP);
+    Op0MaxRefinedByKnownBits =
+        PowOf2 * APIntOps::RoundingUDiv(Op0MaxAccordingToPred, PowOf2,
+                                        APInt::Rounding::DOWN);
+    NewLower = APIntOps::umax(Op0MinRefinedByKnownBits, Op0MinAccordingToPred);
+    NewUpper = APIntOps::umin(Op0MaxRefinedByKnownBits, Op0MaxAccordingToPred);
+    ImprovedLower = NewLower.ugt(Op0MinAccordingToPred);
+    ImprovedUpper = NewUpper.ult(Op0MaxAccordingToPred);
+  }
+
+  // Non-strict inequalities should have been canonicalized to strict ones
+  // by now.
+  switch (Pred) {
+  default:
+    break;
+  case ICmpInst::ICMP_ULT:
+  case ICmpInst::ICMP_SLT: {
+    if (ImprovedUpper)
+      return new ICmpInst(Pred, Op0, ConstantInt::get(Ty, NewUpper + 1));
+    break;
+  }
+  case ICmpInst::ICMP_UGT:
+  case ICmpInst::ICMP_SGT: {
+    if (ImprovedLower)
+      return new ICmpInst(Pred, Op0, ConstantInt::get(Ty, NewLower - 1));
+    break;
+  }
+  }
+  return nullptr;
+}
+
 /// Try to fold the comparison based on range information we can get by checking
 /// whether bits are known to be zero or one in the inputs.
 Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
@@ -6357,6 +6443,23 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
        (Op0Known.One.isNegative() && Op1Known.One.isNegative())))
     return new ICmpInst(I.getUnsignedPredicate(), Op0, Op1);
 
+  // if the result of compare is used only in conditional branches, try to
+  // "strengthen" the compare. This may allow us to deduce stronger results
+  // about the value involved in comparison in the blocks dominated by these branches.
+  bool AllUsesAreInBranches = true;
+  for (const Use &U : I.uses()) {
+    const Instruction *UI = cast<Instruction>(U.getUser());
+    if (!dyn_cast<BranchInst>(UI)) {
+      AllUsesAreInBranches = false;
+      break;
+    }
+  }
+  if (AllUsesAreInBranches) {
+    if (Instruction *Res =
+            strengthenICmpUsingKnownBits(I, Op0Known, Op1Known, BitWidth))
+      return Res;
+  }
+
   return nullptr;
 }
 
diff --git a/llvm/test/Transforms/InstCombine/assume-loop-align.ll b/llvm/test/Transforms/InstCombine/assume-loop-align.ll
index e7eb18c61b6bb..79af1b0fede4b 100644
--- a/llvm/test/Transforms/InstCombine/assume-loop-align.ll
+++ b/llvm/test/Transforms/InstCombine/assume-loop-align.ll
@@ -28,7 +28,7 @@ define void @foo(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 1648
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 1633
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/InstCombine/icmp-mul.ll b/llvm/test/Transforms/InstCombine/icmp-mul.ll
index 7f76a94f395b6..6c6befcfe39e4 100644
--- a/llvm/test/Transforms/InstCombine/icmp-mul.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-mul.ll
@@ -969,7 +969,7 @@ define i1 @mul_of_pow2_no_lz_other_op(i32 %x, i8 %y) {
 ; CHECK-NEXT:    [[B:%.*]] = and i32 [[X:%.*]], 2
 ; CHECK-NEXT:    [[S:%.*]] = sext i8 [[Y:%.*]] to i32
 ; CHECK-NEXT:    [[M:%.*]] = mul nuw nsw i32 [[B]], [[S]]
-; CHECK-NEXT:    [[R:%.*]] = icmp sgt i32 [[M]], 254
+; CHECK-NEXT:    [[R:%.*]] = icmp sgt i32 [[M]], 255
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %b = and i32 %x, 2
diff --git a/llvm/test/Transforms/InstCombine/icmp-or.ll b/llvm/test/Transforms/InstCombine/icmp-or.ll
index 922845c1e7e2d..587df66417eb0 100644
--- a/llvm/test/Transforms/InstCombine/icmp-or.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-or.ll
@@ -308,7 +308,7 @@ define i1 @decrement_sgt_n1_commute_use1(i8 %px) {
 ; CHECK-NEXT:    [[X:%.*]] = mul i8 [[PX:%.*]], 42
 ; CHECK-NEXT:    [[DEC:%.*]] = add i8 [[X]], -1
 ; CHECK-NEXT:    call void @use(i8 [[DEC]])
-; CHECK-NEXT:    [[R:%.*]] = icmp sgt i8 [[X]], 0
+; CHECK-NEXT:    [[R:%.*]] = icmp sgt i8 [[X]], 1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x = mul i8 %px, 42 ; thwart complexity-based canonicalization
diff --git a/llvm/test/Transforms/InstCombine/icmp-shl-nsw.ll b/llvm/test/Transforms/InstCombine/icmp-shl-nsw.ll
index 6b9ea1f8ef97e..5b827c839a4e4 100644
--- a/llvm/test/Transforms/InstCombine/icmp-shl-nsw.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-shl-nsw.ll
@@ -136,7 +136,7 @@ define i1 @icmp_sgt6(i8 %x) {
 
 define i1 @icmp_sgt7(i8 %x) {
 ; CHECK-LABEL: @icmp_sgt7(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[X:%.*]], 62
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[X:%.*]], 63
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl nsw i8 %x, 1
@@ -224,7 +224,7 @@ define i1 @icmp_sle1(i8 %x) {
 
 define i1 @icmp_sle2(i8 %x) {
 ; CHECK-LABEL: @icmp_sle2(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[X:%.*]], -63
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[X:%.*]], -64
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl nsw i8 %x, 1
diff --git a/llvm/test/Transforms/InstCombine/icmp.ll b/llvm/test/Transforms/InstCombine/icmp.ll
index 10ab1fe118348..e3a861c55649d 100644
--- a/llvm/test/Transforms/InstCombine/icmp.ll
+++ b/llvm/test/Transforms/InstCombine/icmp.ll
@@ -1490,8 +1490,8 @@ define <2 x i1> @test70vec(<2 x i32> %X) {
 
 define i1 @icmp_sext16trunc(i32 %x) {
 ; CHECK-LABEL: @icmp_sext16trunc(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[X:%.*]] to i16
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i16 [[TMP1]], 36
+; CHECK-NEXT:    [[SEXT1:%.*]] = shl i32 [[X:%.*]], 16
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[SEXT1]], 2293761
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %trunc = trunc i32 %x to i16
@@ -1502,8 +1502,8 @@ define i1 @icmp_sext16trunc(i32 %x) {
 
 define i1 @icmp_sext8trunc(i32 %x) {
 ; CHECK-LABEL: @icmp_sext8trunc(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[X:%.*]] to i8
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[TMP1]], 36
+; CHECK-NEXT:    [[SEXT1:%.*]] = shl i32 [[X:%.*]], 24
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[SEXT1]], 587202561
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %trunc = trunc i32 %x to i8
@@ -1515,8 +1515,8 @@ define i1 @icmp_sext8trunc(i32 %x) {
 ; Vectors should fold the same way.
 define <2 x i1> @icmp_sext8trunc_vec(<2 x i32> %x) {
 ; CHECK-LABEL: @icmp_sext8trunc_vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i32> [[X:%.*]] to <2 x i8>
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> [[TMP1]], <i8 36, i8 36>
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i32> [[X:%.*]], <i32 24, i32 24>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i32> [[TMP1]], <i32 587202561, i32 587202561>
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %trunc = trunc <2 x i32> %x to <2 x i8>
@@ -1527,8 +1527,8 @@ define <2 x i1> @icmp_sext8trunc_vec(<2 x i32> %x) {
 
 define i1 @icmp_shl16(i32 %x) {
 ; CHECK-LABEL: @icmp_shl16(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[X:%.*]] to i16
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i16 [[TMP1]], 36
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[X:%.*]], 16
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[SHL]], 2293761
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl i32 %x, 16
@@ -1541,7 +1541,7 @@ define i1 @icmp_shl16(i32 %x) {
 define i1 @icmp_shl17(i32 %x) {
 ; CHECK-LABEL: @icmp_shl17(
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[X:%.*]], 17
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[SHL]], 2359296
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[SHL]], 2228225
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl i32 %x, 17
@@ -1551,8 +1551,8 @@ define i1 @icmp_shl17(i32 %x) {
 
 define <2 x i1> @icmp_shl16_vec(<2 x i32> %x) {
 ; CHECK-LABEL: @icmp_shl16_vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i32> [[X:%.*]] to <2 x i16>
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i16> [[TMP1]], <i16 36, i16 36>
+; CHECK-NEXT:    [[SHL:%.*]] = shl <2 x i32> [[X:%.*]], <i32 16, i32 16>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i32> [[SHL]], <i32 2293761, i32 2293761>
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %shl = shl <2 x i32> %x, <i32 16, i32 16>
@@ -1562,8 +1562,8 @@ define <2 x i1> @icmp_shl16_vec(<2 x i32> %x) {
 
 define i1 @icmp_shl24(i32 %x) {
 ; CHECK-LABEL: @icmp_shl24(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[X:%.*]] to i8
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[TMP1]], 36
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[X:%.*]], 24
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[SHL]], 587202561
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl i32 %x, 24
@@ -2199,7 +2199,7 @@ define i1 @icmp_ashr_and_overshift(i8 %X) {
 define i1 @icmp_and_ashr_neg_and_legal(i8 %x) {
 ; CHECK-LABEL: @icmp_and_ashr_neg_and_legal(
 ; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[X:%.*]], -32
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[TMP1]], 16
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[TMP1]], 1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %ashr = ashr i8 %x, 4
@@ -2225,7 +2225,7 @@ define i1 @icmp_and_ashr_mixed_and_shiftout(i8 %x) {
 define i1 @icmp_and_ashr_neg_cmp_slt_legal(i8 %x) {
 ; CHECK-LABEL: @icmp_and_ashr_neg_cmp_slt_legal(
 ; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[X:%.*]], -32
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[TMP1]], -64
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[TMP1]], -95
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %ashr = ashr i8 %x, 4
@@ -2239,7 +2239,7 @@ define i1 @icmp_and_ashr_neg_cmp_slt_shiftout(i8 %x) {
 ; CHECK-LABEL: @icmp_and_ashr_neg_cmp_slt_shiftout(
 ; CHECK-NEXT:    [[ASHR:%.*]] = ashr i8 [[X:%.*]], 4
 ; CHECK-NEXT:    [[AND:%.*]] = and i8 [[ASHR]], -2
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[AND]], -68
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[AND]], -69
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %ashr = ashr i8 %x, 4
@@ -5183,3 +5183,121 @@ entry:
   %cmp = icmp eq i8 %add2, %add1
   ret i1 %cmp
 }
+
+define i1 @strengthen_icmp_using_known_bits_ugt(i16 %a) {
+; CHECK-LABEL: @strengthen_icmp_using_known_bits_ugt(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i16 [[A:%.*]], 15
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %and_ = and i16 %a, 65532
+  %cmp = icmp ugt i16 %and_, 14
+  ret i1 %cmp
+}
+
+define i1 @strengthen_icmp_using_known_bits_ult(i16 %a) {
+; CHECK-LABEL: @strengthen_icmp_using_known_bits_ult(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND_:%.*]] = and i16 [[A:%.*]], -4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i16 [[AND_]], 17
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %and_ = and i16 %a, 65532
+  %cmp = icmp ult i16 %and_, 18
+  ret i1 %cmp
+}
+
+define i1 @strengthen_icmp_using_known_bits_sgt(i16 %a) {
+; CHECK-LABEL: @strengthen_icmp_using_known_bits_sgt(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i16 [[A:%.*]], -1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %and_ = and i16 %a, 65520
+  %cmp = icmp sgt i16 %and_, -15
+  ret i1 %cmp
+}
+
+define i1 @strengthen_icmp_using_known_bits_slt(i16 %a) {
+; CHECK-LABEL: @strengthen_icmp_using_known_bits_slt(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND_:%.*]] = and i16 [[A:%.*]], -4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i16 [[AND_]], -15
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %and_ = and i16 %a, 65532
+  %cmp = icmp slt i16 %and_, -14
+  ret i1 %cmp
+}
+
+define i1 @dont_strengthen_icmp_in_sign_bit_check(i8 %a) {
+; CHECK-LABEL: @dont_strengthen_icmp_in_sign_bit_check(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ICMP_:%.*]] = icmp sgt i8 [[A:%.*]], -1
+; CHECK-NEXT:    ret i1 [[ICMP_]]
+;
+entry:
+  %shl_ = and i8 %a, 252
+  %icmp_ = icmp sgt i8 %shl_, -1
+  ret i1 %icmp_
+}
+
+define i8 @dont_strengthen_icmp_in_smin(i8 %a) {
+; CHECK-LABEL: @dont_strengthen_icmp_in_smin(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SHL_:%.*]] = shl i8 [[A:%.*]], 2
+; CHECK-NEXT:    [[SELECT_:%.*]] = call i8 @llvm.smin.i8(i8 [[SHL_]], i8 7)
+; CHECK-NEXT:    ret i8 [[SELECT_]]
+;
+entry:
+  %shl_ = shl i8 %a, 2
+  %icmp_ = icmp slt i8 %shl_, 7
+  %select_ = select i1 %icmp_, i8 %shl_, i8 7
+  ret i8 %select_
+}
+
+define i8 @dont_strengthen_icmp_in_umin(i8 %a) {
+; CHECK-LABEL: @dont_strengthen_icmp_in_umin(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SHL_:%.*]] = shl i8 [[A:%.*]], 2
+; CHECK-NEXT:    [[SELECT_:%.*]] = call i8 @llvm.umin.i8(i8 [[SHL_]], i8 7)
+; CHECK-NEXT:    ret i8 [[SELECT_]]
+;
+entry:
+  %shl_ = shl i8 %a, 2
+  %icmp_ = icmp ult i8 %shl_, 7
+  %select_ = select i1 %icmp_, i8 %shl_, i8 7
+  ret i8 %select_
+}
+
+define i8 @dont_strengthen_icmp_in_smax(i8 %a) {
+; CHECK-LABEL: @dont_strengthen_icmp_in_smax(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SHL_:%.*]] = shl i8 [[A:%.*]], 2
+; CHECK-NEXT:    [[SELECT_:%.*]] = call i8 @llvm.smax.i8(i8 [[SHL_]], i8 6)
+; CHECK-NEXT:    ret i8 [[SELECT_]]
+;
+entry:
+  %shl_ = shl i8 %a, 2
+  %icmp_ = icmp sgt i8 %shl_, 6
+  %select_ = select i1 %icmp_, i8 %shl_, i8 6
+  ret i8 %select_
+}
+
+define i8 @dont_strengthen_icmp_in_umax(i8 %a) {
+; CHECK-LABEL: @dont_strengthen_icmp_in_umax(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SHL_:%.*]] = shl i8 [[A:%.*]], 2
+; CHECK-NEXT:    [[SELECT_:%.*]] = call i8 @llvm.umax.i8(i8 [[SHL_]], i8 6)
+; CHECK-NEXT:    ret i8 [[SELECT_]]
+;
+entry:
+  %shl_ = shl i8 %a, 2
+  %icmp_ = icmp ugt i8 %shl_, 6
+  %select_ = select i1 %icmp_, i8 %shl_, i8 6
+  ret i8 %select_
+}
diff --git a/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll b/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll
index 2b5b3fce70535..bd32a9270f224 100644
--- a/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll
+++ b/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll
@@ -11,7 +11,7 @@ define ptr@test1(ptr %A, i32 %Offset) {
 ; CHECK:       bb:
 ; CHECK-NEXT:    [[RHS_IDX:%.*]] = phi i32 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[RHS_ADD]] = add nsw i32 [[RHS_IDX]], 4
-; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 400
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 403
 ; CHECK-NEXT:    br i1 [[COND]], label [[BB2:%.*]], label [[BB]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[RHS_PTR:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[RHS_IDX]]
@@ -40,7 +40,7 @@ define ptr@test2(i32 %A, i32 %Offset) {
 ; CHECK:       bb:
 ; CHECK-NEXT:    [[RHS_IDX:%.*]] = phi i32 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[RHS_ADD]] = add nsw i32 [[RHS_IDX]], 4
-; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 400
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 403
 ; CHECK-NEXT:    br i1 [[COND]], label [[BB2:%.*]], label [[BB]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[A_PTR:%.*]] = inttoptr i32 [[A:%.*]] to ptr
@@ -164,7 +164,7 @@ define ptr@test4(i16 %A, i32 %Offset) {
 ; CHECK:       bb:
 ; CHECK-NEXT:    [[RHS_IDX:%.*]] = phi i32 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[RHS_ADD]] = add nsw i32 [[RHS_IDX]], 4
-; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 400
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 403
 ; CHECK-NEXT:    br i1 [[COND]], label [[BB2:%.*]], label [[BB]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32
@@ -203,7 +203,7 @@ define ptr@test5(i32 %Offset) personality ptr @__gxx_personality_v0 {
 ; CHECK:       bb:
 ; CHECK-NEXT:    [[RHS_IDX:%.*]] = phi i32 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[CONT]] ]
 ; CHECK-NEXT:    [[RHS_ADD]] = add nsw i32 [[RHS_IDX]], 4
-; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 400
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 403
 ; CHECK-NEXT:    br i1 [[COND]], label [[BB2:%.*]], label [[BB]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[RHS_PTR:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 [[RHS_IDX]]
@@ -248,7 +248,7 @@ define ptr@test6(i32 %Offset) personality ptr @__gxx_personality_v0 {
 ; CHECK:       bb:
 ; CHECK-NEXT:    [[RHS_IDX:%.*]] = phi i32 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[CONT]] ]
 ; CHECK-NEXT:    [[RHS_ADD]] = add nsw i32 [[RHS_IDX]], 4
-; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 400
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 403
 ; CHECK-NEXT:    br i1 [[COND]], label [[BB2:%.*]], label [[BB]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[A_PTR:%.*]] = inttoptr i32 [[A]] to ptr
diff --git a/llvm/test/Transforms/InstCombine/opaque-ptr.ll b/llvm/test/Transforms/InstCombine/opaque-ptr.ll
index 4d38e2cd37c95..f92c27cd6b07d 100644
--- a/llvm/test/Transforms/InstCombine/opaque-ptr.ll
+++ b/llvm/test/Transforms/InstCombine/opaque-ptr.ll
@@ -387,7 +387,7 @@ define ptr @indexed_compare(ptr %A, i64 %offset) {
 ; CHECK:       bb:
 ; CHECK-NEXT:    [[RHS_IDX:%.*]] = phi i64 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[RHS_ADD]] = add nsw i64 [[RHS_IDX]], 4
-; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i64 [[RHS_IDX]], 400
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i64 [[RHS_IDX]], 403
 ; CHECK-NEXT:    br i1 [[COND]], label [[BB2:%.*]], label [[BB]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[RHS_PTR:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[RHS_IDX]]
@@ -416,7 +416,7 @@ define ptr @indexed_compare_different_types(ptr %A, i64 %offset) {
 ; CHECK:       bb:
 ; CHECK-NEXT:    [[RHS_IDX:%.*]] = phi i64 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[RHS_ADD]] = add nsw i64 [[RHS_IDX]], 4
-; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i64 [[RHS_IDX]], 800
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i64 [[RHS_IDX]], 803
 ; CHECK-NEXT:    br i1 [[COND]], label [[BB2:%.*]], label [[BB]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[RHS_PTR:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[RHS_IDX]]
diff --git a/llvm/test/Transforms/InstCombine/pr17827.ll b/llvm/test/Transforms/InstCombine/pr17827.ll
index 6c6110aa073a5..d87909a283495 100644
--- a/llvm/test/Transforms/InstCombine/pr17827.ll
+++ b/llvm/test/Transforms/InstCombine/pr17827.ll
@@ -6,7 +6,7 @@ define i1 @test_shift_and_cmp_not_changed1(i8 %p) {
 ; CHECK-LABEL: @test_shift_and_cmp_not_changed1(
 ; CHECK-NEXT:    [[SHLP:%.*]] = shl i8 [[P:%.*]], 5
 ; CHECK-NEXT:    [[ANDP:%.*]] = and i8 [[SHLP]], -64
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[ANDP]], 32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[ANDP]], 1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shlp = shl i8 %p, 5
@@ -20,7 +20,7 @@ define i1 @test_shift_and_cmp_not_changed2(i8 %p) {
 ; CHECK-LABEL: @test_shift_and_cmp_not_changed2(
 ; CHECK-NEXT:    [[SHLP:%.*]] = ashr i8 [[P:%.*]], 5
 ; CHECK-NEXT:    [[ANDP:%.*]] = and i8 [[SHLP]], -64
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[ANDP]], 32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[ANDP]], 1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shlp = ashr i8 %p, 5
@@ -35,7 +35,7 @@ define i1 @test_shift_and_cmp_changed1(i8 %p, i8 %q) {
 ; CHECK-LABEL: @test_shift_and_cmp_changed1(
 ; CHECK-NEXT:    [[ANDP:%.*]] = shl i8 [[P:%.*]], 5
 ; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[ANDP]], -64
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[TMP1]], 32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[TMP1]], 1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %andp = and i8 %p, 6
@@ -51,7 +51,7 @@ define <2 x i1> @test_shift_and_cmp_changed1_vec(<2 x i8> %p, <2 x i8> %q) {
 ; CHECK-LABEL: @test_shift_and_cmp_changed1_vec(
 ; CHECK-NEXT:    [[ANDP:%.*]] = shl <2 x i8> [[P:%.*]], <i8 5, i8 5>
 ; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i8> [[ANDP]], <i8 -64, i8 -64>
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> [[TMP1]], <i8 32, i8 32>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> [[TMP1]], <i8 1, i8 1>
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %andp = and <2 x i8> %p, <i8 6, i8 6>
@@ -66,8 +66,8 @@ define <2 x i1> @test_shift_and_cmp_changed1_vec(<2 x i8> %p, <2 x i8> %q) {
 ; Unsigned compare allows a transformation to compare against 0.
 define i1 @test_shift_and_cmp_changed2(i8 %p) {
 ; CHECK-LABEL: @test_shift_and_cmp_changed2(
-; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[P:%.*]], 6
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    [[SHLP:%.*]] = shl i8 [[P:%.*]], 5
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[SHLP]], 33
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shlp = shl i8 %p, 5
@@ -78,8 +78,8 @@ define i1 @test_shift_and_cmp_changed2(i8 %p) {
 
 define <2 x i1> @test_shift_and_cmp_changed2_vec(<2 x i8> %p) {
 ; CHECK-LABEL: @test_shift_and_cmp_changed2_vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i8> [[P:%.*]], <i8 6, i8 6>
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <2 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[SHLP:%.*]] = shl <2 x i8> [[P:%.*]], <i8 5, i8 5>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult <2 x i8> [[SHLP]], <i8 33, i8 33>
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %shlp = shl <2 x i8> %p, <i8 5, i8 5>
@@ -93,7 +93,7 @@ define i1 @test_shift_and_cmp_changed3(i8 %p) {
 ; CHECK-LABEL: @test_shift_and_cmp_changed3(
 ; CHECK-NEXT:    [[SHLP:%.*]] = shl nsw i8 [[P:%.*]], 5
 ; CHECK-NEXT:    [[ANDP:%.*]] = and i8 [[SHLP]], -64
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[ANDP]], 32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[ANDP]], 1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shlp = shl nsw i8 %p, 5
diff --git a/llvm/test/Transforms/InstCombine/pr27343.ll b/llvm/test/Transforms/InstCombine/pr27343.ll
index e67d0b34056bf..f16affde2ce41 100644
--- a/llvm/test/Transforms/InstCombine/pr27343.ll
+++ b/llvm/test/Transforms/InstCombine/pr27343.ll
@@ -6,7 +6,7 @@ define i32 @__isnan(float %x) alwaysinline nounwind optsize {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast float [[X:%.*]] to i32
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[DOTCAST]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[SHL]], -16777216
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[SHL]], -16777215
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
 ;
@@ -24,7 +24,7 @@ entry:
 define i1 @icmp_shl7(i32 %x) {
 ; CHECK-LABEL: @icmp_shl7(
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[X:%.*]], 7
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[SHL]], 4608
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[SHL]], 4481
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl i32 %x, 7
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index c5f1b77c6d740..65b2c978c36c0 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -3423,7 +3423,7 @@ define <vscale x 2 x i32> @scalable_sign_bits(<vscale x 2 x i8> %x) {
 define <vscale x 2 x i1> @scalable_non_zero(<vscale x 2 x i32> %x) {
 ; CHECK-LABEL: @scalable_non_zero(
 ; CHECK-NEXT:    [[A:%.*]] = or <vscale x 2 x i32> [[X:%.*]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ule <vscale x 2 x i32> [[A]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 56, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ule <vscale x 2 x i32> [[A]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 55, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x i1> [[CMP]]
 ;
   %a = or <vscale x 2 x i32> %x, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
diff --git a/llvm/test/Transforms/InstCombine/shift.ll b/llvm/test/Transforms/InstCombine/shift.ll
index d783adbe93863..dda06351482fb 100644
--- a/llvm/test/Transforms/InstCombine/shift.ll
+++ b/llvm/test/Transforms/InstCombine/shift.ll
@@ -658,8 +658,8 @@ define i8 @test39(i32 %a0) {
 ; CHECK-NEXT:    [[I51:%.*]] = xor i8 [[I50]], [[I5]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = lshr exact i8 [[I5]], 3
 ; CHECK-NEXT:    [[I54:%.*]] = and i8 [[TMP0]], 16
-; CHECK-NEXT:    [[I551:%.*]] = or disjoint i8 [[I54]], [[I51]]
-; CHECK-NEXT:    ret i8 [[I551]]
+; CHECK-NEXT:    [[I55:%.*]] = or disjoint i8 [[I54]], [[I51]]
+; CHECK-NEXT:    ret i8 [[I55]]
 ;
 entry:
   %i4 = trunc i32 %a0 to i8
diff --git a/llvm/test/Transforms/InstCombine/shl-unsigned-cmp-const.ll b/llvm/test/Transforms/InstCombine/shl-unsigned-cmp-const.ll
index 25b26770c366d..9e1473a621d27 100644
--- a/llvm/test/Transforms/InstCombine/shl-unsigned-cmp-const.ll
+++ b/llvm/test/Transforms/InstCombine/shl-unsigned-cmp-const.ll
@@ -9,8 +9,8 @@
 ; C2 Shift amount smaller than C1 trailing zeros.
 define i1 @scalar_i8_shl_ult_const_1(i8 %x) {
 ; CHECK-LABEL: @scalar_i8_shl_ult_const_1(
-; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[X:%.*]], 6
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    [[SHL:%.*]] = shl i8 [[X:%.*]], 5
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[SHL]], 33
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl i8 %x, 5
@@ -45,8 +45,8 @@ define i1 @scalar_i8_shl_ult_const_3(i8 %x) {
 ; C2 Shift amount smaller than C1 trailing zeros.
 define i1 @scalar_i16_shl_ult_const(i16 %x) {
 ; CHECK-LABEL: @scalar_i16_shl_ult_const(
-; CHECK-NEXT:    [[TMP1:%.*]] = and i16 [[X:%.*]], 252
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT:    [[SHL:%.*]] = shl i16 [[X:%.*]], 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i16 [[SHL]], 769
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl i16 %x, 8
@@ -56,8 +56,8 @@ define i1 @scalar_i16_shl_ult_const(i16 %x) {
 
 define i1 @scalar_i32_shl_ult_const(i32 %x) {
 ; CHECK-LABEL: @scalar_i32_shl_ult_const(
-; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[X:%.*]], 2097088
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[X:%.*]], 11
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[SHL]], 129025
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl i32 %x, 11
@@ -67,8 +67,8 @@ define i1 @scalar_i32_shl_ult_const(i32 %x) {
 
 define i1 @scalar_i64_shl_ult_const(i64 %x) {
 ; CHECK-LABEL: @scalar_i64_shl_ult_const(
-; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[X:%.*]], 549755813632
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[TMP1]], 0
+; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[X:%.*]], 25
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[SHL]], 8556380161
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl i64 %x, 25
@@ -91,8 +91,8 @@ define i1 @scalar_i8_shl_uge_const(i8 %x) {
 ; Check 'ule' predicate
 define i1 @scalar_i8_shl_ule_const(i8 %x) {
 ; CHECK-LABEL: @scalar_i8_shl_ule_const(
-; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[X:%.*]], 6
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    [[SHL:%.*]] = shl i8 [[X:%.*]], 5
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[SHL]], 33
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl i8 %x, 5
@@ -116,8 +116,8 @@ define i1 @scalar_i8_shl_ugt_const(i8 %x) {
 
 define <4 x i1> @vector_4xi32_shl_ult_const(<4 x i32> %x) {
 ; CHECK-LABEL: @vector_4xi32_shl_ult_const(
-; CHECK-NEXT:    [[TMP1:%.*]] = and <4 x i32> [[X:%.*]], <i32 2097088, i32 2097088, i32 2097088, i32 2097088>
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[SHL:%.*]] = shl <4 x i32> [[X:%.*]], <i32 11, i32 11, i32 11, i32 11>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult <4 x i32> [[SHL]], <i32 129025, i32 129025, i32 129025, i32 129025>
 ; CHECK-NEXT:    ret <4 x i1> [[CMP]]
 ;
   %shl = shl <4 x i32> %x, <i32 11, i32 11, i32 11, i32 11>
@@ -173,8 +173,8 @@ define <4 x i1> @vector_4xi32_shl_uge_const(<4 x i32> %x) {
 ; Check 'ule' predicate
 define <4 x i1> @vector_4xi32_shl_ule_const(<4 x i32> %x) {
 ; CHECK-LABEL: @vector_4xi32_shl_ule_const(
-; CHECK-NEXT:    [[TMP1:%.*]] = and <4 x i32> [[X:%.*]], <i32 2097088, i32 2097088, i32 2097088, i32 2097088>
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[SHL:%.*]] = shl <4 x i32> [[X:%.*]], <i32 11, i32 11, i32 11, i32 11>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult <4 x i32> [[SHL]], <i32 129025, i32 129025, i32 129025, i32 129025>
 ; CHECK-NEXT:    ret <4 x i1> [[CMP]]
 ;
   %shl = shl <4 x i32> %x, <i32 11, i32 11, i32 11, i32 11>
@@ -201,7 +201,7 @@ define i1 @scalar_i8_shl_ult_const_extra_use_shl(i8 %x, ptr %p) {
 ; CHECK-LABEL: @scalar_i8_shl_ult_const_extra_use_shl(
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i8 [[X:%.*]], 5
 ; CHECK-NEXT:    store i8 [[SHL]], ptr [[P:%.*]], align 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[SHL]], 64
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[SHL]], 33
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl i8 %x, 5
@@ -216,7 +216,7 @@ define i1 @scalar_i8_shl_ult_const_extra_use_shl(i8 %x, ptr %p) {
 define i1 @scalar_i8_shl_slt_const(i8 %x) {
 ; CHECK-LABEL: @scalar_i8_shl_slt_const(
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i8 [[X:%.*]], 5
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[SHL]], 64
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[SHL]], 33
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl i8 %x, 5
@@ -227,7 +227,7 @@ define i1 @scalar_i8_shl_slt_const(i8 %x) {
 define i1 @scalar_i8_shl_ugt_const_not_power_of_2(i8 %x) {
 ; CHECK-LABEL: @scalar_i8_shl_ugt_const_not_power_of_2(
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i8 [[X:%.*]], 5
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[SHL]], 66
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[SHL]], 95
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl i8 %x, 5
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index 3ba91360850e7..082d8146bfce2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -28,45 +28,45 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024
+; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1039
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.ph:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = shl i32 [[TMP19]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = shl i32 [[TMP5]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP6]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl i32 [[TMP7]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP8]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl i32 [[INDEX]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP10]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = shl i32 [[INDEX]], 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP11]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> [[TMP9]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP12]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
 ; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = or disjoint i32 [[TMP8]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP12]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP14]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP13]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i64 -1
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP20]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = or disjoint i32 [[TMP10]], 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP16]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP15]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP18]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i64 -1
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP16]], <vscale x 16 x i8> [[TMP17]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> [[TMP9]])
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP20]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -111,16 +111,18 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.ph:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = shl i32 [[TMP19]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = sub i32 1024, [[TMP3]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], 1009
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = shl i32 [[TMP8]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP9]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
@@ -128,31 +130,31 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = shl i32 [[INDEX]], 1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> [[TMP10]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP9]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = shl i32 [[INDEX]], 1
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP13]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP11]], <vscale x 16 x i1> [[TMP11]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP14]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = or disjoint i32 [[TMP7]], 1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP12]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP14]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP13]] to i64
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i64 -1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> [[TMP10]])
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP20]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = or disjoint i32 [[TMP12]], 1
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP18]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP17]] to i64
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP20]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[TMP21]], i64 -1
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP18]], <vscale x 16 x i8> [[TMP19]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP11]], <vscale x 16 x i1> [[TMP11]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP22]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP21]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP23]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; PREDICATED_TAIL_FOLDING:       scalar.ph:
@@ -220,36 +222,36 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024
+; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1039
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.ph:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = shl i32 [[TMP14]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = shl i32 [[TMP5]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP6]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl i32 [[TMP7]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP8]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP8]]
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP9]], i32 1, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer))
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = zext nneg <vscale x 16 x i32> [[TMP11]] to <vscale x 16 x i64>
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP12]]
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP10]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP15]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP10]]
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP11]], i32 1, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer))
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = or disjoint <vscale x 16 x i32> [[TMP9]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP13]] to <vscale x 16 x i64>
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP14]]
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP12]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -286,16 +288,18 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.ph:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = shl i32 [[TMP14]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = sub i32 1024, [[TMP3]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], 1009
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = shl i32 [[TMP8]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP9]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
@@ -303,22 +307,22 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = zext nneg <vscale x 16 x i32> [[TMP6]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP7]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP8]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP6]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP11]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP13]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP15]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP13]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = or disjoint <vscale x 16 x i32> [[TMP10]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = zext nneg <vscale x 16 x i32> [[TMP15]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP16]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP17]], i32 1, <vscale x 16 x i1> [[TMP14]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP18]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; PREDICATED_TAIL_FOLDING:       scalar.ph:
@@ -382,19 +386,19 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[CONV3:%.*]] = zext i8 [[GUARD2]] to i32
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024
+; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1039
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.ph:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = shl i32 [[TMP15]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = shl i32 [[TMP5]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP6]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl i32 [[TMP7]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP8]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
@@ -403,18 +407,18 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP9]]
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP10]], i32 1, <vscale x 16 x i1> [[TMP8]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP12]] to <vscale x 16 x i64>
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP13]]
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP11]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP16]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP11]]
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP10]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = or disjoint <vscale x 16 x i32> [[TMP9]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP14]] to <vscale x 16 x i64>
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP15]]
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP13]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -457,16 +461,18 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING:       vector.ph:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV3:%.*]] = zext i8 [[GUARD2]] to i32
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD1]] to i32
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = shl i32 [[TMP16]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = sub i32 1024, [[TMP3]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], 1009
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = shl i32 [[TMP8]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP9]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
@@ -476,24 +482,24 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = zext nneg <vscale x 16 x i32> [[TMP6]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP8]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP9]], i32 1, <vscale x 16 x i1> [[TMP10]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP11]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = or disjoint <vscale x 16 x i32> [[TMP6]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP12]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP11]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP13]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP15]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP17]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP12]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = or disjoint <vscale x 16 x i32> [[TMP10]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = zext nneg <vscale x 16 x i32> [[TMP17]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP18]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP19]], i32 1, <vscale x 16 x i1> [[TMP16]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP18]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP6:![0-9]+]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP20]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP6:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; PREDICATED_TAIL_FOLDING:       scalar.ph: