diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 54ccaefdad246..5c8d01a5d9396 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7659,14 +7659,17 @@ static void fixReductionScalarResumeWhenVectorizingEpilog( } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind( RdxDesc.getRecurrenceKind())) { using namespace llvm::PatternMatch; - Value *Cmp, *OrigResumeV; + Value *Cmp, *OrigResumeV, *CmpOp; bool IsExpectedPattern = match(MainResumeValue, m_Select(m_OneUse(m_Value(Cmp)), m_Specific(RdxDesc.getSentinelValue()), m_Value(OrigResumeV))) && - match(Cmp, - m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV), - m_Specific(RdxDesc.getRecurrenceStartValue()))); + (match(Cmp, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV), + m_Value(CmpOp))) && + (match(CmpOp, + m_Freeze(m_Specific(RdxDesc.getRecurrenceStartValue()))) || + (CmpOp == RdxDesc.getRecurrenceStartValue() && + isGuaranteedNotToBeUndefOrPoison(CmpOp)))); assert(IsExpectedPattern && "Unexpected reduction resume pattern"); (void)IsExpectedPattern; MainResumeValue = OrigResumeV; @@ -10365,6 +10368,36 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) { VPlanTransforms::runPass(VPlanTransforms::removeDeadRecipes, MainPlan); using namespace VPlanPatternMatch; + // When vectorizing the epilogue, FindLastIV reductions can introduce multiple + // uses of undef/poison. If the reduction start value may be undef or poison + // it needs to be frozen and the frozen start has to be used when computing + // the reduction result. We also need to use the frozen value in the resume + // phi generated by the main vector loop, as this is also used to compute the + // reduction result after the epilogue vector loop. + auto AddFreezeForFindLastIVReductions = [](VPlan &Plan, + bool UpdateResumePhis) { + VPBuilder Builder(Plan.getEntry()); + for (VPRecipeBase &R : *Plan.getMiddleBlock()) { + auto *VPI = dyn_cast(&R); + if (!VPI || VPI->getOpcode() != VPInstruction::ComputeFindLastIVResult) + continue; + VPValue *OrigStart = VPI->getOperand(1); + if (isGuaranteedNotToBeUndefOrPoison(OrigStart->getLiveInIRValue())) + continue; + VPInstruction *Freeze = + Builder.createNaryOp(Instruction::Freeze, {OrigStart}, {}, "fr"); + VPI->setOperand(1, Freeze); + if (UpdateResumePhis) + OrigStart->replaceUsesWithIf(Freeze, [Freeze](VPUser &U, unsigned) { + return Freeze != &U && isa(&U) && + cast(&U)->getOpcode() == + VPInstruction::ResumePhi; + }); + } + }; + AddFreezeForFindLastIVReductions(MainPlan, true); + AddFreezeForFindLastIVReductions(EpiPlan, false); + VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader(); VPValue *VectorTC = &MainPlan.getVectorTripCount(); // If there is a suitable resume value for the canonical induction in the @@ -10392,24 +10425,7 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); Header->setName("vec.epilog.vector.body"); - // Re-use the trip count and steps expanded for the main loop, as - // skeleton creation needs it as a value that dominates both the scalar - // and vector epilogue loops - // TODO: This is a workaround needed for epilogue vectorization and it - // should be removed once induction resume value creation is done - // directly in VPlan. - for (auto &R : make_early_inc_range(*Plan.getEntry())) { - auto *ExpandR = dyn_cast(&R); - if (!ExpandR) - continue; - auto *ExpandedVal = - Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second); - ExpandR->replaceAllUsesWith(ExpandedVal); - if (Plan.getTripCount() == ExpandR) - Plan.resetTripCount(ExpandedVal); - ExpandR->eraseFromParent(); - } - + DenseMap ToFrozen; // Ensure that the start values for all header phi recipes are updated before // vectorizing the epilogue loop. for (VPRecipeBase &R : Header->phis()) { @@ -10475,6 +10491,10 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, ResumeV = Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue()); } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) { + ToFrozen[RdxDesc.getRecurrenceStartValue()] = + cast(ResumeV)->getIncomingValueForBlock( + EPI.MainLoopIterationCountCheck); + // VPReductionPHIRecipe for FindLastIV reductions requires an adjustment // to the resume value. The resume value is adjusted to the sentinel // value when the final value from the main vector loop equals the start @@ -10483,8 +10503,8 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, // variable. BasicBlock *ResumeBB = cast(ResumeV)->getParent(); IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt()); - Value *Cmp = - Builder.CreateICmpEQ(ResumeV, RdxDesc.getRecurrenceStartValue()); + Value *Cmp = Builder.CreateICmpEQ( + ResumeV, ToFrozen[RdxDesc.getRecurrenceStartValue()]); ResumeV = Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV); } @@ -10500,6 +10520,35 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV); cast(&R)->setStartValue(StartVal); } + + // For some VPValues in the epilogue plan we must re-use the generated IR + // values from the main plan. Replace them with live-in VPValues. + // TODO: This is a workaround needed for epilogue vectorization and it + // should be removed once induction resume value creation is done + // directly in VPlan. + for (auto &R : make_early_inc_range(*Plan.getEntry())) { + // Re-use frozen values from the main plan for Freeze VPInstructions in the + // epilogue plan. This ensures all users use the same frozen value. + auto *VPI = dyn_cast(&R); + if (VPI && VPI->getOpcode() == Instruction::Freeze) { + VPI->replaceAllUsesWith(Plan.getOrAddLiveIn( + ToFrozen.lookup(VPI->getOperand(0)->getLiveInIRValue()))); + continue; + } + + // Re-use the trip count and steps expanded for the main loop, as + // skeleton creation needs it as a value that dominates both the scalar + // and vector epilogue loops + auto *ExpandR = dyn_cast(&R); + if (!ExpandR) + continue; + auto *ExpandedVal = + Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second); + ExpandR->replaceAllUsesWith(ExpandedVal); + if (Plan.getTripCount() == ExpandR) + Plan.resetTripCount(ExpandedVal); + ExpandR->eraseFromParent(); + } } // Generate bypass values from the additional bypass block. Note that when the diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index b16a8fc563f4c..a117d82e64ef7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -423,6 +423,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const { if (isSingleScalar() || isVectorToScalar()) return true; switch (Opcode) { + case Instruction::Freeze: case Instruction::ICmp: case Instruction::PHI: case Instruction::Select: @@ -474,6 +475,10 @@ Value *VPInstruction::generate(VPTransformState &State) { Value *Idx = State.get(getOperand(1), /*IsScalar=*/true); return Builder.CreateExtractElement(Vec, Idx, Name); } + case Instruction::Freeze: { + Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this)); + return Builder.CreateFreeze(Op, Name); + } case Instruction::ICmp: { bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); @@ -909,6 +914,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { return false; switch (getOpcode()) { case Instruction::ExtractElement: + case Instruction::Freeze: case Instruction::ICmp: case Instruction::Select: case VPInstruction::AnyOf: @@ -941,6 +947,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { case Instruction::ICmp: case Instruction::Select: case Instruction::Or: + case Instruction::Freeze: // TODO: Cover additional opcodes. return vputils::onlyFirstLaneUsed(this); case VPInstruction::ActiveLaneMask: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll index 7296cc0840dc0..c0806ea16a5fc 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll @@ -9,6 +9,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) { ; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i32 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 8 +; CHECK-NEXT: [[FR:%.*]] = freeze i8 [[START]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] ; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: ; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP2]], 32 @@ -42,7 +43,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) { ; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) ; CHECK-NEXT: [[TMP13:%.*]] = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> [[RDX_MINMAX]]) ; CHECK-NEXT: [[RDX_SELECT_CMP12:%.*]] = icmp ne i8 [[TMP13]], -128 -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP12]], i8 [[TMP13]], i8 [[START]] +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP12]], i8 [[TMP13]], i8 [[FR]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: @@ -53,8 +54,8 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) { ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[TMP3]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i8 [[BC_MERGE_RDX]], [[START]] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[FR]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i8 [[BC_MERGE_RDX]], [[FR]] ; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i8 -128, i8 [[BC_MERGE_RDX]] ; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i32 [[TMP2]], 8 ; CHECK-NEXT: [[N_VEC5:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF4]] @@ -82,7 +83,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) { ; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP22:%.*]] = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> [[TMP20]]) ; CHECK-NEXT: [[RDX_SELECT_CMP14:%.*]] = icmp ne i8 [[TMP22]], -128 -; CHECK-NEXT: [[RDX_SELECT15:%.*]] = select i1 [[RDX_SELECT_CMP14]], i8 [[TMP22]], i8 [[START]] +; CHECK-NEXT: [[RDX_SELECT15:%.*]] = select i1 [[RDX_SELECT_CMP14]], i8 [[TMP22]], i8 [[FR]] ; CHECK-NEXT: [[CMP_N16:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[CMP_N16]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] ; CHECK: [[VEC_EPILOG_SCALAR_PH]]: @@ -128,6 +129,7 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 { ; CHECK-NEXT: [[N_POS:%.*]] = icmp sgt i32 [[N]], 0 ; CHECK-NEXT: call void @llvm.assume(i1 [[N_POS]]) ; CHECK-NEXT: [[N_EXT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[FR:%.*]] = freeze i32 [[START]] ; CHECK-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[N_EXT]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] @@ -166,7 +168,7 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 { ; CHECK-NEXT: [[RDX_MINMAX6:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[RDX_MINMAX5]], <4 x i32> [[TMP6]]) ; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[RDX_MINMAX6]]) ; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP8]], -2147483648 -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP8]], i32 [[START]] +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP8]], i32 [[FR]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: @@ -175,8 +177,8 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 { ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[BC_MERGE_RDX]], [[START]] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[FR]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[BC_MERGE_RDX]], [[FR]] ; CHECK-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 -2147483648, i32 [[BC_MERGE_RDX]] ; CHECK-NEXT: [[N_MOD_VF7:%.*]] = urem i64 [[TMP0]], 4 ; CHECK-NEXT: [[N_VEC8:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF7]] @@ -203,7 +205,7 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 { ; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP14]]) ; CHECK-NEXT: [[RDX_SELECT_CMP18:%.*]] = icmp ne i32 [[TMP16]], -2147483648 -; CHECK-NEXT: [[RDX_SELECT19:%.*]] = select i1 [[RDX_SELECT_CMP18]], i32 [[TMP16]], i32 [[START]] +; CHECK-NEXT: [[RDX_SELECT19:%.*]] = select i1 [[RDX_SELECT_CMP18]], i32 [[TMP16]], i32 [[FR]] ; CHECK-NEXT: [[CMP_N20:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC8]] ; CHECK-NEXT: br i1 [[CMP_N20]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] ; CHECK: [[VEC_EPILOG_SCALAR_PH]]: diff --git a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll index ee154ea5a169a..800b6f3f28b7d 100644 --- a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll @@ -217,6 +217,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) { ; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i32 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 4 +; CHECK-NEXT: [[FR:%.*]] = freeze i8 [[START]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] ; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: ; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP2]], 4 @@ -243,7 +244,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP10:%.*]] = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> [[TMP8]]) ; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i8 [[TMP10]], -128 -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i8 [[TMP10]], i8 [[START]] +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i8 [[TMP10]], i8 [[FR]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: @@ -254,8 +255,8 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) { ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[TMP3]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i8 [[BC_MERGE_RDX]], [[START]] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[FR]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i8 [[BC_MERGE_RDX]], [[FR]] ; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i8 -128, i8 [[BC_MERGE_RDX]] ; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i32 [[TMP2]], 4 ; CHECK-NEXT: [[N_VEC3:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF2]] @@ -283,7 +284,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) { ; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP19:%.*]] = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> [[TMP17]]) ; CHECK-NEXT: [[RDX_SELECT_CMP12:%.*]] = icmp ne i8 [[TMP19]], -128 -; CHECK-NEXT: [[RDX_SELECT13:%.*]] = select i1 [[RDX_SELECT_CMP12]], i8 [[TMP19]], i8 [[START]] +; CHECK-NEXT: [[RDX_SELECT13:%.*]] = select i1 [[RDX_SELECT_CMP12]], i8 [[TMP19]], i8 [[FR]] ; CHECK-NEXT: [[CMP_N14:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N14]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] ; CHECK: [[VEC_EPILOG_SCALAR_PH]]: