diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 88086f24dfdce..778d928252e05 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7562,67 +7562,62 @@ static void addRuntimeUnrollDisableMetaData(Loop *L) { } } -// Check if \p RedResult is a ComputeReductionResult instruction, and if it is -// create a merge phi node for it. -static void createAndCollectMergePhiForReduction( - VPInstruction *RedResult, - VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock, - bool VectorizingEpilogue) { - if (!RedResult || - RedResult->getOpcode() != VPInstruction::ComputeReductionResult) +// If \p R is a ComputeReductionResult when vectorizing the epilog loop, +// fix the reduction's scalar PHI node by adding the incoming value from the +// main vector loop. +static void fixReductionScalarResumeWhenVectorizingEpilog( + VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock) { + auto *EpiRedResult = dyn_cast(R); + if (!EpiRedResult || + EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult) return; - auto *PhiR = cast(RedResult->getOperand(0)); - const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); - - Value *FinalValue = State.get(RedResult, VPLane(VPLane::getFirstLane())); - auto *ResumePhi = - dyn_cast(PhiR->getStartValue()->getUnderlyingValue()); - if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind( - RdxDesc.getRecurrenceKind())) { - auto *Cmp = cast(PhiR->getStartValue()->getUnderlyingValue()); - assert(Cmp->getPredicate() == CmpInst::ICMP_NE); - assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue()); - ResumePhi = cast(Cmp->getOperand(0)); - } - assert((!VectorizingEpilogue || ResumePhi) && - "when vectorizing the epilogue loop, we need a resume phi from main " - "vector loop"); - - // TODO: bc.merge.rdx should not be created here, instead it should be - // modeled in VPlan. - BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader(); - // Create a phi node that merges control-flow from the backedge-taken check - // block and the middle block. - auto *BCBlockPhi = - PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx", - LoopScalarPreHeader->getTerminator()->getIterator()); - - // If we are fixing reductions in the epilogue loop then we should already - // have created a bc.merge.rdx Phi after the main vector body. Ensure that - // we carry over the incoming values correctly. + auto *EpiRedHeaderPhi = + cast(EpiRedResult->getOperand(0)); + const RecurrenceDescriptor &RdxDesc = + EpiRedHeaderPhi->getRecurrenceDescriptor(); + Value *MainResumeValue = + EpiRedHeaderPhi->getStartValue()->getUnderlyingValue(); + if (RecurrenceDescriptor::isAnyOfRecurrenceKind( + RdxDesc.getRecurrenceKind())) { + auto *Cmp = cast(MainResumeValue); + assert(Cmp->getPredicate() == CmpInst::ICMP_NE && + "AnyOf expected to start with ICMP_NE"); + assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue() && + "AnyOf expected to start by comparing main resume value to original " + "start value"); + MainResumeValue = Cmp->getOperand(0); + } + PHINode *MainResumePhi = cast(MainResumeValue); + + // When fixing reductions in the epilogue loop we should already have + // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry + // over the incoming values correctly. + using namespace VPlanPatternMatch; + auto IsResumePhi = [](VPUser *U) { + return match( + U, m_VPInstruction(m_VPValue(), m_VPValue())); + }; + assert(count_if(EpiRedResult->users(), IsResumePhi) == 1 && + "ResumePhi must have a single user"); + auto *EpiResumePhiVPI = + cast(*find_if(EpiRedResult->users(), IsResumePhi)); + auto *EpiResumePhi = cast(State.get(EpiResumePhiVPI, true)); + BasicBlock *LoopScalarPreHeader = EpiResumePhi->getParent(); + bool Updated = false; for (auto *Incoming : predecessors(LoopScalarPreHeader)) { - if (Incoming == LoopMiddleBlock) - BCBlockPhi->addIncoming(FinalValue, Incoming); - else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming)) - BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), - Incoming); - else - BCBlockPhi->addIncoming(RdxDesc.getRecurrenceStartValue(), Incoming); + if (is_contained(MainResumePhi->blocks(), Incoming)) { + assert(EpiResumePhi->getIncomingValueForBlock(Incoming) == + RdxDesc.getRecurrenceStartValue() && + "Trying to reset unexpected value"); + assert(!Updated && "Should update at most 1 incoming value"); + EpiResumePhi->setIncomingValueForBlock( + Incoming, MainResumePhi->getIncomingValueForBlock(Incoming)); + Updated = true; + } } - - auto *OrigPhi = cast(PhiR->getUnderlyingValue()); - // TODO: This fixup should instead be modeled in VPlan. - // Fix the scalar loop reduction variable with the incoming reduction sum - // from the vector body and from the backedge value. - int IncomingEdgeBlockIdx = - OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); - assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); - // Pick the other block. - int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); - OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); - Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); - OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); + assert(Updated && "Must update EpiResumePhi."); + (void)Updated; } DenseMap LoopVectorizationPlanner::executePlan( @@ -7713,11 +7708,11 @@ DenseMap LoopVectorizationPlanner::executePlan( // 2.5 Collect reduction resume values. auto *ExitVPBB = cast(BestVPlan.getVectorLoopRegion()->getSingleSuccessor()); - for (VPRecipeBase &R : *ExitVPBB) { - createAndCollectMergePhiForReduction( - dyn_cast(&R), State, OrigLoop, - State.CFG.VPBB2IRBB[ExitVPBB], VectorizingEpilogue); - } + if (VectorizingEpilogue) + for (VPRecipeBase &R : *ExitVPBB) { + fixReductionScalarResumeWhenVectorizingEpilog( + &R, State, State.CFG.VPBB2IRBB[ExitVPBB]); + } // 2.6. Maintain Loop Hints // Keep all loop hints from the original loop on the vector loop (we'll @@ -9518,6 +9513,17 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( }); FinalReductionResult->insertBefore(*MiddleVPBB, IP); + // Order is strict: if there are multiple successors, the first is the exit + // block, second is the scalar preheader. + VPBasicBlock *ScalarPHVPBB = + cast(MiddleVPBB->getSuccessors().back()); + VPBuilder ScalarPHBuilder(ScalarPHVPBB); + auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp( + VPInstruction::ResumePhi, {FinalReductionResult, PhiR->getStartValue()}, + {}, "bc.merge.rdx"); + auto *RedPhi = cast(PhiR->getUnderlyingInstr()); + Plan->addLiveOut(RedPhi, ResumePhiRecipe); + // Adjust AnyOf reductions; replace the reduction phi for the selected value // with a boolean reduction phi node to check if the condition is true in // any iteration. The final value is selected by the final diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll index 1326751a847d7..59db6c197ef8c 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll @@ -65,7 +65,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: No successors ; IF-EVL-INLOOP-EMPTY: ; IF-EVL-INLOOP-NEXT: scalar.ph: +; IF-EVL-INLOOP-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start> ; IF-EVL-INLOOP-NEXT: No successors +; IF-EVL-INLOOP-EMPTY: +; IF-EVL-INLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]> ; IF-EVL-INLOOP-NEXT: } ; @@ -104,7 +107,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; NO-VP-OUTLOOP-NEXT: No successors ; NO-VP-OUTLOOP-EMPTY: ; NO-VP-OUTLOOP-NEXT: scalar.ph: +; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start> ; NO-VP-OUTLOOP-NEXT: No successors +; NO-VP-OUTLOOP-EMPTY: +; NO-VP-OUTLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]> ; NO-VP-OUTLOOP-NEXT: } ; @@ -143,7 +149,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; NO-VP-INLOOP-NEXT: No successors ; NO-VP-INLOOP-EMPTY: ; NO-VP-INLOOP-NEXT: scalar.ph: +; NO-VP-INLOOP-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start> ; NO-VP-INLOOP-NEXT: No successors +; NO-VP-INLOOP-EMPTY: +; NO-VP-INLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]> ; NO-VP-INLOOP-NEXT: } ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll index 8e56614a2e3d5..b05980bef1b38 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -232,9 +232,11 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0> +; CHECK-NEXT: EMIT vp<[[RESUME_RED:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<1234> ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: Live-out i32 %recur = vp<[[RESUME_1_P]]> +; CHECK-NEXT: Live-out i32 %and.red = vp<[[RESUME_RED]]> ; CHECK-NEXT: } ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index 0dde507d08be7..2247295295663 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -165,7 +165,10 @@ define float @print_reduction(i64 %n, ptr noalias %y) { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph +; CHECK-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00> ; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: Live-out float %red = vp<[[RED_RESUME]]> ; CHECK-NEXT: } ; entry: @@ -221,7 +224,10 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph +; CHECK-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00> ; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: Live-out float %red = vp<[[RED_RESUME]]> ; CHECK-NEXT: } ; entry: @@ -447,7 +453,10 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph +; CHECK-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00> ; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: Live-out float %sum.07 = vp<[[RED_RESUME]]> ; CHECK-NEXT:} entry: