diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 8c41f896ad622..73f7b86dffa1a 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9298,6 +9298,7 @@ static void addExitUsersForFirstOrderRecurrences( VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { + using namespace llvm::VPlanPatternMatch; SmallPtrSet *, 1> InterleaveGroups; // --------------------------------------------------------------------------- @@ -9321,6 +9322,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { PSE, RequiresScalarEpilogueCheck, CM.foldTailByMasking(), OrigLoop); + // Build hierarchical CFG. + VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); + HCFGBuilder.buildHierarchicalCFG(); + // Don't use getDecisionAndClampRange here, because we don't know the UF // so this function is better to be conservative, rather than to split // it up into different VPlans. @@ -9371,12 +9376,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { // Construct recipes for the instructions in the loop // --------------------------------------------------------------------------- - // Scan the body of the loop in a topological order to visit each basic block - // after having visited its predecessor basic blocks. - LoopBlocksDFS DFS(OrigLoop); - DFS.perform(LI); - - VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock(); + VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion(); + VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock(); VPBasicBlock *VPBB = HeaderVPBB; BasicBlock *HeaderBB = OrigLoop->getHeader(); bool NeedsMasks = @@ -9389,26 +9390,70 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { RecipeBuilder.collectScaledReductions(Range); auto *MiddleVPBB = Plan->getMiddleBlock(); + + // Scan the body of the loop in a topological order to visit each basic block + // after having visited its predecessor basic blocks. + ReversePostOrderTraversal> RPOT( + HeaderVPBB); + VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi(); - for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { - // Relevant instructions from basic block BB will be grouped into VPRecipe - // ingredients and fill a new VPBasicBlock. - if (VPBB != HeaderVPBB) - VPBB->setName(BB->getName()); - Builder.setInsertPoint(VPBB); + VPBlockBase *PrevVPBB = nullptr; + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { + // Handle VPBBs down to the latch. + if (VPBB == LoopRegion->getExiting()) { + assert(!HCFGBuilder.getIRBBForVPB(VPBB) && + "the latch block shouldn't have a corresponding IRBB"); + VPBlockUtils::connectBlocks(PrevVPBB, VPBB); + break; + } - if (VPBB == HeaderVPBB) + // Create mask based on the IR BB corresponding to VPBB. + // TODO: Predicate directly based on VPlan. + Builder.setInsertPoint(VPBB, VPBB->begin()); + if (VPBB == HeaderVPBB) { + Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi()); RecipeBuilder.createHeaderMask(); - else if (NeedsMasks) - RecipeBuilder.createBlockInMask(BB); + } else if (NeedsMasks) { + // FIXME: At the moment, masks need to be placed at the beginning of the + // block, as blends introduced for phi nodes need to use it. The created + // blends should be sunk after the mask recipes. + RecipeBuilder.createBlockInMask(HCFGBuilder.getIRBBForVPB(VPBB)); + } + + // Convert input VPInstructions to widened recipes. + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + auto *SingleDef = cast(&R); + auto *UnderlyingValue = SingleDef->getUnderlyingValue(); + // Skip recipes that do not need transforming, including canonical IV, + // wide canonical IV and VPInstructions without underlying values. The + // latter are added above for masking. + // FIXME: Migrate code relying on the underlying instruction from VPlan0 + // to construct recipes below to not use the underlying instruction. + if (isa(&R) || + (isa(&R) && !UnderlyingValue)) + continue; - // Introduce each ingredient into VPlan. - // TODO: Model and preserve debug intrinsics in VPlan. - for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) { - Instruction *Instr = &I; + // FIXME: VPlan0, which models a copy of the original scalar loop, should + // not use VPWidenPHIRecipe to model the phis. + assert((isa(&R) || isa(&R)) && + UnderlyingValue && "unsupported recipe"); + + if (isa(&R) && + (cast(&R)->getOpcode() == + VPInstruction::BranchOnCond || + (cast(&R)->getOpcode() == Instruction::Switch))) { + R.eraseFromParent(); + break; + } + + // TODO: Gradually replace uses of underlying instruction by analyses on + // VPlan. + Instruction *Instr = cast(UnderlyingValue); + Builder.setInsertPoint(SingleDef); SmallVector Operands; auto *Phi = dyn_cast(Instr); if (Phi && Phi->getParent() == HeaderBB) { + // The backedge value will be added in fixHeaderPhis later. Operands.push_back(Plan->getOrAddLiveIn( Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); } else { @@ -9420,15 +9465,16 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { // in the exit block, a uniform store recipe will be created for the final // invariant store of the reduction. StoreInst *SI; - if ((SI = dyn_cast(&I)) && + if ((SI = dyn_cast(Instr)) && Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) { // Only create recipe for the final invariant store of the reduction. - if (!Legal->isInvariantStoreOfReduction(SI)) - continue; - auto *Recipe = new VPReplicateRecipe( - SI, make_range(Operands.begin(), Operands.end()), - true /* IsUniform */); - Recipe->insertBefore(*MiddleVPBB, MBIP); + if (Legal->isInvariantStoreOfReduction(SI)) { + auto *Recipe = new VPReplicateRecipe( + SI, make_range(Operands.begin(), Operands.end()), + true /* IsUniform */); + Recipe->insertBefore(*MiddleVPBB, MBIP); + } + R.eraseFromParent(); continue; } @@ -9438,25 +9484,29 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { Recipe = RecipeBuilder.handleReplication(Instr, Operands, Range); RecipeBuilder.setRecipe(Instr, Recipe); - if (isa(Recipe)) { - // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In - // the following cases, VPHeaderPHIRecipes may be created after non-phi - // recipes and need to be moved to the phi section of HeaderVPBB: - // * tail-folding (non-phi recipes computing the header mask are - // introduced earlier than regular header phi recipes, and should appear - // after them) - // * Optimizing truncates to VPWidenIntOrFpInductionRecipe. - - assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() || - CM.foldTailByMasking() || isa(Instr)) && - "unexpected recipe needs moving"); + if (isa(Recipe) && isa(Instr)) { + // Optimized a truncate to VPWidenIntOrFpInductionRecipe. It needs to be + // moved to the phi section in the header. Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); - } else - VPBB->appendRecipe(Recipe); - } - - VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB); - VPBB = cast(VPBB->getSingleSuccessor()); + } else { + Builder.insert(Recipe); + } + if (Recipe->getNumDefinedValues() == 1) + SingleDef->replaceAllUsesWith(Recipe->getVPSingleValue()); + else + assert(Recipe->getNumDefinedValues() == 0 && + "Unexpected multidef recipe"); + R.eraseFromParent(); + } + + // Flatten the CFG in the loop. Masks for blocks have already been generated + // and added to recipes as needed. To do so, first disconnect VPBB from its + // successors. Then connect VPBB to the previously visited VPBB. + for (auto *Succ : to_vector(VPBB->getSuccessors())) + VPBlockUtils::disconnectBlocks(VPBB, Succ); + if (PrevVPBB) + VPBlockUtils::connectBlocks(PrevVPBB, VPBB); + PrevVPBB = VPBB; } // After here, VPBB should not be used. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 1332e50252978..cd111365c134c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -600,16 +600,25 @@ static bool hasConditionalTerminator(const VPBasicBlock *VPBB) { } const VPRecipeBase *R = &VPBB->back(); + bool IsSwitch = isa(R) && + cast(R)->getOpcode() == Instruction::Switch; bool IsCondBranch = isa(R) || match(R, m_BranchOnCond(m_VPValue())) || match(R, m_BranchOnCount(m_VPValue(), m_VPValue())); (void)IsCondBranch; - - if (VPBB->getNumSuccessors() >= 2 || + (void)IsSwitch; + if (VPBB->getNumSuccessors() == 2 || (VPBB->isExiting() && !VPBB->getParent()->isReplicator())) { - assert(IsCondBranch && "block with multiple successors not terminated by " - "conditional branch recipe"); + assert((IsCondBranch || IsSwitch) && + "block with multiple successors not terminated by " + "conditional branch nor switch recipe"); + + return true; + } + if (VPBB->getNumSuccessors() > 2) { + assert(IsSwitch && "block with more than 2 successors not terminated by " + "a switch recipe"); return true; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp index 5a2e5d7cfee48..6d6755c93830c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp @@ -75,7 +75,7 @@ class PlainCFGBuilder { : TheLoop(Lp), LI(LI), Plan(P) {} /// Build plain CFG for TheLoop and connects it to Plan's entry. - void buildPlainCFG(); + void buildPlainCFG(DenseMap &VPB2IRBB); }; } // anonymous namespace @@ -237,10 +237,10 @@ bool PlainCFGBuilder::isExternalDef(Value *Val) { // Instruction definition is in outermost loop PH. return false; - // Check whether Instruction definition is in the loop exit. - BasicBlock *Exit = TheLoop->getUniqueExitBlock(); - assert(Exit && "Expected loop with single exit."); - if (InstParent == Exit) { + // Check whether Instruction definition is in a loop exit. + SmallVector ExitBlocks; + TheLoop->getExitBlocks(ExitBlocks); + if (is_contained(ExitBlocks, InstParent)) { // Instruction definition is in outermost loop exit. return false; } @@ -283,6 +283,7 @@ VPValue *PlainCFGBuilder::getOrCreateVPOperand(Value *IRVal) { void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB, BasicBlock *BB) { VPIRBuilder.setInsertPoint(VPBB); + // TODO: Model and preserve debug intrinsics in VPlan. for (Instruction &InstRef : BB->instructionsWithoutDebug(false)) { Instruction *Inst = &InstRef; @@ -308,6 +309,14 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB, continue; } + if (auto *SI = dyn_cast(Inst)) { + SmallVector Ops = {getOrCreateVPOperand(SI->getCondition())}; + for (auto Case : SI->cases()) + Ops.push_back(getOrCreateVPOperand(Case.getCaseValue())); + VPIRBuilder.createNaryOp(Instruction::Switch, Ops, Inst); + continue; + } + VPValue *NewVPV; if (auto *Phi = dyn_cast(Inst)) { // Phi node's operands may have not been visited at this point. We create @@ -334,7 +343,8 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB, } // Main interface to build the plain CFG. -void PlainCFGBuilder::buildPlainCFG() { +void PlainCFGBuilder::buildPlainCFG( + DenseMap &VPB2IRBB) { // 0. Reuse the top-level region, vector-preheader and exit VPBBs from the // skeleton. These were created directly rather than via getOrCreateVPBB(), // revisit them now to update BB2VPBB. Note that header/entry and @@ -423,6 +433,14 @@ void PlainCFGBuilder::buildPlainCFG() { // Set VPBB successors. We create empty VPBBs for successors if they don't // exist already. Recipes will be created when the successor is visited // during the RPO traversal. + if (auto *SI = dyn_cast(BB->getTerminator())) { + SmallVector Succs = { + getOrCreateVPBB(SI->getDefaultDest())}; + for (auto Case : SI->cases()) + Succs.push_back(getOrCreateVPBB(Case.getCaseSuccessor())); + VPBB->setSuccessors(Succs); + continue; + } auto *BI = cast(BB->getTerminator()); unsigned NumSuccs = succ_size(BB); if (NumSuccs == 1) { @@ -476,11 +494,14 @@ void PlainCFGBuilder::buildPlainCFG() { // have a VPlan couterpart. Fix VPlan phi nodes by adding their corresponding // VPlan operands. fixPhiNodes(); + + for (const auto &[IRBB, VPB] : BB2VPBB) + VPB2IRBB[VPB] = IRBB; } void VPlanHCFGBuilder::buildPlainCFG() { PlainCFGBuilder PCFGBuilder(TheLoop, LI, Plan); - PCFGBuilder.buildPlainCFG(); + PCFGBuilder.buildPlainCFG(VPB2IRBB); } // Public interface to build a H-CFG. diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h index ad6e2ad90a961..bc853bf7a1395 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h @@ -53,6 +53,10 @@ class VPlanHCFGBuilder { // are introduced. VPDominatorTree VPDomTree; + /// Map of create VP blocks to their input IR basic blocks, if they have been + /// created for a input IR basic block. + DenseMap VPB2IRBB; + /// Build plain CFG for TheLoop and connects it to Plan's entry. void buildPlainCFG(); @@ -62,6 +66,14 @@ class VPlanHCFGBuilder { /// Build H-CFG for TheLoop and update Plan accordingly. void buildHierarchicalCFG(); + + /// Return the input IR BasicBlock corresponding to \p VPB. Returns nullptr if + /// there is no such corresponding block. + /// FIXME: This is a temporary workaround to drive the createBlockInMask. + /// Remove once mask creation is done on VPlan. + BasicBlock *getIRBBForVPB(const VPBlockBase *VPB) const { + return VPB2IRBB.lookup(VPB); + } }; } // namespace llvm diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index ebb5d46cd8438..4e862bf2f7480 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -46,7 +46,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 ; CHECK-NEXT: LV: Using user VF vscale x 4. ; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom @@ -295,7 +295,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 ; CHECK-NEXT: LV: Using user VF vscale x 4. ; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom